{-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} {-| Adult (AKA Census Income) dataset. UCI ML Repository link -} module Numeric.Datasets.Adult where import Numeric.Datasets import Data.Csv import GHC.Generics import Control.Applicative import Data.Text (Text, strip) import Network.HTTP.Req ((/:), Scheme(..)) data WorkClass = Private | SelfEmpNotInc | SelfEmpInc | FederalGov | LocalGov | StateGov | WithoutPay | NeverWorked deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField WorkClass where parseField = parseDashToCamelField data MaritalStatus = MarriedCivSpouse | Divorced | NeverMarried | Separated | Widowed | MarriedSpouseAbsent | MarriedAFSpouse deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField MaritalStatus where -- parseField "Married-AF-spouse" = pure MarriedAFSpouse parseField = parseDashToCamelField data Occupation = TechSupport | CraftRepair | OtherService | Sales | ExecManagerial | ProfSpecialty | HandlersCleaners | MachineOpInspct | AdmClerical | FarmingFishing | TransportMoving | PrivHouseServ | ProtectiveServ | ArmedForces deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField Occupation where parseField = parseDashToCamelField data Relationship = Wife | OwnChild | Husband | NotInFamily | OtherRelative | Unmarried deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField Relationship where parseField = parseDashToCamelField data Race = White | AsianPacIslander | AmerIndianEskimo | Other | Black deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField Race where parseField = parseDashToCamelField data Sex = Female | Male deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField Sex where parseField = parseDashToCamelField data Income = GT50K | LE50K deriving (Show, Read, Eq, Generic, Bounded, Enum) instance FromField Income where parseField " >50K" = pure GT50K parseField " <=50K" = pure LE50K parseField " >50K." = pure GT50K parseField " <=50K." = pure LE50K parseField ">50K" = pure GT50K parseField "<=50K" = pure LE50K parseField _ = fail "unknown income" data Adult = Adult { age :: Int , workClass :: Maybe WorkClass , finalWeight :: Int , education :: Text , educationNum :: Int , maritalStatus :: MaritalStatus , occupation :: Maybe Occupation , relationship :: Relationship , race :: Race , sex :: Sex , capitalGain :: Int , capitalLoss :: Int , hoursPerWeek :: Int , nativeCountry :: Text , income :: Income } deriving (Show, Read, Generic) instance FromRecord Adult where parseRecord v = Adult <$> v .! 0 <*> (v.! 1 <|> return Nothing) <*> v.!2 <*> (strip <$> v.!3) <*> v.!4 <*> v.!5 <*> (v.!6 <|> return Nothing) <*> v.!7 <*> v.!8 <*> v.!9 <*> v.!10 <*> v.!11 <*> v.!12 <*> v.!13 <*> v.!14 adult :: Dataset Adult adult = csvDataset $ URL $ umassMLDB /: "adult" /: "adult.data" adultTestSet :: Dataset Adult adultTestSet = withPreprocess (dropLines 1) $ csvDataset $ URL $ umassMLDB /: "adult" /: "adult.test" -- "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test"