{-# LANGUAGE OverloadedStrings, GADTs, DataKinds #-}
module Numeric.Datasets (getDataset, Dataset(..), Source(..),
readDataset, ReadAs(..), csvRecord,
csvDataset, csvHdrDataset, csvHdrDatasetSep, csvDatasetSkipHdr,
jsonDataset,
withPreprocess, withTempDir,
dropLines, fixedWidthToCSV, removeEscQuotes, fixAmericanDecimals,
parseReadField, parseDashToCamelField,
yearToUTCTime,
umassMLDB, uciMLDB) where
import Data.Csv
import System.FilePath
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import qualified Data.Vector as V
import Data.Aeson as JSON
import Control.Applicative
import Data.Time
import Data.Default.Class (Default(..))
import Network.HTTP.Req (req, runReq, Url, (/:), http, https, Scheme(..), LbsResponse, lbsResponse, responseBody, GET(..), NoReqBody(..), HttpMethod(..))
import Data.Char (ord, toUpper)
import Text.Read (readMaybe)
import Data.Maybe (fromMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
getDataset :: Dataset h a -> IO [a]
getDataset ds = do
dir <- tempDirForDataset ds
bs <- fmap (fromMaybe id $ preProcess ds) $ getFileFromSource dir $ source ds
return $ readDataset (readAs ds) bs
getFileFromSource ::
FilePath
-> Source h
-> IO BL.ByteString
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash $ show url)
ex <- doesFileExist fnm
if ex
then BL.readFile fnm
else do
rsp <- runReq def $ req GET url NoReqBody lbsResponse mempty
let bs = responseBody rsp
BL.writeFile fnm bs
return bs
getFileFromSource _ (File fnm) =
BL.readFile fnm
readDataset ::
ReadAs a
-> BL.ByteString
-> [a]
readDataset JSON bs =
case JSON.decode bs of
Just theData -> theData
Nothing -> error "failed to parse json"
readDataset (CSVRecord hhdr opts) bs =
case decodeWith opts hhdr bs of
Right theData -> V.toList theData
Left err -> error err
readDataset (CSVNamedRecord opts) bs =
case decodeByNameWith opts bs of
Right (_,theData) -> V.toList theData
Left err -> error err
tempDirForDataset :: Dataset h a -> IO FilePath
tempDirForDataset ds =
case temporaryDirectory ds of
Nothing -> getTemporaryDirectory
Just tdir -> return tdir
data Source h = URL (Url h)
| File FilePath
data Dataset h a = Dataset
{ source :: Source h
, temporaryDirectory :: Maybe FilePath
, preProcess :: Maybe (BL.ByteString -> BL.ByteString)
, readAs :: ReadAs a
}
data ReadAs a where
JSON :: FromJSON a => ReadAs a
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
csvRecord :: FromRecord a => ReadAs a
csvRecord = CSVRecord NoHeader defaultDecodeOptions
csvDataset :: FromRecord a => Source h -> Dataset h a
csvDataset src = Dataset src Nothing Nothing csvRecord
csvDatasetSkipHdr :: FromRecord a => Source h -> Dataset h a
csvDatasetSkipHdr src = Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions
csvHdrDataset :: FromNamedRecord a => Source h -> Dataset h a
csvHdrDataset src = Dataset src Nothing Nothing $ CSVNamedRecord defaultDecodeOptions
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source h -> Dataset h a
csvHdrDatasetSep sepc src
= Dataset src Nothing Nothing
$ CSVNamedRecord defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)}
jsonDataset :: FromJSON a => Source h -> Dataset h a
jsonDataset src = Dataset src Nothing Nothing JSON
withPreprocess :: (BL8.ByteString -> BL8.ByteString) -> Dataset h a -> Dataset h a
withPreprocess preF ds = ds { preProcess = Just preF}
withTempDir :: FilePath -> Dataset h a -> Dataset h a
withTempDir dir ds = ds { temporaryDirectory = Just dir }
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n-1) $ BL.tail $ BL8.dropWhile (/='\n') s
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs)
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []
removeEscQuotes :: BL8.ByteString -> BL8.ByteString
removeEscQuotes = BL8.filter (/= '\"')
yearToUTCTime :: Double -> UTCTime
yearToUTCTime yearDbl =
let (yearn,yearFrac) = properFraction yearDbl
dayYearBegin = fromGregorian yearn 1 1
(dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365)
day = addDays dayn dayYearBegin
dt = secondsToDiffTime $ round $ dayFrac * 86400
in UTCTime day dt
umassMLDB :: Url 'Http
umassMLDB = http "mlr.cs.umass.edu" /: "ml" /: "machine-learning-databases"
uciMLDB :: Url 'Https
uciMLDB = https "archive.ics.uci.edu" /: "ml" /: "machine-learning-databases"