{-# LANGUAGE OverloadedStrings, GADTs, DataKinds #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Numeric.Datasets (getDataset, Dataset(..), Source(..), getDatavec, defaultTempDir, getFileFromSource,
readDataset, safeReadDataset, ReadAs(..), csvRecord,
csvDataset, csvHdrDataset, csvHdrDatasetSep, csvDatasetSkipHdr,
jsonDataset,
withPreprocess, withTempDir,
dropLines, fixedWidthToCSV, removeEscQuotes, fixAmericanDecimals,
parseReadField, parseDashToCamelField,
yearToUTCTime,
umassMLDB, uciMLDB) where
import Data.Csv
import Data.Monoid
import Data.Foldable
import Data.List (isSuffixOf)
import Data.List.NonEmpty (NonEmpty((:|)))
import qualified Data.List.NonEmpty as NE
import System.FilePath (takeExtensions, (</>))
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import Data.Aeson as JSON
import Control.Applicative
import Data.Time
import Network.HTTP.Req (req, runReq, Url, (/:), http, https, Scheme(..), LbsResponse, lbsResponse, responseBody, GET(..), NoReqBody(..), HttpMethod(..), defaultHttpConfig)
import Control.Exception.Safe
import Data.Char (ord, toUpper)
import Text.Read (readMaybe)
import Data.Maybe (fromMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Vector.Generic (Vector)
import qualified Data.Vector as VB
import qualified Data.Vector.Generic as V
import qualified Data.Attoparsec.ByteString as Atto'
import qualified Data.Attoparsec.ByteString.Lazy as Atto
getDataset :: (MonadThrow io, MonadIO io) => Dataset a -> io [a]
getDataset ds = VB.toList <$> getDatavec ds
getDatavec :: (MonadThrow io, MonadIO io, Vector v a) => Dataset a -> io (v a)
getDatavec ds = liftIO $ do
folder <- tempDirForDataset ds
files <- getFileFromSource folder (source ds)
safeReadDataset (readAs ds) (fromMaybe id (preProcess ds) <$> files)
getFileFromSource
:: FilePath
-> Source
-> IO (NonEmpty BL.ByteString)
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash $ show url)
ex <- doesFileExist fnm
if ex
then (:|[]) <$> BL.readFile fnm
else do
rsp <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse mempty
let bs = responseBody rsp
BL.writeFile fnm bs
return (bs:|[])
getFileFromSource _ (File fnm) = (:|[]) <$> BL.readFile fnm
getFileFromSource _ (ImgFolder root labels) =
NE.fromList <$> foldrM allImFolderData [] labels
where
allImFolderData :: String -> [BL.ByteString] -> IO [BL.ByteString]
allImFolderData label agg = (agg ++) <$> toImFolderData label
toImFolderData :: String -> IO [BL.ByteString]
toImFolderData l = map (asBytes l) . filter hasValidExt <$> listDirectory (root </> l)
asBytes :: String -> FilePath -> BL8.ByteString
asBytes label fp = BL8.pack $ label ++ "<<.>>" ++ (root </> label </> fp)
hasValidExt :: FilePath -> Bool
hasValidExt fp = any (`isExtensionOf` fp) ["png", "jpeg", "bitmap", "tiff"]
isExtensionOf :: String -> FilePath -> Bool
isExtensionOf ext@('.':_) = isSuffixOf ext . takeExtensions
isExtensionOf ext = isSuffixOf ('.':ext) . takeExtensions
readDataset
:: ReadAs a
-> BL.ByteString
-> [a]
readDataset ra bs =
case safeReadDataset ra (bs:|[]) of
Left e -> error (show e)
Right dat -> VB.toList dat
safeReadDataset :: (Vector v a, MonadThrow m) => ReadAs a -> NonEmpty BL.ByteString -> m (v a)
safeReadDataset ra bss = either throwString pure $
case (ra, bss) of
(JSON, bs:|[]) -> V.fromList <$> JSON.eitherDecode' bs
(CSVRecord hhdr opts, bs:|[]) -> V.convert <$> decodeWith opts hhdr bs
(CSVNamedRecord opts, bs:|[]) -> V.convert . snd <$> decodeByNameWith opts bs
(Parsable psr, bs:|[]) -> V.fromList <$> Atto.eitherResult (Atto.parse (Atto.many' psr) bs)
(ImageFolder labels, _) -> do
ds <- mapM (getImFiles labels) bss
pure $ V.fromList (toList ds)
_ -> Left $
if length bss > 1
then "Cannot parse more than one file for this data format"
else "impossible: logic has changed, please file this issue on dh-core"
where
getImFiles :: NonEmpty String -> BL.ByteString -> Either String (String, FilePath)
getImFiles labels bs' = Atto.eitherResult (Atto.parse (parseTaggedFile labels) bs')
parseTaggedFile :: NonEmpty String -> Atto.Parser (String, FilePath)
parseTaggedFile (l0:|ls) = do
lbl <- Atto.choice $ Atto.string . B8.pack <$> (l0:ls)
_ <- Atto.string "<<.>>"
fp <- Atto.takeByteString
pure (B8.unpack lbl, B8.unpack fp)
tempDirForDataset :: Dataset a -> IO FilePath
tempDirForDataset = defaultTempDir . temporaryDirectory
defaultTempDir :: Maybe FilePath -> IO FilePath
defaultTempDir = \case
Nothing -> getTemporaryDirectory
Just tdir -> return tdir
data Source
= forall h . URL (Url h)
| File FilePath
| ImgFolder FilePath (NonEmpty String)
data Dataset a = Dataset
{ source :: Source
, temporaryDirectory :: Maybe FilePath
, preProcess :: Maybe (BL.ByteString -> BL.ByteString)
, readAs :: ReadAs a
}
data ReadAs a where
JSON :: FromJSON a => ReadAs a
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
Parsable :: Atto.Parser a -> ReadAs a
ImageFolder
:: NonEmpty String
-> ReadAs (String, FilePath)
csvRecord :: FromRecord a => ReadAs a
csvRecord = CSVRecord NoHeader defaultDecodeOptions
csvDataset :: FromRecord a => Source -> Dataset a
csvDataset src = Dataset src Nothing Nothing csvRecord
csvDatasetSkipHdr :: FromRecord a => Source -> Dataset a
csvDatasetSkipHdr src = Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
csvHdrDataset src = Dataset src Nothing Nothing $ CSVNamedRecord defaultDecodeOptions
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
csvHdrDatasetSep sepc src
= Dataset src Nothing Nothing
$ CSVNamedRecord defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)}
jsonDataset :: FromJSON a => Source -> Dataset a
jsonDataset src = Dataset src Nothing Nothing JSON
withPreprocess :: (BL8.ByteString -> BL8.ByteString) -> Dataset a -> Dataset a
withPreprocess preF ds = ds { preProcess = Just preF}
withTempDir :: FilePath -> Dataset a -> Dataset a
withTempDir dir ds = ds { temporaryDirectory = Just dir }
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n-1) $ BL.tail $ BL8.dropWhile (/='\n') s
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs)
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []
removeEscQuotes :: BL8.ByteString -> BL8.ByteString
removeEscQuotes = BL8.filter (/= '\"')
yearToUTCTime :: Double -> UTCTime
yearToUTCTime yearDbl = UTCTime day dt
where
(yearn,yearFrac) = properFraction yearDbl
dayYearBegin = fromGregorian yearn 1 1
(dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365)
day = addDays dayn dayYearBegin
dt = secondsToDiffTime $ round $ dayFrac * 86400
umassMLDB :: Url 'Http
umassMLDB = http "mlr.cs.umass.edu" /: "ml" /: "machine-learning-databases"
uciMLDB :: Url 'Https
uciMLDB = https "archive.ics.uci.edu" /: "ml" /: "machine-learning-databases"