module Numeric.Datasets where
import Data.Csv
import System.FilePath
import System.Directory
import Data.Hashable
import Data.Monoid
import qualified Data.ByteString.Lazy as BL
import qualified Data.Vector as V
import Data.Aeson as JSON
import Control.Applicative
import Data.Time
import Data.Char (ord)
import qualified Network.Wreq as Wreq
import Lens.Micro ((^.))
import Data.Char (toUpper)
import Text.Read (readMaybe)
import Data.Maybe (fromMaybe)
import Data.ByteString.Char8 (unpack)
import qualified Data.ByteString.Lazy.Char8 as BL8
import Data.ByteString.Lazy.Search (replace)
getDataset :: Dataset a -> IO [a]
getDataset ds = do
dir <- case temporaryDirectory ds of
Nothing -> getTemporaryDirectory
Just tdir -> return tdir
bs <- fmap (fromMaybe id $ preProcess ds) $ getFileFromSource dir $ source ds
return $ readDataset (readAs ds) bs
readDataset :: ReadAs a -> BL.ByteString -> [a]
readDataset JSON bs =
case JSON.decode bs of
Just theData -> theData
Nothing -> error "failed to parse json"
readDataset (CSVRecord hhdr opts) bs =
case decodeWith opts hhdr bs of
Right theData -> V.toList theData
Left err -> error err
readDataset (CSVNamedRecord opts) bs =
case decodeByNameWith opts bs of
Right (_,theData) -> V.toList theData
Left err -> error err
data Source = URL String
data Dataset a = Dataset
{ source :: Source
, temporaryDirectory :: Maybe FilePath
, preProcess :: Maybe (BL.ByteString -> BL.ByteString)
, readAs :: ReadAs a
}
data ReadAs a where
JSON :: FromJSON a => ReadAs a
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
csvRecord :: FromRecord a => ReadAs a
csvRecord = CSVRecord NoHeader defaultDecodeOptions
csvDatasetPreprocess :: FromRecord a => (BL.ByteString -> BL.ByteString) -> Source -> Dataset a
csvDatasetPreprocess preF src = (csvDataset src) { preProcess = Just preF }
csvDataset :: FromRecord a => Source -> Dataset a
csvDataset src = Dataset src Nothing Nothing $ CSVRecord NoHeader defaultDecodeOptions
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
csvHdrDataset src = Dataset src Nothing Nothing $ CSVNamedRecord defaultDecodeOptions
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
csvHdrDatasetSep sepc src
= Dataset src Nothing Nothing
$ CSVNamedRecord defaultDecodeOptions { decDelimiter = fromIntegral (ord sepc)}
jsonDataset :: FromJSON a => Source -> Dataset a
jsonDataset src = Dataset src Nothing Nothing JSON
getFileFromSource :: FilePath -> Source -> IO (BL.ByteString)
getFileFromSource cacheDir (URL url) = do
createDirectoryIfMissing True cacheDir
let fnm = cacheDir </> "ds" <> show (hash url)
ex <- doesFileExist fnm
if ex
then BL.readFile fnm
else do
rsp <- Wreq.get url
let bs = rsp ^. Wreq.responseBody
BL.writeFile fnm bs
return bs
dashToCamelCase :: String -> String
dashToCamelCase ('-':c:cs) = toUpper c : dashToCamelCase cs
dashToCamelCase (c:cs) = c : dashToCamelCase cs
dashToCamelCase [] = []
parseDashToCamelField :: Read a => Field -> Parser a
parseDashToCamelField s =
case readMaybe (dashToCamelCase $ unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
parseReadField :: Read a => Field -> Parser a
parseReadField s =
case readMaybe (unpack s) of
Just wc -> pure wc
Nothing -> fail "unknown"
dropLines :: Int -> BL.ByteString -> BL.ByteString
dropLines 0 s = s
dropLines n s = dropLines (n1) $ BL.tail $ BL8.dropWhile (/='\n') s
fixAmericanDecimals :: BL.ByteString -> BL.ByteString
fixAmericanDecimals = replace ",." (",0."::BL.ByteString)
fixedWidthToCSV :: BL.ByteString -> BL.ByteString
fixedWidthToCSV = BL8.pack . fnl . BL8.unpack where
f [] = []
f (' ':cs) = ',':f (chomp cs)
f ('\n':cs) = '\n':fnl cs
f (c:cs) = c:f cs
fnl cs = f (chomp cs) --newline
chomp (' ':cs) = chomp cs
chomp (c:cs) = c:cs
chomp [] = []
yearToUTCTime :: Double -> UTCTime
yearToUTCTime yearDbl =
let (yearn,yearFrac) = properFraction yearDbl
dayYearBegin = fromGregorian yearn 1 1
(dayn, dayFrac) = properFraction $ yearFrac * (if isLeapYear yearn then 366 else 365)
day = addDays dayn dayYearBegin
dt = secondsToDiffTime $ round $ dayFrac * 86400
in UTCTime day dt