Safe Haskell | None |
---|---|
Language | Haskell2010 |
The datasets package defines two different kinds of datasets:
- small data sets which are directly (or indirectly with `file-embed`) embedded in the package as pure values and do not require network or IO to download the data set.
- other data sets which need to be fetched over the network with
getDataset
and are cached in a local temporary directory
This module defines the getDataset
function for fetching datasets
and utilies for defining new data sets. It is only necessary to import
this module when using fetched data sets. Embedded data sets can be
imported directly.
- getDataset :: Dataset a -> IO [a]
- readDataset :: ReadAs a -> ByteString -> [a]
- data Source = URL String
- data Dataset a = Dataset {
- source :: Source
- temporaryDirectory :: Maybe FilePath
- preProcess :: Maybe (ByteString -> ByteString)
- readAs :: ReadAs a
- data ReadAs a where
- JSON :: FromJSON a => ReadAs a
- CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a
- CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a
- csvRecord :: FromRecord a => ReadAs a
- csvDatasetPreprocess :: FromRecord a => (ByteString -> ByteString) -> Source -> Dataset a
- csvDataset :: FromRecord a => Source -> Dataset a
- csvHdrDataset :: FromNamedRecord a => Source -> Dataset a
- csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a
- jsonDataset :: FromJSON a => Source -> Dataset a
- getFileFromSource :: FilePath -> Source -> IO ByteString
- dashToCamelCase :: String -> String
- parseDashToCamelField :: Read a => Field -> Parser a
- parseReadField :: Read a => Field -> Parser a
- dropLines :: Int -> ByteString -> ByteString
- fixAmericanDecimals :: ByteString -> ByteString
- fixedWidthToCSV :: ByteString -> ByteString
- yearToUTCTime :: Double -> UTCTime
Using datasets
getDataset :: Dataset a -> IO [a] Source #
Load a dataset, using the system temporary directory as a cache
readDataset :: ReadAs a -> ByteString -> [a] Source #
Read a ByteString into a Haskell value
A dataset is a record telling us how to load the data
Dataset | |
|
ReadAs is a datatype to describe data formats that hold data sets
JSON :: FromJSON a => ReadAs a | |
CSVRecord :: FromRecord a => HasHeader -> DecodeOptions -> ReadAs a | |
CSVNamedRecord :: FromNamedRecord a => DecodeOptions -> ReadAs a |
csvRecord :: FromRecord a => ReadAs a Source #
Defining datasets
csvDatasetPreprocess :: FromRecord a => (ByteString -> ByteString) -> Source -> Dataset a Source #
Define a dataset from a pre-processing function and a source for a CSV file
csvDataset :: FromRecord a => Source -> Dataset a Source #
Define a dataset from a source for a CSV file
csvHdrDataset :: FromNamedRecord a => Source -> Dataset a Source #
Define a dataset from a source for a CSV file with a known header
csvHdrDatasetSep :: FromNamedRecord a => Char -> Source -> Dataset a Source #
Define a dataset from a source for a CSV file with a known header and separator
jsonDataset :: FromJSON a => Source -> Dataset a Source #
Define a dataset from a source for a JSON file -- data file must be accessible with HTTP, not HTTPS
getFileFromSource :: FilePath -> Source -> IO ByteString Source #
Get a ByteString from the specified Source
Helper functions for parsing
dashToCamelCase :: String -> String Source #
Turn dashes to CamlCase
parseDashToCamelField :: Read a => Field -> Parser a Source #
Parse a field, first turning dashes to CamlCase
dropLines :: Int -> ByteString -> ByteString Source #
Drop lines from a bytestring
fixAmericanDecimals :: ByteString -> ByteString Source #
Turn US-style decimals starting with a period (e.g. .2) into something Haskell can parse (e.g. 0.2)
fixedWidthToCSV :: ByteString -> ByteString Source #
Convert a Fixed-width format to a CSV
Helper functions for data analysis
yearToUTCTime :: Double -> UTCTime Source #
convert a fractional year to UTCTime with second-level precision (due to not taking into account leap seconds)