{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedLabels #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE NoImplicitPrelude #-} -- | A csv process based on attoparsec and the box library -- module Box.Csv ( CsvConfig (..), defaultCsvConfig, file, Header (..), rowEmitter, rowCommitter, runCsv, -- * parsers sep, field_, field, skipField_, skipField, int, int', A.double, double', fields, scis, ints, doubles, ) where import Box import Control.Lens import qualified Data.Attoparsec.Text as A import Data.Generics.Labels () import Data.Scientific import qualified Data.Text as Text import NumHask.Prelude -- $setup -- >>> :set -XOverloadedStrings -- | csv file configuration data CsvConfig = CsvConfig { -- | file name stem name :: Text, -- | file suffix suffix :: Text, -- | directory dir :: Text, -- | field separator fsep :: Char, -- | nature of header row(s) header :: Header } deriving (Show, Generic, Eq) -- | default csv file details -- -- >>> defaultCsvConfig -- CsvConfig {name = "time_series_covid19_deaths_global_narrow", suffix = ".csv", dir = "./other", fsep = ',', header = HasHXL} -- -- test data from https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases defaultCsvConfig :: CsvConfig defaultCsvConfig = CsvConfig "time_series_covid19_deaths_global_narrow" ".csv" "./other" ',' HasHXL -- | filepath for the config. -- -- >>> file defaultCsvConfig -- "./other/time_series_covid19_deaths_global_narrow.csv" file :: CsvConfig -> FilePath file cfg = cfg ^. #dir <> "/" <> cfg ^. #name <> cfg ^. #suffix & unpack -- | Type of header rows. Note the modern propensity for multiple header rows. data Header = HasHeader | HasHXL | NoHeader deriving (Show, Eq) -- | A continuation emitter of parsed csv rows from a CsvConfig, returning the original text on failure -- >>> rowEmitter defaultCsvConfig fields `with` emit -- Just (Right ["Province/State","Country/Region","Lat","Long","Date","Value","ISO 3166-1 Alpha 3-Codes","Region Code","Sub-region Code","Intermediate Region Code\r"]) rowEmitter :: CsvConfig -> (Char -> A.Parser a) -> Cont IO (Emitter IO (Either Text a)) rowEmitter cfg p = parseE (p (view #fsep cfg)) <$> fileE (file cfg) -- | commits printed csv rows -- -- >>> let testConfig = CsvConfig "test" ".csv" "./test" ',' NoHeader -- >>> let ctest = rowCommitter testConfig (fmap (Text.intercalate "," . fmap show)) -- >>> ctest `with` (\c -> commit c [[1..10::Int]]) -- True -- -- >>> rowEmitter testConfig ints `with` emit -- Just (Right [1,2,3,4,5,6,7,8,9,10]) -- rowCommitter :: CsvConfig -> (a -> [Text]) -> Cont IO (Committer IO a) rowCommitter cfg f = contramap (Text.intercalate (Text.singleton $ view #fsep cfg) . f) <$> fileWriteC (file cfg) -- | Run a parser across all lines of a file. -- -- >>> r1 <- runCsv defaultCsvConfig fields -- >>> length r1 -- 42562 -- -- >>> length [x | (Left x) <- r1] -- 0 -- -- >>> take 2 $ drop 2 [x | (Right x) <- r1] -- [["","Afghanistan","33.0","65.0","2020-06-29","733","AFG","142","34","\r"],["","Afghanistan","33.0","65.0","2020-06-28","721","AFG","142","34","\r"]] runCsv :: CsvConfig -> (Char -> A.Parser a) -> IO [Either Text a] runCsv cfg p = with (rowEmitter cfg p) toListE -- * low-level generic csv parser helpers -- | Most parsing and building routines implicity assume a character acting as a separator of fields, and newlines separating rows. -- -- >>> A.parse (sep ',') ",ok" -- Done "ok" () sep :: Char -> A.Parser () sep c = void (A.char c) -- * single field parsers -- | an unquoted field -- Does not consume the separator token -- -- >>> A.parse (field_ ',') "field,ok" -- Done ",ok" "field" field_ :: Char -> A.Parser Text field_ c = A.takeWhile (/= c) -- | an unquoted field -- Consume the separator token -- -- >>> A.parse (field ',') "field,ok" -- Done "ok" "field" field :: Char -> A.Parser Text field c = A.takeWhile (/= c) <* A.char c -- | skipping a field -- -- >>> A.parse (skipField_ ',') "field,ok" -- Done ",ok" () skipField_ :: Char -> A.Parser () skipField_ c = A.skipWhile (/= c) -- | skipping a field -- -- >>> A.parse (skipField ',') "field,ok" -- Done "ok" () skipField :: Char -> A.Parser () skipField c = A.skipWhile (/= c) <* A.char ',' -- | int parser -- -- >>> A.parse int "234,ok" -- Done ",ok" 234 int :: A.Parser Int int = A.decimal -- | int parser, consumes separator -- -- >>> A.parse (int' ',') "234,ok" -- Done "ok" 234 int' :: Char -> A.Parser Int int' c = A.decimal <* A.char c -- | double parser, consumes separator -- -- >>> A.parse (double' ',') "234.000,ok" -- Done "ok" 234.0 double' :: Char -> A.Parser Double double' c = A.double <* A.char c -- * Block list parsers -- | Parser for a csv row of [Text]. -- TODO: deal with potential for an extra '\r' -- -- >>> A.parseOnly (fields ',') "field1,field2\r" -- Right ["field1","field2\r"] -- fields :: Char -> A.Parser [Text] fields c = field_ c `A.sepBy1` (sep c) -- | parser for a csv row of [Scientific] -- -- >>> A.parseOnly (scis ',') "1,2.2,3.3" -- Right [1.0,2.2,3.3] scis :: Char -> A.Parser [Scientific] scis c = A.scientific `A.sepBy1` sep c -- | parser for a csv row of [Double] -- -- >>> A.parseOnly (doubles ',') "1,2,3" -- Right [1.0,2.0,3.0] doubles :: Char -> A.Parser [Double] doubles c = A.double `A.sepBy1` sep c -- | parser for a csv row of [Int] -- -- >>> A.parseOnly (ints ',') "1,2,3" -- Right [1,2,3] ints :: Char -> A.Parser [Int] ints c = A.signed A.decimal `A.sepBy1` sep c