{-# LANGUAGE BangPatterns, CPP, DataKinds, FlexibleInstances, KindSignatures, LambdaCase, MultiParamTypeClasses, OverloadedStrings, QuasiQuotes, RankNTypes, RecordWildCards, ScopedTypeVariables, TemplateHaskell, TypeOperators #-} -- | Infer row types from comma-separated values (CSV) data and read -- that data from files. Template Haskell is used to generate the -- necessary types so that you can write type safe programs referring -- to those types. module Frames.CSV where #if __GLASGOW_HASKELL__ < 710 import Control.Applicative ((<$>), pure, (<*>)) import Data.Foldable (foldMap) import Data.Traversable (sequenceA) import Data.Monoid (Monoid) #endif import Control.Arrow (first, second) import Control.Monad (when, void) import Data.Char (isAlpha, isAlphaNum, toLower, toUpper) import qualified Data.Foldable as F import Data.List (intercalate) import Data.Maybe (isNothing, fromMaybe) import Data.Monoid ((<>)) import Data.Proxy import qualified Data.Text as T import Data.Vinyl (RElem, Rec) import Data.Vinyl.TypeLevel (RecAll, RIndex) import Data.Vinyl.Functor (Identity) import Frames.Col import Frames.ColumnTypeable import Frames.ColumnUniverse import Frames.Rec import Frames.RecF import Frames.RecLens import Language.Haskell.TH import Language.Haskell.TH.Syntax import Pipes ((>->)) import qualified Pipes as P import qualified Pipes.Prelude as P import qualified Pipes.ByteString import qualified Pipes.Group import qualified Pipes.Parse as P import qualified Pipes.Prelude.Text as PT import qualified Pipes.Text as PT import qualified Pipes.Text.Encoding as PT import qualified Pipes.Safe as P import qualified Pipes.Safe.Prelude import System.IO (IOMode(ReadMode)) type Separator = T.Text type QuoteChar = Char data QuotingMode -- | No quoting enabled. The separator may not appear in values = NoQuoting -- | Quoted values with the given quoting character. Quotes are escaped by doubling them. -- Mostly RFC4180 compliant, except doesn't support newlines in values | RFC4180Quoting QuoteChar deriving (Eq, Show) data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text] , columnSeparator :: Separator , quotingMode :: QuotingMode } deriving (Eq, Show) instance Lift QuotingMode where lift NoQuoting = [|NoQuoting|] lift (RFC4180Quoting char) = [|RFC4180Quoting $(litE . charL $ char)|] instance Lift ParserOptions where lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|] where sep' = [|T.pack $(stringE $ T.unpack sep)|] quoting' = lift quoting lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|] where sep' = [|T.pack $(stringE $ T.unpack sep)|] hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|] quoting' = lift quoting -- | Default 'ParseOptions' get column names from a header line, and -- use commas to separate columns. defaultParser :: ParserOptions defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"') -- | Default separator string. defaultSep :: Separator defaultSep = T.pack "," -- * Parsing -- | Helper to split a 'T.Text' on commas and strip leading and -- trailing whitespace from each resulting chunk. tokenizeRow :: ParserOptions -> T.Text -> [T.Text] tokenizeRow options = handleQuoting . T.splitOn sep where sep = columnSeparator options quoting = quotingMode options handleQuoting = case quoting of NoQuoting -> id RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote -- | Post processing applied to a list of tokens split by the -- separator which should have quoted sections reassembeld reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text] reassembleRFC4180QuotedParts sep quoteChar = go where go [] = [] go (part:parts) | T.null part = T.empty : go parts | prefixQuoted part = if suffixQuoted part then unescape (T.drop 1 . T.dropEnd 1 $ part) : go parts else case break suffixQuoted parts of (h,[]) -> [unescape (T.intercalate sep (T.drop 1 part : h))] (h,t:ts) -> unescape (T.intercalate sep (T.drop 1 part : h ++ [T.dropEnd 1 t])) : go ts | otherwise = T.strip part : go parts prefixQuoted t = T.head t == quoteChar && T.length (T.takeWhile (== quoteChar) t) `rem` 2 == 1 suffixQuoted t = quoteText `T.isSuffixOf` t && T.length (T.takeWhileEnd (== quoteChar) t) `rem` 2 == 1 quoteText = T.singleton quoteChar unescape :: T.Text -> T.Text unescape = T.replace q2 quoteText where q2 = quoteText <> quoteText --tokenizeRow :: Separator -> T.Text -> [T.Text] --tokenizeRow sep = map (unquote . T.strip) . T.splitOn sep -- where unquote txt -- | quoted txt = case T.dropEnd 1 (T.drop 1 txt) of -- txt' | T.null txt' -> "Col" -- | numish txt' -> txt -- | otherwise -> txt' -- | otherwise = txt -- numish = T.all (`elem` ("-+.0123456789"::String)) -- quoted txt = case T.uncons txt of -- Just ('"', rst) -- | not (T.null rst) -> T.last rst == '"' -- _ -> False -- | Infer column types from a prefix (up to 1000 lines) of a CSV -- file. prefixInference :: (ColumnTypeable a, Monoid a, Monad m) => ParserOptions -> P.Parser T.Text m [a] prefixInference opts = P.draw >>= \case Nothing -> return [] Just row1 -> P.foldAll (\ts -> zipWith (<>) ts . inferCols) (inferCols row1) id where inferCols = map inferType . tokenizeRow opts -- | Extract column names and inferred types from a CSV file. readColHeaders :: (ColumnTypeable a, Monoid a, Monad m) => ParserOptions -> P.Producer T.Text m () -> m [(T.Text, a)] readColHeaders opts = P.evalStateT $ do headerRow <- maybe ((tokenizeRow opts . fromMaybe (error "Empty Producer has no header row")) <$> P.draw) pure (headerOverride opts) colTypes <- prefixInference opts return (zip headerRow colTypes) -- * Loading Data -- | Parsing each component of a 'RecF' from a list of text chunks, -- one chunk per record component. class ReadRec (rs :: [*]) where readRec :: [T.Text] -> Rec Maybe rs instance ReadRec '[] where readRec _ = Nil instance (Parseable t, ReadRec ts) => ReadRec (s :-> t ': ts) where readRec [] = frameCons Nothing (readRec []) readRec (h:t) = frameCons (parse' h) (readRec t) -- | Produce the lines of a latin1 (or ISO8859 Part 1) encoded file as -- ’T.Text’ values. Similar to ’PT.readFileLn’ that uses the system -- locale for decoding, but built on the ’PT.decodeIso8859_1’ decoder. readFileLatin1Ln :: P.MonadSafe m => FilePath -> P.Producer T.Text m () readFileLatin1Ln fp = Pipes.Safe.Prelude.withFile fp ReadMode $ \h -> let latinText = void (PT.decodeIso8859_1 (Pipes.ByteString.fromHandle h)) latinLines = PT.decode PT.lines latinText in Pipes.Group.concats latinLines -- | Read a 'RecF' from one line of CSV. readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec Maybe rs readRow = (readRec .) . tokenizeRow -- | Produce rows where any given entry can fail to parse. readTableMaybeOpt :: (P.MonadSafe m, ReadRec rs) => ParserOptions -> FilePath -> P.Producer (Rec Maybe rs) m () readTableMaybeOpt opts csvFile = PT.readFileLn csvFile >-> pipeTableMaybeOpt opts {-# INLINE readTableMaybeOpt #-} -- | Stream lines of CSV data into rows of ’Rec’ values values where -- any given entry can fail to parse. pipeTableMaybeOpt :: (Monad m, ReadRec rs) => ParserOptions -> P.Pipe T.Text (Rec Maybe rs) m () pipeTableMaybeOpt opts = do when (isNothing (headerOverride opts)) (() <$ P.await) P.map (readRow opts) {-# INLINE pipeTableMaybeOpt #-} -- | Produce rows where any given entry can fail to parse. readTableMaybe :: (P.MonadSafe m, ReadRec rs) => FilePath -> P.Producer (Rec Maybe rs) m () readTableMaybe = readTableMaybeOpt defaultParser {-# INLINE readTableMaybe #-} -- | Stream lines of CSV data into rows of ’Rec’ values where any -- given entry can fail to parse. pipeTableMaybe :: (Monad m, ReadRec rs) => P.Pipe T.Text (Rec Maybe rs) m () pipeTableMaybe = pipeTableMaybeOpt defaultParser {-# INLINE pipeTableMaybe #-} -- -- | Returns a `MonadPlus` producer of rows for which each column was -- -- successfully parsed. This is typically slower than 'readTableOpt'. -- readTableOpt' :: forall m rs. -- (MonadPlus m, MonadIO m, ReadRec rs) -- => ParserOptions -> FilePath -> m (Record rs) -- readTableOpt' opts csvFile = -- do h <- liftIO $ do -- h <- openFile csvFile ReadMode -- when (isNothing $ headerOverride opts) (void $ T.hGetLine h) -- return h -- let go = liftIO (hIsEOF h) >>= \case -- True -> mzero -- False -> let r = recMaybe . readRow opts <$> T.hGetLine h -- in liftIO r >>= maybe go (flip mplus go . return) -- go -- {-# INLINE readTableOpt' #-} -- -- | Returns a `MonadPlus` producer of rows for which each column was -- -- successfully parsed. This is typically slower than 'readTable'. -- readTable' :: forall m rs. (P.MonadSafe m, ReadRec rs) -- => FilePath -> m (Record rs) -- readTable' = readTableOpt' defaultParser -- {-# INLINE readTable' #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readTableOpt :: forall m rs. (P.MonadSafe m, ReadRec rs) => ParserOptions -> FilePath -> P.Producer (Record rs) m () readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe {-# INLINE readTableOpt #-} -- | Pipe lines of CSV text into rows for which each column was -- successfully parsed. pipeTableOpt :: (ReadRec rs, Monad m) => ParserOptions -> P.Pipe T.Text (Record rs) m () pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat {-# INLINE pipeTableOpt #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readTable :: forall m rs. (P.MonadSafe m, ReadRec rs) => FilePath -> P.Producer (Record rs) m () readTable = readTableOpt defaultParser {-# INLINE readTable #-} -- | Pipe lines of CSV text into rows for which each column was -- successfully parsed. pipeTable :: (ReadRec rs, Monad m) => P.Pipe T.Text (Record rs) m () pipeTable = pipeTableOpt defaultParser {-# INLINE pipeTable #-} -- * Template Haskell -- | Generate a column type. recDec :: [(T.Text, Q Type)] -> Q Type recDec = appT [t|Record|] . go where go [] = return PromotedNilT go ((n,t):cs) = [t|($(litT $ strTyLit (T.unpack n)) :-> $(t)) ': $(go cs) |] -- | Capitalize the first letter of a 'T.Text'. capitalize1 :: T.Text -> T.Text capitalize1 = foldMap (onHead toUpper) . T.split (not . isAlphaNum) where onHead f = maybe mempty (uncurry T.cons . first f) . T.uncons -- | Massage a column name from a CSV file into a valid Haskell type -- identifier. sanitizeTypeName :: T.Text -> T.Text sanitizeTypeName = unreserved . fixupStart . T.concat . T.split (not . valid) . capitalize1 where valid c = isAlphaNum c || c == '\'' || c == '_' unreserved t | t `elem` ["Type", "Class"] = "Col" <> t | otherwise = t fixupStart t = case T.uncons t of Nothing -> "Col" Just (c,_) | isAlpha c -> t | otherwise -> "Col" <> t -- | Declare a type synonym for a column. mkColTDec :: TypeQ -> Name -> DecQ mkColTDec colTypeQ colTName = tySynD colTName [] colTypeQ -- | Declare a singleton value of the given column type and lenses for -- working with that column. mkColPDec :: Name -> TypeQ -> T.Text -> DecsQ mkColPDec colTName colTy colPName = sequenceA [tySig, val, tySig', val'] where nm = mkName $ T.unpack colPName nm' = mkName $ T.unpack colPName <> "'" -- tySig = sigD nm [t|Proxy $(conT colTName)|] tySig = sigD nm [t|forall f rs. (Functor f, RElem $(conT colTName) rs (RIndex $(conT colTName) rs)) => ($colTy -> f $colTy) -> Record rs -> f (Record rs) |] tySig' = sigD nm' [t|forall f g rs. (Functor f, Functor g, RElem $(conT colTName) rs (RIndex $(conT colTName) rs)) => (g $(conT colTName) -> f (g $(conT colTName))) -> Rec g rs -> f (Rec g rs) |] val = valD (varP nm) (normalB [e|rlens (Proxy :: Proxy $(conT colTName))|]) [] val' = valD (varP nm') (normalB [e|rlens' (Proxy :: Proxy $(conT colTName))|]) [] lowerHead :: T.Text -> Maybe T.Text lowerHead = fmap aux . T.uncons where aux (c,t) = T.cons (toLower c) t -- | For each column, we declare a type synonym for its type, and a -- Proxy value of that type. colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName' <*> mkColPDec colTName' colTyQ colPName where colTName = sanitizeTypeName (prefix <> capitalize1 colName) colPName = fromMaybe "colDec impossible" (lowerHead colTName) colTName' = mkName $ T.unpack colTName colTyQ = colType colTy colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|] -- | Splice for manually declaring a column of a given type. For -- example, @declareColumn "x2" ''Double@ will declare a type synonym -- @type X2 = "x2" :-> Double@ and a lens @x2@. declareColumn :: T.Text -> Name -> DecsQ declareColumn colName colTy = (:) <$> mkColTDec colTypeQ colTName' <*> mkColPDec colTName' colTyQ colPName where colTName = sanitizeTypeName colName colPName = maybe "colDec impossible" (\(c,t) -> T.cons (toLower c) t) (T.uncons colTName) colTName' = mkName $ T.unpack colTName colTyQ = return (ConT colTy) colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|] -- * Default CSV Parsing -- | Control how row and named column types are generated. data RowGen a = RowGen { columnNames :: [String] -- ^ Use these column names. If empty, expect a -- header row in the data file to provide -- column names. , tablePrefix :: String -- ^ A common prefix to use for every generated -- declaration. , separator :: Separator -- ^ The string that separates the columns on a -- row. , rowTypeName :: String -- ^ The row type that enumerates all -- columns. , columnUniverse :: Proxy a -- ^ A type that identifies all the types that -- can be used to classify a column. This is -- essentially a type-level list of types. See -- 'colQ'. , lineReader :: P.Producer T.Text (P.SafeT IO) () -- ^ A producer of lines of ’T.Text’xs } -- | Shorthand for a 'Proxy' value of 'ColumnUniverse' applied to the -- given type list. colQ :: Name -> Q Exp colQ n = [e| (Proxy :: Proxy (ColumnUniverse $(conT n))) |] -- | A default 'RowGen'. This instructs the type inference engine to -- get column names from the data file, use the default column -- separator (a comma), infer column types from the default 'Columns' -- set of types, and produce a row type with name @Row@. rowGen :: FilePath -> RowGen Columns rowGen = RowGen [] "" defaultSep "Row" Proxy . PT.readFileLn -- | Generate a type for each row of a table. This will be something -- like @Record ["x" :-> a, "y" :-> b, "z" :-> c]@. tableType :: String -> FilePath -> DecsQ tableType n fp = tableType' (rowGen fp) { rowTypeName = n } -- | Like 'tableType', but additionally generates a type synonym for -- each column, and a proxy value of that type. If the CSV file has -- column names \"foo\", \"bar\", and \"baz\", then this will declare -- @type Foo = "foo" :-> Int@, for example, @foo = rlens (Proxy :: Proxy -- Foo)@, and @foo' = rlens' (Proxy :: Proxy Foo)@. tableTypes :: String -> FilePath -> DecsQ tableTypes n fp = tableTypes' (rowGen fp) { rowTypeName = n } -- * Customized Data Set Parsing -- | Inspect no more than this many lines when inferring column types. prefixSize :: Int prefixSize = 1000 -- | Generate a type for a row of a table. This will be something like -- @Record ["x" :-> a, "y" :-> b, "z" :-> c]@. Column type synonyms -- are /not/ generated (see 'tableTypes''). tableType' :: forall a. (ColumnTypeable a, Monoid a) => RowGen a -> DecsQ tableType' (RowGen {..}) = pure . TySynD (mkName rowTypeName) [] <$> (runIO (P.runSafeT (readColHeaders opts lineSource)) >>= recDec') where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type colNames' | null columnNames = Nothing | otherwise = Just (map T.pack columnNames) opts = ParserOptions colNames' separator (RFC4180Quoting '\"') lineSource = lineReader >-> P.take prefixSize -- | Tokenize the first line of a ’P.Producer’. colNamesP :: Monad m => ParserOptions -> P.Producer T.Text m () -> m [T.Text] colNamesP opts src = either (const []) (tokenizeRow opts . fst) <$> P.next src -- | Generate a type for a row of a table all of whose columns remain -- unparsed 'Text' values. tableTypesText' :: forall a. (ColumnTypeable a, Monoid a) => RowGen a -> DecsQ tableTypesText' (RowGen {..}) = do colNames <- runIO . P.runSafeT $ maybe (colNamesP opts lineReader) pure (headerOverride opts) let headers = zip colNames (repeat (inferType " ")) recTy <- tySynD (mkName rowTypeName) [] (recDec' headers) let optsName = case rowTypeName of [] -> error "Row type name shouldn't be empty" h:t -> mkName $ toLower h : t ++ "Parser" optsTy <- sigD optsName [t|ParserOptions|] optsDec <- valD (varP optsName) (normalB $ lift opts) [] colDecs <- concat <$> mapM (uncurry $ colDec (T.pack tablePrefix)) headers return (recTy : optsTy : optsDec : colDecs) where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type colNames' | null columnNames = Nothing | otherwise = Just (map T.pack columnNames) opts = ParserOptions colNames' separator (RFC4180Quoting '\"') -- | Like 'tableType'', but additionally generates a type synonym for -- each column, and a proxy value of that type. If the CSV file has -- column names \"foo\", \"bar\", and \"baz\", then this will declare -- @type Foo = "foo" :-> Int@, for example, @foo = rlens (Proxy :: -- Proxy Foo)@, and @foo' = rlens' (Proxy :: Proxy Foo)@. tableTypes' :: forall a. (ColumnTypeable a, Monoid a) => RowGen a -> DecsQ tableTypes' (RowGen {..}) = do headers <- runIO . P.runSafeT $ readColHeaders opts lineSource recTy <- tySynD (mkName rowTypeName) [] (recDec' headers) let optsName = case rowTypeName of [] -> error "Row type name shouldn't be empty" h:t -> mkName $ toLower h : t ++ "Parser" optsTy <- sigD optsName [t|ParserOptions|] optsDec <- valD (varP optsName) (normalB $ lift opts) [] colDecs <- concat <$> mapM (uncurry mkColDecs) headers return (recTy : optsTy : optsDec : colDecs) -- (:) <$> (tySynD (mkName n) [] (recDec' headers)) -- <*> (concat <$> mapM (uncurry $ colDec (T.pack prefix)) headers) where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type colNames' | null columnNames = Nothing | otherwise = Just (map T.pack columnNames) opts = ParserOptions colNames' separator (RFC4180Quoting '\"') lineSource = lineReader >-> P.take prefixSize mkColDecs colNm colTy = do let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm) mColNm <- lookupTypeName safeName case mColNm of Just _ -> pure [] Nothing -> colDec (T.pack tablePrefix) colNm colTy -- * Writing CSV Data -- | 'P.yield' a header row with column names followed by a line of -- text for each 'Record' with each field separated by a comma. produceCSV :: forall f ts m. (ColumnHeaders ts, AsVinyl ts, Foldable f, Monad m, RecAll Identity (UnColumn ts) Show) => f (Record ts) -> P.Producer String m () produceCSV recs = do P.yield (intercalate "," (columnHeaders (Proxy :: Proxy (Record ts)))) F.mapM_ (P.yield . intercalate "," . showFields) recs -- | Write a header row with column names followed by a line of text -- for each 'Record' to the given file. writeCSV :: (ColumnHeaders ts, AsVinyl ts, Foldable f, RecAll Identity (UnColumn ts) Show) => FilePath -> f (Record ts) -> IO () writeCSV fp recs = P.runSafeT . P.runEffect $ produceCSV recs >-> P.map T.pack >-> PT.writeFileLn fp