{-# LANGUAGE CPP, DataKinds, DeriveLift, FlexibleContexts, FlexibleInstances, GADTs, LambdaCase, OverloadedStrings, RankNTypes, ScopedTypeVariables, TemplateHaskell, TypeApplications, TypeOperators #-} -- | Infer row types from comma-separated values (CSV) data and read -- that data from files. Template Haskell is used to generate the -- necessary types so that you can write type safe programs referring -- to those types. module Frames.CSV where import Control.Exception (try, IOException) import Control.Monad (when, unless) import qualified Data.ByteString.Char8 as B8 import qualified Data.Foldable as F import Data.List (intercalate) import Data.Maybe (isNothing, fromMaybe) #if __GLASGOW_HASKELL__ < 808 import Data.Monoid ((<>)) #endif import Data.Proxy import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import Data.Vinyl (recordToList, Rec(..), ElField(..), RecordToList) import Data.Vinyl (RecMapMethod, rmapMethod, RMap, rmap) import Data.Vinyl.Class.Method (PayloadType) import Data.Vinyl.Functor (Const(..), (:.), Compose(..)) import Frames.Col import Frames.ColumnTypeable import Frames.Rec import Frames.RecF import Frames.ShowCSV import GHC.TypeLits (KnownSymbol) import Language.Haskell.TH import Language.Haskell.TH.Syntax import Pipes ((>->)) import qualified Pipes as P import qualified Pipes.Prelude as P import qualified Pipes.Parse as P import qualified Pipes.Safe as P import qualified Pipes.Safe.Prelude as Safe import System.IO (Handle, IOMode(ReadMode, WriteMode)) -- * Parsing type Separator = T.Text type QuoteChar = Char data QuotingMode -- | No quoting enabled. The separator may not appear in values = NoQuoting -- | Quoted values with the given quoting character. Quotes are escaped by doubling them. -- Mostly RFC4180 compliant, except doesn't support newlines in values | RFC4180Quoting QuoteChar deriving (Eq, Show, Lift) data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text] , columnSeparator :: Separator , quotingMode :: QuotingMode } deriving (Eq, Show) instance Lift ParserOptions where lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|] where sep' = [|T.pack $(stringE $ T.unpack sep)|] quoting' = lift quoting lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|] where sep' = [|T.pack $(stringE $ T.unpack sep)|] hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|] quoting' = lift quoting #if MIN_VERSION_template_haskell(2,16,0) liftTyped = unsafeTExpCoerce . lift #endif -- | Default 'ParseOptions' get column names from a header line, and -- use commas to separate columns. defaultParser :: ParserOptions defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"') -- | Default separator string. defaultSep :: Separator defaultSep = T.pack "," -- | Helper to split a 'T.Text' on commas and strip leading and -- trailing whitespace from each resulting chunk. tokenizeRow :: ParserOptions -> T.Text -> [T.Text] tokenizeRow options = handleQuoting . T.splitOn sep where sep = columnSeparator options quoting = quotingMode options handleQuoting = case quoting of NoQuoting -> id RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote -- | Post processing applied to a list of tokens split by the -- separator which should have quoted sections reassembeld reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text] reassembleRFC4180QuotedParts sep quoteChar = go where go [] = [] go (part:parts) | T.null part = T.empty : go parts | prefixQuoted part = if suffixQuoted part then unescape (T.drop 1 . T.dropEnd 1 $ part) : go parts else case break suffixQuoted parts of (h,[]) -> [unescape (T.intercalate sep (T.drop 1 part : h))] (h,t:ts) -> unescape (T.intercalate sep (T.drop 1 part : h ++ [T.dropEnd 1 t])) : go ts | otherwise = T.strip part : go parts prefixQuoted t = T.head t == quoteChar-- && -- T.length (T.takeWhile (== quoteChar) t) `rem` 2 == 1 suffixQuoted t = quoteText `T.isSuffixOf` t-- && -- T.length (T.takeWhileEnd (== quoteChar) t) `rem` 2 == 1 quoteText = T.singleton quoteChar unescape :: T.Text -> T.Text unescape = T.replace q2 quoteText where q2 = quoteText <> quoteText --tokenizeRow :: Separator -> T.Text -> [T.Text] --tokenizeRow sep = map (unquote . T.strip) . T.splitOn sep -- where unquote txt -- | quoted txt = case T.dropEnd 1 (T.drop 1 txt) of -- txt' | T.null txt' -> "Col" -- | numish txt' -> txt -- | otherwise -> txt' -- | otherwise = txt -- numish = T.all (`elem` ("-+.0123456789"::String)) -- quoted txt = case T.uncons txt of -- Just ('"', rst) -- | not (T.null rst) -> T.last rst == '"' -- _ -> False -- | Infer column types from a prefix (up to 1000 lines) of a CSV -- file. prefixInference :: (ColumnTypeable a, Monoid a, Monad m) => P.Parser [T.Text] m [a] prefixInference = P.draw >>= \case Nothing -> return [] Just row1 -> P.foldAll (\ts -> zipWith (<>) ts . inferCols) (inferCols row1) id where inferCols = map inferType -- | Extract column names and inferred types from a CSV file. readColHeaders :: (ColumnTypeable a, Monoid a, Monad m) => ParserOptions -> P.Producer [T.Text] m () -> m [(T.Text, a)] readColHeaders opts = P.evalStateT $ do headerRow <- maybe (fromMaybe err <$> P.draw) pure (headerOverride opts) colTypes <- prefixInference unless (length headerRow == length colTypes) (error errNumColumns) return (zip headerRow colTypes) where err = error "Empty Producer has no header row" errNumColumns = unlines [ "" , "Error parsing CSV: " , " Number of columns in header differs from number of columns" , " found in the remaining file. This may be due to newlines" , " being present within the data itself (not just separating" , " rows). If support for embedded newlines is required, " , " consider using the Frames-dsv package in conjunction with" , " Frames to make use of a different CSV parser."] -- * Loading CSV Data -- | Parsing each component of a 'RecF' from a list of text chunks, -- one chunk per record component. class ReadRec rs where readRec :: [T.Text] -> Rec (Either T.Text :. ElField) rs instance ReadRec '[] where readRec _ = RNil instance (Parseable t, ReadRec ts, KnownSymbol s) => ReadRec (s :-> t ': ts) where readRec [] = Compose (Left mempty) :& readRec [] readRec (h:t) = maybe (Compose (Left (T.copy h))) (Compose . Right . Field) (parse' h) :& readRec t -- | Opens a file (in 'P.MonadSafe') and repeatedly applies the given -- function to the 'Handle' to obtain lines to yield. Adapted from the -- moribund pipes-text package. pipeLines :: P.MonadSafe m => (Handle -> IO (Either IOException T.Text)) -> FilePath -> P.Producer T.Text m () pipeLines pgetLine fp = Safe.withFile fp ReadMode $ \h -> let loop = do txt <- P.liftIO (pgetLine h) case txt of Left _e -> return () Right y -> P.yield y >> loop in loop -- | Produce lines of 'T.Text'. produceTextLines :: P.MonadSafe m => FilePath -> P.Producer T.Text m () produceTextLines = pipeLines (try . T.hGetLine) -- | Produce lines of tokens that were separated by the given -- separator. produceTokens :: P.MonadSafe m => FilePath -> Separator -> P.Producer [T.Text] m () produceTokens fp sep = produceTextLines fp >-> P.map tokenize where tokenize = tokenizeRow popts popts = defaultParser { columnSeparator = sep } -- | Consume lines of 'T.Text', writing them to a file. consumeTextLines :: P.MonadSafe m => FilePath -> P.Consumer T.Text m r consumeTextLines fp = Safe.withFile fp WriteMode $ \h -> let loop = P.await >>= P.liftIO . T.hPutStrLn h >> loop in loop -- | Produce the lines of a latin1 (or ISO8859 Part 1) encoded file as -- ’T.Text’ values. readFileLatin1Ln :: P.MonadSafe m => FilePath -> P.Producer [T.Text] m () readFileLatin1Ln fp = pipeLines (try . fmap T.decodeLatin1 . B8.hGetLine) fp >-> P.map (tokenizeRow defaultParser) -- | Read a 'RecF' from one line of CSV. readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec (Either T.Text :. ElField) rs readRow = (readRec .) . tokenizeRow -- | Produce rows where any given entry can fail to parse. readTableMaybeOpt :: (P.MonadSafe m, ReadRec rs, RMap rs) => ParserOptions -> FilePath -> P.Producer (Rec (Maybe :. ElField) rs) m () readTableMaybeOpt opts csvFile = produceTokens csvFile (columnSeparator opts) >-> pipeTableMaybeOpt opts {-# INLINE readTableMaybeOpt #-} -- | Stream lines of CSV data into rows of ’Rec’ values values where -- any given entry can fail to parse. pipeTableMaybeOpt :: (Monad m, ReadRec rs, RMap rs) => ParserOptions -> P.Pipe [T.Text] (Rec (Maybe :. ElField) rs) m () pipeTableMaybeOpt opts = do when (isNothing (headerOverride opts)) (() <$ P.await) P.map (rmap (either (const (Compose Nothing)) (Compose . Just) . getCompose) . readRec) {-# INLINE pipeTableMaybeOpt #-} -- | Stream lines of CSV data into rows of ’Rec’ values values where -- any given entry can fail to parse. In the case of a parse failure, the -- raw 'T.Text' of that entry is retained. pipeTableEitherOpt :: (Monad m, ReadRec rs) => ParserOptions -> P.Pipe T.Text (Rec (Either T.Text :. ElField) rs) m () pipeTableEitherOpt opts = do when (isNothing (headerOverride opts)) (() <$ P.await) P.map (readRow opts) {-# INLINE pipeTableEitherOpt #-} -- | Produce rows where any given entry can fail to parse. readTableMaybe :: (P.MonadSafe m, ReadRec rs, RMap rs) => FilePath -> P.Producer (Rec (Maybe :. ElField) rs) m () readTableMaybe = readTableMaybeOpt defaultParser {-# INLINE readTableMaybe #-} -- | Stream lines of CSV data into rows of ’Rec’ values where any -- given entry can fail to parse. pipeTableMaybe :: (Monad m, ReadRec rs, RMap rs) => P.Pipe [T.Text] (Rec (Maybe :. ElField) rs) m () pipeTableMaybe = pipeTableMaybeOpt defaultParser {-# INLINE pipeTableMaybe #-} -- | Stream lines of CSV data into rows of ’Rec’ values where any -- given entry can fail to parse. In the case of a parse failure, the -- raw 'T.Text' of that entry is retained. pipeTableEither :: (Monad m, ReadRec rs) => P.Pipe T.Text (Rec (Either T.Text :. ElField) rs) m () pipeTableEither = pipeTableEitherOpt defaultParser {-# INLINE pipeTableEither #-} -- -- | Returns a `MonadPlus` producer of rows for which each column was -- -- successfully parsed. This is typically slower than 'readTableOpt'. -- readTableOpt' :: forall m rs. -- (MonadPlus m, MonadIO m, ReadRec rs) -- => ParserOptions -> FilePath -> m (Record rs) -- readTableOpt' opts csvFile = -- do h <- liftIO $ do -- h <- openFile csvFile ReadMode -- when (isNothing $ headerOverride opts) (void $ T.hGetLine h) -- return h -- let go = liftIO (hIsEOF h) >>= \case -- True -> mzero -- False -> let r = recMaybe . readRow opts <$> T.hGetLine h -- in liftIO r >>= maybe go (flip mplus go . return) -- go -- {-# INLINE readTableOpt' #-} -- -- | Returns a `MonadPlus` producer of rows for which each column was -- -- successfully parsed. This is typically slower than 'readTable'. -- readTable' :: forall m rs. (P.MonadSafe m, ReadRec rs) -- => FilePath -> m (Record rs) -- readTable' = readTableOpt' defaultParser -- {-# INLINE readTable' #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readTableOpt :: (P.MonadSafe m, ReadRec rs, RMap rs) => ParserOptions -> FilePath -> P.Producer (Record rs) m () readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe {-# INLINE readTableOpt #-} -- | Pipe lines of CSV text into rows for which each column was -- successfully parsed. pipeTableOpt :: (ReadRec rs, RMap rs, Monad m) => ParserOptions -> P.Pipe [T.Text] (Record rs) m () pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat {-# INLINE pipeTableOpt #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readTable :: (P.MonadSafe m, ReadRec rs, RMap rs) => FilePath -> P.Producer (Record rs) m () readTable = readTableOpt defaultParser {-# INLINE readTable #-} -- | Pipe lines of CSV text into rows for which each column was -- successfully parsed. pipeTable :: (ReadRec rs, RMap rs, Monad m) => P.Pipe [T.Text] (Record rs) m () pipeTable = pipeTableOpt defaultParser {-# INLINE pipeTable #-} -- * Writing CSV Data showFieldsCSV :: (RecMapMethod ShowCSV ElField ts, RecordToList ts) => Record ts -> [T.Text] showFieldsCSV = recordToList . rmapMethod @ShowCSV aux where aux :: (ShowCSV (PayloadType ElField a)) => ElField a -> Const T.Text a aux (Field x) = Const (showCSV x) -- | 'P.yield' a header row with column names followed by a line of -- text for each 'Record' with each field separated by a comma. If -- your source of 'Record' values is a 'P.Producer', consider using -- 'pipeToCSV' to keep everything streaming. produceCSV :: forall f ts m. (ColumnHeaders ts, Foldable f, Monad m, RecordToList ts, RecMapMethod ShowCSV ElField ts) => f (Record ts) -> P.Producer String m () produceCSV recs = do P.yield (intercalate "," (columnHeaders (Proxy :: Proxy (Record ts)))) F.mapM_ (P.yield . T.unpack . T.intercalate "," . showFieldsCSV) recs -- | 'P.yield' a header row with column names followed by a line of -- text for each 'Record' with each field separated by a comma. This -- is the same as 'produceCSV', but adapated for cases where you have -- streaming input that you wish to use to produce streaming output. pipeToCSV :: forall ts m. (Monad m, ColumnHeaders ts, RecordToList ts, RecMapMethod ShowCSV ElField ts) => P.Pipe (Record ts) T.Text m () pipeToCSV = P.yield (T.intercalate "," (map T.pack header)) >> go where header = columnHeaders (Proxy :: Proxy (Record ts)) go :: P.Pipe (Record ts) T.Text m () go = P.map (T.intercalate "," . showFieldsCSV) -- | Write a header row with column names followed by a line of text -- for each 'Record' to the given file. writeCSV :: (ColumnHeaders ts, Foldable f, RecordToList ts, RecMapMethod ShowCSV ElField ts) => FilePath -> f (Record ts) -> IO () writeCSV fp recs = P.runSafeT . P.runEffect $ produceCSV recs >-> P.map T.pack >-> consumeTextLines fp