{-# LANGUAGE TypeOperators #-} -- | CSV parsers for use with the @Frames@ package. -- -- Most commonly used are 'dsvTableTypes' for generating type -- definitions at compile time based on a CSV file, and 'readDsvTable' -- to load the table at run time. These are comparable to @tableTypes@ -- and @readTable@ from the @Frames@ package, but use an alternative -- CSV parser. module Frames.Dsv where import Control.Monad (when) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS import Data.Maybe (isNothing) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Vector as V import Data.Vinyl import Data.Vinyl.Functor ((:.), Compose(..)) import Data.Word (Word8) import Frames (recMaybe, Record) import Frames.CSV (defaultParser, pipeTableMaybeOpt) import Frames.CSV (ReadRec(..), Separator, ParserOptions(..)) import Frames.TH (rowGen, RowGen(..), tableTypes') import qualified HaskellWorks.Data.Dsv.Lazy.Cursor as SVL import qualified HaskellWorks.Data.Dsv.Lazy.Cursor.Strict as SVLS import Pipes (MonadIO, Producer, (>->), yield) import qualified Pipes as P import qualified Pipes.Prelude as P import Language.Haskell.TH -- * Row Reading -- | Produce one DSV row at a time. rowLoop :: Monad m => SVL.DsvCursor -> Producer [BS.ByteString] m () rowLoop c = if SVL.dsvCursorPosition d > SVL.dsvCursorPosition c && not (SVL.atEnd c) then do yield (SVLS.getRowListBetween c d dEnd) rowLoop (SVL.trim d) else return () where nr = SVL.nextRow c d = SVL.nextPosition nr dEnd = SVL.atEnd nr -- | Produce rows of raw 'LBS.ByteString' values. dsvRowsByte :: MonadIO m => FilePath -> Word8 -> Producer [BS.ByteString] m () dsvRowsByte fp columnSeparator = do bs <- P.liftIO (LBS.readFile fp) rowLoop (SVL.makeCursor columnSeparator bs) -- | Produce rows of UTF-8 encoded 'Text' values. dsvRows' :: MonadIO m => FilePath -> Word8 -> Producer [Text] m () dsvRows' fp = (>-> P.map (map T.decodeUtf8)) . dsvRowsByte fp -- | Produce rows of Latin-1 (aka ISO-8859-1) encoded 'Text' values. dsvRowsLatin1' :: MonadIO m => FilePath -> Word8 -> Producer [Text] m () dsvRowsLatin1' fp = (>-> P.map (map T.decodeLatin1)) . dsvRowsByte fp -- | Call 'error' indicating the problem with a separator intended for -- use with the @hw-dsv@ library. dsvSepErr :: DsvParserOptionsError -> a dsvSepErr = error . ("DSV separator must be a single character: "++) . show -- | Produce rows of UTF-8 encoded 'Text' values. dsvRows :: MonadIO m => FilePath -> Separator -> Producer [Text] m () dsvRows fp = either dsvSepErr (dsvRows' fp) . separatorWord8 -- | Produce rows of Latin-1 (aka ISO-8859-1) encoded 'Text' values. dsvRowsLatin1 :: MonadIO m => FilePath -> Separator -> Producer [Text] m () dsvRowsLatin1 fp = either dsvSepErr (dsvRowsLatin1' fp) . separatorWord8 -- | The ways in which an arbitrary 'Text' value may be unsuitable for -- use as a separator for the @hw-dsv@ package. data DsvParserOptionsError = SeparatorIsNull | SeparatorCharIsMoreThanOneByte | SeparatorIsMoreThanOneChar deriving (Eq, Show) -- | The @Frames@ library supports column separators that can be -- arbitrary 'Text' values, but the @hw-dsv@ library requires a single -- byte be used to demarcate values. If the given 'Text' can be -- losslessly represented as a single byte, we sue it, otherwise we -- return an error indicating the problem. separatorWord8 :: Separator -> Either DsvParserOptionsError Word8 separatorWord8 sep = case T.uncons sep of Nothing -> Left SeparatorIsNull Just (h, t) | T.null t -> let i = fromEnum h in if i < 256 then Right (fromIntegral i) else Left SeparatorCharIsMoreThanOneByte | otherwise -> Left SeparatorIsMoreThanOneChar -- * Whole Table Reading -- | Produce rows where any given entry can fail to parse. readDsvTableMaybeOpt :: (MonadIO m, ReadRec rs, RMap rs) => ParserOptions -> FilePath -> P.Producer (Rec (Maybe :. ElField) rs) m () readDsvTableMaybeOpt opts csvFile = dsvRows csvFile (columnSeparator opts) >-> pipeTableMaybeOpt opts {-# INLINE readDsvTableMaybeOpt #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readDsvTableOpt :: (MonadIO m, ReadRec rs, RMap rs) => ParserOptions -> FilePath -> P.Producer (Record rs) m () readDsvTableOpt opts csvFile = readDsvTableMaybeOpt opts csvFile P.>-> go where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe {-# INLINE readDsvTableOpt #-} -- | Returns a producer of rows for which each column was successfully -- parsed. readDsvTable :: (MonadIO m, ReadRec rs, RMap rs) => FilePath -> P.Producer (Record rs) m () readDsvTable = readDsvTableOpt defaultParser {-# INLINE readDsvTable #-} -- * Template Haskell -- | Like 'tableType', but additionally generates a type synonym for -- each column, and a proxy value of that type. If the CSV file has -- column names \"foo\", \"bar\", and \"baz\", then this will declare -- @type Foo = "foo" :-> Int@, for example, @foo = rlens \@Foo@, and -- @foo' = rlens' \@Foo@. dsvTableTypes :: String -> FilePath -> DecsQ dsvTableTypes n fp = tableTypes' (rowGen fp) { rowTypeName = n , lineReader = dsvRows fp }