{-# LANGUAGE BangPatterns, CPP, DataKinds, FlexibleInstances,
KindSignatures, LambdaCase, MultiParamTypeClasses,
OverloadedStrings, QuasiQuotes, RankNTypes,
RecordWildCards, ScopedTypeVariables, TemplateHaskell,
TypeOperators #-}
module Frames.CSV where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative ((<$>), pure, (<*>))
import Data.Foldable (foldMap)
import Data.Traversable (sequenceA)
import Data.Monoid (Monoid)
#endif
import Control.Arrow (first, second)
import Control.Monad (when, void)
import Data.Char (isAlpha, isAlphaNum, toLower, toUpper)
import qualified Data.Foldable as F
import Data.List (intercalate)
import Data.Maybe (isNothing, fromMaybe)
import Data.Monoid ((<>))
import Data.Proxy
import qualified Data.Text as T
import Data.Vinyl (rmap, RElem, Rec)
import Data.Vinyl.TypeLevel (RecAll, RIndex)
import Data.Vinyl.Functor (Identity)
import Frames.Col
import Frames.ColumnTypeable
import Frames.ColumnUniverse
import Frames.Rec
import Frames.RecF
import Frames.RecLens
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
import Pipes ((>->))
import qualified Pipes as P
import qualified Pipes.Prelude as P
import qualified Pipes.ByteString
import qualified Pipes.Group
import qualified Pipes.Parse as P
import qualified Pipes.Prelude.Text as PT
import qualified Pipes.Text as PT
import qualified Pipes.Text.Encoding as PT
import qualified Pipes.Safe as P
import qualified Pipes.Safe.Prelude
import System.IO (IOMode(ReadMode))
type Separator = T.Text
type QuoteChar = Char
data QuotingMode
= NoQuoting
| RFC4180Quoting QuoteChar
deriving (Eq, Show)
data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text]
, columnSeparator :: Separator
, quotingMode :: QuotingMode }
deriving (Eq, Show)
instance Lift QuotingMode where
lift NoQuoting = [|NoQuoting|]
lift (RFC4180Quoting char) = [|RFC4180Quoting $(litE . charL $ char)|]
instance Lift ParserOptions where
lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
quoting' = lift quoting
lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|]
quoting' = lift quoting
defaultParser :: ParserOptions
defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"')
defaultSep :: Separator
defaultSep = T.pack ","
tokenizeRow :: ParserOptions -> T.Text -> [T.Text]
tokenizeRow options =
handleQuoting . T.splitOn sep
where sep = columnSeparator options
quoting = quotingMode options
handleQuoting = case quoting of
NoQuoting -> id
RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote
reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text]
reassembleRFC4180QuotedParts sep quoteChar = go
where go [] = []
go (part:parts)
| T.null part = T.empty : go parts
| prefixQuoted part =
if suffixQuoted part
then unescape (T.drop 1 . T.dropEnd 1 $ part) : go parts
else case break suffixQuoted parts of
(h,[]) -> [unescape (T.intercalate sep (T.drop 1 part : h))]
(h,t:ts) -> unescape
(T.intercalate
sep
(T.drop 1 part : h ++ [T.dropEnd 1 t]))
: go ts
| otherwise = T.strip part : go parts
prefixQuoted t =
T.head t == quoteChar &&
T.length (T.takeWhile (== quoteChar) t) `rem` 2 == 1
suffixQuoted t =
quoteText `T.isSuffixOf` t &&
T.length (T.takeWhileEnd (== quoteChar) t) `rem` 2 == 1
quoteText = T.singleton quoteChar
unescape :: T.Text -> T.Text
unescape = T.replace q2 quoteText
where q2 = quoteText <> quoteText
prefixInference :: (ColumnTypeable a, Monoid a, Monad m)
=> ParserOptions
-> P.Parser T.Text m [a]
prefixInference opts = P.draw >>= \case
Nothing -> return []
Just row1 -> P.foldAll (\ts -> zipWith (<>) ts . inferCols)
(inferCols row1)
id
where inferCols = map inferType . tokenizeRow opts
readColHeaders :: (ColumnTypeable a, Monoid a, Monad m)
=> ParserOptions -> P.Producer T.Text m () -> m [(T.Text, a)]
readColHeaders opts = P.evalStateT $
do headerRow <- maybe ((tokenizeRow opts
. fromMaybe (error "Empty Producer has no header row")) <$> P.draw)
pure
(headerOverride opts)
colTypes <- prefixInference opts
return (zip headerRow colTypes)
class ReadRec (rs :: [*]) where
readRec :: [T.Text] -> Rec (Either T.Text) rs
instance ReadRec '[] where
readRec _ = Nil
instance (Parseable t, ReadRec ts) => ReadRec (s :-> t ': ts) where
readRec [] = frameCons (Left mempty) (readRec [])
readRec (h:t) = frameCons (maybe (Left (T.copy h)) Right (parse' h))
(readRec t)
readFileLatin1Ln :: P.MonadSafe m => FilePath -> P.Producer T.Text m ()
readFileLatin1Ln fp = Pipes.Safe.Prelude.withFile fp ReadMode $ \h ->
let latinText = void (PT.decodeIso8859_1 (Pipes.ByteString.fromHandle h))
latinLines = PT.decode PT.lines latinText
in Pipes.Group.concats latinLines
readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec (Either T.Text) rs
readRow = (readRec .) . tokenizeRow
readTableMaybeOpt :: (P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybeOpt opts csvFile =
PT.readFileLn csvFile >-> pipeTableMaybeOpt opts
{-# INLINE readTableMaybeOpt #-}
pipeTableMaybeOpt :: (Monad m, ReadRec rs)
=> ParserOptions
-> P.Pipe T.Text (Rec Maybe rs) m ()
pipeTableMaybeOpt opts = do
when (isNothing (headerOverride opts)) (() <$ P.await)
P.map (rmap (either (const Nothing) Just) . readRow opts)
{-# INLINE pipeTableMaybeOpt #-}
pipeTableEitherOpt :: (Monad m, ReadRec rs)
=> ParserOptions
-> P.Pipe T.Text (Rec (Either T.Text) rs) m ()
pipeTableEitherOpt opts = do
when (isNothing (headerOverride opts)) (() <$ P.await)
P.map (readRow opts)
{-# INLINE pipeTableEitherOpt #-}
readTableMaybe :: (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybe = readTableMaybeOpt defaultParser
{-# INLINE readTableMaybe #-}
pipeTableMaybe :: (Monad m, ReadRec rs) => P.Pipe T.Text (Rec Maybe rs) m ()
pipeTableMaybe = pipeTableMaybeOpt defaultParser
{-# INLINE pipeTableMaybe #-}
pipeTableEither :: (Monad m, ReadRec rs)
=> P.Pipe T.Text (Rec (Either T.Text) rs) m ()
pipeTableEither = pipeTableEitherOpt defaultParser
{-# INLINE pipeTableEither #-}
readTableOpt :: forall m rs.
(P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Record rs) m ()
readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go
where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe
{-# INLINE readTableOpt #-}
pipeTableOpt :: (ReadRec rs, Monad m)
=> ParserOptions -> P.Pipe T.Text (Record rs) m ()
pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat
{-# INLINE pipeTableOpt #-}
readTable :: forall m rs. (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Record rs) m ()
readTable = readTableOpt defaultParser
{-# INLINE readTable #-}
pipeTable :: (ReadRec rs, Monad m) => P.Pipe T.Text (Record rs) m ()
pipeTable = pipeTableOpt defaultParser
{-# INLINE pipeTable #-}
recDec :: [(T.Text, Q Type)] -> Q Type
recDec = appT [t|Record|] . go
where go [] = return PromotedNilT
go ((n,t):cs) =
[t|($(litT $ strTyLit (T.unpack n)) :-> $(t)) ': $(go cs) |]
capitalize1 :: T.Text -> T.Text
capitalize1 = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
where onHead f = maybe mempty (uncurry T.cons . first f) . T.uncons
sanitizeTypeName :: T.Text -> T.Text
sanitizeTypeName = unreserved . fixupStart
. T.concat . T.split (not . valid) . capitalize1
where valid c = isAlphaNum c || c == '\'' || c == '_'
unreserved t
| t `elem` ["Type", "Class"] = "Col" <> t
| otherwise = t
fixupStart t = case T.uncons t of
Nothing -> "Col"
Just (c,_) | isAlpha c -> t
| otherwise -> "Col" <> t
mkColTDec :: TypeQ -> Name -> DecQ
mkColTDec colTypeQ colTName = tySynD colTName [] colTypeQ
mkColPDec :: Name -> TypeQ -> T.Text -> DecsQ
mkColPDec colTName colTy colPName = sequenceA [tySig, val, tySig', val']
where nm = mkName $ T.unpack colPName
nm' = mkName $ T.unpack colPName <> "'"
tySig = sigD nm [t|forall f rs. (Functor f,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> ($colTy -> f $colTy)
-> Record rs
-> f (Record rs)
|]
tySig' = sigD nm' [t|forall f g rs. (Functor f, Functor g,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> (g $(conT colTName) -> f (g $(conT colTName)))
-> Rec g rs
-> f (Rec g rs)
|]
val = valD (varP nm)
(normalB [e|rlens (Proxy :: Proxy $(conT colTName))|])
[]
val' = valD (varP nm')
(normalB [e|rlens' (Proxy :: Proxy $(conT colTName))|])
[]
lowerHead :: T.Text -> Maybe T.Text
lowerHead = fmap aux . T.uncons
where aux (c,t) = T.cons (toLower c) t
colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ
colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName (prefix <> capitalize1 colName)
colPName = fromMaybe "colDec impossible" (lowerHead colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = colType colTy
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
declareColumn :: T.Text -> Name -> DecsQ
declareColumn colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName colName
colPName = maybe "colDec impossible"
(\(c,t) -> T.cons (toLower c) t)
(T.uncons colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = return (ConT colTy)
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
data RowGen a = RowGen { columnNames :: [String]
, tablePrefix :: String
, separator :: Separator
, rowTypeName :: String
, columnUniverse :: Proxy a
, lineReader :: P.Producer T.Text (P.SafeT IO) ()
}
colQ :: Name -> Q Exp
colQ n = [e| (Proxy :: Proxy (ColumnUniverse $(conT n))) |]
rowGen :: FilePath -> RowGen Columns
rowGen = RowGen [] "" defaultSep "Row" Proxy . PT.readFileLn
tableType :: String -> FilePath -> DecsQ
tableType n fp = tableType' (rowGen fp) { rowTypeName = n }
tableTypes :: String -> FilePath -> DecsQ
tableTypes n fp = tableTypes' (rowGen fp) { rowTypeName = n }
prefixSize :: Int
prefixSize = 1000
tableType' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableType' (RowGen {..}) =
pure . TySynD (mkName rowTypeName) [] <$>
(runIO (P.runSafeT (readColHeaders opts lineSource)) >>= recDec')
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
lineSource = lineReader >-> P.take prefixSize
colNamesP :: Monad m
=> ParserOptions -> P.Producer T.Text m () -> m [T.Text]
colNamesP opts src = either (const []) (tokenizeRow opts . fst) <$> P.next src
tableTypesText' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableTypesText' (RowGen {..}) =
do colNames <- runIO . P.runSafeT $
maybe (colNamesP opts lineReader)
pure
(headerOverride opts)
let headers = zip colNames (repeat (inferType " "))
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry mkColDecs) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
mkColDecs colNm colTy = do
let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm)
mColNm <- lookupTypeName safeName
case mColNm of
Just _ -> pure []
Nothing -> colDec (T.pack tablePrefix) colNm colTy
tableTypes' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableTypes' (RowGen {..}) =
do headers <- runIO . P.runSafeT $ readColHeaders opts lineSource
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry mkColDecs) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
lineSource = lineReader >-> P.take prefixSize
mkColDecs colNm colTy = do
let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm)
mColNm <- lookupTypeName safeName
case mColNm of
Just _ -> pure []
Nothing -> colDec (T.pack tablePrefix) colNm colTy
produceCSV :: forall f ts m.
(ColumnHeaders ts, AsVinyl ts, Foldable f, Monad m,
RecAll Identity (UnColumn ts) Show)
=> f (Record ts) -> P.Producer String m ()
produceCSV recs = do
P.yield (intercalate "," (columnHeaders (Proxy :: Proxy (Record ts))))
F.mapM_ (P.yield . intercalate "," . showFields) recs
pipeToCSV :: forall ts m.
(Monad m, ColumnHeaders ts, AsVinyl ts,
RecAll Identity (UnColumn ts) Show)
=> P.Pipe (Record ts) T.Text m ()
pipeToCSV = P.yield (T.intercalate "," (map T.pack header)) >> go
where header = columnHeaders (Proxy :: Proxy (Record ts))
go :: P.Pipe (Record ts) T.Text m ()
go = P.map (T.intercalate "," . map T.pack . showFields)
writeCSV :: (ColumnHeaders ts, AsVinyl ts, Foldable f,
RecAll Identity (UnColumn ts) Show)
=> FilePath -> f (Record ts) -> IO ()
writeCSV fp recs = P.runSafeT . P.runEffect $
produceCSV recs >-> P.map T.pack >-> PT.writeFileLn fp