module Frames.CSV where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative ((<$>), pure, (<*>))
import Data.Foldable (foldMap)
import Data.Traversable (sequenceA)
import Data.Monoid (Monoid)
#endif
import Control.Arrow (first, second)
import Control.Monad (when, void)
import Data.Char (isAlpha, isAlphaNum, toLower, toUpper)
import qualified Data.Foldable as F
import Data.List (intercalate)
import Data.Maybe (isNothing, fromMaybe)
import Data.Monoid ((<>))
import Data.Proxy
import qualified Data.Text as T
import Data.Vinyl (RElem, Rec)
import Data.Vinyl.TypeLevel (RecAll, RIndex)
import Data.Vinyl.Functor (Identity)
import Frames.Col
import Frames.ColumnTypeable
import Frames.ColumnUniverse
import Frames.Rec
import Frames.RecF
import Frames.RecLens
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
import Pipes ((>->))
import qualified Pipes as P
import qualified Pipes.Prelude as P
import qualified Pipes.ByteString
import qualified Pipes.Group
import qualified Pipes.Parse as P
import qualified Pipes.Prelude.Text as PT
import qualified Pipes.Text as PT
import qualified Pipes.Text.Encoding as PT
import qualified Pipes.Safe as P
import qualified Pipes.Safe.Prelude
import System.IO (IOMode(ReadMode))
type Separator = T.Text
type QuoteChar = Char
data QuotingMode
= NoQuoting
| RFC4180Quoting QuoteChar
deriving (Eq, Show)
data ParserOptions = ParserOptions { headerOverride :: Maybe [T.Text]
, columnSeparator :: Separator
, quotingMode :: QuotingMode }
deriving (Eq, Show)
instance Lift QuotingMode where
lift NoQuoting = [|NoQuoting|]
lift (RFC4180Quoting char) = [|RFC4180Quoting $(litE . charL $ char)|]
instance Lift ParserOptions where
lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
quoting' = lift quoting
lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|]
where sep' = [|T.pack $(stringE $ T.unpack sep)|]
hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|]
quoting' = lift quoting
defaultParser :: ParserOptions
defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"')
defaultSep :: Separator
defaultSep = T.pack ","
tokenizeRow :: ParserOptions -> T.Text -> [T.Text]
tokenizeRow options =
handleQuoting . T.splitOn sep
where sep = columnSeparator options
quoting = quotingMode options
handleQuoting = case quoting of
NoQuoting -> id
RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote
reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text]
reassembleRFC4180QuotedParts sep quoteChar = go
where go [] = []
go (part:parts)
| T.null part = T.empty : go parts
| prefixQuoted part =
if suffixQuoted part
then unescape (T.drop 1 . T.dropEnd 1 $ part) : go parts
else case break suffixQuoted parts of
(h,[]) -> [unescape (T.intercalate sep (T.drop 1 part : h))]
(h,t:ts) -> unescape
(T.intercalate
sep
(T.drop 1 part : h ++ [T.dropEnd 1 t]))
: go ts
| otherwise = T.strip part : go parts
prefixQuoted t =
T.head t == quoteChar &&
T.length (T.takeWhile (== quoteChar) t) `rem` 2 == 1
suffixQuoted t =
quoteText `T.isSuffixOf` t &&
T.length (T.takeWhileEnd (== quoteChar) t) `rem` 2 == 1
quoteText = T.singleton quoteChar
unescape :: T.Text -> T.Text
unescape = T.replace q2 quoteText
where q2 = quoteText <> quoteText
prefixInference :: (ColumnTypeable a, Monoid a, Monad m)
=> ParserOptions
-> P.Parser T.Text m [a]
prefixInference opts = P.draw >>= \case
Nothing -> return []
Just row1 -> P.foldAll (\ts -> zipWith (<>) ts . inferCols)
(inferCols row1)
id
where inferCols = map inferType . tokenizeRow opts
readColHeaders :: (ColumnTypeable a, Monoid a, Monad m)
=> ParserOptions -> P.Producer T.Text m () -> m [(T.Text, a)]
readColHeaders opts = P.evalStateT $
do headerRow <- maybe ((tokenizeRow opts
. fromMaybe (error "Empty Producer has no header row")) <$> P.draw)
pure
(headerOverride opts)
colTypes <- prefixInference opts
return (zip headerRow colTypes)
class ReadRec (rs :: [*]) where
readRec :: [T.Text] -> Rec Maybe rs
instance ReadRec '[] where
readRec _ = Nil
instance (Parseable t, ReadRec ts) => ReadRec (s :-> t ': ts) where
readRec [] = frameCons Nothing (readRec [])
readRec (h:t) = frameCons (parse' h) (readRec t)
readFileLatin1Ln :: P.MonadSafe m => FilePath -> P.Producer T.Text m ()
readFileLatin1Ln fp = Pipes.Safe.Prelude.withFile fp ReadMode $ \h ->
let latinText = void (PT.decodeIso8859_1 (Pipes.ByteString.fromHandle h))
latinLines = PT.decode PT.lines latinText
in Pipes.Group.concats latinLines
readRow :: ReadRec rs => ParserOptions -> T.Text -> Rec Maybe rs
readRow = (readRec .) . tokenizeRow
readTableMaybeOpt :: (P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybeOpt opts csvFile =
PT.readFileLn csvFile >-> pipeTableMaybeOpt opts
pipeTableMaybeOpt :: (Monad m, ReadRec rs)
=> ParserOptions
-> P.Pipe T.Text (Rec Maybe rs) m ()
pipeTableMaybeOpt opts = do
when (isNothing (headerOverride opts)) (() <$ P.await)
P.map (readRow opts)
readTableMaybe :: (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Rec Maybe rs) m ()
readTableMaybe = readTableMaybeOpt defaultParser
pipeTableMaybe :: (Monad m, ReadRec rs) => P.Pipe T.Text (Rec Maybe rs) m ()
pipeTableMaybe = pipeTableMaybeOpt defaultParser
readTableOpt :: forall m rs.
(P.MonadSafe m, ReadRec rs)
=> ParserOptions -> FilePath -> P.Producer (Record rs) m ()
readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go
where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe
pipeTableOpt :: (ReadRec rs, Monad m)
=> ParserOptions -> P.Pipe T.Text (Record rs) m ()
pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat
readTable :: forall m rs. (P.MonadSafe m, ReadRec rs)
=> FilePath -> P.Producer (Record rs) m ()
readTable = readTableOpt defaultParser
pipeTable :: (ReadRec rs, Monad m) => P.Pipe T.Text (Record rs) m ()
pipeTable = pipeTableOpt defaultParser
recDec :: [(T.Text, Q Type)] -> Q Type
recDec = appT [t|Record|] . go
where go [] = return PromotedNilT
go ((n,t):cs) =
[t|($(litT $ strTyLit (T.unpack n)) :-> $(t)) ': $(go cs) |]
capitalize1 :: T.Text -> T.Text
capitalize1 = foldMap (onHead toUpper) . T.split (not . isAlphaNum)
where onHead f = maybe mempty (uncurry T.cons . first f) . T.uncons
sanitizeTypeName :: T.Text -> T.Text
sanitizeTypeName = unreserved . fixupStart
. T.concat . T.split (not . valid) . capitalize1
where valid c = isAlphaNum c || c == '\'' || c == '_'
unreserved t
| t `elem` ["Type", "Class"] = "Col" <> t
| otherwise = t
fixupStart t = case T.uncons t of
Nothing -> "Col"
Just (c,_) | isAlpha c -> t
| otherwise -> "Col" <> t
mkColTDec :: TypeQ -> Name -> DecQ
mkColTDec colTypeQ colTName = tySynD colTName [] colTypeQ
mkColPDec :: Name -> TypeQ -> T.Text -> DecsQ
mkColPDec colTName colTy colPName = sequenceA [tySig, val, tySig', val']
where nm = mkName $ T.unpack colPName
nm' = mkName $ T.unpack colPName <> "'"
tySig = sigD nm [t|forall f rs. (Functor f,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> ($colTy -> f $colTy)
-> Record rs
-> f (Record rs)
|]
tySig' = sigD nm' [t|forall f g rs. (Functor f, Functor g,
RElem $(conT colTName) rs (RIndex $(conT colTName) rs))
=> (g $(conT colTName) -> f (g $(conT colTName)))
-> Rec g rs
-> f (Rec g rs)
|]
val = valD (varP nm)
(normalB [e|rlens (Proxy :: Proxy $(conT colTName))|])
[]
val' = valD (varP nm')
(normalB [e|rlens' (Proxy :: Proxy $(conT colTName))|])
[]
lowerHead :: T.Text -> Maybe T.Text
lowerHead = fmap aux . T.uncons
where aux (c,t) = T.cons (toLower c) t
colDec :: ColumnTypeable a => T.Text -> T.Text -> a -> DecsQ
colDec prefix colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName (prefix <> capitalize1 colName)
colPName = fromMaybe "colDec impossible" (lowerHead colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = colType colTy
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
declareColumn :: T.Text -> Name -> DecsQ
declareColumn colName colTy = (:) <$> mkColTDec colTypeQ colTName'
<*> mkColPDec colTName' colTyQ colPName
where colTName = sanitizeTypeName colName
colPName = maybe "colDec impossible"
(\(c,t) -> T.cons (toLower c) t)
(T.uncons colTName)
colTName' = mkName $ T.unpack colTName
colTyQ = return (ConT colTy)
colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $colTyQ|]
data RowGen a = RowGen { columnNames :: [String]
, tablePrefix :: String
, separator :: Separator
, rowTypeName :: String
, columnUniverse :: Proxy a
, lineReader :: P.Producer T.Text (P.SafeT IO) ()
}
colQ :: Name -> Q Exp
colQ n = [e| (Proxy :: Proxy (ColumnUniverse $(conT n))) |]
rowGen :: FilePath -> RowGen Columns
rowGen = RowGen [] "" defaultSep "Row" Proxy . PT.readFileLn
tableType :: String -> FilePath -> DecsQ
tableType n fp = tableType' (rowGen fp) { rowTypeName = n }
tableTypes :: String -> FilePath -> DecsQ
tableTypes n fp = tableTypes' (rowGen fp) { rowTypeName = n }
prefixSize :: Int
prefixSize = 1000
tableType' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableType' (RowGen {..}) =
pure . TySynD (mkName rowTypeName) [] <$>
(runIO (P.runSafeT (readColHeaders opts lineSource)) >>= recDec')
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
lineSource = lineReader >-> P.take prefixSize
colNamesP :: Monad m
=> ParserOptions -> P.Producer T.Text m () -> m [T.Text]
colNamesP opts src = either (const []) (tokenizeRow opts . fst) <$> P.next src
tableTypesText' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableTypesText' (RowGen {..}) =
do colNames <- runIO . P.runSafeT $
maybe (colNamesP opts lineReader)
pure
(headerOverride opts)
let headers = zip colNames (repeat (inferType " "))
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry $ colDec (T.pack tablePrefix)) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
tableTypes' :: forall a. (ColumnTypeable a, Monoid a)
=> RowGen a -> DecsQ
tableTypes' (RowGen {..}) =
do headers <- runIO . P.runSafeT $ readColHeaders opts lineSource
recTy <- tySynD (mkName rowTypeName) [] (recDec' headers)
let optsName = case rowTypeName of
[] -> error "Row type name shouldn't be empty"
h:t -> mkName $ toLower h : t ++ "Parser"
optsTy <- sigD optsName [t|ParserOptions|]
optsDec <- valD (varP optsName) (normalB $ lift opts) []
colDecs <- concat <$> mapM (uncurry mkColDecs) headers
return (recTy : optsTy : optsDec : colDecs)
where recDec' = recDec . map (second colType) :: [(T.Text, a)] -> Q Type
colNames' | null columnNames = Nothing
| otherwise = Just (map T.pack columnNames)
opts = ParserOptions colNames' separator (RFC4180Quoting '\"')
lineSource = lineReader >-> P.take prefixSize
mkColDecs colNm colTy = do
let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm)
mColNm <- lookupTypeName safeName
case mColNm of
Just _ -> pure []
Nothing -> colDec (T.pack tablePrefix) colNm colTy
produceCSV :: forall f ts m.
(ColumnHeaders ts, AsVinyl ts, Foldable f, Monad m,
RecAll Identity (UnColumn ts) Show)
=> f (Record ts) -> P.Producer String m ()
produceCSV recs = do
P.yield (intercalate "," (columnHeaders (Proxy :: Proxy (Record ts))))
F.mapM_ (P.yield . intercalate "," . showFields) recs
writeCSV :: (ColumnHeaders ts, AsVinyl ts, Foldable f,
RecAll Identity (UnColumn ts) Show)
=> FilePath -> f (Record ts) -> IO ()
writeCSV fp recs = P.runSafeT . P.runEffect $
produceCSV recs >-> P.map T.pack >-> PT.writeFileLn fp