{-# LANGUAGE CPP #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveLift #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}

-- | Infer row types from comma-separated values (CSV) data and read
-- that data from files. Template Haskell is used to generate the
-- necessary types so that you can write type safe programs referring
-- to those types.
module Frames.CSV where

import Control.Exception (IOException, try)
import Control.Monad (unless, when)
import qualified Data.ByteString.Char8 as B8
import qualified Data.Foldable as F
import Data.List (intercalate)
import Data.Maybe (fromMaybe, isNothing)
#if __GLASGOW_HASKELL__ < 808
import Data.Monoid ((<>))
#endif
import Data.Proxy
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Data.Vinyl (ElField (..), RMap, Rec (..), RecMapMethod, RecordToList, recordToList, rmap, rmapMethod)
import Data.Vinyl.Class.Method (PayloadType)
import Data.Vinyl.Functor (Compose (..), Const (..), (:.))
import Frames.Col
import Frames.ColumnTypeable
import Frames.Rec
import Frames.RecF
import Frames.ShowCSV
import GHC.TypeLits (KnownSymbol)
import Language.Haskell.TH
import Language.Haskell.TH.Syntax
import Pipes ((>->))
import qualified Pipes as P
import qualified Pipes.Parse as P
import qualified Pipes.Prelude as P
import qualified Pipes.Safe as P
import qualified Pipes.Safe.Prelude as Safe
import System.IO (Handle, IOMode (ReadMode, WriteMode), hPrint, stderr)

-- * Parsing

type Separator = T.Text

type QuoteChar = Char

data QuotingMode
    = -- | No quoting enabled. The separator may not appear in values
      NoQuoting
    | -- | Quoted values with the given quoting character. Quotes are escaped by doubling them.
      -- Mostly RFC4180 compliant, except doesn't support newlines in values
      RFC4180Quoting QuoteChar
    deriving (Eq, Show, Lift)

data ParserOptions = ParserOptions
    { headerOverride :: Maybe [T.Text]
    , columnSeparator :: Separator
    , quotingMode :: QuotingMode
    }
    deriving (Eq, Show)

instance Lift ParserOptions where
    lift (ParserOptions Nothing sep quoting) = [|ParserOptions Nothing $sep' $quoting'|]
      where
        sep' = [|T.pack $(stringE $ T.unpack sep)|]
        quoting' = lift quoting
    lift (ParserOptions (Just hs) sep quoting) = [|ParserOptions (Just $hs') $sep' $quoting'|]
      where
        sep' = [|T.pack $(stringE $ T.unpack sep)|]
        hs' = [|map T.pack $(listE $ map (stringE . T.unpack) hs)|]
        quoting' = lift quoting
#if MIN_VERSION_template_haskell(2,16,0)
#if MIN_VERSION_template_haskell(2,17,0)
    liftTyped = liftCode . unsafeTExpCoerce . lift
#else
    liftTyped = unsafeTExpCoerce . lift
#endif
#endif

-- | Default 'ParseOptions' get column names from a header line, and
-- use commas to separate columns.
defaultParser :: ParserOptions
defaultParser = ParserOptions Nothing defaultSep (RFC4180Quoting '\"')

-- | Default separator string.
defaultSep :: Separator
defaultSep = T.pack ","

-- | Helper to split a 'T.Text' on commas and strip leading and
-- trailing whitespace from each resulting chunk.
tokenizeRow :: ParserOptions -> T.Text -> [T.Text]
tokenizeRow options = handleQuoting . T.splitOn sep
  where
    sep = columnSeparator options
    quoting = quotingMode options
    handleQuoting = case quoting of
        NoQuoting -> id
        RFC4180Quoting quote -> reassembleRFC4180QuotedParts sep quote

-- | Post processing applied to a list of tokens split by the
-- separator which should have quoted sections reassembled
reassembleRFC4180QuotedParts :: Separator -> QuoteChar -> [T.Text] -> [T.Text]
reassembleRFC4180QuotedParts sep quoteChar = go
  where
    go [] = []
    go (part : parts)
        | T.null part = T.empty : go parts
        | prefixQuoted part =
            if suffixQuoted part
                then unescape (T.drop 1 . T.dropEnd 1 $ part) : go parts
                else case break suffixQuoted parts of
                    (h, []) -> [unescape (T.intercalate sep (T.drop 1 part : h))]
                    (h, t : ts) ->
                        unescape
                            ( T.intercalate
                                sep
                                (T.drop 1 part : h ++ [T.dropEnd 1 t])
                            )
                            : go ts
        | otherwise = T.strip part : go parts

    prefixQuoted t =
        T.head t == quoteChar --  &&
        -- T.length (T.takeWhile (== quoteChar) t) `rem` 2 == 1
    suffixQuoted t =
        quoteText `T.isSuffixOf` t --  &&
        -- T.length (T.takeWhileEnd (== quoteChar) t) `rem` 2 == 1
    quoteText = T.singleton quoteChar

    unescape :: T.Text -> T.Text
    unescape = T.replace q2 quoteText
      where
        q2 = quoteText <> quoteText

-- tokenizeRow :: Separator -> T.Text -> [T.Text]
-- tokenizeRow sep = map (unquote . T.strip) . T.splitOn sep
--  where unquote txt
--          | quoted txt = case T.dropEnd 1 (T.drop 1 txt) of
--                           txt' | T.null txt' -> "Col"
--                                | numish txt' -> txt
--                                | otherwise -> txt'
--          | otherwise = txt
--        numish = T.all (`elem` ("-+.0123456789"::String))
--        quoted txt = case T.uncons txt of
--                       Just ('"', rst)
--                         | not (T.null rst) -> T.last rst == '"'
--                       _ -> False

-- | Infer column types from a prefix (up to 1000 lines) of a CSV
-- file.
prefixInference ::
    (ColumnTypeable a, Semigroup a, Monad m, Show a) =>
    P.Parser [T.Text] m [a]
prefixInference =
    P.draw >>= \case
        Nothing -> return []
        Just row1 ->
            P.foldAll
                (\ts -> zipWith (<>) ts . inferCols)
                (inferCols row1)
                id
  where
    inferCols = map inferType

-- | Extract column names and inferred types from a CSV file.
readColHeaders ::
    (ColumnTypeable a, Semigroup a, Monad m, Show a) =>
    ParserOptions
    -> P.Producer [T.Text] m ()
    -> m [(T.Text, a)]
readColHeaders opts = P.evalStateT $
    do
        headerRow <-
            maybe
                (fromMaybe err <$> P.draw)
                pure
                (headerOverride opts)
        colTypes <- prefixInference
        unless (length headerRow == length colTypes) (error errNumColumns)
        return (zip headerRow colTypes)
  where
    err = error "Empty Producer has no header row"
    errNumColumns =
        unlines
            [ ""
            , "Error parsing CSV: "
            , "  Number of columns in header differs from number of columns"
            , "  found in the remaining file. This may be due to newlines"
            , "  being present within the data itself (not just separating"
            , "  rows). If support for embedded newlines is required, "
            , "  consider using the Frames-dsv package in conjunction with"
            , "  Frames to make use of a different CSV parser."
            ]

-- * Loading CSV Data

-- | Parsing each component of a 'RecF' from a list of text chunks,
-- one chunk per record component.
class ReadRec rs where
    readRec :: [T.Text] -> Rec (Either T.Text :. ElField) rs

instance ReadRec '[] where
    readRec _ = RNil

instance (Parseable t, ReadRec ts, KnownSymbol s) => ReadRec (s :-> t ': ts) where
    readRec [] = Compose (Left mempty) :& readRec []
    readRec (h : t) =
        maybe
            (Compose (Left (T.copy h)))
            (Compose . Right . Field)
            (parse' h)
            :& readRec t

-- | Opens a file (in 'P.MonadSafe') and repeatedly applies the given
-- function to the 'Handle' to obtain lines to yield. Adapted from the
-- moribund pipes-text package.
pipeLines ::
    (P.MonadSafe m) =>
    (Handle -> IO (Either IOException T.Text))
    -> FilePath
    -> P.Producer T.Text m ()
pipeLines pgetLine fp = Safe.withFile fp ReadMode $ \h ->
    let loop = do
            txt <- P.liftIO (pgetLine h)
            case txt of
                Left _e -> return ()
                Right y -> P.yield y >> loop
     in loop

-- | Produce lines of 'T.Text'.
produceTextLines :: (P.MonadSafe m) => FilePath -> P.Producer T.Text m ()
produceTextLines = pipeLines (try . T.hGetLine)

-- | Produce lines of tokens that were separated by the given
-- separator.
produceTokens ::
    (P.MonadSafe m) =>
    FilePath
    -> Separator
    -> P.Producer [T.Text] m ()
produceTokens fp sep = produceTextLines fp >-> P.map tokenize
  where
    tokenize = tokenizeRow popts
    popts = defaultParser{columnSeparator = sep}

-- | Consume lines of 'T.Text', writing them to a file.
consumeTextLines :: (P.MonadSafe m) => FilePath -> P.Consumer T.Text m r
consumeTextLines fp = Safe.withFile fp WriteMode $ \h ->
    let loop = P.await >>= P.liftIO . T.hPutStrLn h >> loop
     in loop

-- | Produce the lines of a latin1 (or ISO8859 Part 1) encoded file as
-- ’T.Text’ values.
readFileLatin1Ln :: (P.MonadSafe m) => FilePath -> P.Producer [T.Text] m ()
readFileLatin1Ln fp =
    pipeLines (try . fmap T.decodeLatin1 . B8.hGetLine) fp
        >-> P.map (tokenizeRow defaultParser)

-- | Read a 'RecF' from one line of CSV.
readRow ::
    (ReadRec rs) =>
    ParserOptions
    -> T.Text
    -> Rec (Either T.Text :. ElField) rs
readRow = (readRec .) . tokenizeRow

-- | Produce rows where any given entry can fail to parse.
readTableMaybeOpt ::
    (P.MonadSafe m, ReadRec rs, RMap rs) =>
    ParserOptions
    -> FilePath
    -> P.Producer (Rec (Maybe :. ElField) rs) m ()
readTableMaybeOpt opts csvFile =
    produceTokens csvFile (columnSeparator opts) >-> pipeTableMaybeOpt opts

-- | Stream lines of CSV data into rows of ’Rec’ values values where
-- any given entry can fail to parse.
pipeTableMaybeOpt ::
    (Monad m, ReadRec rs, RMap rs) =>
    ParserOptions
    -> P.Pipe [T.Text] (Rec (Maybe :. ElField) rs) m ()
pipeTableMaybeOpt opts = do
    when (isNothing (headerOverride opts)) (() <$ P.await)
    P.map
        ( rmap
            ( either
                (const (Compose Nothing))
                (Compose . Just)
                . getCompose
            )
            . readRec
        )

-- | Stream lines of CSV data into rows of ’Rec’ values values where
-- any given entry can fail to parse. In the case of a parse failure, the
-- raw 'T.Text' of that entry is retained.
pipeTableEitherOpt ::
    (Monad m, ReadRec rs) =>
    ParserOptions
    -> P.Pipe T.Text (Rec (Either T.Text :. ElField) rs) m ()
pipeTableEitherOpt opts = do
    when (isNothing (headerOverride opts)) (() <$ P.await)
    P.map (readRow opts)

-- | Produce rows where any given entry can fail to parse.
readTableMaybe ::
    (P.MonadSafe m, ReadRec rs, RMap rs) =>
    FilePath
    -> P.Producer (Rec (Maybe :. ElField) rs) m ()
readTableMaybe = readTableMaybeOpt defaultParser

-- | Stream lines of CSV data into rows of ’Rec’ values where any
-- given entry can fail to parse.
pipeTableMaybe ::
    (Monad m, ReadRec rs, RMap rs) =>
    P.Pipe [T.Text] (Rec (Maybe :. ElField) rs) m ()
pipeTableMaybe = pipeTableMaybeOpt defaultParser

-- | Stream lines of CSV data into rows of ’Rec’ values where any
-- given entry can fail to parse. In the case of a parse failure, the
-- raw 'T.Text' of that entry is retained.
pipeTableEither ::
    (Monad m, ReadRec rs) =>
    P.Pipe T.Text (Rec (Either T.Text :. ElField) rs) m ()
pipeTableEither = pipeTableEitherOpt defaultParser

-- -- | Returns a `MonadPlus` producer of rows for which each column was
-- -- successfully parsed. This is typically slower than 'readTableOpt'.
-- readTableOpt' :: forall m rs.
--                  (MonadPlus m, MonadIO m, ReadRec rs)
--               => ParserOptions -> FilePath -> m (Record rs)
-- readTableOpt' opts csvFile =
--   do h <- liftIO $ do
--             h <- openFile csvFile ReadMode
--             when (isNothing $ headerOverride opts) (void $ T.hGetLine h)
--             return h
--      let go = liftIO (hIsEOF h) >>= \case
--               True -> mzero
--               False -> let r = recMaybe . readRow opts <$> T.hGetLine h
--                        in liftIO r >>= maybe go (flip mplus go . return)
--      go

-- -- | Returns a `MonadPlus` producer of rows for which each column was
-- -- successfully parsed. This is typically slower than 'readTable'.
-- readTable' :: forall m rs. (P.MonadSafe m, ReadRec rs)
--            => FilePath -> m (Record rs)
-- readTable' = readTableOpt' defaultParser

-- | Returns a producer of rows for which each column was successfully
-- parsed.
readTableOpt ::
    (P.MonadSafe m, ReadRec rs, RMap rs) =>
    ParserOptions
    -> FilePath
    -> P.Producer (Record rs) m ()
readTableOpt opts csvFile = readTableMaybeOpt opts csvFile P.>-> go
  where
    go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe

-- | Pipe lines of CSV text into rows for which each column was
-- successfully parsed.
pipeTableOpt ::
    (ReadRec rs, RMap rs, Monad m) =>
    ParserOptions
    -> P.Pipe [T.Text] (Record rs) m ()
pipeTableOpt opts = pipeTableMaybeOpt opts >-> P.map recMaybe >-> P.concat

-- | Returns a producer of rows for which each column was successfully
-- parsed.
readTable ::
    (P.MonadSafe m, ReadRec rs, RMap rs) =>
    FilePath
    -> P.Producer (Record rs) m ()
readTable = readTableOpt defaultParser

readRecEither ::
    (ReadRec rs, RMap rs) =>
    [T.Text]
    -> Either (Rec (Either T.Text :. ElField) rs) (Record rs)
readRecEither tokens =
    let tmp = readRec tokens
     in case rtraverse getCompose tmp of
            Right r -> Right r
            _ -> Left tmp

-- | Similar to 'readTable' except that rows that fail to parse are
-- printed to @stderr@ with columns that failed to parse printed as
-- @"Left rawtext"@ while those that were successfully parsed are
-- shown as @"Right text"@.
readTableDebug ::
    forall m rs.
    ( P.MonadSafe m
    , ReadRec rs
    , RMap rs
    , RecMapMethod ShowCSV (Either T.Text :. ElField) rs
    , RecordToList rs
    ) =>
    FilePath
    -> P.Producer (Record rs) m ()
readTableDebug csvFile =
    produceTokens csvFile (columnSeparator opts) >-> go >-> debugAll
  where
    opts = defaultParser
    go = do
        when (isNothing (headerOverride opts)) (() <$ P.await)
        P.map readRecEither
    debugAll = do
        P.await >>= either (P.liftIO . hPrint stderr . debugOne) P.yield
        debugAll
    debugOne = recordToList . rmapMethod @ShowCSV (aux . getCompose)
    aux ::
        (ShowCSV (PayloadType ElField a)) =>
        Either T.Text (ElField a)
        -> Const T.Text a
    aux (Right (Field x)) = Const ("Right " <> showCSV x)
    aux (Left txt) = Const ("Left " <> txt)

-- | Pipe lines of CSV text into rows for which each column was
-- successfully parsed.
pipeTable ::
    (ReadRec rs, RMap rs, Monad m) =>
    P.Pipe [T.Text] (Record rs) m ()
pipeTable = pipeTableOpt defaultParser

-- * Writing CSV Data

showFieldsCSV ::
    (RecMapMethod ShowCSV ElField ts, RecordToList ts) =>
    Record ts
    -> [T.Text]
showFieldsCSV = recordToList . rmapMethod @ShowCSV aux
  where
    aux ::
        (ShowCSV (PayloadType ElField a)) =>
        ElField a
        -> Const T.Text a
    aux (Field x) = Const (showCSV x)

-- | 'P.yield' a header row with column names followed by a line of
-- text for each 'Record' with each field separated by a comma. If
-- your source of 'Record' values is a 'P.Producer', consider using
-- 'pipeToCSV' to keep everything streaming.
produceCSV ::
    forall f ts m.
    ( ColumnHeaders ts
    , Foldable f
    , Monad m
    , RecordToList ts
    , RecMapMethod ShowCSV ElField ts
    ) =>
    f (Record ts)
    -> P.Producer String m ()
produceCSV = produceDSV defaultParser

produceDSV ::
    forall f ts m.
    ( ColumnHeaders ts
    , Foldable f
    , Monad m
    , RecordToList ts
    , RecMapMethod ShowCSV ElField ts
    ) =>
    ParserOptions
    -> f (Record ts)
    -> P.Producer String m ()
produceDSV opts recs = do
    P.yield (intercalate (T.unpack separator) (columnHeaders (Proxy :: Proxy (Record ts))))
    F.mapM_ (P.yield . T.unpack . T.intercalate separator . showFieldsCSV) recs
  where
    separator = columnSeparator opts

-- | 'P.yield' a header row with column names followed by a line of
-- text for each 'Record' with each field separated by a comma. This
-- is the same as 'produceCSV', but adapted for cases where you have
-- streaming input that you wish to use to produce streaming output.
pipeToCSV ::
    forall ts m.
    ( Monad m
    , ColumnHeaders ts
    , RecordToList ts
    , RecMapMethod ShowCSV ElField ts
    ) =>
    P.Pipe (Record ts) T.Text m ()
pipeToCSV = P.yield (T.intercalate "," (map T.pack header)) >> go
  where
    header = columnHeaders (Proxy :: Proxy (Record ts))
    go :: P.Pipe (Record ts) T.Text m ()
    go = P.map (T.intercalate "," . showFieldsCSV)

-- | Write a header row with column names followed by a line of text
-- for each 'Record' to the given file.
writeCSV ::
    ( ColumnHeaders ts
    , Foldable f
    , RecordToList ts
    , RecMapMethod ShowCSV ElField ts
    ) =>
    FilePath
    -> f (Record ts)
    -> IO ()
writeCSV = writeDSV defaultParser

-- | Write a header row with column names followed by a line of text
-- for each 'Record' to the given file.
writeDSV ::
    ( ColumnHeaders ts
    , Foldable f
    , RecordToList ts
    , RecMapMethod ShowCSV ElField ts
    ) =>
    ParserOptions
    -> FilePath
    -> f (Record ts)
    -> IO ()
writeDSV opts fp recs =
    P.runSafeT . P.runEffect $
        produceDSV opts recs >-> P.map T.pack >-> consumeTextLines fp