{-# LANGUAGE TypeOperators #-}
-- | CSV parsers for use with the @Frames@ package.
--
-- Most commonly used are 'dsvTableTypes' for generating type
-- definitions at compile time based on a CSV file, and 'readDsvTable'
-- to load the table at run time. These are comparable to @tableTypes@
-- and @readTable@ from the @Frames@ package, but use an alternative
-- CSV parser.
module Frames.Dsv where
import Control.Monad (when)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.Maybe (isNothing)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Vector as V
import Data.Vinyl
import Data.Vinyl.Functor ((:.), Compose(..))
import Data.Word (Word8)

import Frames (recMaybe, Record)
import Frames.CSV (defaultParser, pipeTableMaybeOpt)
import Frames.CSV (ReadRec(..), Separator, ParserOptions(..))
import Frames.TH (rowGen, RowGen(..), tableTypes')

import qualified HaskellWorks.Data.Dsv.Lazy.Cursor as SVL

import Pipes (MonadIO, Producer, (>->), yield)
import qualified Pipes as P
import qualified Pipes.Prelude as P

import Language.Haskell.TH

-- * Row Reading

-- | Produce one DSV row at a time.
rowLoop :: Monad m => SVL.DsvCursor -> Producer [BS.ByteString] m ()
rowLoop c =
  if SVL.dsvCursorPosition d > SVL.dsvCursorPosition c && not (SVL.atEnd c)
  then do yield (V.toList (SVL.getRowBetweenStrict c d dEnd))
          rowLoop (SVL.trim d)
  else return ()
  where nr = SVL.nextRow c
        d = SVL.nextPosition nr
        dEnd = SVL.atEnd nr

-- | Produce rows of raw 'LBS.ByteString' values.
dsvRowsByte :: MonadIO m => FilePath -> Word8 -> Producer [BS.ByteString] m ()
dsvRowsByte fp columnSeparator =
  do bs <- P.liftIO (LBS.readFile fp)
     rowLoop (SVL.makeCursor columnSeparator bs)

-- | Produce rows of UTF-8 encoded 'Text' values.
dsvRows' :: MonadIO m => FilePath -> Word8 -> Producer [Text] m ()
dsvRows' fp = (>-> P.map (map T.decodeUtf8)) . dsvRowsByte fp

-- | Produce rows of Latin-1 (aka ISO-8859-1) encoded 'Text' values.
dsvRowsLatin1' :: MonadIO m => FilePath -> Word8 -> Producer [Text] m ()
dsvRowsLatin1' fp = (>-> P.map (map T.decodeLatin1)) . dsvRowsByte fp

-- | Call 'error' indicating the problem with a separator intended for
-- use with the @hw-dsv@ library.
dsvSepErr :: DsvParserOptionsError -> a
dsvSepErr = error . ("DSV separator must be a single character: "++) . show

-- | Produce rows of UTF-8 encoded 'Text' values.
dsvRows :: MonadIO m => FilePath -> Separator -> Producer [Text] m ()
dsvRows fp = either dsvSepErr (dsvRows' fp) . separatorWord8

-- | Produce rows of Latin-1 (aka ISO-8859-1) encoded 'Text' values.
dsvRowsLatin1 :: MonadIO m => FilePath -> Separator -> Producer [Text] m ()
dsvRowsLatin1 fp = either dsvSepErr (dsvRowsLatin1' fp) . separatorWord8

-- | The ways in which an arbitrary 'Text' value may be unsuitable for
-- use as a separator for the @hw-dsv@ package.
data DsvParserOptionsError = SeparatorIsNull
                           | SeparatorCharIsMoreThanOneByte
                           | SeparatorIsMoreThanOneChar
  deriving (Eq, Show)

-- | The @Frames@ library supports column separators that can be
-- arbitrary 'Text' values, but the @hw-dsv@ library requires a single
-- byte be used to demarcate values. If the given 'Text' can be
-- losslessly represented as a single byte, we sue it, otherwise we
-- return an error indicating the problem.
separatorWord8 :: Separator -> Either DsvParserOptionsError Word8
separatorWord8 sep =
  case T.uncons sep of
    Nothing -> Left SeparatorIsNull
    Just (h, t)
      | T.null t -> let i = fromEnum h
                    in if i < 256
                       then Right (fromIntegral i)
                       else Left SeparatorCharIsMoreThanOneByte
      | otherwise -> Left SeparatorIsMoreThanOneChar

-- * Whole Table Reading

-- | Produce rows where any given entry can fail to parse.
readDsvTableMaybeOpt :: (MonadIO m, ReadRec rs, RMap rs)
                     => ParserOptions
                     -> FilePath
                     -> P.Producer (Rec (Maybe :. ElField) rs) m ()
readDsvTableMaybeOpt opts csvFile =
  dsvRows csvFile (columnSeparator opts)  >-> pipeTableMaybeOpt opts
{-# INLINE readDsvTableMaybeOpt #-}

-- | Returns a producer of rows for which each column was successfully
-- parsed.
readDsvTableOpt :: (MonadIO m, ReadRec rs, RMap rs)
                => ParserOptions -> FilePath -> P.Producer (Record rs) m ()
readDsvTableOpt opts csvFile = readDsvTableMaybeOpt opts csvFile P.>-> go
  where go = P.await >>= maybe go (\x -> P.yield x >> go) . recMaybe
{-# INLINE readDsvTableOpt #-}

-- | Returns a producer of rows for which each column was successfully
-- parsed.
readDsvTable :: (MonadIO m, ReadRec rs, RMap rs)
             => FilePath -> P.Producer (Record rs) m ()
readDsvTable = readDsvTableOpt defaultParser
{-# INLINE readDsvTable #-}

-- * Template Haskell

-- | Like 'tableType', but additionally generates a type synonym for
-- each column, and a proxy value of that type. If the CSV file has
-- column names \"foo\", \"bar\", and \"baz\", then this will declare
-- @type Foo = "foo" :-> Int@, for example, @foo = rlens \@Foo@, and
-- @foo' = rlens' \@Foo@.
dsvTableTypes :: String -> FilePath -> DecsQ
dsvTableTypes n fp =
  tableTypes' (rowGen fp) { rowTypeName = n
                          , lineReader = dsvRows fp }