module Siphon.Internal where
import Siphon.Types
import Data.ByteString.Builder (toLazyByteString,byteString)
import qualified Data.ByteString.Char8 as BC8
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LByteString
import qualified Data.ByteString.Builder as Builder
import qualified Data.Text as T
import Data.Word (Word8)
import Data.Vector (Vector)
import Data.ByteString (ByteString)
import Data.Coerce (coerce)
import Siphon.Types
import Control.Applicative
import Data.Monoid
byteStringChar8 :: Siphon ByteString
byteStringChar8 = Siphon
escape
encodeRow
(A.parse (row comma))
B.null
encodeRow :: Vector (Escaped ByteString) -> ByteString
encodeRow = id
. flip B.append (B.singleton newline)
. B.intercalate (B.singleton comma)
. V.toList
. coerce
escape :: ByteString -> Escaped ByteString
escape t = case B.find (\c -> c == newline || c == cr || c == comma || c == doubleQuote) t of
Nothing -> Escaped t
Just _ -> escapeAlways t
escapeAlways :: ByteString -> Escaped ByteString
escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
Builder.word8 doubleQuote
<> B.foldl
(\ acc b -> acc <> if b == doubleQuote
then Builder.byteString
(B.pack [doubleQuote,doubleQuote])
else Builder.word8 b)
mempty
t
<> Builder.word8 doubleQuote
sepByDelim1' :: AL.Parser a
-> Word8
-> AL.Parser [a]
sepByDelim1' p !delim = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
sepByEndOfLine1' :: AL.Parser a
-> AL.Parser [a]
sepByEndOfLine1' p = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == cr ->
liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop
| b == newline ->
liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
row :: Word8
-> AL.Parser (Vector ByteString)
row !delim = rowNoNewline delim <* endOfLine
rowNoNewline :: Word8
-> AL.Parser (Vector ByteString)
rowNoNewline !delim = V.fromList <$!> field delim `sepByDelim1'` delim
removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
removeBlankLines = filter (not . blankLine)
field :: Word8 -> AL.Parser ByteString
field !delim = do
mb <- A.peekWord8
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField delim
escapedField :: AL.Parser S.ByteString
escapedField = do
_ <- dquote
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= delim &&
c /= cr)
dquote :: AL.Parser Char
dquote = char '"'
unescape :: Z.Parser S.ByteString
unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` byteString h)
else rest
(<$!>) :: Monad m => (a -> b) -> m a -> m b
f <$!> m = do
a <- m
return $! f a
infixl 4 <$!>
blankLine :: V.Vector B.ByteString -> Bool
blankLine v = V.length v == 1 && (B.null (V.head v))
liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
liftM2' f a b = do
!x <- a
y <- b
return (f x y)
endOfLine :: A.Parser ()
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
doubleQuote, newline, cr, comma :: Word8
doubleQuote = 34
newline = 10
cr = 13
comma = 44