Text/CSV/ByteString.hs

--------------------------------------------------------------------
-- |
-- Module    : Text.CSV.ByteString
-- Copyright : (c) Don Stewart 2008
-- License   : BSD3
--
-- Maintainer: Don Stewart <dons@galois.com>
-- Stability : provisional
-- Portability:
--
--------------------------------------------------------------------
--
-- Parsing comma separated values format (CSV) efficiently using ByteStrings,
--
-- RFC notes:
--
--  * we don't attempt to process any header line, as that existence or
--  not of a header depends on mime information, which isn't available.
--

module Text.CSV.ByteString where

import qualified Data.ByteString          as S hiding (uncons,last)
import qualified Data.ByteString.Unsafe   as S
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Char8    as S

import qualified Data.DList as D
import Text.CSV.ByteString.Lex

------------------------------------------------------------------------

-- | A CSV file is a series of records. According to the RFC, the
-- records all have to have the same length. As an extension, I allow
-- variable length records. 
type CSV    = [Record]

-- | A record is a series of fields
-- Each record is located on a separate line, delimited by a line break (CRLF).
type Record = [Field]

-- | A field is a strict ByteString.
-- Within the header and each record, there may be one or more
-- ields, separated by commas.  Each line should contain the same number
-- of fields throughout the file.  Spaces are considered part of a field
-- and should not be ignored.  The last field in the record must not be
-- followed by a comma.
type Field = S.ByteString

------------------------------------------------------------------------

-- | Parse a ByteString into a CSV form.
parseCSV :: S.ByteString -> Maybe CSV
parseCSV s
   | S.null s  = Nothing
   | otherwise = Just $! parseRecords toks D.empty
  where
   toks = lexCSV s

   parseRecords :: [CSVToken] -> D.DList Record -> CSV
   parseRecords [] csv = D.toList csv
   parseRecords xs csv = parseRecords (tail rest) (csv `D.snoc` fields)
      where
        (line, rest) = break (== Newline) xs
        fields       = [ unquote s | Item s <- line ]
        -- todo, handle nesting.

        -- If double-quotes are used to enclose fields, then a
        -- double-quote appearing inside a field must be escaped by preceding
        -- it with another double quote.
        unquote s
            | S.null s                    = S.empty
            | c == '"' && S.last s == '"' = S.init cs -- TODO more quoting
            | otherwise                   = s
            where
                (c,cs) = (S.w2c (S.unsafeHead s), S.unsafeTail s)