-------------------------------------------------------------------- -- | -- Module : Text.CSV.ByteString -- Copyright : (c) Don Stewart 2008 -- License : BSD3 -- -- Maintainer: Don Stewart -- Stability : provisional -- Portability: -- -------------------------------------------------------------------- -- -- Parsing comma separated values format (CSV) efficiently using ByteStrings, -- -- RFC notes: -- -- * we don't attempt to process any header line, as that existence or -- not of a header depends on mime information, which isn't available. -- module Text.CSV.ByteString where import qualified Data.ByteString as S hiding (uncons,last) import qualified Data.ByteString.Unsafe as S import qualified Data.ByteString.Internal as S import qualified Data.ByteString.Char8 as S import Text.CSV.ByteString.Lex ------------------------------------------------------------------------ -- | A CSV file is a series of records. According to the RFC, the -- records all have to have the same length. As an extension, I allow -- variable length records. type CSV = [Record] -- | A record is a series of fields -- Each record is located on a separate line, delimited by a line break (CRLF). type Record = [Field] -- | A field is a strict ByteString. -- Within the header and each record, there may be one or more -- ields, separated by commas. Each line should contain the same number -- of fields throughout the file. Spaces are considered part of a field -- and should not be ignored. The last field in the record must not be -- followed by a comma. type Field = S.ByteString ------------------------------------------------------------------------ -- | Parse a ByteString into a CSV form. parseCSV :: S.ByteString -> Maybe CSV parseCSV s | S.null s = Nothing | otherwise = Just $! parseRecords toks [] where toks = lexCSV s parseRecords :: [CSVToken] -> CSV -> CSV parseRecords [] csv = reverse csv parseRecords xs csv = parseRecords (tail rest) (fields : csv) where (line, rest) = break (== Newline) xs fields = [ unquote s | Item s <- line ] -- todo, handle nesting. -- If double-quotes are used to enclose fields, then a -- double-quote appearing inside a field must be escaped by preceding -- it with another double quote. unquote s | S.null s = S.empty | c == '"' && S.last s == '"' = S.init cs -- TODO more quoting | otherwise = s where (c,cs) = (S.w2c (S.unsafeHead s), S.unsafeTail s)