------------------------------------------------------------------------------------------- -- Input/output of delimiter separated strings ------------------------------------------------------------------------------------------- {-| Module : Text.DelimiterSeparated Description : Library for manipulating delimiter separated records. Copyright : (c) Atze Dijkstra, 2015 License : BSD3 Maintainer : atze@uu.nl Stability : experimental Portability : POSIX The library provides parsing/unparsing of 'Records' as well as interpreting those records to a datatype of your choice, via 'toRecords' and 'fromRecords' using class 'DelimSepRecord', where each individual field can be interpreted via class 'DelimSepField'. The following example demonstrates the basic parsing/unparsing: > module Main where > > import Text.DelimiterSeparated > import System.IO > import Control.Monad > > main = do > txt <- readFile "data.csv" > putStrLn txt > case recordsFromDelimiterSeparated csv txt of > Left es -> forM_ es putStrLn > Right recs -> do > putStrLn $ show recs > writeFile "data-out-csv.csv" $ recordsToDelimiterSeparated csv recs > writeFile "data-out-csv.tsv" $ recordsToDelimiterSeparated tsv recs > > txt <- readFile "data.tsv" > -- putStrLn txt > case recordsFromDelimiterSeparated tsv txt of > Left es -> forM_ es putStrLn > Right recs -> do > putStrLn $ show recs > writeFile "data-out-tsv.csv" $ recordsToDelimiterSeparated csv recs -} module Text.DelimiterSeparated ( -- * Types Field(..) , Record , Records , DelimiterStyle(..) , csv, tsv -- * Construction , emptyField -- * Encoding, decoding , recordsToDelimiterSeparated , recordsFromDelimiterSeparated -- * Checks, fixes , Check(..) , checkAll , recordsCheck , Fix(..) , recordsFix -- * Construction , recordsFromHeaderStr , recordsAddHeaderStr -- * Manipulation , recordsPartitionRows , recordsPartitionColsBasedOnHeader , recordsSpan , recordsSplitHeader -- * Conversion , recordsToStrings , recordsFromStrings -- * Overloaded conversion , DelimSepField(..) , DelimSepRecord(..) , toRecords , toRecordsWithHeader , toRecordsWithHeaderStr , fromRecords ) where ------------------------------------------------------------------------------------------- import UU.Parsing import UU.Parsing.CharParser import UHC.Util.ParseUtils import UHC.Util.Utils import Data.List ------------------------------------------------------------------------------------------- ------------------------------------------------------------------------------------------- -- Types ------------------------------------------------------------------------------------------- -- | Field data Field = Field { fldStr :: String -- base case } deriving (Eq) -- | Empty field emptyField :: Field emptyField = Field "" instance Show Field where show = showField tsv -- | Show field, depending on delimiter style showField :: DelimiterStyle -> Field -> String showField (CSV {}) (Field s) = "\"" ++ s ++ "\"" showField (TSV {}) (Field s) = s -- | Record is sequence of fields (representation may change in future) type Record = [Field] -- | Records is sequence of records (representation may change in future) type Records = [Record] -- | Style of delimitation data DelimiterStyle = CSV { styleFieldDelimChars :: [Char] -- field delimiters, first one used when unparsing , styleRecordDelimChars :: [Char] -- record delimiters, first one used when unparsing } | TSV -- | Predefined delimiter style for comma field separated, newline/return record separated csv :: DelimiterStyle csv = CSV "," "\n\r" -- | Predefined delimiter style for tab field separated, newline/return record separated tsv :: DelimiterStyle tsv = TSV ------------------------------------------------------------------------------------------- -- Parsing, encoding, decoding ------------------------------------------------------------------------------------------- type P p = PlainParser Char p -- | Parsing of records, given a delimiterstyle pRecords :: DelimiterStyle -> P Records pRecords style = concat <$> pList1Sep_ng pNL pRecordsNonEmpty -- <* (pNL `opt` '?') where pField = case style of TSV -> Field <$> pList_ng (pExcept (minBound, maxBound, '?') "\t\n\r") CSV {styleFieldDelimChars=fs, styleRecordDelimChars=ls} -> Field <$> ( pDQ *> pList_ng pQChar <* pDQ <|> pList_ng pChar ) where pQChar = pExcept (minBound, maxBound, '?') "\"" <|> pDQ <* pDQ pChar = pExcept (minBound, maxBound, '?') ("\"" ++ fs ++ ls) pRecord = pList1Sep_ng pSep pField pRecordsNonEmpty = chk <$> pRecord where chk [] = [] chk [Field ""] = [] chk fs = [fs] pNL = case style of TSV -> pAnySym "\n\r" CSV {styleRecordDelimChars=ls} -> pAnySym ls pDQ = pSym '"' pSep = case style of TSV -> pAnySym "\t" CSV {styleFieldDelimChars=fs} -> pAnySym fs -- | Encode internal representation in external delimiter separated format recordsToDelimiterSeparated :: DelimiterStyle -> Records -> String recordsToDelimiterSeparated style recs = (mk ls $ map (mk fs . map (showField style)) recs) ++ "\n" where (fs,ls) = case style of TSV -> ('\t', '\n') CSV {styleFieldDelimChars=(fs:_), styleRecordDelimChars=(ls:_)} -> (fs,ls) mk sep = concat . intersperse [sep] -- | Decode internal representation from external delimiter separated format, possible failing with error messages recordsFromDelimiterSeparated :: DelimiterStyle -> String -> Either [String] Records recordsFromDelimiterSeparated style str | null errs = Right res | otherwise = Left $ map show errs where (res,errs) = parseToResMsgs (pRecords style) str ------------------------------------------------------------------------------------------- -- Construction ------------------------------------------------------------------------------------------- -- | Convert a String representation of a header to actual record recordsFromHeaderStr :: String -> Records recordsFromHeaderStr s = recordsFromStrings [words s] -- | Add a header described by string holding whitespaced separated labels recordsAddHeaderStr :: String -> Records -> Records recordsAddHeaderStr s = (recordsFromHeaderStr s ++) ------------------------------------------------------------------------------------------- -- Manipulation ------------------------------------------------------------------------------------------- -- | Lift a predicate to a Record liftRecPred :: ([String] -> Bool) -> (Record -> Bool) liftRecPred pred = pred . map fldStr -- | Do something with records, taking into account header recordsDoHeader :: Bool -- ^ first rec is header? -> (Records -> Records -> res) -- ^ do it, given header (if any) and records -> Records -> res recordsDoHeader fstIsHdr mk recs | fstIsHdr = mk [hd] tl | otherwise = mk [] recs where (hd:tl) = recs -- | Partition record rows. -- Fst of tuple holds the possible header, if indicated it is present. -- Snd of tuple holds records failing the predicate. -- Assumes >0 records recordsPartitionRows :: Bool -> ([String] -> Bool) -> Records -> (Records, Records) recordsPartitionRows fstIsHdr pred recs = recordsDoHeader fstIsHdr mk recs where mk hd rs = (hd++y,n) where (y,n) = partition (liftRecPred pred) rs -- | Partition record columns, fst of tuple holds the obligatory header upon wich partitioning is done -- Assumes header and records all have same nr of fields recordsPartitionColsBasedOnHeader :: (String -> Bool) -> Records -> (Records, Records) recordsPartitionColsBasedOnHeader pred hr@(hdr:_) = let spl = split hdr in unzip $ map spl hr where split (h:t) | pred (fldStr h) = let tspl = split t in \(rh:rt) -> let (r1,r2) = tspl rt in (rh:r1, r2) | otherwise = let tspl = split t in \(rh:rt) -> let (r1,r2) = tspl rt in ( r1, rh:r2) split [] = \[] -> ( [], []) -- | Partition records, fst of tuple holds the possible header, if indicated it is present. -- Assumes >0 records recordsSpan :: Bool -> ([String] -> Bool) -> Records -> (Records, Records) recordsSpan fstIsHdr pred recs = recordsDoHeader fstIsHdr mk recs where mk hd rs = (hd++y,n) where (y,n) = span (liftRecPred pred) rs -- | Split of header, assuming there is one recordsSplitHeader :: Records -> (Record, Records) recordsSplitHeader (h:t) = (h, t) ------------------------------------------------------------------------------------------- -- Conversion ------------------------------------------------------------------------------------------- -- | Get all fields as strings recordsToStrings :: Records -> [[String]] recordsToStrings = map (map fldStr) -- | Lift strings as Records recordsFromStrings :: [[String]] -> Records recordsFromStrings = map (map Field) ------------------------------------------------------------------------------------------- -- Check(s) & fixes ------------------------------------------------------------------------------------------- -- | Which checks are to be done by 'recordsCheck' data Check = Check_DupHdrNms -- ^ check for duplicate header names | Check_EqualSizedRecs -- ^ check for equal sized records (ignoring possible header) | Check_AtLeast1Rec -- ^ check for at least 1 record (ignoring possible header) | Check_EqualSizedHdrRecs -- ^ check for equal sized header and records | Check_NoRecsLargerThanHdr -- ^ check for records not larger than header | Check_NoRecsSmallerThanHdr -- ^ check for records not smaller than header deriving (Eq,Enum,Bounded) -- | All checks checkAll :: [Check] checkAll = [minBound .. maxBound] -- | Check records, possibly yielding errors recordsCheck :: Bool -> [Check] -> Records -> Maybe String recordsCheck fstIsHdr chks recs --- | null recs = Just $ "no records nor header" | (Check_AtLeast1Rec `elem` chks || checksHdrSizeAndRecs) && not (has1Rec recs) = Just $ "not at least 1 record" ++ (if fstIsHdr then " and header" else "") | Check_EqualSizedRecs `elem` chks && length tlSzs > 1 = Just $ "records have varying sizes: " ++ show tlSzs | checksHdrSizeAndRecs && fstIsHdr && not (null cmp_tlSzs) = Just $ "header size=" ++ show hdLen ++ " and records sizes=" ++ show cmp_tlSzs ++ " differ" | otherwise = Nothing where has1Rec (_:_:_) | fstIsHdr = True has1Rec (_:_ ) | fstIsHdr = False | otherwise = True has1Rec _ = False ~(~[rhd],rtl) = recordsDoHeader fstIsHdr (,) recs dupnms = concat $ map head $ filter (\l -> length l > 1) $ groupSortOn id $ map fldStr rhd tlNrAndLen = zipWith (\i r -> (i, length r)) [1::Int ..] rtl hdLen = length rhd tlSzs@(~(hd_tlSzs@(~(hd_tlSz,_)):_)) = [ (l, map fst nl) | nl@((_,l):_) <- groupSortOn snd tlNrAndLen ] checksGT = Check_NoRecsLargerThanHdr `elem` chks checksLT = Check_NoRecsSmallerThanHdr `elem` chks checksHdrSizeAndRecs = checksGT || checksLT cmpSz | checksGT && checksLT = (/=) | checksGT = (>) | checksLT = (<) | otherwise = \_ _ -> False cmp_tlSzs = filter (\(sz,_) -> cmpSz sz hdLen) tlSzs -- | Which fixes are to be done by 'recordsCheck' data Fix = Fix_Pad -- ^ pad | Fix_PadToHdrLen -- ^ in combi with pad, pad to header len deriving (Eq,Enum,Bounded) -- | Fix sizes of records by padding to max size recordsFix :: Bool -> [Fix] -> Records -> Records recordsFix fstIsHdr fxs recs = map fix recs where ~(~[rhd],rtl) = recordsDoHeader fstIsHdr (,) recs maxl | Fix_PadToHdrLen `elem` fxs && fstIsHdr = length rhd | otherwise = maximum $ map length rtl fix | Fix_Pad `elem` fxs = \r -> r ++ take (maxl - length r) p | otherwise = id where p = repeat emptyField ------------------------------------------------------------------------------------------- -- Additional conversion/interpretation of field, i.e. show/read (why not use it like that?) ------------------------------------------------------------------------------------------- -- | Conversion to/from Field, i.e. kinda show/read class DelimSepField x where toDelimSepField :: x -> Field fromDelimSepField :: Field -> x instance DelimSepField Field where toDelimSepField = id fromDelimSepField = id instance {-# OVERLAPPABLE #-} DelimSepField x => DelimSepField [x] where toDelimSepField = toDelimSepField . unwords . map (fromDelimSepField . toDelimSepField) fromDelimSepField = map (fromDelimSepField . toDelimSepField) . words . fromDelimSepField instance DelimSepField String where toDelimSepField = Field fromDelimSepField = fldStr instance DelimSepField Integer where toDelimSepField = Field . show fromDelimSepField (Field x) = read x instance DelimSepField Int where toDelimSepField = Field . show fromDelimSepField (Field x) = read x instance DelimSepField Double where toDelimSepField = Field . show fromDelimSepField (Field x) = read x -- | Conversion to/from Record class DelimSepRecord x where toDelimSepRecord :: x -> Record fromDelimSepRecord :: Record -> x instance {-# OVERLAPPABLE #-} DelimSepRecord Record where toDelimSepRecord = id fromDelimSepRecord = id -- | Convert to records toRecords :: DelimSepRecord x => [x] -> Records toRecords = map toDelimSepRecord -- | Convert to records, with a header described by string holding whitespaced separated labels toRecordsWithHeader :: DelimSepRecord x => [Record] -> [x] -> Records toRecordsWithHeader h = (h++) . toRecords -- | Convert to records, with a header described by string holding whitespaced separated labels toRecordsWithHeaderStr :: DelimSepRecord x => String -> [x] -> Records toRecordsWithHeaderStr s = toRecordsWithHeader (recordsFromHeaderStr s) -- | Convert from records fromRecords :: DelimSepRecord x => Records -> [x] fromRecords = map fromDelimSepRecord