----------------------------------------------------------------------------- -- | -- Module : Database.TxtSushi.FlatFile -- Copyright : (c) Keith Sheppard 2009-2010 -- License : BSD3 -- Maintainer : keithshep@gmail.com -- Stability : experimental -- Portability : portable -- -- Functions for reading/writing flat files -- ----------------------------------------------------------------------------- module Database.TxtSushi.FlatFile ( formatTableWithWidths, maxTableColumnWidths, formatTable, parseTable, Format(Format), csvFormat, tabDelimitedFormat, doubleQuote) where import Data.List import Data.Ord {- | 'Format' allows you to specify different flat-file formats so that you can use 'parseTable' for CSV, tab-delimited etc. -} data Format = Format { quote :: String, fieldDelimiter :: String, rowDelimiters :: [String]} deriving (Show) defaultRowDelimiter :: Format -> String defaultRowDelimiter = head . rowDelimiters csvFormat :: Format csvFormat = Format "\"" "," ["\n", "\r", "\n\r", "\r\n"] tabDelimitedFormat :: Format tabDelimitedFormat = Format "\"" "\t" ["\n", "\r", "\n\r", "\r\n"] {- | get a quote escape sequence for the given 'Format' -} doubleQuote :: Format -> String doubleQuote format = quote format ++ quote format formatTableWithWidths :: String -> [Int] -> [[String]] -> String formatTableWithWidths _ _ [] = [] formatTableWithWidths boundaryString widths (row:tableTail) = let (initCells, [lastCell]) = splitAt (length row - 1) row in concat (zipWith ensureWidth widths initCells) ++ lastCell ++ "\n" ++ formatTableWithWidths boundaryString widths tableTail where ensureWidth width field = let lengthField = length field in if width > lengthField then field ++ replicate (width - lengthField) ' ' ++ boundaryString else field ++ boundaryString {- | for a table, calculate the max width in characters for each column -} maxTableColumnWidths :: [[String]] -> [Int] maxTableColumnWidths [] = [] maxTableColumnWidths table = maxTableColumnWidthsInternal table [] maxTableColumnWidthsInternal :: [[String]] -> [Int] -> [Int] maxTableColumnWidthsInternal [] prevMaxValues = prevMaxValues maxTableColumnWidthsInternal (row:tableTail) prevMaxValues | seqList prevMaxValues = undefined | otherwise = maxTableColumnWidthsInternal tableTail (maxRowFieldWidths row prevMaxValues) -- this filthy little function is for making the list strict... otherwise -- we run out of memory seqList :: [a] -> Bool seqList [] = False seqList (x:xt) | x `seq` False = undefined | otherwise = seqList xt maxRowFieldWidths :: [String] -> [Int] -> [Int] maxRowFieldWidths row prevMaxValues = zipWithD max (map length row) prevMaxValues zipWithD :: (a -> a -> a) -> [a] -> [a] -> [a] zipWithD f (x:xt) (y:yt) = f x y : zipWithD f xt yt zipWithD _ [] ys = ys zipWithD _ xs [] = xs {- | Format the given table (the 2D String array) into a flat-file string using the given 'Format' -} formatTable :: Format -> [[String]] -> String formatTable _ [] = "" formatTable format (headRow:tableTail) = formatRow format headRow ++ defaultRowDelimiter format ++ formatTable format tableTail {- | Format the row into a flat file sub-string using the given 'Format' -} formatRow :: Format -> [String] -> String formatRow _ [] = [] formatRow format (headField:rowTail) = -- we need to escape any quotes let escapedField = encodeField format headField in -- use a field delimiter on all but the last field if null rowTail then escapedField else escapedField ++ fieldDelimiter format ++ formatRow format rowTail {- | encode the given text field if it contains any special formatting characters -} encodeField :: Format -> String -> String encodeField format field | quoteInField = let escapedField = replaceAll field (quote format) (doubleQuote format) in quote format ++ escapedField ++ quote format | delimiterInField = quote format ++ field ++ quote format | otherwise = field where quoteInField = quote format `isInfixOf` field delimiterInField = any (`isInfixOf` field) (rowDelimiters format) || (fieldDelimiter format `isInfixOf` field) {- replace all instances of 'targetSublist' found in 'list' with 'replacementList' -} replaceAll :: (Eq a) => [a] -> [a] -> [a] -> [a] replaceAll [] _ _ = [] replaceAll list@(listHead:listTail) targetSublist replacementList = if targetSublist `isPrefixOf` list then let remainingList = drop (length targetSublist) list in replacementList ++ replaceAll remainingList targetSublist replacementList else listHead : replaceAll listTail targetSublist replacementList {- | Parse the given text using the given flat file 'Format'. The result is a list of list of strings. The strings are fields and the string lists are rows -} parseTable :: Format -> String -> [[String]] parseTable format text = go text where -- sorting the delimiters from shortest to longest allows us to -- guarantee that we don't mistake a multi-char newline as two single -- char newlines. The code in parseUnquotedField works on this -- assumption newFormat = format { rowDelimiters = sortBy (comparing (negate . length)) (rowDelimiters format)} go "" = [] go txt = let (nextLine, remainingText) = parseLine newFormat txt in nextLine : go remainingText -- parse a row giving (rowFields, remainingText) parseLine :: Format -> String -> ([String], String) parseLine _ [] = ([], "") parseLine format text = let (nextField, moreFieldsInRow, textRemainingAfterField) = parseField format text in -- if there are more fields, recursively add them to the row if moreFieldsInRow then let (rowTail, remainingText) = parseLine format textRemainingAfterField in (nextField:rowTail, remainingText) -- if there are no more fields return the current fields as a singleton -- list else ([nextField], textRemainingAfterField) -- parse a field giving (field, moreFieldsInRow, remainingText) parseField :: Format -> String -> (String, Bool, String) parseField _ [] = ("", False, "") parseField format text = -- check if this field is quoted or not if quote format `isPrefixOf` text then let tailOfQuote = drop (length (quote format)) text in parseQuotedField format tailOfQuote else parseUnquotedField format text -- parse a quoted field giving (field, moreFieldsInRow, remainingText) parseQuotedField :: Format -> String -> (String, Bool, String) parseQuotedField _ [] = ("", False, "") parseQuotedField format text@(textHead : textTail) -- a double quote is an escaped quote, so add a quote to the field | doubleQuote format `isPrefixOf` text = let tailOfDoubleQuote = drop (length (doubleQuote format)) text (fieldTail, moreFieldsInRow, remainingText) = parseQuotedField format tailOfDoubleQuote in (quote format ++ fieldTail, moreFieldsInRow, remainingText) -- a single quote is the end of the field, we can use parseUnquotedField to -- chew up any chars between the ending quote and the next delimiter (there -- really shouldn't be any if the text is formatted well, but you never -- know) | quote format `isPrefixOf` text = let tailOfQuote = drop (length (quote format)) text (_, moreFieldsInRow, remainingText) = parseUnquotedField format tailOfQuote in ("", moreFieldsInRow, remainingText) -- just another character... toss it in the field and keep going | otherwise = let (fieldTail, moreFieldsInRow, remainingText) = parseQuotedField format textTail in (textHead : fieldTail, moreFieldsInRow, remainingText) -- parse an unquoted field giving (field, moreFieldsInRow, remainingText) parseUnquotedField :: Format -> String -> (String, Bool, String) parseUnquotedField _ [] = ("", False, "") parseUnquotedField format text@(textHead:textTail) = -- if we hit a field delimiter: return an empty string and let caller know -- there are more fields in this row if fieldDelimiter format `isPrefixOf` text then let tailOfDelimiter = drop (length (fieldDelimiter format)) text in ([], True, tailOfDelimiter) else case findIndex (`isPrefixOf` text) (rowDelimiters format) of Nothing -> -- just another character... toss it in the field and keep going let (fieldTail, moreFieldsInRow, remainingText) = parseUnquotedField format textTail in (textHead:fieldTail, moreFieldsInRow, remainingText) Just delimIndex -> -- if we hit a row delimiter: return an empty string and let caller know there -- are no more fields in this row let tailOfDelimiter = drop (length (rowDelimiters format !! delimIndex)) text in ([], False, tailOfDelimiter)