{- | The 'FlatFile' module is for reading misc. 'FlatFile' formats like CSV or tab delimited -} module Database.TxtSushi.FlatFile ( formatTableWithWidths, maxTableColumnWidths, formatTable, parseTable, Format(Format), csvFormat, tabDelimitedFormat, doubleQuote) where import Data.Function import Data.List {- | 'Format' allows you to specify different flat-file formats so that you can use 'parseTable' for CSV, tab-delimited etc. -} data Format = Format { quote :: String, fieldDelimiter :: String, rowDelimiters :: [String]} deriving (Show) defaultRowDelimiter :: Format -> String defaultRowDelimiter = head . rowDelimiters csvFormat :: Format csvFormat = Format "\"" "," ["\n", "\r", "\n\r", "\r\n"] tabDelimitedFormat :: Format tabDelimitedFormat = Format "\"" "\t" ["\n", "\r", "\n\r", "\r\n"] {- | get a quote escape sequence for the given 'Format' -} doubleQuote :: Format -> String doubleQuote format = (quote format) ++ (quote format) formatTableWithWidths :: String -> [Int] -> [[String]] -> String formatTableWithWidths _ _ [] = [] formatTableWithWidths boundaryString widths (row:tableTail) = let (initCells, [lastCell]) = splitAt (length row - 1) row in (concat $ zipWith ensureWidth widths initCells) ++ lastCell ++ "\n" ++ (formatTableWithWidths boundaryString widths tableTail) where ensureWidth width field = let lengthField = length field in if width > lengthField then field ++ (replicate (width - lengthField) ' ') ++ boundaryString else field ++ boundaryString {- | for a table, calculate the max width in characters for each column -} maxTableColumnWidths :: [[String]] -> [Int] maxTableColumnWidths [] = [] maxTableColumnWidths table = maxTableColumnWidthsInternal table [] maxTableColumnWidthsInternal :: [[String]] -> [Int] -> [Int] maxTableColumnWidthsInternal [] prevMaxValues = prevMaxValues maxTableColumnWidthsInternal (row:tableTail) prevMaxValues | seqList prevMaxValues = undefined | otherwise = maxTableColumnWidthsInternal tableTail (maxRowFieldWidths row prevMaxValues) -- this filthy little function is for making the list strict... otherwise -- we run out of memory seqList :: [a] -> Bool seqList [] = False seqList (x:xt) | x `seq` False = undefined | otherwise = seqList xt maxRowFieldWidths :: [String] -> [Int] -> [Int] maxRowFieldWidths row prevMaxValues = zipWithD max (map length row) prevMaxValues zipWithD :: (a -> a -> a) -> [a] -> [a] -> [a] zipWithD f (x:xt) (y:yt) = (f x y):(zipWithD f xt yt) zipWithD _ [] ys = ys zipWithD _ xs [] = xs {- | Format the given table (the 2D String array) into a flat-file string using the given 'Format' -} formatTable :: Format -> [[String]] -> String formatTable _ [] = "" formatTable format (headRow:tableTail) = (formatRow format headRow) ++ (defaultRowDelimiter format) ++ (formatTable format tableTail) {- | Format the row into a flat file sub-string using the given 'Format' -} formatRow :: Format -> [String] -> String formatRow _ [] = [] formatRow format (headField:rowTail) = -- we need to escape any quotes let escapedField = encodeField format headField in -- use a field delimiter on all but the last field if null rowTail then escapedField else escapedField ++ (fieldDelimiter format) ++ (formatRow format rowTail) {- | encode the given text field if it contains any special formatting characters -} encodeField :: Format -> String -> String encodeField format field = if (quote format) `isInfixOf` field then let escapedField = replaceAll field (quote format) (doubleQuote format) in (quote format) ++ escapedField ++ (quote format) else if any (`isInfixOf` field) (rowDelimiters format) || (fieldDelimiter format) `isInfixOf` field then (quote format) ++ field ++ (quote format) else field {- replace all instances of 'targetSublist' found in 'list' with 'replacementList' -} replaceAll :: (Eq a) => [a] -> [a] -> [a] -> [a] replaceAll [] _ _ = [] replaceAll list@(listHead:listTail) targetSublist replacementList = if targetSublist `isPrefixOf` list then let remainingList = drop (length targetSublist) list in replacementList ++ (replaceAll remainingList targetSublist replacementList) else listHead:(replaceAll listTail targetSublist replacementList) {- | Parse the given text using the given flat file 'Format'. The result is a list of list of strings. The strings are fields and the string lists are rows -} parseTable :: Format -> String -> [[String]] parseTable format text = go text where -- sorting the delimiters from shortest to longest allows us to -- guarantee that we don't mistake a multi-char newline as two single -- char newlines. The code in parseUnquotedField works on this -- assumption newFormat = format { rowDelimiters = sortBy (compare `on` negate . length) (rowDelimiters format)} go "" = [] go txt = let (nextLine, remainingText) = parseLine newFormat txt in nextLine : go remainingText -- parse a row giving (rowFields, remainingText) parseLine :: Format -> String -> ([String], String) parseLine _ [] = ([], "") parseLine format text = let (nextField, moreFieldsInRow, textRemainingAfterField) = parseField format text in -- if there are more fields, recursively add them to the row if moreFieldsInRow then let (rowTail, remainingText) = parseLine format textRemainingAfterField in (nextField:rowTail, remainingText) -- if there are no more fields return the current fields as a singleton -- list else ([nextField], textRemainingAfterField) -- parse a field giving (field, moreFieldsInRow, remainingText) parseField :: Format -> String -> (String, Bool, String) parseField _ [] = ("", False, "") parseField format text = -- check if this field is quoted or not if (quote format) `isPrefixOf` text then let tailOfQuote = drop (length (quote format)) text in parseQuotedField format tailOfQuote else parseUnquotedField format text -- parse a quoted field giving (field, moreFieldsInRow, remainingText) parseQuotedField :: Format -> String -> (String, Bool, String) parseQuotedField _ [] = ("", False, "") parseQuotedField format text@(textHead:textTail) = -- a double quote is an escaped quote, so add a quote to the field if (doubleQuote format) `isPrefixOf` text then let tailOfDoubleQuote = drop (length (doubleQuote format)) text (fieldTail, moreFieldsInRow, remainingText) = parseQuotedField format tailOfDoubleQuote in ((quote format) ++ fieldTail, moreFieldsInRow, remainingText) -- a single quote is the end of the field, we can use parseUnquotedField to -- chew up any chars between the ending quote and the next delimiter (there -- really shouldn't be any if the text is formatted well, but you never -- know) else if (quote format) `isPrefixOf` text then let tailOfQuote = drop (length (quote format)) text (_, moreFieldsInRow, remainingText) = parseUnquotedField format tailOfQuote in ("", moreFieldsInRow, remainingText) -- just another character... toss it in the field and keep going else let (fieldTail, moreFieldsInRow, remainingText) = parseQuotedField format textTail in (textHead:fieldTail, moreFieldsInRow, remainingText) -- parse an unquoted field giving (field, moreFieldsInRow, remainingText) parseUnquotedField :: Format -> String -> (String, Bool, String) parseUnquotedField _ [] = ("", False, "") parseUnquotedField format text@(textHead:textTail) = -- if we hit a field delimiter: return an empty string and let caller know -- there are more fields in this row if (fieldDelimiter format) `isPrefixOf` text then let tailOfDelimiter = drop (length (fieldDelimiter format)) text in ([], True, tailOfDelimiter) else case findIndex (`isPrefixOf` text) (rowDelimiters format) of Nothing -> -- just another character... toss it in the field and keep going let (fieldTail, moreFieldsInRow, remainingText) = parseUnquotedField format textTail in (textHead:fieldTail, moreFieldsInRow, remainingText) Just delimIndex -> -- if we hit a row delimiter: return an empty string and let caller know there -- are no more fields in this row let tailOfDelimiter = drop (length (rowDelimiters format !! delimIndex)) text in ([], False, tailOfDelimiter)