{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {- | Convenience utility to read Xlsx tabular cells. The majority of the @toTableRows*@ functions assume that the table of interest consiste of contiguous rows styled with borders lines surrounding all cells, with possible text above and below the table that is not of interest. Like so: @ Some documentation here.... --------------------------- | Header1 | Header2 | ... | --------------------------- | Value1 | Value2 | ... | --------------------------- | Value1 | Value2 | ... | --------------------------- Maybe some annoying text here, I don't care about. @ The heauristic used for table row selection in these functions is that any table rows will have a bottom border line. If the above heuristic is not valid for your table you can instead provide your own row selection predicate to the `toTableRowsCustom` function. For example, the predicate @\\_ _ -> True@ (or @(const . const) True@) will select all contiguous rows. -} module Codec.Xlsx.Util.Tabular ( -- * Types Tabular , TabularHead , TabularRow -- * Lenses -- ** Tabular , tabularHeads , tabularRows -- ** TabularHead , tabularHeadIx , tabularHeadLabel -- ** TabularRow , tabularRowIx , tabularRowCells -- * Functions , toTableRowsFromFile , toTableRows , toTableRows' -- * Custom row predicates , toTableRowsCustom ) where import Codec.Xlsx.Util.Tabular.Imports import qualified Data.ByteString.Lazy as ByteString type Row = (Int, Cols) type Rows = [(Int, Cols)] -- [Row] type Cols = [(Int, Cell)] type RowValues = [(Int, [(Int, Maybe CellValue)])] -- | A @RowPredicate@ is given the Xlsx "StyleSheet" as well as the -- row itself (consisting of the row's index and the row's cells) and -- should return @True@ if the row is part of the table and false -- otherwise. type RowPredicate = StyleSheet -> Row -> Bool -- |Read tabular rows from the first sheel of an Xlsx file. -- The table is assumed to consist of all contiguous rows -- that have bottom border lines, starting with the header. toTableRowsFromFile :: Int -- ^ Starting row index (header row) -> String -- ^ File name -> IO (Maybe Tabular) toTableRowsFromFile offset fname = flip toTableRows' offset . toXlsx <$> ByteString.readFile fname -- |Decode cells as tabular rows. -- The table is assumed to consist of all contiguous rows -- that have bottom border lines, starting with the header. toTableRows :: Xlsx -- ^ Xlsx Workbook -> Text -- ^ Worksheet name to decode -> Int -- ^ Starting row index (header row) -> Maybe Tabular toTableRows = toTableRowsCustom borderBottomPredicate -- |Decode cells from first sheet as tabular rows. -- The table is assumed to consist of all contiguous rows -- that have bottom border lines, starting with the header. toTableRows' :: Xlsx -- ^ Xlsx Workbook -> Int -- ^ Starting row index (header row) -> Maybe Tabular toTableRows' xlsx = toTableRows xlsx firstSheetName where firstSheetName = fst $ head $ xlsx ^. xlSheets -- ^ TODO: Is this still true with xlsx-0.3 or are sheets now -- in alphabetical order?? -- | Decode cells as tabular rows. -- The table is assumed to consist of all contiguous rows -- that fulfill the given predicate, starting with the header. -- -- The predicate function is given the Xlsx @StyleSheet@ as well -- as a row (consisting of the row's index and the row's cells) -- and should return @True@ if the row is part of the table. -- -- Since 0.1.1 toTableRowsCustom :: (StyleSheet -> (Int, [(Int, Cell)]) -> Bool) -- ^ Predicate for row selection -> Xlsx -- ^ Xlsx Workbook -> Text -- ^ Worksheet name to decode -> Int -- ^ Starting row index (header row) -> Maybe Tabular toTableRowsCustom predicate xlsx sheetName offset = do styles <- parseStyleSheet (xlsx ^. xlStyles) ^? _Right rows <- xlsx ^? ixSheet sheetName . wsCells . to toRows decodeRows (predicate styles) offset rows decodeRows p offset rs = if null rs' then Nothing else Just $ def & tabularHeads .~ header' & tabularRows .~ rows where rs' = getCells p offset rs header = head rs' ^. _2 header' = join $ map toText header toText (i, Just (CellText t)) = [def & tabularHeadIx .~ i & tabularHeadLabel .~ t] toText _ = [] ixs = map (view tabularHeadIx) header' rows = map rowValue (tail rs') rowValue (ix, row) = def & tabularRowIx .~ ix & tabularRowCells .~ insertMissingCells ixs row where -- Insert empty cells when there is a header but no corresponding -- cell. This can happen if cells have no content nor formatting -- defined. insertMissingCells :: [Int] -> [(Int, Maybe CellValue)] -> [Maybe CellValue] insertMissingCells ixs cs = map (join . flip lookup cs) ixs -- |Pickup cells that has value from line getCells :: (Row -> Bool) -- ^ Predicate -> Int -- ^ Start line number -> Rows -- ^ cell rows -> RowValues getCells p i = filter (any (isJust . snd) . snd) . (fmap . fmap) rowValues . takeContiguous i . takeWhile p . startAt i startAt :: Int -> Rows -> Rows startAt i = dropWhile ((< i) . fst) -- |Take contiguous rows that start from i takeContiguous :: Int -> Rows -> Rows --takeContiguous i rs = [r | (x, r@(y, _)) <- zip [i..] rs, x == y] takeContiguous i = map snd . filter (uncurry (==) . fmap fst) . zip [i..] rowValues = map (fmap _cellValue) -- Predicate for at least one cell having a bottom border style. -- |Take rows while all valued cell has bottom border line. -- | * no bottom border line means out of table. borderBottomPredicate :: RowPredicate -- StyleSheet -> Row -> Bool borderBottomPredicate ss = or . rowBordersHas borderBottom ss . snd rowBordersHas v ss = map (cellHasBorder v ss . snd) cellHasBorder v ss cell = fromMaybe False mb where mb = borderStyleHasLine v <$> cellBorder ss cell cellBorder :: StyleSheet -> Cell -> Maybe Border cellBorder ss cell = fmap xf (view cellStyle cell) >>= fmap bd . view cellXfBorderId where xf n = (ss ^. styleSheetCellXfs) !! n bd n = (ss ^. styleSheetBorders) !! n borderStyleHasLine v b = fromMaybe False value where value = view v b >>= fmap (/= LineStyleNone) . view borderStyleLine