module Codec.Xlsx.Parser(
xlsx,
sheet,
cellSource,
sheetRowSource
) where
import Control.Applicative
import Control.Monad (join)
import Control.Monad.IO.Class()
import Data.Function (on)
import qualified Data.IntMap as M
import qualified Data.IntSet as S
import Data.List
import qualified Data.Map as Map
import Data.Maybe
import Data.Ord
import Prelude hiding (sequence)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Lazy.Char8()
import qualified Codec.Archive.Zip as Zip
import Data.Conduit
import qualified Data.Conduit.List as CL
import Data.XML.Types
import System.FilePath
import Text.XML as X
import Text.XML.Cursor
import qualified Text.XML.Stream.Parse as Xml
import Codec.Xlsx
type MapRow = Map.Map Text Text
xlsx :: FilePath -> IO Xlsx
xlsx fname = do
ar <- Zip.toArchive <$> L.readFile fname
ss <- getSharedStrings ar
st <- getStyles ar
ws <- getWorksheetFiles ar
return $ Xlsx ar ss st ws
cellSource :: MonadThrow m => Xlsx -> Int -> [Text] -> Source m [Cell]
cellSource x sheetN cols = getSheetCells x sheetN
$= filterColumns (S.fromList $ map col2int cols)
$= groupRows
$= reverseRows
decimal :: Monad m => Text -> m Int
decimal t = case T.decimal t of
Right (d, _) -> return d
_ -> fail "invalid decimal"
rational :: Monad m => Text -> m Double
rational t = case T.rational t of
Right (r, _) -> return r
_ -> fail "invalid rational"
sheet :: MonadThrow m => Xlsx -> Int -> m Worksheet
sheet Xlsx{xlArchive=ar, xlSharedStrings=ss, xlWorksheetFiles=sheets} sheetN
| sheetN < 0 || sheetN >= length sheets
= fail "parseSheet: Invalid sheet number"
| otherwise
= collect parse
where
filename = wfPath $ sheets !! sheetN
sName = wfName $ sheets !! sheetN
file = fromJust $ Zip.fromEntry <$> Zip.findEntryByPath filename ar
doc = case parseLBS def file of
Left _ -> error "could not read file"
Right d -> d
tc :: Cursor
tc = fromDocument doc
parse = (tc $/ parseColumns, tc $/ parseRows)
parseColumns :: Cursor -> [ColumnsWidth]
parseColumns = element (n"cols") &/ element (n"col") >=> parseColumn
parseColumn :: Cursor -> [ColumnsWidth]
parseColumn c = do
min <- c $| attribute "min" >=> decimal
max <- c $| attribute "max" >=> decimal
width <- c $| attribute "width" >=> rational
return $ ColumnsWidth min max width
parseRows :: Cursor -> [(Int, Maybe Double, [(Int, Int, CellData)])]
parseRows = element (n"sheetData") &/ element (n"row") >=> parseRow
parseRow c = do
r <- c $| attribute "r" >=> decimal
let ht = if attribute "customHeight" c == ["true"]
then listToMaybe $ c $| attribute "ht" >=> rational
else Nothing
return (r, ht, c $/ element (n"c") >=> parseCell)
parseCell :: Cursor -> [(Int, Int, CellData)]
parseCell cell = do
(c, r) <- T.span (>'9') <$> (cell $| attribute "r")
return (col2int c, int r, CellData s d)
where
s = listToMaybe $ cell $| attribute "s" >=> decimal
t = fromMaybe "n" $ listToMaybe $ cell $| attribute "t"
d = listToMaybe $ cell $/ element (n"v") &/ content >=> extractValue
extractValue v = case t of
"n" ->
case T.rational v of
Right (d, _) -> [CellDouble d]
_ -> []
"s" ->
case T.decimal v of
Right (d, _) -> maybeToList $ fmap CellText $ M.lookup d ss
_ -> []
_ -> []
collect (cw, rd) = return $ Worksheet sName minX maxX minY maxY cw rowMap cellMap
where
(rowMap, (minX, maxX, minY, maxY, cellMap)) = foldr collectRow rInit rd
rInit = (Map.empty, (maxBound, minBound, maxBound, minBound, Map.empty))
collectRow (_, Nothing, cells) (rowMap, cellData) =
(rowMap, foldr collectCell cellData cells)
collectRow (n, Just h, cells) (rowMap, cellData) =
(Map.insert n h rowMap, foldr collectCell cellData cells)
collectCell (x, y, cd) (minX, maxX, minY, maxY, cellMap) =
(min minX x, max maxX x, min minY y, max maxY y, Map.insert (x,y) cd cellMap)
sheetRowSource :: MonadThrow m => Xlsx -> Int -> Source m MapRow
sheetRowSource x sheetN
= getSheetCells x sheetN
$= groupRows
$= reverseRows
$= mkMapRows
mkMapRows :: Monad m => Conduit [Cell] m MapRow
mkMapRows = sequence mkMapRowsSink =$= CL.concatMap id
mkMapRowsSink :: Monad m => Sink [Cell] m [MapRow]
mkMapRowsSink = do
header <- fromMaybe [] <$> CL.head
rows <- CL.consume
return $ map (mkMapRow header) rows
where
mkMapRow header row = Map.fromList $ zipCells header row
zipCells :: [Cell] -> [Cell] -> [(Text, Text)]
zipCells [] _ = []
zipCells header [] = map (\h -> (txt h, "")) header
zipCells header@(h:hs) row@(r:rs) =
case comparing (fst . cellIx) h r of
LT -> (txt h , "" ) : zipCells hs row
EQ -> (txt h , txt r) : zipCells hs rs
GT -> ("" , txt r) : zipCells header rs
txt = fromMaybe "" . cv
cv Cell{cellData=CellData{cdValue=Just(CellText t)}} = Just t
cv _ = Nothing
reverseRows :: Monad m => Conduit [a] m [a]
reverseRows = CL.map reverse
groupRows = CL.groupBy ((==) `on` (snd.cellIx))
filterColumns cs = CL.filter ((`S.member` cs) . col2int . fst . cellIx)
getSheetCells
:: MonadThrow m => Xlsx -> Int -> Source m Cell
getSheetCells (Xlsx{xlArchive=ar, xlSharedStrings=ss, xlWorksheetFiles=sheets}) sheetN
| sheetN < 0 || sheetN >= length sheets
= error "parseSheet: Invalid sheet number"
| otherwise
= case xmlSource ar (wfPath $ sheets !! sheetN) of
Nothing -> error "An impossible happened"
Just xml -> xml $= mkXmlCond (getCell ss)
getCell
:: MonadThrow m => M.IntMap Text -> Sink Event m (Maybe Cell)
getCell ss = Xml.tagName (n"c") cAttrs cParser
where
cAttrs = do
cellIx <- Xml.requireAttr "r"
style <- Xml.optionalAttr "s"
typ <- Xml.optionalAttr "t"
Xml.ignoreAttrs
return (cellIx,style,typ)
maybeCellDouble Nothing = Nothing
maybeCellDouble (Just t) = either (const Nothing) (\(d,_) -> Just (CellDouble d)) $ T.rational t
cParser (ix,style,typ) = do
val <- case typ of
Just "inlineStr" -> liftA (fmap CellText) (tagSeq ["is", "t"])
Just "s" -> liftA (fmap CellText) (tagSeq ["v"] >>=
return . join . fmap ((`M.lookup` ss).int))
Just "n" -> liftA maybeCellDouble $ tagSeq ["v"]
_ -> liftA maybeCellDouble $ tagSeq ["v"]
return $ Cell (mkCellIx ix) $ CellData (int <$> style) val
mkCellIx ix = let (c,r) = T.span (>'9') ix
in (c,int r)
n x = Name
{nameLocalName = x
,nameNamespace = Just "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
,namePrefix = Nothing}
odr x = Name
{nameLocalName = x
,nameNamespace = Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
,namePrefix = Nothing}
pr x = Name
{nameLocalName = x
,nameNamespace = Just "http://schemas.openxmlformats.org/package/2006/relationships"
,namePrefix = Nothing}
tagSeq :: MonadThrow m => [Text] -> Sink Event m (Maybe Text)
tagSeq (x:xs)
= Xml.tagNoAttr (n x)
$ foldr (\x -> Xml.force "" . Xml.tagNoAttr (n x)) Xml.content xs
tagSeq _ = error "no tags in tag sequence"
xmlSource
:: MonadThrow m => Zip.Archive -> FilePath -> Maybe (Source m Event)
xmlSource ar fname
= Xml.parseLBS Xml.def
. Zip.fromEntry
<$> Zip.findEntryByPath fname ar
getSharedStrings
:: (MonadThrow m, Functor m)
=> Zip.Archive -> m (M.IntMap Text)
getSharedStrings x
= case xmlSource x "xl/sharedStrings.xml" of
Nothing -> return M.empty
Just xml -> (M.fromAscList . zip [0..]) <$> getText xml
getText xml = xml $= mkXmlCond Xml.contentMaybe $$ CL.consume
getStyles :: (MonadThrow m, Functor m) => Zip.Archive -> m Styles
getStyles ar = case Zip.fromEntry <$> Zip.findEntryByPath "xl/styles.xml" ar of
Nothing -> return (Styles L.empty)
Just xml -> return (Styles xml)
getWorksheetFiles :: (MonadThrow m, Functor m) => Zip.Archive -> m [WorksheetFile]
getWorksheetFiles ar = case xmlSource ar "xl/workbook.xml" of
Nothing ->
error "invalid workbook"
Just xml -> do
sheetData <- xml $= mkXmlCond getSheetData $$ CL.consume
wbRels <- getWbRels ar
return [WorksheetFile n ("xl" </> T.unpack (fromJust $ lookup rId wbRels)) | (n, rId) <- sheetData]
getSheetData = Xml.tagName (n"sheet") attrs return
where
attrs = do
name <- Xml.requireAttr "name"
rId <- Xml.requireAttr (odr "id")
Xml.ignoreAttrs
return (name, rId)
getWbRels :: (MonadThrow m, Functor m) => Zip.Archive -> m [(Text, Text)]
getWbRels ar = case xmlSource ar "xl/_rels/workbook.xml.rels" of
Nothing -> return []
Just xml -> xml $$ parseWbRels
parseWbRels = Xml.force "relationships required" $
Xml.tagNoAttr (pr"Relationships") $
Xml.many $ Xml.tagName (pr"Relationship") attr return
where
attr = do
target <- Xml.requireAttr "Target"
id <- Xml.requireAttr "Id"
Xml.ignoreAttrs
return (id, target)
int :: Text -> Int
int = either error fst . T.decimal
mkXmlCond f = sequenceSink () $ const
$ CL.peek >>= maybe
(return Stop)
(\_ -> f >>= maybe
(CL.drop 1 >> return (Emit () []))
(return . Emit () . (:[])))