module Codec.Xlsx.Parser
( toXlsx
, toXlsxEither
, ParseError(..)
, Parser
) where
import qualified Codec.Archive.Zip as Zip
import Control.Applicative
import Control.Arrow (left)
import Control.Error.Safe (headErr)
import Control.Error.Util (note)
import Control.Lens hiding (element, views, (<.>))
import Control.Monad.Except (catchError, throwError)
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Lazy.Char8 ()
import Data.List
import qualified Data.Map as M
import Data.Maybe
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T
import Data.Traversable
import GHC.Generics (Generic)
import Prelude hiding (sequence)
import System.FilePath.Posix
import Text.XML as X
import Text.XML.Cursor
import Codec.Xlsx.Parser.Internal
import Codec.Xlsx.Parser.Internal.PivotTable
import Codec.Xlsx.Types
import Codec.Xlsx.Types.Internal
import Codec.Xlsx.Types.Internal.CfPair
import Codec.Xlsx.Types.Internal.CommentTable as CommentTable
import Codec.Xlsx.Types.Internal.ContentTypes as ContentTypes
import Codec.Xlsx.Types.Internal.CustomProperties
as CustomProperties
import Codec.Xlsx.Types.Internal.DvPair
import Codec.Xlsx.Types.Internal.Relationships as Relationships
import Codec.Xlsx.Types.Internal.SharedStringTable
import Codec.Xlsx.Types.PivotTable.Internal
toXlsx :: L.ByteString -> Xlsx
toXlsx = either (error . show) id . toXlsxEither
data ParseError = InvalidZipArchive
| MissingFile FilePath
| InvalidFile FilePath
| InvalidRef FilePath RefId
| InconsistentXlsx Text
deriving (Eq, Show, Generic)
type Parser = Either ParseError
toXlsxEither :: L.ByteString -> Parser Xlsx
toXlsxEither bs = do
ar <- left (const InvalidZipArchive) $ Zip.toArchiveOrFail bs
sst <- getSharedStrings ar
contentTypes <- getContentTypes ar
(wfs, names, cacheSources) <- readWorkbook ar
sheets <- forM wfs $ \wf -> do
sheet <- extractSheet ar sst contentTypes cacheSources wf
return (wfName wf, sheet)
CustomProperties customPropMap <- getCustomProperties ar
return $ Xlsx sheets (getStyles ar) names customPropMap
data WorksheetFile = WorksheetFile { wfName :: Text
, wfPath :: FilePath
}
deriving (Show, Generic)
type Caches = [(CacheId, (Text, CellRef, [CacheField]))]
extractSheet :: Zip.Archive
-> SharedStringTable
-> ContentTypes
-> Caches
-> WorksheetFile
-> Parser Worksheet
extractSheet ar sst contentTypes caches wf = do
let filePath = wfPath wf
file <- note (MissingFile filePath) $ Zip.fromEntry <$> Zip.findEntryByPath filePath ar
cur <- fmap fromDocument . left (\_ -> InvalidFile filePath) $
parseLBS def file
sheetRels <- getRels ar filePath
let sheetViewList = cur $/ element (n_ "sheetViews") &/ element (n_ "sheetView") >=> fromCursor
sheetViews = case sheetViewList of
[] -> Nothing
views -> Just views
let commentsType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
commentTarget :: Maybe FilePath
commentTarget = relTarget <$> findRelByType commentsType sheetRels
legacyDrRId = cur $/ element (n_ "legacyDrawing") >=> fromAttribute (odr"id")
legacyDrPath = fmap relTarget . flip Relationships.lookup sheetRels =<< listToMaybe legacyDrRId
commentsMap :: Maybe CommentTable <- maybe (Right Nothing) (getComments ar legacyDrPath) commentTarget
let pageSetup = listToMaybe $ cur $/ element (n_ "pageSetup") >=> fromCursor
cws = cur $/ element (n_ "cols") &/ element (n_ "col") >=> fromCursor
(rowProps, cells0) = collect $ cur $/ element (n_ "sheetData") &/ element (n_ "row") >=> parseRow
parseRow :: Cursor -> [(Int, Maybe RowProperties, [(Int, Int, Cell)])]
parseRow c = do
r <- fromAttribute "r" c
let prop = RowProps
{ rowHeight = do h <- listToMaybe $ fromAttribute "ht" c
case fromAttribute "customHeight" c of
[True] -> return $ CustomHeight h
_ -> return $ AutomaticHeight h
, rowStyle = listToMaybe $ fromAttribute "s" c
, rowHidden =
case fromAttribute "hidden" c of
[] -> False
f:_ -> f
}
return ( r
, if prop == def then Nothing else Just prop
, c $/ element (n_ "c") >=> parseCell
)
parseCell :: Cursor -> [(Int, Int, Cell)]
parseCell cell = do
ref <- fromAttribute "r" cell
let
s = listToMaybe $ cell $| attribute "s" >=> decimal
t = fromMaybe "n" $ listToMaybe $ cell $| attribute "t"
d = listToMaybe $ cell $/ element (n_ "v") &/ content >=> extractCellValue sst t
f = listToMaybe $ cell $/ element (n_ "f") >=> fromCursor
(r, c) = fromSingleCellRefNoting ref
comment = commentsMap >>= lookupComment ref
return (r, c, Cell s d comment f)
collect = foldr collectRow (M.empty, M.empty)
collectRow (_, Nothing, rowCells) (rowMap, cellMap) =
(rowMap, foldr collectCell cellMap rowCells)
collectRow (r, Just h, rowCells) (rowMap, cellMap) =
(M.insert r h rowMap, foldr collectCell cellMap rowCells)
collectCell (x, y, cd) = M.insert (x,y) cd
commentCells =
M.fromList
[ (fromSingleCellRefNoting r, def {_cellComment = Just cmnt})
| (r, cmnt) <- maybe [] CommentTable.toList commentsMap
]
cells = cells0 `M.union` commentCells
mProtection = listToMaybe $ cur $/ element (n_ "sheetProtection") >=> fromCursor
mDrawingId = listToMaybe $ cur $/ element (n_ "drawing") >=> fromAttribute (odr"id")
merges = cur $/ parseMerges
parseMerges :: Cursor -> [Range]
parseMerges = element (n_ "mergeCells") &/ element (n_ "mergeCell") >=> fromAttribute "ref"
condFormtattings = M.fromList . map unCfPair $ cur $/ element (n_ "conditionalFormatting") >=> fromCursor
validations = M.fromList . map unDvPair $
cur $/ element (n_ "dataValidations") &/ element (n_ "dataValidation") >=> fromCursor
tableIds =
cur $/ element (n_ "tableParts") &/ element (n_ "tablePart") >=>
fromAttribute (odr "id")
let mAutoFilter = listToMaybe $ cur $/ element (n_ "autoFilter") >=> fromCursor
mDrawing <- case mDrawingId of
Just dId -> do
rel <- note (InvalidRef filePath dId) $ Relationships.lookup dId sheetRels
Just <$> getDrawing ar contentTypes (relTarget rel)
Nothing ->
return Nothing
let ptType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable"
pTables <- forM (allByType ptType sheetRels) $ \rel -> do
let ptPath = relTarget rel
bs <- note (MissingFile ptPath) $ Zip.fromEntry <$> Zip.findEntryByPath ptPath ar
note (InconsistentXlsx $ "Bad pivot table in " <> T.pack ptPath) $
parsePivotTable (flip Prelude.lookup caches) bs
tables <- forM tableIds $ \rId -> do
fp <- lookupRelPath filePath sheetRels rId
getTable ar fp
return $
Worksheet
cws
rowProps
cells
mDrawing
merges
sheetViews
pageSetup
condFormtattings
validations
pTables
mAutoFilter
tables
mProtection
extractCellValue :: SharedStringTable -> Text -> Text -> [CellValue]
extractCellValue sst "s" v =
case T.decimal v of
Right (d, _) ->
case sstItem sst d of
XlsxText txt -> [CellText txt]
XlsxRichText rich -> [CellRich rich]
_ ->
[]
extractCellValue _ "str" str = [CellText str]
extractCellValue _ "n" v =
case T.rational v of
Right (d, _) -> [CellDouble d]
_ -> []
extractCellValue _ "b" "1" = [CellBool True]
extractCellValue _ "b" "0" = [CellBool False]
extractCellValue _ _ _ = []
xmlCursorOptional :: Zip.Archive -> FilePath -> Parser (Maybe Cursor)
xmlCursorOptional ar fname =
(Just <$> xmlCursorRequired ar fname) `catchError` missingToNothing
where
missingToNothing :: ParseError -> Either ParseError (Maybe a)
missingToNothing (MissingFile _) = return Nothing
missingToNothing other = throwError other
xmlCursorRequired :: Zip.Archive -> FilePath -> Parser Cursor
xmlCursorRequired ar fname = do
entry <- note (MissingFile fname) $ Zip.findEntryByPath fname ar
cur <- left (\_ -> InvalidFile fname) $ parseLBS def (Zip.fromEntry entry)
return $ fromDocument cur
getSharedStrings :: Zip.Archive -> Parser SharedStringTable
getSharedStrings x = maybe sstEmpty (head . fromCursor) <$>
xmlCursorOptional x "xl/sharedStrings.xml"
getContentTypes :: Zip.Archive -> Parser ContentTypes
getContentTypes x = head . fromCursor <$> xmlCursorRequired x "[Content_Types].xml"
getStyles :: Zip.Archive -> Styles
getStyles ar = case Zip.fromEntry <$> Zip.findEntryByPath "xl/styles.xml" ar of
Nothing -> Styles L.empty
Just xml -> Styles xml
getComments :: Zip.Archive -> Maybe FilePath -> FilePath -> Parser (Maybe CommentTable)
getComments ar drp fp = do
mCurComments <- xmlCursorOptional ar fp
mCurDr <- maybe (return Nothing) (xmlCursorOptional ar) drp
return (liftA2 hide (hidden <$> mCurDr) . listToMaybe . fromCursor =<< mCurComments)
where
hide refs (CommentTable m) = CommentTable $ foldl' hideComment m refs
hideComment m r = M.adjust (\c->c{_commentVisible = False}) r m
v nm = Name nm (Just "urn:schemas-microsoft-com:vml") Nothing
x nm = Name nm (Just "urn:schemas-microsoft-com:office:excel") Nothing
hidden :: Cursor -> [CellRef]
hidden cur = cur $/ checkElement visibleShape &/
element (x"ClientData") >=> shapeCellRef
visibleShape Element{..} = elementName == (v"shape") &&
maybe False (any ("visibility:hidden"==) . T.split (==';')) (M.lookup "style" elementAttributes)
shapeCellRef :: Cursor -> [CellRef]
shapeCellRef c = do
r0 <- c $/ element (x"Row") &/ content >=> decimal
c0 <- c $/ element (x"Column") &/ content >=> decimal
return $ singleCellRef (r0 + 1, c0 + 1)
getCustomProperties :: Zip.Archive -> Parser CustomProperties
getCustomProperties ar = maybe CustomProperties.empty (head . fromCursor) <$> xmlCursorOptional ar "docProps/custom.xml"
getDrawing :: Zip.Archive -> ContentTypes -> FilePath -> Parser Drawing
getDrawing ar contentTypes fp = do
cur <- xmlCursorRequired ar fp
drawingRels <- getRels ar fp
unresolved <- headErr (InvalidFile fp) (fromCursor cur)
anchors <- forM (unresolved ^. xdrAnchors) $ resolveFileInfo drawingRels
return $ Drawing anchors
where
resolveFileInfo :: Relationships -> Anchor RefId RefId -> Parser (Anchor FileInfo ChartSpace)
resolveFileInfo rels uAnch =
case uAnch ^. anchObject of
Picture {..} -> do
let mRefId = _picBlipFill ^. bfpImageInfo
mFI <- lookupFI rels mRefId
let pic' =
Picture
{ _picMacro = _picMacro
, _picPublished = _picPublished
, _picNonVisual = _picNonVisual
, _picBlipFill = (_picBlipFill & bfpImageInfo .~ mFI)
, _picShapeProperties = _picShapeProperties
}
return uAnch {_anchObject = pic'}
Graphic nv rId tr -> do
chartPath <- lookupRelPath fp rels rId
chart <- readChart ar chartPath
return uAnch {_anchObject = Graphic nv chart tr}
lookupFI _ Nothing = return Nothing
lookupFI rels (Just rId) = do
path <- lookupRelPath fp rels rId
contentType <- note (InvalidFile path) $ ContentTypes.lookup ("/" <> path) contentTypes
contents <- Zip.fromEntry <$> note (MissingFile path) (Zip.findEntryByPath path ar)
return . Just $ FileInfo (stripMediaPrefix path) contentType contents
stripMediaPrefix :: FilePath -> FilePath
stripMediaPrefix p = fromMaybe p $ stripPrefix "xl/media/" p
readChart :: Zip.Archive -> FilePath -> Parser ChartSpace
readChart ar path = head . fromCursor <$> xmlCursorRequired ar path
readWorkbook :: Zip.Archive -> Parser ([WorksheetFile], DefinedNames, Caches)
readWorkbook ar = do
let wbPath = "xl/workbook.xml"
cur <- xmlCursorRequired ar wbPath
wbRels <- getRels ar wbPath
let mkDefinedName :: Cursor -> [(Text, Maybe Text, Text)]
mkDefinedName c =
return
( head $ attribute "name" c
, listToMaybe $ attribute "localSheetId" c
, T.concat $ c $/ content)
names =
cur $/ element (n_ "definedNames") &/ element (n_ "definedName") >=>
mkDefinedName
sheets <-
sequence $
cur $/ element (n_ "sheets") &/ element (n_ "sheet") >=>
liftA2 (worksheetFile wbPath wbRels) <$> attribute "name" <*>
fromAttribute (odr "id")
let cacheRefs =
cur $/ element (n_ "pivotCaches") &/ element (n_ "pivotCache") >=>
liftA2 (,) <$> fromAttribute "cacheId" <*> fromAttribute (odr "id")
caches <-
forM cacheRefs $ \(cacheId, rId) -> do
path <- lookupRelPath wbPath wbRels rId
bs <-
note (MissingFile path) $ Zip.fromEntry <$> Zip.findEntryByPath path ar
sources <-
note (InconsistentXlsx $ "Bad pivot table cache in " <> T.pack path) $
parseCache bs
return (cacheId, sources)
return (sheets, DefinedNames names, caches)
getTable :: Zip.Archive -> FilePath -> Parser Table
getTable ar fp = do
cur <- xmlCursorRequired ar fp
headErr (InvalidFile fp) (fromCursor cur)
worksheetFile :: FilePath -> Relationships -> Text -> RefId -> Parser WorksheetFile
worksheetFile parentPath wbRels name rId =
WorksheetFile name <$> lookupRelPath parentPath wbRels rId
getRels :: Zip.Archive -> FilePath -> Parser Relationships
getRels ar fp = do
let (dir, file) = splitFileName fp
relsPath = dir </> "_rels" </> file <.> "rels"
c <- xmlCursorOptional ar relsPath
return $ maybe Relationships.empty (setTargetsFrom fp . head . fromCursor) c
lookupRelPath :: FilePath
-> Relationships
-> RefId
-> Either ParseError FilePath
lookupRelPath fp rels rId =
relTarget <$> note (InvalidRef fp rId) (Relationships.lookup rId rels)