{-# LANGUAGE NoMonomorphismRestriction #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} -- | This module provides a function for reading .xlsx files module Codec.Xlsx.Parser ( toXlsx ) where import qualified Codec.Archive.Zip as Zip import Control.Applicative import Control.Arrow ((&&&)) import Control.Monad.IO.Class () import qualified Data.ByteString.Lazy as L import Data.ByteString.Lazy.Char8 () import Data.List import qualified Data.Map as M import Data.Maybe import Data.Ord import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Read as T import Prelude hiding (sequence) import Safe import System.FilePath.Posix import Text.XML as X import Text.XML.Cursor import Codec.Xlsx.Parser.Internal import Codec.Xlsx.Types import Codec.Xlsx.Types.Internal import Codec.Xlsx.Types.Internal.Relationships as Relationships import Codec.Xlsx.Types.Internal.SharedStringTable import Codec.Xlsx.Types.Internal.CustomProperties import Codec.Xlsx.Types.Internal.CommentTable import Codec.Xlsx.Types.Internal.CustomProperties as CustomProperties -- | Reads `Xlsx' from raw data (lazy bytestring) toXlsx :: L.ByteString -> Xlsx toXlsx bs = Xlsx sheets styles names customPropMap where ar = Zip.toArchive bs sst = getSharedStrings ar styles = getStyles ar (wfs, names) = readWorkbook ar sheets = M.fromList $ map (wfName &&& extractSheet ar sst) wfs CustomProperties customPropMap = getCustomProperties ar data WorksheetFile = WorksheetFile { wfName :: Text , wfPath :: FilePath } deriving Show extractSheet :: Zip.Archive -> SharedStringTable -> WorksheetFile -> Worksheet extractSheet ar sst wf = Worksheet cws rowProps cells merges sheetViews pageSetup where file = fromJust $ Zip.fromEntry <$> Zip.findEntryByPath (wfPath wf) ar cur = case parseLBS def file of Left _ -> error "could not read file" Right d -> fromDocument d -- The specification says the file should contain either 0 or 1 @sheetViews@ -- (4th edition, section 18.3.1.88, p. 1704 and definition CT_Worksheet, p. 3910) sheetViewList = cur $/ element (n"sheetViews") &/ element (n"sheetView") >=> fromCursor sheetViews = case sheetViewList of [] -> Nothing views -> Just views commentsMap = getComments ar . relTarget =<< findRelByType commentsType sheetRels commentsType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments" sheetRels = getRels ar (wfPath wf) -- Likewise, @pageSetup@ also occurs either 0 or 1 times pageSetup = listToMaybe $ cur $/ element (n"pageSetup") >=> fromCursor cws = cur $/ element (n"cols") &/ element (n"col") >=> fromCursor (rowProps, cells) = collect $ cur $/ element (n"sheetData") &/ element (n"row") >=> parseRow parseRow c = do r <- c $| attribute "r" >=> decimal let ht = if attribute "customHeight" c == ["true"] then listToMaybe $ c $| attribute "ht" >=> rational else Nothing let s = if attribute "s" c /= [] then listToMaybe $ c $| attribute "s" >=> decimal else Nothing let rp = if isNothing s && isNothing ht then Nothing else Just (RowProps ht s) return (r, rp, c $/ element (n"c") >=> parseCell) parseCell :: Cursor -> [(Int, Int, Cell)] parseCell cell = do ref <- cell $| attribute "r" let s = listToMaybe $ cell $| attribute "s" >=> decimal t = fromMaybe "n" $ listToMaybe $ cell $| attribute "t" d = listToMaybe $ cell $/ element (n"v") &/ content >=> extractCellValue sst t (c, r) = T.span (>'9') ref comment = commentsMap >>= lookupComment ref return (int r, col2int c, Cell s d comment) collect = foldr collectRow (M.empty, M.empty) collectRow (_, Nothing, rowCells) (rowMap, cellMap) = (rowMap, foldr collectCell cellMap rowCells) collectRow (r, Just h, rowCells) (rowMap, cellMap) = (M.insert r h rowMap, foldr collectCell cellMap rowCells) collectCell (x, y, cd) = M.insert (x,y) cd merges = cur $/ parseMerges parseMerges :: Cursor -> [Text] parseMerges = element (n"mergeCells") &/ element (n"mergeCell") >=> parseMerge parseMerge c = c $| attribute "ref" extractCellValue :: SharedStringTable -> Text -> Text -> [CellValue] extractCellValue sst "s" v = case T.decimal v of Right (d, _) -> case sstItem sst d of XlsxText txt -> [CellText txt] XlsxRichText rich -> [CellRich rich] _ -> [] extractCellValue _ "str" str = [CellText str] extractCellValue _ "n" v = case T.rational v of Right (d, _) -> [CellDouble d] _ -> [] extractCellValue _ "b" "1" = [CellBool True] extractCellValue _ "b" "0" = [CellBool False] extractCellValue _ _ _ = [] -- | Get xml cursor from the specified file inside the zip archive. xmlCursor :: Zip.Archive -> FilePath -> Maybe Cursor xmlCursor ar fname = parse <$> Zip.findEntryByPath fname ar where parse entry = case parseLBS def (Zip.fromEntry entry) of Left _ -> error "could not read file" Right d -> fromDocument d -- | Get shared string table getSharedStrings :: Zip.Archive -> SharedStringTable getSharedStrings x = case xmlCursor x "xl/sharedStrings.xml" of Nothing -> error "invalid shared strings" Just c -> let [sst] = fromCursor c in sst getStyles :: Zip.Archive -> Styles getStyles ar = case Zip.fromEntry <$> Zip.findEntryByPath "xl/styles.xml" ar of Nothing -> Styles L.empty Just xml -> Styles xml getComments :: Zip.Archive -> FilePath -> Maybe CommentTable getComments ar fp = listToMaybe =<< fromCursor <$> xmlCursor ar fp getCustomProperties :: Zip.Archive -> CustomProperties getCustomProperties ar = case fromCursor <$> xmlCursor ar "docProps/custom.xml" of Just [cp] -> cp _ -> CustomProperties.empty -- | readWorkbook pulls the names of the sheets and the defined names readWorkbook :: Zip.Archive -> ([WorksheetFile], DefinedNames) readWorkbook ar = case xmlCursor ar wbPath of Nothing -> error "invalid workbook" Just c -> let sheets = c $/ element (n"sheets") &/ element (n"sheet") >=> liftA2 (worksheetFile wbRels) <$> attribute "name" <*> (attribute (odr"id") &| RefId) wbRels = getRels ar wbPath names = c $/ element (n"definedNames") &/ element (n"definedName") >=> mkDefinedName in (sheets, DefinedNames names) where wbPath = "xl/workbook.xml" -- Specification says the 'name' is required. mkDefinedName :: Cursor -> [(Text, Maybe Text, Text)] mkDefinedName c = return ( head $ attribute "name" c , listToMaybe $ attribute "localSheetId" c , T.concat $ c $/ content ) worksheetFile :: Relationships -> Text -> RefId -> WorksheetFile worksheetFile wbRels name rId = WorksheetFile name path where path = relTarget . fromJustNote "sheet path" $ Relationships.lookup rId wbRels getRels :: Zip.Archive -> FilePath -> Relationships getRels ar fp = let (dir, file) = splitFileName fp relsPath = dir "_rels" file <.> "rels" in case xmlCursor ar relsPath of Nothing -> Relationships.empty Just c -> let [rels] = fromCursor c in setTargetsFrom fp rels int :: Text -> Int int = either error fst . T.decimal