src/Codec/Xlsx/Parser.hs

{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE OverloadedStrings         #-}
{-# LANGUAGE RecordWildCards           #-}
{-# LANGUAGE ScopedTypeVariables       #-}
{-# LANGUAGE TupleSections             #-}
{-# LANGUAGE DeriveGeneric #-}

-- | This module provides a function for reading .xlsx files
module Codec.Xlsx.Parser
  ( toXlsx
  , toXlsxEither
  , ParseError(..)
  , Parser
  ) where


import qualified Codec.Archive.Zip as Zip
import Control.Applicative
import Control.Arrow (left)
import Control.Error.Safe (headErr)
import Control.Error.Util (note)
import Control.Lens hiding (element, views, (<.>))
import Control.Monad.Except (catchError, throwError)
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Lazy.Char8 ()
import Data.List
import qualified Data.Map as M
import Data.Maybe
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T
import Data.Traversable
import GHC.Generics (Generic)
import Prelude hiding (sequence)
import System.FilePath.Posix
import Text.XML as X
import Text.XML.Cursor

import Codec.Xlsx.Parser.Internal
import Codec.Xlsx.Parser.Internal.PivotTable
import Codec.Xlsx.Types
import Codec.Xlsx.Types.Internal
import Codec.Xlsx.Types.Internal.CfPair
import Codec.Xlsx.Types.Internal.CommentTable as CommentTable
import Codec.Xlsx.Types.Internal.ContentTypes as ContentTypes
import Codec.Xlsx.Types.Internal.CustomProperties
       as CustomProperties
import Codec.Xlsx.Types.Internal.DvPair
import Codec.Xlsx.Types.Internal.Relationships as Relationships
import Codec.Xlsx.Types.Internal.SharedStringTable
import Codec.Xlsx.Types.PivotTable.Internal

-- | Reads `Xlsx' from raw data (lazy bytestring)
toXlsx :: L.ByteString -> Xlsx
toXlsx = either (error . show) id . toXlsxEither

data ParseError = InvalidZipArchive
                | MissingFile FilePath
                | InvalidFile FilePath
                | InvalidRef FilePath RefId
                | InconsistentXlsx Text
                deriving (Eq, Show, Generic)

type Parser = Either ParseError

-- | Reads `Xlsx' from raw data (lazy bytestring), failing with Left on parse error
toXlsxEither :: L.ByteString -> Parser Xlsx
toXlsxEither bs = do
  ar <- left (const InvalidZipArchive) $ Zip.toArchiveOrFail bs
  sst <- getSharedStrings ar
  contentTypes <- getContentTypes ar
  (wfs, names, cacheSources) <- readWorkbook ar
  sheets <- forM wfs $ \wf -> do
      sheet <- extractSheet ar sst contentTypes cacheSources wf
      return (wfName wf, sheet)
  CustomProperties customPropMap <- getCustomProperties ar
  return $ Xlsx sheets (getStyles ar) names customPropMap

data WorksheetFile = WorksheetFile { wfName :: Text
                                   , wfPath :: FilePath
                                   }
                   deriving (Show, Generic)

type Caches = [(CacheId, (Text, CellRef, [CacheField]))]

extractSheet :: Zip.Archive
             -> SharedStringTable
             -> ContentTypes
             -> Caches
             -> WorksheetFile
             -> Parser Worksheet
extractSheet ar sst contentTypes caches wf = do
  let filePath = wfPath wf
  file <- note (MissingFile filePath) $ Zip.fromEntry <$> Zip.findEntryByPath filePath ar
  cur <- fmap fromDocument . left (\_ -> InvalidFile filePath) $
         parseLBS def file
  sheetRels <- getRels ar filePath

  -- The specification says the file should contain either 0 or 1 @sheetViews@
  -- (4th edition, section 18.3.1.88, p. 1704 and definition CT_Worksheet, p. 3910)
  let  sheetViewList = cur $/ element (n_ "sheetViews") &/ element (n_ "sheetView") >=> fromCursor
       sheetViews = case sheetViewList of
         []    -> Nothing
         views -> Just views

  let commentsType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
      commentTarget :: Maybe FilePath
      commentTarget = relTarget <$> findRelByType commentsType sheetRels
      legacyDrRId = cur $/ element (n_ "legacyDrawing") >=> fromAttribute (odr"id")
      legacyDrPath = fmap relTarget . flip Relationships.lookup sheetRels  =<< listToMaybe legacyDrRId

  commentsMap :: Maybe CommentTable <- maybe (Right Nothing) (getComments ar legacyDrPath) commentTarget

  -- Likewise, @pageSetup@ also occurs either 0 or 1 times
  let pageSetup = listToMaybe $ cur $/ element (n_ "pageSetup") >=> fromCursor

      cws = cur $/ element (n_ "cols") &/ element (n_ "col") >=> fromCursor

      (rowProps, cells0) = collect $ cur $/ element (n_ "sheetData") &/ element (n_ "row") >=> parseRow
      parseRow :: Cursor -> [(Int, Maybe RowProperties, [(Int, Int, Cell)])]
      parseRow c = do
        r <- fromAttribute "r" c
        let prop = RowProps
              { rowHeight = do h <- listToMaybe $ fromAttribute "ht" c
                               case fromAttribute "customHeight" c of
                                 [True] -> return $ CustomHeight    h
                                 _      -> return $ AutomaticHeight h
              , rowStyle  = listToMaybe $ fromAttribute "s" c
              , rowHidden =
                  case fromAttribute "hidden" c of
                    []  -> False
                    f:_ -> f
              }
        return ( r
               , if prop == def then Nothing else Just prop
               , c $/ element (n_ "c") >=> parseCell
               )
      parseCell :: Cursor -> [(Int, Int, Cell)]
      parseCell cell = do
        ref <- fromAttribute "r" cell
        let
          s = listToMaybe $ cell $| attribute "s" >=> decimal
          t = fromMaybe "n" $ listToMaybe $ cell $| attribute "t"
          d = listToMaybe $ cell $/ element (n_ "v") &/ content >=> extractCellValue sst t
          f = listToMaybe $ cell $/ element (n_ "f") >=> fromCursor
          (r, c) = fromSingleCellRefNoting ref
          comment = commentsMap >>= lookupComment ref
        return (r, c, Cell s d comment f)
      collect = foldr collectRow (M.empty, M.empty)
      collectRow (_, Nothing, rowCells) (rowMap, cellMap) =
        (rowMap, foldr collectCell cellMap rowCells)
      collectRow (r, Just h, rowCells) (rowMap, cellMap) =
        (M.insert r h rowMap, foldr collectCell cellMap rowCells)
      collectCell (x, y, cd) = M.insert (x,y) cd

      commentCells =
        M.fromList
          [ (fromSingleCellRefNoting r, def {_cellComment = Just cmnt})
          | (r, cmnt) <- maybe [] CommentTable.toList commentsMap
          ]
      cells = cells0 `M.union` commentCells

      mProtection = listToMaybe $ cur $/ element (n_ "sheetProtection") >=> fromCursor

      mDrawingId = listToMaybe $ cur $/ element (n_ "drawing") >=> fromAttribute (odr"id")

      merges = cur $/ parseMerges
      parseMerges :: Cursor -> [Range]
      parseMerges = element (n_ "mergeCells") &/ element (n_ "mergeCell") >=> fromAttribute "ref"

      condFormtattings = M.fromList . map unCfPair  $ cur $/ element (n_ "conditionalFormatting") >=> fromCursor

      validations = M.fromList . map unDvPair $
          cur $/ element (n_ "dataValidations") &/ element (n_ "dataValidation") >=> fromCursor

      tableIds =
        cur $/ element (n_ "tableParts") &/ element (n_ "tablePart") >=>
        fromAttribute (odr "id")

  let mAutoFilter = listToMaybe $ cur $/ element (n_ "autoFilter") >=> fromCursor

  mDrawing <- case mDrawingId of
      Just dId -> do
          rel <- note (InvalidRef filePath dId) $ Relationships.lookup dId sheetRels
          Just <$> getDrawing ar contentTypes (relTarget rel)
      Nothing  ->
          return Nothing

  let ptType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable"
  pTables <- forM (allByType ptType sheetRels) $ \rel -> do
    let ptPath = relTarget rel
    bs <- note (MissingFile ptPath) $ Zip.fromEntry <$> Zip.findEntryByPath ptPath ar
    note (InconsistentXlsx $ "Bad pivot table in " <> T.pack ptPath) $
      parsePivotTable (flip Prelude.lookup caches) bs

  tables <- forM tableIds $ \rId -> do
    fp <- lookupRelPath filePath sheetRels rId
    getTable ar fp

  return $
    Worksheet
      cws
      rowProps
      cells
      mDrawing
      merges
      sheetViews
      pageSetup
      condFormtattings
      validations
      pTables
      mAutoFilter
      tables
      mProtection

extractCellValue :: SharedStringTable -> Text -> Text -> [CellValue]
extractCellValue sst "s" v =
    case T.decimal v of
      Right (d, _) ->
        case sstItem sst d of
          XlsxText     txt  -> [CellText txt]
          XlsxRichText rich -> [CellRich rich]
      _ ->
        []
extractCellValue _ "str" str = [CellText str]
extractCellValue _ "n" v =
    case T.rational v of
      Right (d, _) -> [CellDouble d]
      _            -> []
extractCellValue _ "b" "1" = [CellBool True]
extractCellValue _ "b" "0" = [CellBool False]
extractCellValue _ _ _ = []

-- | Get xml cursor from the specified file inside the zip archive.
xmlCursorOptional :: Zip.Archive -> FilePath -> Parser (Maybe Cursor)
xmlCursorOptional ar fname =
    (Just <$> xmlCursorRequired ar fname) `catchError` missingToNothing
  where
    missingToNothing :: ParseError -> Either ParseError (Maybe a)
    missingToNothing (MissingFile _) = return Nothing
    missingToNothing other           = throwError other

-- | Get xml cursor from the given file, failing with MissingFile if not found.
xmlCursorRequired :: Zip.Archive -> FilePath -> Parser Cursor
xmlCursorRequired ar fname = do
    entry <- note (MissingFile fname) $ Zip.findEntryByPath fname ar
    cur <- left (\_ -> InvalidFile fname) $ parseLBS def (Zip.fromEntry entry)
    return $ fromDocument cur

-- | Get shared string table
getSharedStrings  :: Zip.Archive -> Parser SharedStringTable
getSharedStrings x = maybe sstEmpty (head . fromCursor) <$>
                     xmlCursorOptional x "xl/sharedStrings.xml"

getContentTypes :: Zip.Archive -> Parser ContentTypes
getContentTypes x = head . fromCursor <$> xmlCursorRequired x "[Content_Types].xml"

getStyles :: Zip.Archive -> Styles
getStyles ar = case Zip.fromEntry <$> Zip.findEntryByPath "xl/styles.xml" ar of
  Nothing  -> Styles L.empty
  Just xml -> Styles xml

getComments :: Zip.Archive -> Maybe FilePath -> FilePath -> Parser (Maybe CommentTable)
getComments ar drp fp = do
    mCurComments <- xmlCursorOptional ar fp
    mCurDr <- maybe (return Nothing) (xmlCursorOptional ar) drp
    return (liftA2 hide (hidden <$> mCurDr) . listToMaybe . fromCursor =<< mCurComments)
  where
    hide refs (CommentTable m) = CommentTable $ foldl' hideComment m refs
    hideComment m r = M.adjust (\c->c{_commentVisible = False}) r m
    v nm = Name nm (Just "urn:schemas-microsoft-com:vml") Nothing
    x nm = Name nm (Just "urn:schemas-microsoft-com:office:excel") Nothing
    hidden :: Cursor -> [CellRef]
    hidden cur = cur $/ checkElement visibleShape &/
                 element (x"ClientData") >=> shapeCellRef
    visibleShape Element{..} = elementName ==  (v"shape") &&
        maybe False (any ("visibility:hidden"==) . T.split (==';')) (M.lookup "style" elementAttributes)
    shapeCellRef :: Cursor -> [CellRef]
    shapeCellRef c = do
        r0 <- c $/ element (x"Row") &/ content >=> decimal
        c0 <- c $/ element (x"Column") &/ content >=> decimal
        return $ singleCellRef (r0 + 1, c0 + 1)

getCustomProperties :: Zip.Archive -> Parser CustomProperties
getCustomProperties ar = maybe CustomProperties.empty (head . fromCursor) <$> xmlCursorOptional ar "docProps/custom.xml"

getDrawing :: Zip.Archive -> ContentTypes ->  FilePath -> Parser Drawing
getDrawing ar contentTypes fp = do
    cur <- xmlCursorRequired ar fp
    drawingRels <- getRels ar fp
    unresolved <- headErr (InvalidFile fp) (fromCursor cur)
    anchors <- forM (unresolved ^. xdrAnchors) $ resolveFileInfo drawingRels
    return $ Drawing anchors
  where
    resolveFileInfo :: Relationships -> Anchor RefId RefId -> Parser (Anchor FileInfo ChartSpace)
    resolveFileInfo rels uAnch =
      case uAnch ^. anchObject of
        Picture {..} -> do
          let mRefId = _picBlipFill ^. bfpImageInfo
          mFI <- lookupFI rels mRefId
          let pic' =
                Picture
                { _picMacro = _picMacro
                , _picPublished = _picPublished
                , _picNonVisual = _picNonVisual
                , _picBlipFill = (_picBlipFill & bfpImageInfo .~ mFI)
                , _picShapeProperties = _picShapeProperties
                }
          return uAnch {_anchObject = pic'}
        Graphic nv rId tr -> do
          chartPath <- lookupRelPath fp rels rId
          chart <- readChart ar chartPath
          return uAnch {_anchObject = Graphic nv chart tr}
    lookupFI _ Nothing = return Nothing
    lookupFI rels (Just rId) = do
        path <- lookupRelPath fp rels rId
        -- content types use paths starting with /
        contentType <- note (InvalidFile path) $ ContentTypes.lookup ("/" <> path) contentTypes
        contents <- Zip.fromEntry <$> note (MissingFile path) (Zip.findEntryByPath path ar)
        return . Just $ FileInfo (stripMediaPrefix path) contentType contents
    stripMediaPrefix :: FilePath -> FilePath
    stripMediaPrefix p = fromMaybe p $ stripPrefix "xl/media/" p

readChart :: Zip.Archive -> FilePath -> Parser ChartSpace
readChart ar path = head . fromCursor <$> xmlCursorRequired ar path

-- | readWorkbook pulls the names of the sheets and the defined names
readWorkbook :: Zip.Archive -> Parser ([WorksheetFile], DefinedNames, Caches)
readWorkbook ar = do
  let wbPath = "xl/workbook.xml"
  cur <- xmlCursorRequired ar wbPath
  wbRels <- getRels ar wbPath
  -- Specification says the 'name' is required.
  let mkDefinedName :: Cursor -> [(Text, Maybe Text, Text)]
      mkDefinedName c =
        return
          ( head $ attribute "name" c
          , listToMaybe $ attribute "localSheetId" c
          , T.concat $ c $/ content)
      names =
        cur $/ element (n_ "definedNames") &/ element (n_ "definedName") >=>
        mkDefinedName
  sheets <-
    sequence $
    cur $/ element (n_ "sheets") &/ element (n_ "sheet") >=>
    liftA2 (worksheetFile wbPath wbRels) <$> attribute "name" <*>
    fromAttribute (odr "id")
  let cacheRefs =
        cur $/ element (n_ "pivotCaches") &/ element (n_ "pivotCache") >=>
        liftA2 (,) <$> fromAttribute "cacheId" <*> fromAttribute (odr "id")
  caches <-
    forM cacheRefs $ \(cacheId, rId) -> do
      path <- lookupRelPath wbPath wbRels rId
      bs <-
        note (MissingFile path) $ Zip.fromEntry <$> Zip.findEntryByPath path ar
      sources <-
        note (InconsistentXlsx $ "Bad pivot table cache in " <> T.pack path) $
        parseCache bs
      return (cacheId, sources)
  return (sheets, DefinedNames names, caches)

getTable :: Zip.Archive -> FilePath -> Parser Table
getTable ar fp = do
  cur <- xmlCursorRequired ar fp
  headErr (InvalidFile fp) (fromCursor cur)

worksheetFile :: FilePath -> Relationships -> Text -> RefId -> Parser WorksheetFile
worksheetFile parentPath wbRels name rId =
  WorksheetFile name <$> lookupRelPath parentPath wbRels rId

getRels :: Zip.Archive -> FilePath -> Parser Relationships
getRels ar fp = do
    let (dir, file) = splitFileName fp
        relsPath = dir </> "_rels" </> file <.> "rels"
    c <- xmlCursorOptional ar relsPath
    return $ maybe Relationships.empty (setTargetsFrom fp . head . fromCursor) c

lookupRelPath :: FilePath
              -> Relationships
              -> RefId
              -> Either ParseError FilePath
lookupRelPath fp rels rId =
  relTarget <$> note (InvalidRef fp rId) (Relationships.lookup rId rels)