{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TemplateHaskell #-} -------------------------------------------------------------------------------- -- | -- Module : Data.Rakhana.XRef -- Copyright : (C) 2014 Yorick Laupa -- License : (see the file LICENSE) -- -- Maintainer : Yorick Laupa -- Stability : provisional -- Portability : non-portable -- -------------------------------------------------------------------------------- module Data.Rakhana.XRef ( XRef(..) , XRefException(..) , CObj(..) , FObj(..) , UObj(..) , XRefStream(..) , getXRef , getXRefPos ) where -------------------------------------------------------------------------------- import Prelude hiding (take) import Data.Bits (shiftL) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Internal as L import Data.Foldable (traverse_) import qualified Data.Map.Strict as M import Data.Maybe (fromMaybe) import Data.Word -------------------------------------------------------------------------------- import Codec.Compression.Zlib import Codec.Compression.Zlib.Internal import Control.Lens import Control.Monad.Error.Class import Control.Monad.State.Strict import Data.Attoparsec.ByteString import qualified Data.Attoparsec.ByteString.Lazy as PL import Data.Attoparsec.ByteString.Char8 -------------------------------------------------------------------------------- import Data.Rakhana.Internal.Parsers import Data.Rakhana.Internal.Types import Data.Rakhana.Tape import Data.Rakhana.Util.Drive -------------------------------------------------------------------------------- data Entry = FreeObject Integer Int | UsedObject Integer Int | CompressedObject Integer Int deriving Show -------------------------------------------------------------------------------- data XRef = XRef { xrefFirstNumber :: !Int , xrefObjectCount :: !Int , xrefUTable :: !UTable , xrefFTable :: !FTable , xrefCTable :: !CTable , xrefTrailer :: !Dictionary , xrefStream :: !(Maybe XRefStream) } deriving Show -------------------------------------------------------------------------------- data Predictor = Png_Up | Predictor_Unsupported Integer deriving Show -------------------------------------------------------------------------------- data DecodeParms = DecodeParms { decodeParmsColumns :: !Integer , decodeParmsPredictor :: !Predictor } deriving Show -------------------------------------------------------------------------------- data XRefStream = XRefStream { xrefStreamLength :: !Int , xrefStreamSize :: !Integer , xrefStreamFirstNumber :: !Integer , xrefStreamEntryCount :: !Integer , xrefStreamPrev :: !(Maybe Integer) , xrefStreamW :: !(Integer, Integer, Integer) , xrefStreamEntryWidth :: !Integer , xrefStreamDecodeParms :: !(Maybe DecodeParms) , xrefStreamFilter :: !(Maybe Filter) , xrefStreamPos :: !Integer , xrefStreamDict :: !Dictionary } deriving Show -------------------------------------------------------------------------------- data UObj = UObj { uObjOff :: !Integer , uObjGen :: !Int } deriving Show -------------------------------------------------------------------------------- data CObj = CObj { cObjNum :: !Int , cObjIdx :: !Integer } deriving Show -------------------------------------------------------------------------------- data FObj = FObj { fObjNxtNum :: !Int , fObjGen :: !Int } deriving Show -------------------------------------------------------------------------------- type FTable = M.Map (Int,Int) FObj type UTable = M.Map (Int,Int) UObj type CTable = M.Map (Int,Int) CObj -------------------------------------------------------------------------------- data ObjType = Free | Used | Compressed deriving Show -------------------------------------------------------------------------------- data XRefException = XRefParsingException String | InvalidXRefStream | UnsupportedFilter B.ByteString | UnsupportedPredictor Integer | ZLibException String String deriving Show -------------------------------------------------------------------------------- data ExtractState = ExtractState { _extractType :: !ObjType , _extractOffset :: !Integer , _extractGen :: !Int } -------------------------------------------------------------------------------- data UnpredictState = UnpredictState { _unpredictPrev :: ![Word8] , _unpredictId :: !Int , _unpredictUTable :: !UTable , _unpredictFTable :: !FTable , _unpredictCTable :: !CTable } -------------------------------------------------------------------------------- data NoUnpredictState = NoUnpredictState { _noUnpredictId :: !Int , _noUnpredictUTable :: !UTable , _noUnpredictFTable :: !FTable , _noUnpredictCTable :: !CTable } -------------------------------------------------------------------------------- data EntriesParse = EntriesParse { _entriesParseId :: !Int , _entriesParseFTable :: !FTable , _entriesParseUTable :: !UTable } -------------------------------------------------------------------------------- -- Lenses -------------------------------------------------------------------------------- makeLenses ''ExtractState makeLenses ''UnpredictState makeLenses ''NoUnpredictState makeLenses ''EntriesParse -------------------------------------------------------------------------------- bufferSize :: Int bufferSize = 4096 -------------------------------------------------------------------------------- getXRefPos :: MonadError XRefException m => Drive m Integer getXRefPos = do driveBottom driveBackward skipEOL parseEOF skipEOL p <- parseXRefPosInteger skipEOL parseStartXRef return p -------------------------------------------------------------------------------- getXRef :: MonadError XRefException m => Header -> Integer -> Drive m XRef getXRef h pos = do driveTop driveForward catchError (normalXRef pos) $ \e -> if headerMaj h == 1 && headerMin h < 5 then throwError e else streamXRef pos -------------------------------------------------------------------------------- normalXRef :: MonadError XRefException m => Integer -> Drive m XRef normalXRef pos = do driveSeek pos xref <- parsing let dict = xrefTrailer xref mPrev = dict ^? dictKey "Prev" . _Number . _Natural case mPrev of Nothing -> return xref Just prev -> do xrefPrev <- normalXRef prev let nUTable = M.union (xrefUTable xref) (xrefUTable xrefPrev) newXRef = xref { xrefUTable = nUTable } return newXRef where parsing = driveParse bufferSize parseXRef >>= either (throwError . XRefParsingException) return -------------------------------------------------------------------------------- streamXRef :: MonadError XRefException m => Integer -> Drive m XRef streamXRef offset = loop (offset, Nothing) where loop (off, newerRefM) = do xref <- streamXRefStep off let prevXRefPos = xrefStream xref >>= xrefStreamPrev updateXRef nRef = let nUTable = M.union (xrefUTable nRef) (xrefUTable xref) nCTable = M.union (xrefCTable nRef) (xrefCTable xref) in nRef { xrefUTable = nUTable , xrefCTable = nCTable } case prevXRefPos of Nothing -> return $ maybe xref updateXRef newerRefM Just prev -> loop (prev, Just xref) >>= \xref' -> return $ maybe xref' updateXRef newerRefM -------------------------------------------------------------------------------- streamXRefStep :: MonadError XRefException m => Integer -> Drive m XRef streamXRefStep offset = do stream <- parseXRefStream offset xstream <- validateXRefStream stream let len = xrefStreamLength xstream filt = xrefStreamFilter xstream pos = xrefStreamPos xstream dparms = xrefStreamDecodeParms xstream mPred = fmap decodeParmsPredictor dparms driveSeek pos bs <- driveGetLazy len dbs <- decodeByteString filt bs case mPred of Nothing -> noUnpredict xstream dbs Just prd -> unpredict prd xstream dbs -------------------------------------------------------------------------------- parseXRefStream :: MonadError XRefException m => Integer -> Drive m Stream parseXRefStream offset = do driveSeek offset (_,_,obj) <- parsing maybe (throwError $ XRefParsingException "Expected a XRef Stream") return (obj ^? _Stream) where parsing = driveParseObject 128 >>= either (throwError . XRefParsingException) return -------------------------------------------------------------------------------- getFilter :: Dictionary -> Maybe Filter getFilter dict = dict ^? dictKey "Filter" . _Name . to toFilt where toFilt "FlateDecode" = FlateDecode toFilt x = Filter_Unsupported x -------------------------------------------------------------------------------- decodeParms :: Dictionary -> Maybe DecodeParms decodeParms dict = do parms <- dict ^? dictKey "DecodeParms" . _Dict col <- parms ^? dictKey "Columns" . _Number . _Natural prd <- parms ^? dictKey "Predictor" . _Number . _Natural . to toPred return DecodeParms { decodeParmsColumns = col , decodeParmsPredictor = prd } where toPred 12 = Png_Up toPred x = Predictor_Unsupported x -------------------------------------------------------------------------------- decompressErrorStr :: DecompressError -> String decompressErrorStr TruncatedInput = "TruncatedInput" decompressErrorStr DictionaryRequired = "DictionaryRequired" decompressErrorStr DataError = "DataError" -------------------------------------------------------------------------------- zlibDecompress :: MonadError XRefException m => L.ByteString -> m L.ByteString zlibDecompress bs = foldDecompressStream (liftM . L.Chunk) (return L.Empty) (\code -> throwError . ZLibException (decompressErrorStr code)) (decompressWithErrors zlibFormat defaultDecompressParams bs) -------------------------------------------------------------------------------- decodeByteString :: MonadError XRefException m => Maybe Filter -> L.ByteString -> m L.ByteString decodeByteString (Just filt) bs = case filt of FlateDecode -> zlibDecompress bs Filter_Unsupported x -> throwError $ UnsupportedFilter x decodeByteString _ bs = return bs -------------------------------------------------------------------------------- unpredict :: MonadError XRefException m => Predictor -> XRefStream -> L.ByteString -> m XRef unpredict p xstream input = case p of Png_Up -> unpredictPngUp xstream input Predictor_Unsupported x -> throwError $ UnsupportedPredictor x -------------------------------------------------------------------------------- noUnpredict :: MonadError XRefException m => XRefStream -> L.ByteString -> m XRef noUnpredict xstream input = case PL.parse parser input of PL.Fail _ _ e -> throwError $ XRefParsingException e PL.Done _ bs -> return bs where width = fromIntegral $ xrefStreamEntryWidth xstream firstNumber = fromIntegral $ xrefStreamFirstNumber xstream ecount = fromIntegral $ xrefStreamEntryCount xstream start = NoUnpredictState { _noUnpredictId = firstNumber - 1 , _noUnpredictUTable = M.empty , _noUnpredictFTable = M.empty , _noUnpredictCTable = M.empty } parser = evalStateT aState start aState = do replicateM_ ecount action utable <- use noUnpredictUTable ftable <- use noUnpredictFTable ctable <- use noUnpredictCTable return XRef { xrefFirstNumber = firstNumber , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = ctable , xrefTrailer = M.empty , xrefStream = Just xstream } action = do oid <- noUnpredictId <+= 1 row <- lift step (typ, off, gen) <- either fail return (extractTableEntry xstream row) let ref = (oid,gen) cref = (oid, 0) fobj = FObj (fromIntegral off) gen uobj = UObj off gen cobj = CObj (fromIntegral off) (fromIntegral gen) case typ of Free -> noUnpredictFTable.at ref ?= fobj Used -> noUnpredictUTable.at ref ?= uobj Compressed -> noUnpredictCTable.at cref ?= cobj step = do bs <- take width let row = B.unpack bs return row -------------------------------------------------------------------------------- unpredictPngUp :: MonadError XRefException m => XRefStream -> L.ByteString -> m XRef unpredictPngUp xstream input = case PL.parse parser input of PL.Fail _ _ e -> throwError $ XRefParsingException e PL.Done _ bs -> return bs where width = fromIntegral $ xrefStreamEntryWidth xstream firstNumber = fromIntegral $ xrefStreamFirstNumber xstream ecount = fromIntegral $ xrefStreamEntryCount xstream start = UnpredictState { _unpredictPrev = replicate width 0 , _unpredictId = firstNumber - 1 , _unpredictUTable = M.empty , _unpredictFTable = M.empty , _unpredictCTable = M.empty } parser = evalStateT aState start aState = do replicateM_ ecount action utable <- use unpredictUTable ftable <- use unpredictFTable ctable <- use unpredictCTable return XRef { xrefFirstNumber = firstNumber , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = ctable , xrefTrailer = M.empty , xrefStream = Just xstream } action = do prev <- use unpredictPrev newPrev <- lift $ step prev oid <- unpredictId <+= 1 unpredictPrev .= newPrev (typ, off, gen) <- either fail return (extractTableEntry xstream newPrev) let ref = (oid, gen) cref = (oid, 0) fobj = FObj (fromIntegral off) gen uobj = UObj off gen cobj = CObj (fromIntegral off) (fromIntegral gen) case typ of Free -> unpredictFTable.at ref ?= fobj Used -> unpredictUTable.at ref ?= uobj Compressed -> unpredictCTable.at cref ?= cobj step prev = do _ <- anyWord8 bs <- take width let newPrev = zipWith (+) (B.unpack bs) prev return newPrev -------------------------------------------------------------------------------- extractTableEntry :: XRefStream -> [Word8] -> Either String (ObjType, Integer, Int) extractTableEntry xstream arr = fmap mkEntry $ execStateT action start where action = traverse_ step $ zip [1..width] arr start = ExtractState Free 0 0 mkEntry s = let off = s ^. extractOffset gen = s ^. extractGen typ = s ^. extractType in (typ, off, gen) step (i,w) | i == 1 = case w of 0x00 -> extractType .= Free 0x01 -> extractType .= Used 0x02 -> extractType .= Compressed _ -> lift $ Left $ "Invalid entry type " ++ show w | i <= oLen+1 = extractOffset += (fromIntegral w) `shiftL` (8*(oLen-i+1)) | i <= oLen+gLen+1 = extractGen += (fromIntegral w) `shiftL` (8*(gLen-i+1)) | otherwise = return () (_,c2,c3) = xrefStreamW xstream oLen = fromIntegral c2 gLen = fromIntegral c3 width = fromIntegral $ xrefStreamEntryWidth xstream -------------------------------------------------------------------------------- validateXRefStream :: MonadError XRefException m => Stream -> m XRefStream validateXRefStream s = maybe (throwError InvalidXRefStream) return action where action = do typ <- dict ^? dictKey "Type" . _Name when (typ /= "XRef") Nothing size <- dict ^? dictKey "Size" . _Number . _Natural len <- dict ^? dictKey "Length" . _Number . _Natural w@(c1,c2,c3) <- getW let xstream = XRefStream { xrefStreamLength = fromIntegral len , xrefStreamSize = size , xrefStreamFirstNumber = fromMaybe 0 firstNumber , xrefStreamEntryCount = fromMaybe size entryCount , xrefStreamPrev = getPrev , xrefStreamW = w , xrefStreamDecodeParms = decodeParms dict , xrefStreamFilter = getFilter dict , xrefStreamEntryWidth = c1 + c2 + c3 , xrefStreamPos = s ^. streamPos , xrefStreamDict = dict } return xstream dict = s ^. streamDict firstNumber = do ar <- dict ^? dictKey "Index" . _Array ar ^? nth 0 . _Number . _Natural entryCount = do ar <- dict ^? dictKey "Index" . _Array ar ^? nth 1 . _Number . _Natural getPrev = dict ^? dictKey "Prev" . _Number . _Natural getW = do ar <- dict ^? dictKey "W" . _Array one <- ar ^? nth 0 . _Number . _Natural two <- ar ^? nth 1 . _Number . _Natural three <- ar ^? nth 2 . _Number . _Natural return (one, two, three) -------------------------------------------------------------------------------- skipEOL :: Monad m => Drive m () skipEOL = do bs <- drivePeek 1 case B8.uncons bs of Just (c, _) | isSpace c -> driveDiscard 1 >> skipEOL | otherwise -> return () _ -> return () -------------------------------------------------------------------------------- parseEOF :: MonadError XRefException m => Drive m () parseEOF = do bs <- driveGet 5 case bs of "%%EOF" -> return () _ -> throwError $ XRefParsingException "Expected %%EOF" -------------------------------------------------------------------------------- parseXRefPosInteger :: Monad m => Drive m Integer parseXRefPosInteger = go [] where go cs = do bs <- drivePeek 1 case B8.uncons bs of Just (c,_) | isDigit c -> driveDiscard 1 >> go (c:cs) | otherwise -> return $ read cs _ -> return $ read cs -------------------------------------------------------------------------------- parseStartXRef :: MonadError XRefException m => Drive m () parseStartXRef = do bs <- driveGet 9 case bs of "startxref" -> return () _ -> throwError $ XRefParsingException "Expected startxref" -------------------------------------------------------------------------------- -- Parsers -------------------------------------------------------------------------------- tableXRef :: Parser () tableXRef = do _ <- string "xref" pdfEndOfLine -------------------------------------------------------------------------------- parseXRef :: Parser XRef parseXRef = do skipSpace tableXRef (fnum, ecount) <- parseSubsectionHeader (ftable, utable) <- parseTableEntries fnum ecount trailer <- parseTrailerAfterTable return XRef { xrefFirstNumber = fnum , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = M.empty , xrefTrailer = trailer , xrefStream = Nothing } -------------------------------------------------------------------------------- parseSubsectionHeader :: Parser (Int, Int) parseSubsectionHeader = do start <- decimal skipSpace ecount <- decimal pdfEndOfLine return (start, ecount) -------------------------------------------------------------------------------- parseTrailerAfterTable :: Parser Dictionary parseTrailerAfterTable = do skipSpace _ <- string "trailer" pdfEndOfLine skipSpace Dict d <- parseDict return d -------------------------------------------------------------------------------- parseTableEntries :: Int -> Int -> Parser (FTable, UTable) parseTableEntries firstNumber n = parser where parser = fmap end $ execStateT action start action = replicateM_ n step step = do (off, gen, used) <- lift parseTableEntry eid <- entriesParseId <+= 1 let key = (eid, gen) when (off /= 0) $ if used then entriesParseUTable.at key ?= UObj off gen else let nxt = fromIntegral off in entriesParseFTable.at key ?= FObj nxt gen start = EntriesParse { _entriesParseId = firstNumber - 1 , _entriesParseFTable = M.empty , _entriesParseUTable = M.empty } end s = (s ^. entriesParseFTable, s ^. entriesParseUTable) -------------------------------------------------------------------------------- parseTableEntry :: Parser (Integer, Int, Bool) parseTableEntry = do skipSpace offset <- decimal skipSpace gen <- decimal skipSpace c <- anyChar case c of 'n' -> return (offset, gen, True) 'f' -> return (offset, gen, False) _ -> let msg = "error parsing XRef table entry: unknown char: " ++ [c] in fail msg