{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TemplateHaskell #-} -------------------------------------------------------------------------------- -- | -- Module : Data.Rakhana.XRef -- Copyright : (C) 2014 Yorick Laupa -- License : (see the file LICENSE) -- -- Maintainer : Yorick Laupa -- Stability : provisional -- Portability : non-portable -- -------------------------------------------------------------------------------- module Data.Rakhana.XRef ( XRef(..) , XRefException(..) , CObj(..) , FObj(..) , UObj(..) , XRefStream(..) , getXRef , getXRefPos ) where -------------------------------------------------------------------------------- import Prelude hiding (take) import Data.Bits (shiftL) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy.Internal as L import Data.Foldable (traverse_) import qualified Data.Map.Strict as M import Data.Maybe (fromMaybe) import Data.Word -------------------------------------------------------------------------------- import Codec.Compression.Zlib import Codec.Compression.Zlib.Internal import Control.Lens import Control.Monad.State.Strict import Data.Attoparsec.ByteString import qualified Data.Attoparsec.ByteString.Lazy as PL import Data.Attoparsec.ByteString.Char8 import Pipes.Safe () -------------------------------------------------------------------------------- import Data.Rakhana.Internal.Parsers import Data.Rakhana.Internal.Types import Data.Rakhana.Tape import Data.Rakhana.Util.Drive -------------------------------------------------------------------------------- data Entry = FreeObject Integer Int | UsedObject Integer Int | CompressedObject Integer Int deriving Show -------------------------------------------------------------------------------- data XRef = XRef { xrefFirstNumber :: !Int , xrefObjectCount :: !Int , xrefUTable :: !UTable , xrefFTable :: !FTable , xrefCTable :: !CTable , xrefTrailer :: !Dictionary , xrefStream :: !(Maybe XRefStream) } deriving Show -------------------------------------------------------------------------------- data Predictor = Png_Up | Predictor_Unsupported Integer deriving Show -------------------------------------------------------------------------------- data DecodeParms = DecodeParms { decodeParmsColumns :: !Integer , decodeParmsPredictor :: !Predictor } deriving Show -------------------------------------------------------------------------------- data XRefStream = XRefStream { xrefStreamLength :: !Int , xrefStreamSize :: !Integer , xrefStreamFirstNumber :: !Integer , xrefStreamEntryCount :: !Integer , xrefStreamPrev :: !(Maybe Integer) , xrefStreamW :: !(Integer, Integer, Integer) , xrefStreamEntryWidth :: !Integer , xrefStreamDecodeParms :: !(Maybe DecodeParms) , xrefStreamFilter :: !(Maybe Filter) , xrefStreamPos :: !Integer , xrefStreamDict :: !Dictionary } deriving Show -------------------------------------------------------------------------------- data UObj = UObj { uObjOff :: !Integer , uObjGen :: !Int } deriving Show -------------------------------------------------------------------------------- data CObj = CObj { cObjNum :: !Int , cObjIdx :: !Integer } deriving Show -------------------------------------------------------------------------------- data FObj = FObj { fObjNxtNum :: !Int , fObjGen :: !Int } deriving Show -------------------------------------------------------------------------------- type FTable = M.Map (Int,Int) FObj type UTable = M.Map (Int,Int) UObj type CTable = M.Map (Int,Int) CObj -------------------------------------------------------------------------------- data ObjType = Free | Used | Compressed deriving Show -------------------------------------------------------------------------------- data XRefException = XRefParsingException String | InvalidXRefStream | UnsupportedFilter B.ByteString | UnsupportedPredictor Integer | ZLibException String String deriving Show -------------------------------------------------------------------------------- data ExtractState = ExtractState { _extractType :: !ObjType , _extractOffset :: !Integer , _extractGen :: !Int } -------------------------------------------------------------------------------- data UnpredictState = UnpredictState { _unpredictPrev :: ![Word8] , _unpredictId :: !Int , _unpredictUTable :: !UTable , _unpredictFTable :: !FTable , _unpredictCTable :: !CTable } -------------------------------------------------------------------------------- data NopredictState = NopredictState { _nopredictId :: !Int , _nopredictUTable :: !UTable , _nopredictFTable :: !FTable , _nopredictCTable :: !CTable } -------------------------------------------------------------------------------- -- Lenses -------------------------------------------------------------------------------- makeLenses ''ExtractState makeLenses ''UnpredictState makeLenses ''NopredictState -------------------------------------------------------------------------------- bufferSize :: Int bufferSize = 4096 -------------------------------------------------------------------------------- getXRefPos :: Monad m => Drive m (Either XRefException Integer) getXRefPos = do driveBottom driveBackward skipEOL mE <- parseEOF case mE of Just e -> return $ Left e Nothing -> do skipEOL p <- parseXRefPosInteger skipEOL mR <- parseStartXRef return $ maybe (Right p) Left mR -------------------------------------------------------------------------------- getXRef :: Monad m => Header -> Integer -> Drive m (Either XRefException XRef) getXRef h pos = do rE <- crossRef pos case rE of Left e | headerMaj h == 1 && headerMin h < 5 -> return $ Left e | otherwise -> crossRefStream pos _ -> return rE -------------------------------------------------------------------------------- crossRef :: Monad m => Integer -> Drive m (Either XRefException XRef) crossRef pos = do driveTop driveForward driveSeek pos eR <- driveParse bufferSize parseXRef return $ either (Left . XRefParsingException) Right eR -------------------------------------------------------------------------------- crossRefStream :: Monad m => Integer -> Drive m (Either XRefException XRef) crossRefStream offset = loop (offset, Nothing) where loop (off, newerRefM) = do xrefE <- crossRefStreamStep off case xrefE of Left e -> return $ Left e Right xref -> let prevM = xrefStream xref >>= xrefStreamPrev upd nRef = let nUTable = M.union (xrefUTable nRef) (xrefUTable xref) nCTable = M.union (xrefCTable nRef) (xrefCTable xref) in nRef { xrefUTable = nUTable , xrefCTable = nCTable } in case prevM of Nothing -> let updRef = maybe xref upd newerRefM in return $ Right updRef Just prev -> do xrefE' <- loop (prev, Just xref) case xrefE' of Left e' -> return $ Left e' Right xref' -> let updRef = maybe xref' upd newerRefM in return $ Right updRef -------------------------------------------------------------------------------- crossRefStreamStep :: Monad m => Integer -> Drive m (Either XRefException XRef) crossRefStreamStep offset = do streamE <- parseXRefStream offset let xstreamE = streamE >>= validateXRefStream case xstreamE of Left e -> return $ Left e Right xstream -> do let len = xrefStreamLength xstream filt = xrefStreamFilter xstream pos = xrefStreamPos xstream driveSeek pos bs <- driveGetLazy len let dbsE = decodeBS filt bs dparms = xrefStreamDecodeParms xstream mPred = fmap decodeParmsPredictor dparms res = case mPred of Nothing -> dbsE >>= noPredict xstream Just prd -> dbsE >>= unpredict prd xstream return res -------------------------------------------------------------------------------- parseXRefStream :: Monad m => Integer -> Drive m (Either XRefException Stream) parseXRefStream offset = do driveTop driveForward driveSeek offset rE <- driveParseObject 128 case rE of Left e -> return $ Left $ XRefParsingException e Right r -> let xstream = r ^. _3 expt = XRefParsingException "Expected a XRef Stream" in return $ maybe (Left $ expt) Right (xstream ^? _Stream) -------------------------------------------------------------------------------- getFilter :: Dictionary -> Maybe Filter getFilter dict = dict ^? dictKey "Filter" . _Name . to toFilt where toFilt "FlateDecode" = FlateDecode toFilt x = Filter_Unsupported x -------------------------------------------------------------------------------- decodeParms :: Dictionary -> Maybe DecodeParms decodeParms dict = do parms <- dict ^? dictKey "DecodeParms" . _Dict col <- parms ^? dictKey "Columns" . _Number . _Natural prd <- parms ^? dictKey "Predictor" . _Number . _Natural . to toPred return DecodeParms { decodeParmsColumns = col , decodeParmsPredictor = prd } where toPred 12 = Png_Up toPred x = Predictor_Unsupported x -------------------------------------------------------------------------------- decompressErrorStr :: DecompressError -> String decompressErrorStr TruncatedInput = "TruncatedInput" decompressErrorStr DictionaryRequired = "DictionaryRequired" decompressErrorStr DataError = "DataError" -------------------------------------------------------------------------------- zlibDecompress :: L.ByteString -> Either XRefException L.ByteString zlibDecompress bs = foldDecompressStream go (Right L.Empty) (\code msg -> Left $ ZLibException (decompressErrorStr code) msg) $ decompressWithErrors zlibFormat defaultDecompressParams bs where go b aE = fmap (\b' -> L.Chunk b b') aE -------------------------------------------------------------------------------- decodeBS :: Maybe Filter -> L.ByteString -> Either XRefException L.ByteString decodeBS (Just filt) bs = case filt of FlateDecode -> zlibDecompress bs Filter_Unsupported x -> Left $ UnsupportedFilter x decodeBS _ bs = Right bs -------------------------------------------------------------------------------- unpredict :: Predictor -> XRefStream -> L.ByteString -> Either XRefException XRef unpredict p xstream input = case p of Png_Up -> unpredictPngUp xstream input Predictor_Unsupported x -> Left $ UnsupportedPredictor x -------------------------------------------------------------------------------- noPredict :: XRefStream -> L.ByteString -> Either XRefException XRef noPredict xstream input = case PL.parse parser input of PL.Fail _ _ e -> Left $ XRefParsingException e PL.Done _ bs -> Right bs where width = fromIntegral $ xrefStreamEntryWidth xstream firstNumber = fromIntegral $ xrefStreamFirstNumber xstream ecount = fromIntegral $ xrefStreamEntryCount xstream start = NopredictState { _nopredictId = firstNumber - 1 , _nopredictUTable = M.empty , _nopredictFTable = M.empty , _nopredictCTable = M.empty } parser = evalStateT aState start aState = do replicateM_ ecount action utable <- use nopredictUTable ftable <- use nopredictFTable ctable <- use nopredictCTable return XRef { xrefFirstNumber = firstNumber , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = ctable , xrefTrailer = M.empty , xrefStream = Just xstream } action = do oid <- nopredictId <+= 1 row <- lift step let (typ, off, gen) = extractTableEntry xstream row ref = (oid,gen) cref = (oid, 0) fobj = FObj (fromIntegral off) gen uobj = UObj off gen cobj = CObj (fromIntegral off) (fromIntegral gen) case typ of Free -> nopredictFTable.at ref ?= fobj Used -> nopredictUTable.at ref ?= uobj Compressed -> nopredictCTable.at cref ?= cobj step = do bs <- take width let row = B.unpack bs return row -------------------------------------------------------------------------------- unpredictPngUp :: XRefStream -> L.ByteString -> Either XRefException XRef unpredictPngUp xstream input = case PL.parse parser input of PL.Fail _ _ e -> Left $ XRefParsingException e PL.Done _ bs -> Right bs where width = fromIntegral $ xrefStreamEntryWidth xstream firstNumber = fromIntegral $ xrefStreamFirstNumber xstream ecount = fromIntegral $ xrefStreamEntryCount xstream start = UnpredictState { _unpredictPrev = replicate width 0 , _unpredictId = firstNumber - 1 , _unpredictUTable = M.empty , _unpredictFTable = M.empty , _unpredictCTable = M.empty } parser = evalStateT aState start aState = do replicateM_ ecount action utable <- use unpredictUTable ftable <- use unpredictFTable ctable <- use unpredictCTable return XRef { xrefFirstNumber = firstNumber , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = ctable , xrefTrailer = M.empty , xrefStream = Just xstream } action = do prev <- use unpredictPrev newPrev <- lift $ step prev oid <- unpredictId <+= 1 unpredictPrev .= newPrev let (typ, off, gen) = extractTableEntry xstream newPrev ref = (oid, gen) cref = (oid, 0) fobj = FObj (fromIntegral off) gen uobj = UObj off gen cobj = CObj (fromIntegral off) (fromIntegral gen) case typ of Free -> unpredictFTable.at ref ?= fobj Used -> unpredictUTable.at ref ?= uobj Compressed -> unpredictCTable.at cref ?= cobj step prev = do _ <- anyWord8 bs <- take width let newPrev = zipWith (+) (B.unpack bs) prev return newPrev -------------------------------------------------------------------------------- extractTableEntry :: XRefStream -> [Word8] -> (ObjType, Integer, Int) extractTableEntry xstream arr = mkEntry $ execState (traverse_ action $ zip [1..width] arr) start where start = ExtractState Free 0 0 mkEntry s = let off = s ^. extractOffset gen = s ^. extractGen typ = s ^. extractType in (typ, off, gen) action (i,w) | i == 1 = case w of 0x00 -> extractType .= Free 0x01 -> extractType .= Used 0x02 -> extractType .= Compressed _ -> error $ "Invalid entry type " ++ show w | i <= oLen+1 = extractOffset += (fromIntegral w) `shiftL` (8*(oLen-i+1)) | i <= oLen+gLen+1 = extractGen += (fromIntegral w) `shiftL` (8*(gLen-i+1)) | otherwise = return () (_,c2,c3) = xrefStreamW xstream oLen = fromIntegral c2 gLen = fromIntegral c3 width = fromIntegral $ xrefStreamEntryWidth xstream -------------------------------------------------------------------------------- validateXRefStream :: Stream -> Either XRefException XRefStream validateXRefStream s = maybe (Left InvalidXRefStream) Right action where action = do typ <- dict ^? dictKey "Type" . _Name when (typ /= "XRef") Nothing size <- dict ^? dictKey "Size" . _Number . _Natural len <- dict ^? dictKey "Length" . _Number . _Natural w@(c1,c2,c3) <- getW let xstream = XRefStream { xrefStreamLength = fromIntegral len , xrefStreamSize = size , xrefStreamFirstNumber = fromMaybe 0 firstNumber , xrefStreamEntryCount = fromMaybe size entryCount , xrefStreamPrev = getPrev , xrefStreamW = w , xrefStreamDecodeParms = decodeParms dict , xrefStreamFilter = getFilter dict , xrefStreamEntryWidth = c1 + c2 + c3 , xrefStreamPos = s ^. streamPos , xrefStreamDict = dict } return xstream dict = s ^. streamDict firstNumber = do ar <- dict ^? dictKey "Index" . _Array ar ^? nth 0 . _Number . _Natural entryCount = do ar <- dict ^? dictKey "Index" . _Array ar ^? nth 1 . _Number . _Natural getPrev = dict ^? dictKey "Prev" . _Number . _Natural getW = do ar <- dict ^? dictKey "W" . _Array one <- ar ^? nth 0 . _Number . _Natural two <- ar ^? nth 1 . _Number . _Natural three <- ar ^? nth 2 . _Number . _Natural return (one, two, three) -------------------------------------------------------------------------------- skipEOL :: Monad m => Drive m () skipEOL = do bs <- drivePeek 1 case B8.uncons bs of Just (c, _) | isSpace c -> driveDiscard 1 >> skipEOL | otherwise -> return () _ -> return () -------------------------------------------------------------------------------- parseEOF :: Monad m => Drive m (Maybe XRefException) parseEOF = do bs <- driveGet 5 case bs of "%%EOF" -> return Nothing _ -> return $ Just $ XRefParsingException "Expected %%EOF" -------------------------------------------------------------------------------- parseXRefPosInteger :: Monad m => Drive m Integer parseXRefPosInteger = go [] where go cs = do bs <- drivePeek 1 case B8.uncons bs of Just (c,_) | isDigit c -> driveDiscard 1 >> go (c:cs) | otherwise -> return $ read cs _ -> return $ read cs -------------------------------------------------------------------------------- parseStartXRef :: Monad m => Drive m (Maybe XRefException) parseStartXRef = do bs <- driveGet 9 case bs of "startxref" -> return Nothing _ -> return $ Just $ XRefParsingException "Expected startxref" -------------------------------------------------------------------------------- -- Parsers -------------------------------------------------------------------------------- tableXRef :: Parser () tableXRef = do _ <- string "xref" pdfEndOfLine -------------------------------------------------------------------------------- parseXRef :: Parser XRef parseXRef = do skipSpace tableXRef (fnum, ecount) <- parseSubsectionHeader (ftable, utable) <- parseTableEntries fnum trailer <- parseTrailerAfterTable return XRef { xrefFirstNumber = fnum , xrefObjectCount = ecount , xrefUTable = utable , xrefFTable = ftable , xrefCTable = M.empty , xrefTrailer = trailer , xrefStream = Nothing } -------------------------------------------------------------------------------- parseSubsectionHeader :: Parser (Int, Int) parseSubsectionHeader = do start <- decimal skipSpace ecount <- decimal pdfEndOfLine return (start, ecount) -------------------------------------------------------------------------------- parseTrailerAfterTable :: Parser Dictionary parseTrailerAfterTable = do skipSpace _ <- string "trailer" pdfEndOfLine skipSpace Dict d <- parseDict return d -------------------------------------------------------------------------------- parseTableEntries :: Int -> Parser (FTable, UTable) parseTableEntries firstNumber = loop start where loop (i, ftable, utable) = do mT <- optional parseTableEntry case mT of Nothing -> return (ftable, utable) Just (off, gen, used) | used -> let key = (i, gen) obj = UObj off gen utable' = M.insert key obj utable in loop (i+1, ftable, utable') | otherwise -> let key = (i, gen) obj = FObj (fromIntegral off) gen ftable' = M.insert key obj ftable in loop (i+1, ftable', utable) start :: (Int, FTable, UTable) start = (firstNumber, M.empty, M.empty) -------------------------------------------------------------------------------- parseTableEntry :: Parser (Integer, Int, Bool) parseTableEntry = do skipSpace offset <- decimal skipSpace gen <- decimal skipSpace c <- anyChar case c of 'n' -> return (offset, gen, True) 'f' -> return (offset, gen, False) _ -> let msg = "error parsing XRef table entry: unknown char: " ++ [c] in fail msg