-- | Parse content stream module Pdf.Toolbox.Content.Parser ( parseContentStream, readNextOperator, parseContent ) where import Data.Int import qualified Data.ByteString.Char8 as ByteString8 import Data.Attoparsec.ByteString.Char8 (Parser) import qualified Data.Attoparsec.ByteString.Char8 as Parser import Data.IORef import Control.Applicative import Control.Exception import System.IO.Streams (InputStream) import qualified System.IO.Streams as Streams import qualified System.IO.Streams.Attoparsec as Streams import Pdf.Toolbox.Core import Pdf.Toolbox.Core.Parsers.Object import Pdf.Toolbox.Content.Ops -- | Parse content streams for a page -- -- Note: we need content stream ref to be able to decrypt stream content. -- We need stream length because it can be an indirect object in -- stream dictionary parseContentStream :: MonadIO m => RIS -- ^ random input stream to read data from -> [StreamFilter] -- ^ how to unpack data -> (Ref -> IS -> IO IS) -- ^ how to decrypt data -> [(Stream Int64, Ref, Int)] -- ^ content streams (with offset), their refs and length -> PdfE m (InputStream Expr) parseContentStream ris filters decryptor streams = do is <- combineStreams ris filters decryptor streams liftIO $ Streams.parserToInputStream parseContent is -- | Read the next operator if any readNextOperator :: MonadIO m => InputStream Expr -> PdfE m (Maybe Operator) readNextOperator is = annotateError "reading the next operator from content stream" $ go [] where go args = do expr <- do e <- tryPdfIO $ (Right <$> Streams.read is) `catch` (\e -> return $ Left $ UnexpectedError $ show (e :: Streams.ParseException)) case e of Right expr -> return expr Left er -> throwE er case expr of Nothing -> case args of [] -> return Nothing _ -> throwE $ UnexpectedError $ "Args without op: " ++ show args Just (Obj o) -> go (o : args) Just (Op o) -> return $ Just (o, reverse args) combineStreams :: MonadIO m => RIS -> [StreamFilter] -> (Ref -> IS -> IO IS) -> [(Stream Int64, Ref, Int)] -> PdfE m IS combineStreams _ _ _ [] = liftIO Streams.nullInput combineStreams ris filters decryptor (x:xs) = do reader <- mkReader x xs ref <- liftIO $ newIORef reader liftIO $ Streams.makeInputStream (doRead ref) where mkReader (s, ref, len) ss = do Stream _ is <- decodedStreamContent ris filters (decryptor ref) len s return (is, ss) doRead ref = do (is, ss) <- liftIO $ readIORef ref chunk <- liftIO $ Streams.read is case chunk of Nothing -> case ss of [] -> return Nothing (h:t) -> do reader <- runExceptT $ mkReader h t case reader of Left e -> liftIO $ ioError $ userError $ show e Right r -> do liftIO $ writeIORef ref r doRead ref Just c -> return (Just c) parseContent :: Parser (Maybe Expr) parseContent = (skipSpace >> Parser.endOfInput >> return Nothing) <|> do skipSpace fmap Just $ fmap Obj parseObject <|> fmap (Op . toOp) (Parser.takeWhile1 isRegularChar) -- See Note Inline image <|> fmap (Op . UnknownOp . ByteString8.pack . return) Parser.anyChar {- Note Inline image There is at least one case that doesn't fit the way we represent content stream operators: inline images, see Pdf1.7:8.9.7 Inline image looks like "BI" operator, a number of key/value pairs, "ID" operator, image data and "EI" operator. I have no idea how to handle that case. There is no data length, so the "EI" string is the only indicator for end of image data. What if the data contains "EI"? For now lets skip all unknown bytes as unknown operator. That seems to work, but it is not reliable. -} -- Treat comments as spaces skipSpace :: Parser () skipSpace = do Parser.skipSpace _ <- many $ do _ <- Parser.char '%' Parser.skipWhile $ \c -> c /= '\n' && c /= '\r' Parser.skipSpace return ()