{-# LANGUAGE OverloadedStrings #-} {-| Module : PDF.DocumentStructure Description : Function to walk around Document Structure of a PDF file Copyright : (c) Keiichiro Shikano, 2020 License : MIT Maintainer : k16.shikano@gmail.com -} module PDF.DocumentStructure ( parseTrailer , expandObjStm , rootRef , contentsStream , findKids , findPages , findDict , findDictByRef , findDictOfType , findObjThroughDict , findObjThroughDictByRef , findObjsByRef , findObjs , findTrailer ) where import Data.Char (chr) import Data.List (find) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import qualified Data.Text as T import Data.Attoparsec.ByteString.Char8 hiding (take) import Data.Attoparsec.Combinator import Control.Applicative import Codec.Compression.Zlib (decompress) import Debug.Trace import PDF.Definition import PDF.Object import PDF.ContentStream (parseStream, parseColorSpace) import PDF.Cmap (parseCMap) spaces = skipSpace oneOf = satisfy . inClass noneOf = satisfy . notInClass -- find objects findObjs :: BS.ByteString -> [PDFBS] findObjs contents = case parseOnly (many1 pdfObj) contents of Left err -> [] Right rlt -> rlt findXref :: BS.ByteString -> String findXref contents = case parseOnly (xref) contents of Left err -> [] Right rlt -> rlt findObjsByRef :: Int -> [PDFObj] -> Maybe [Obj] findObjsByRef x pdfobjs = case find (isRefObj (Just x)) pdfobjs of Just (_,objs) -> Just objs Nothing -> Nothing where isRefObj (Just x) (y, objs) = if x==y then True else False isRefObj _ _ = False findObjThroughDictByRef :: Int -> String -> [PDFObj] -> Maybe Obj findObjThroughDictByRef ref name objs = case findDictByRef ref objs of Just d -> findObjThroughDict d name Nothing -> Nothing findObjThroughDict :: Dict -> String -> Maybe Obj findObjThroughDict d name = case find isName d of Just (_, o) -> Just o otherwise -> Nothing where isName (PdfName n, _) = if name == n then True else False isName _ = False findDictByRef :: Int -> [PDFObj] -> Maybe Dict findDictByRef ref objs = case findObjsByRef ref objs of Just os -> findDict os Nothing -> Nothing findDictOfType :: String -> [Obj] -> Maybe Dict findDictOfType typename objs = case findDict objs of Just d -> if isType d then Just d else Nothing Nothing -> Nothing where isType dict = (PdfName "/Type",PdfName typename) `elem` dict findDict :: [Obj] -> Maybe Dict findDict objs = case find isDict objs of Just (PdfDict d) -> Just d otherwise -> Nothing where isDict :: Obj -> Bool isDict (PdfDict d) = True isDict _ = False findPages :: Dict -> Maybe Int findPages dict = case find isPagesRef dict of Just (_, ObjRef x) -> Just x Nothing -> Nothing where isPagesRef (PdfName "/Pages", ObjRef x) = True isPagesRef (_,_) = False findKids :: Dict -> Maybe [Int] findKids dict = case find isKidsRefs dict of Just (_, PdfArray arr) -> Just (parseRefsArray arr) Nothing -> Nothing where isKidsRefs (PdfName "/Kids", PdfArray x) = True isKidsRefs (_,_) = False contentsStream :: Dict -> PSR -> [PDFObj] -> PDFStream contentsStream dict st objs = case find contents dict of Just (PdfName "/Contents", PdfArray arr) -> parseContentStream dict st objs $ BSL.concat $ map (rawStreamByRef objs) (parseRefsArray arr) Just (PdfName "/Contents", ObjRef r) -> parseContentStream dict st objs $ rawStreamByRef objs r Nothing -> error "No content to be shown" where contents (PdfName "/Contents", _) = True contents _ = False parseContentStream :: Dict -> PSR -> [PDFObj] -> BSL.ByteString -> PDFStream parseContentStream dict st objs s = parseStream (st {fontmaps=fontdict, cmaps=cmap}) s where fontdict = findFontMap dict objs cmap = findCMap dict objs rawStreamByRef :: [PDFObj] -> Int -> BSL.ByteString rawStreamByRef pdfobjs x = case findObjsByRef x pdfobjs of Just objs -> rawStream objs Nothing -> error "No object with stream to be shown" rawStream :: [Obj] -> BSL.ByteString rawStream objs = case find isStream objs of Just (PdfStream strm) -> streamFilter strm Nothing -> error $ (show objs) ++ "\n No stream to be shown" where isStream (PdfStream s) = True isStream _ = False streamFilter = case findDict objs of Just d -> case find withFilter d of Just (PdfName "/Filter", PdfName "/FlateDecode") -> decompress Just _ -> id -- need fix Nothing -> id Nothing -> id withFilter (PdfName "/Filter", _) = True withFilter _ = False contentsColorSpace :: Dict -> PSR -> [PDFObj] -> [T.Text] contentsColorSpace dict st objs = case find contents dict of Just (PdfName "/Contents", PdfArray arr) -> concat $ map (parseColorSpace (st {xcolorspaces=xobjcs}) . rawStreamByRef objs) (parseRefsArray arr) Just (PdfName "/Contents", ObjRef x) -> parseColorSpace (st {xcolorspaces=xobjcs}) $ rawStreamByRef objs x Nothing -> error "No content to be shown" where contents (PdfName "/Contents", _) = True contents _ = False xobjcs = findXObjectColorSpace dict objs -- find XObject findXObjectColorSpace d os = xobjColorSpaceMap (findXObject d os) os xobjColorSpaceMap dict objs = map pairwise dict where pairwise (PdfName n, ObjRef r) = xobjColorSpace r objs pairwise x = "" findXObject dict objs = case findResourcesDict dict objs of Just d -> case findObjThroughDict d "/XObject" of Just (PdfDict d) -> d otherwise -> [] Nothing -> [] xobjColorSpace :: Int -> [PDFObj] -> String xobjColorSpace x objs = case findObjThroughDictByRef x "/ColorSpace" objs of Just (PdfName cs) -> cs otherwise -> "" -- find root ref from Trailer or Cross-Reference Dictionary parseTrailer :: BS.ByteString -> Maybe Dict parseTrailer bs = case parseOnly (try trailer <|> xref) bs of Left err -> (trace (show err) Nothing) Right rlt -> Just (parseCRDict rlt) where trailer :: Parser BS.ByteString trailer = do manyTill anyChar (try $ string "trailer") t <- manyTill anyChar (try $ string "startxref") return $ BS.pack t xref :: Parser BS.ByteString xref = do manyTill anyChar (try $ string "startxref" >> spaces >> lookAhead (oneOf "123456789")) offset <- many1 digit return $ BS.drop (read offset :: Int) bs parseCRDict :: BS.ByteString -> Dict parseCRDict rlt = case parseOnly crdict rlt of Left err -> error $ show (BS.take 100 rlt) Right (PdfDict dict) -> dict Right other -> error "Could not find Cross-Reference dictionary" where crdict :: Parser Obj crdict = do spaces many (many1 digit >> spaces >> digit >> string " obj" >> spaces) d <- pdfdictionary <* spaces return d rootRef :: BS.ByteString -> Maybe Int rootRef bs = case parseTrailer bs of Just dict -> findRefs isRootRef dict Nothing -> rootRefFromCRStream bs rootRefFromCRStream :: BS.ByteString -> Maybe Int rootRefFromCRStream bs = let offset = (read . BS.unpack . head . drop 1 . reverse . BS.lines $ (trace (show bs) bs)) :: Int crstrm = snd . head . findObjs $ BS.drop offset bs crdict = parseCRDict crstrm in findRefs isRootRef $ crdict isRootRef (PdfName "/Root", ObjRef x) = True isRootRef (_,_) = False findRefs :: ((Obj,Obj) -> Bool) -> Dict -> Maybe Int findRefs pred dict = case find pred dict of Just (_, ObjRef x) -> Just x Nothing -> Nothing -- find Info findTrailer bs = do case parseTrailer bs of Just d -> d Nothing -> [] infoRef bs = case parseTrailer bs of Just dict -> findRefs isInfoRef dict Nothing -> error "No ref for info" isInfoRef (PdfName "/Info", ObjRef x) = True isInfoRef (_,_) = False -- expand PDF 1.5 Object Stream expandObjStm :: [PDFObj] -> [PDFObj] expandObjStm os = concat $ map objStm os objStm :: PDFObj -> [PDFObj] objStm (n, obj) = case findDictOfType "/ObjStm" obj of Nothing -> [(n,obj)] Just _ -> pdfObjStm n $ BSL.toStrict $ rawStream obj refOffset :: Parser ([(Int, Int)], String) refOffset = spaces *> ((,) <$> many1 ((\r o -> (read r :: Int, read o :: Int)) <$> (many1 digit <* spaces) <*> (many1 digit <* spaces)) <*> many1 anyChar) pdfObjStm n s = let (location, objstr) = case parseOnly refOffset s of Right val -> val Left err -> error $ "Failed to parse Object Stream: " in map (\(r,o) -> (r, parseDict $ BS.pack $ drop o objstr)) location where parseDict s' = case parseOnly pdfdictionary s' of Right obj -> [obj] Left _ -> case parseOnly pdfarray s' of Right obj -> [obj] Left _ -> case parseOnly pdfletters s' of Right obj -> [obj] Left err -> error $ (show err) ++ ":\n Failed to parse obj around; \n" ++ (show $ BS.take 100 s') -- make fontmap from page's /Resources (see 3.7.2 of PDF Ref.) findFontMap d os = findEncoding (fontObjs d os) os findEncoding :: Dict -> [PDFObj] -> [(String, FontMap)] findEncoding dict objs = map pairwise dict where pairwise (PdfName n, ObjRef r) = (n, fontMap r objs) pairwise x = ("", NullMap) fontObjs :: Dict -> [PDFObj] -> Dict fontObjs dict objs = case findResourcesDict dict objs of Just d -> case findObjThroughDict d "/Font" of Just (PdfDict d) -> d otherwise -> [] Nothing -> [] findResourcesDict :: Dict -> [PDFObj] -> Maybe Dict findResourcesDict dict objs = case find resources dict of Just (_, ObjRef x) -> findDictByRef x objs Just (_, PdfDict d) -> Just d otherwise -> error (show dict) where resources (PdfName "/Resources", _) = True resources _ = False -- Needs rewrite! fontMap :: Int -> [PDFObj] -> FontMap fontMap x objs = case findObjThroughDictByRef x "/Encoding" objs of Just (ObjRef ref) -> case findObjThroughDictByRef ref "/Differences" objs of Just (PdfArray arr) -> charMap arr otherwise -> trace "no /differences" NullMap Just (PdfName "/StandardEncoding") -> NullMap Just (PdfName "/MacRomanEncoding") -> NullMap Just (PdfName "/MacExpertEncoding") -> NullMap Just (PdfName "/WinAnsiEncoding") -> NullMap otherwise -> case findObjThroughDictByRef x "/ToUnicode" objs of Just (ObjRef ref) -> case findObjThroughDictByRef ref "/CharSet" objs of Just (PdfText str) -> WithCharSet str otherwise -> WithCharSet "" otherwise -> case findObjThroughDictByRef x "/DescendantFonts" objs of -- needs CID to Unicode map Just (ObjRef ref) -> case findObjsByRef ref objs of Just [(PdfArray ((ObjRef subref):_))] -> case findObjThroughDictByRef subref "/CIDSystemInfo" objs of Just (ObjRef inforef) -> case findObjThroughDictByRef inforef "/Registry" objs of Just (PdfText "Adobe") -> case findObjThroughDictByRef inforef "/Ordering" objs of Just (PdfText "Japan1") -> case findObjThroughDictByRef inforef "/Supplement" objs of Just (PdfNumber _) -> CIDmap "Adobe-Japan1" _ -> trace (show inforef) defaultCIDMap _ -> trace (show inforef) defaultCIDMap _ -> trace (show inforef) defaultCIDMap _ -> trace (show subref ++ " no /cidsysteminfoy. using Adobe-Japan1...") defaultCIDMap _ -> trace (show ref ++ " no array in /descendantfonts. using Adobe-Japan1...") defaultCIDMap _ -> trace (show x ++ " no /descendantfonts. using Adobe-Japan1...") defaultCIDMap where defaultCIDMap = CIDmap "Adobe-Japan1" charMap :: [Obj] -> FontMap charMap objs = FontMap $ fontmap objs 0 where fontmap (PdfNumber x : PdfName n : xs) i = if i < truncate x then (chr $ truncate x, n) : (fontmap xs $ incr x) else (chr $ i, n) : (fontmap xs $ i+1) fontmap (PdfName n : xs) i = (chr i, n) : (fontmap xs $ i+1) fontmap [] i = [] incr x = (truncate x) + 1 findCMap d os = cMap (fontObjs d os) os cMap :: Dict -> [PDFObj] -> [(String, CMap)] cMap dict objs = map pairwise dict where pairwise (PdfName n, ObjRef r) = (n, toUnicode r objs) pairwise x = ("", []) toUnicode :: Int -> [PDFObj] -> CMap toUnicode x objs = case findObjThroughDictByRef x "/ToUnicode" objs of Just (ObjRef ref) -> parseCMap $ rawStreamByRef objs ref otherwise -> case findObjThroughDictByRef x "/Encoding" objs of Just (PdfName "/Identity-H") -> case findObjThroughDictByRef x "/ToUnicode" objs of Just (ObjRef ref) -> parseCMap $ rawStreamByRef objs ref otherwise -> [] otherwise -> []