{-# LANGUAGE OverloadedStrings #-} import PDF.Definition import PDF.Object import PDF.PDFIO import PDF.Outlines import Data.ByteString.UTF8 (ByteString) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import Debug.Trace -- initstate = (0,0,70,660) initstate = PSR { linex=0 , liney=0 , absolutex=0 , absolutey=700 , leftmargin=0.0 , top=700.0 , bottom=0.0 , fontfactor=1.0 , curfont="" , fontmaps=[] , cmaps=[]} ----------------------------------------------- -- Show each PDF Object by its reference number ----------------------------------------------- objectByRef filename ref = getObjectByRef ref (getPDFObjFromFile filename) streamByRef filename ref = do obj <- getObjectByRef ref (getPDFObjFromFile filename) return $ rawStream obj contentByRef filename ref = do objs <- getPDFObjFromFile filename obj <- objectByRef filename ref BSL.putStrLn $ contentOfObject obj objs where contentOfObject obj objs = case findDictOfType "/Page" obj of Just dict -> contentsStream dict initstate objs Nothing -> "" rawContentByRef filename ref = do objs <- getPDFObjFromFile filename obj <- objectByRef filename ref BSL.putStrLn $ rawContentOfObject obj objs where rawContentOfObject obj objs = case findDictOfType "/Page" obj of Just dict -> rawContentsStream dict objs Nothing -> "" showPage filename page = do pagetree <- refByPage filename contentByRef filename $ pagetree !! (page - 1) showRawPage filename page = do pagetree <- refByPage filename rawContentByRef filename $ pagetree !! (page - 1) cmapStreamByRef filename ref = do objs <- getPDFObjFromFile filename return $ toUnicode ref objs --------------------------------------- -- Sort Object References in Page order --------------------------------------- data PageTree = Nop | Page Int | Pages [PageTree] deriving Show refByPage filename = do root <- getRootRef filename objs <- getPDFObjFromFile filename return $ pageTreeToList $ pageorder root objs pageorder :: Int -> [PDFObj] -> PageTree pageorder parent objs = case findObjsByRef parent objs of Just os -> case findDictOfType "/Catalog" os of Just dict -> case pages dict of Just pr -> pageorder pr objs Nothing -> Nop Nothing -> case findDictOfType "/Pages" os of Just dict -> case pagesKids dict of Just kidsrefs -> Pages $ map (\f -> f objs) (map pageorder kidsrefs) Nothing -> Nop Nothing -> case findDictOfType "/Page" os of Just dict -> Page parent Nothing -> Nop Nothing -> Nop pageTreeToList :: PageTree -> [Int] pageTreeToList (Pages ps) = concatMap pageTreeToList ps pageTreeToList (Page n) = [n] pageTreeToList Nop = [] -------------------------- -- Get Whole Text from PDF -------------------------- -- First: grub objects -- Second: parse within each object, deflating its stream -- Third: linearize stream pdfToText filename = do contents <- BS.readFile filename let objs = map parsePDFObj $ getObjs contents let rootref = case rootRef contents of Just r -> r Nothing -> 0 putStrLn $ show $ linearize rootref objs linearize :: Int -> [PDFObj] -> PDFStream linearize parent objs = case findObjsByRef parent objs of Just os -> case findDictOfType "/Catalog" os of Just dict -> case pages dict of Just pr -> linearize pr objs Nothing -> "" Nothing -> case findDictOfType "/Pages" os of Just dict -> case pagesKids dict of Just kidsrefs -> BSL.concat $ map (\f -> f objs) (map linearize kidsrefs) Nothing -> "" Nothing -> case findDictOfType "/Page" os of Just dict -> contentsStream dict initstate objs Nothing -> "" Nothing -> "" ------------------- -- Meta Information -------------------