{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE PackageImports #-} {-# LANGUAGE OverloadedStrings #-} module Main where import PDF.Definition import PDF.Object import PDF.DocumentStructure import PDF.PDFIO import PDF.Outlines import System.Environment (getArgs) import Data.ByteString.UTF8 (ByteString) import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import Data.Text.Lazy as TL (unpack) import Data.Text.Lazy.Encoding as TL import Data.List (nub, find) import Data.Maybe (fromMaybe) import Options.Applicative import Data.Semigroup ((<>)) import "regex-compat-tdfa" Text.Regex as R import Options.Applicative (strOption) import Control.Monad (when) import PDF.Definition (Obj(PdfStream)) import Debug.Trace initstate = PSR { linex=0 , liney=0 , absolutex=0 , absolutey=0 , leftmargin=0.0 , text_lm=(1,0,0,1,0,0) , text_m=(1,0,0,1,0,0) , text_break=False , top=0.0 , bottom=0.0 , fontfactor=1 , curfont="" , fontmaps=[] , cmaps=[] , colorspace="" , xcolorspaces=[] } main :: IO () main = hpdft =<< execParser opts where opts = info (options <**> helper) (fullDesc <> progDesc "Show text of a PDF file" <> header "hpdft - a PDF parsing tool" ) -- option parser data CmdOpt = CmdOpt { page :: Int, ref :: Int, grep :: String, refs :: Bool, pdftitle :: Bool, pdfinfo :: Bool, pdfoutline :: Bool, trailer :: Bool, file :: FilePath } options :: Parser CmdOpt options = CmdOpt <$> option auto ( long "page" <> short 'p' <> value 0 <> metavar "PAGE" <> help "Page number (nomble)" ) <*> option auto ( long "ref" <> short 'r' <> value 0 <> metavar "REF" <> help "Object reference" ) <*> strOption ( long "grep" <> short 'g' <> value "" <> metavar "RegExp" <> help "grep PDF" ) <*> switch ( long "refs" <> short 'R' <> help "Show object references in page order" ) <*> switch ( long "title" <> short 'T' <> help "Show title (from metadata)" ) <*> switch ( long "info" <> short 'I' <> help "Show PDF metainfo" ) <*> switch ( long "toc" <> short 'O' <> help "Show table of contents (from metadata) " ) <*> switch ( long "trailer" <> help "Show the trailer of PDF" ) <*> strArgument ( help "input pdf file" <> metavar "FILE" <> action "file" ) hpdft :: CmdOpt -> IO () hpdft (CmdOpt 0 0 "" False False False False False fn) = pdfToText fn hpdft (CmdOpt 0 0 "" False True _ _ _ fn) = showTitle fn hpdft (CmdOpt 0 0 "" False _ True _ _ fn) = showInfo fn hpdft (CmdOpt 0 0 "" False _ _ True _ fn) = showOutlines fn hpdft (CmdOpt 0 0 "" False _ _ _ True fn) = print =<< getTrailer fn hpdft (CmdOpt 0 0 "" True _ _ _ _ fn) = print =<< refByPage fn hpdft (CmdOpt n 0 "" False _ _ _ _ fn) = showPage fn n hpdft (CmdOpt 0 r "" False _ _ _ _ fn) = showContent fn r hpdft (CmdOpt 0 0 r False _ _ _ _ fn) = grepPDF fn r hpdft _ = return () -- | Get a whole text from 'filename'. It works as: -- (1) grub objects -- (2) parse within each object, deflating its stream -- (3) linearize stream pdfToText filename = do objs <- getPDFObjFromFile filename rootref <- getRootRef filename BSL.putStrLn $ walkdown initstate rootref objs walkdown :: PSR -> Int -> [PDFObj] -> PDFStream walkdown st parent objs = case findObjsByRef parent objs of Just os -> case findDictOfType "/Catalog" os of Just dict -> case findPages dict of Just pr -> walkdown st pr objs Nothing -> "" Nothing -> case findDictOfType "/Pages" os of Just dict -> case findKids dict of Just kidsrefs -> BSL.concat $ map ((\f -> f objs) . (walkdown st)) kidsrefs Nothing -> "" Nothing -> case findDictOfType "/Page" os of Just dict -> contentsStream dict st objs Nothing -> "" Nothing -> "" data PageTree = Nop | Page Int | Pages [PageTree] deriving Show -- | Sort object references in page order. refByPage filename = do root <- getRootRef filename objs <- getPDFObjFromFile filename return $ pageTreeToList $ pageorder root objs pageorder :: Int -> [PDFObj] -> PageTree pageorder parent objs = case findObjsByRef parent objs of Just os -> case findDictOfType "/Catalog" os of Just dict -> case findPages dict of Just pr -> pageorder pr objs Nothing -> Nop Nothing -> case findDictOfType "/Pages" os of Just dict -> case findKids dict of Just kidsrefs -> Pages $ map (\f -> f objs) (map pageorder kidsrefs) Nothing -> Nop Nothing -> case findDictOfType "/Page" os of Just dict -> Page parent Nothing -> Nop Nothing -> Nop pageTreeToList :: PageTree -> [Int] pageTreeToList (Pages ps) = concatMap pageTreeToList ps pageTreeToList (Page n) = [n] pageTreeToList Nop = [] -- | Show contents of page 'page' in 'filename'. showPage filename page = do pagetree <- refByPage filename case length pagetree >= page of True -> contentByRef filename $ pagetree !! (page - 1) False -> putStrLn $ "hpdft: No Page "++(show page) -- | Show /Content referenced from the 'ref'ed-object in 'filename'. contentByRef filename ref = do objs <- getPDFObjFromFile filename obj <- getObjectByRef ref objs BSL.putStrLn $ contentInObject obj objs where contentInObject obj objs = case findDictOfType "/Page" obj of Just dict -> contentsStream dict initstate objs Nothing -> "" showContent filename ref = do objs <- getPDFObjFromFile filename obj <- getObjectByRef ref objs let d = fromMaybe [] $ findDict obj if hasStream obj then if hasSubtype d -- then it's not content stream then printStreamWithDict d obj else BSL.putStrLn =<< getStream False obj else print =<< getObjectByRef ref objs where hasStream obj = case find isStream obj of Just _ -> True Nothing -> False isStream (PdfStream _) = True isStream _ = False hasSubtype d = case find isSubtype d of Just _ -> True Nothing -> False isSubtype (PdfName "/Subtype", _) = True isSubtype x = False printStreamWithDict :: Dict -> [Obj] -> IO () printStreamWithDict d obj = print (PdfDict d) >> (BSL.putStrLn =<< getStream True obj) -- | Show /Title from meta information in 'filename' showTitle filename = do d <- getInfo filename let title = case findObjFromDict d "/Title" of Just (PdfText s) -> s Just x -> show x Nothing -> "No title anyway" putStrLn title return () -- | Show /Info from meta information in 'filename' showInfo filename = do d <- getInfo filename putStrLn $ toString 0 (PdfDict d) return () -- | Show /Outlines from meta information in 'filename' showOutlines filename = do d <- getOutlines filename putStrLn $ show d return () -- | Find string in each page. grepPDF filename re = do root <- getRootRef filename objs <- getPDFObjFromFile filename mapM (\(strm, pagenm) -> (grepByPage pagenm re . contentInObjs objs) strm) $ zip (pageTreeToList $ pageorder root objs) [1..] return () where contentInObjs objs ref = case findObjsByRef ref objs of Just obj -> case findDictOfType "/Page" obj of Just dict -> contentsStream dict initstate objs Nothing -> "" Nothing -> error $ "No Object with Ref " ++ show ref grepByPage :: Int -> String -> PDFStream -> IO () grepByPage pagenm re txt = do let matched = filter (not . null) $ map (grepByLine re) $ BSL.lines txt when (not $ null matched) (showResult pagenm matched) return () where showResult p m = do putStrLn $ "At page " <> show p <> "..." mapM (putStrLn . (" | " <>)) m return () grepByLine :: String -> PDFStream -> String grepByLine re txt = case matchRegexAll (mkRegex re) $ TL.unpack $ TL.decodeUtf8 txt of Just (b, m, a, _) -> (b <> (highlight m) <> a) Nothing -> "" highlight m = "\ESC[31m" <> m <> "\ESC[0m"