{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS -Wall #-} module Text.PDF.Info (-- * Reading PDF info pdfInfo ,PDFInfo(..) ,PDFSize(..) ,PDFEncryptionInfo(..) ,PDFInfoError(..) -- * Internals ,ParsePDFInfo ,runParse ,parse ,parseSize ,parseDate ,parseEncrypted ,readRight) where import Control.Applicative import Control.Arrow import Control.Exception as E import Control.Monad.Error import Data.Char (isSpace) import Data.Text (Text) import qualified Data.Text as T import Data.Time import Prelude import System.Exit import System.Locale import System.Process.Text -- | A type representing the output from the pdfinfo command. data PDFInfo = PDFInfo { pdfInfoTitle :: !(Maybe Text) -- ^ Title , pdfInfoSubject :: !(Maybe Text) -- ^ Subject , pdfInfoAuthor :: !(Maybe Text) -- ^ Author: E.g. Chris Done , pdfInfoCreator :: !(Maybe Text) -- ^ Creator: E.g. Microsoft Office Word 2007 , pdfInfoProducer :: !(Maybe Text) -- ^ Producer: E.g. Microsoft Office Word 2007 , pdfInfoCreationDate :: !(Maybe UTCTime) -- ^ Creation Date , pdfInfoModDate :: !(Maybe UTCTime) -- ^ Modification Date , pdfInfoTagged :: !(Maybe Bool) -- ^ Tagged? , pdfInfoPages :: !(Maybe Integer) -- ^ Pages: E.g. 238 , pdfInfoEncrypted :: !(Maybe PDFEncryptionInfo) -- ^ Encryption information , pdfInfoPageSize :: !(Maybe PDFSize) -- ^ Page: E.g. 595.32 x 841.92 pts (A4) , pdfInfoFileSize :: !(Maybe Integer) -- ^ File: E.g. 4061737 bytes , pdfInfoOptimized :: !(Maybe Bool) -- ^ Optimized? , pdfInfoPDFVersion :: !(Maybe Double) -- ^ PDF: E.g. 1.5 } deriving Show -- | Possible things that can go wrong while reading the info. data PDFInfoError = ParseError !String -- ^ Couldn't parse a property value. | ProcessFailure !Text -- ^ Process exited with this stderr. | ProcessError !IOException -- ^ Error to do with the pdfinfo process. | NoMessage -- ^ No message given. | SomeError String -- ^ Some nonspecific error. deriving Show -- | Size of the PDF in pts. data PDFSize = PDFSize { pdfSizeW :: !Float, pdfSizeH :: !Float } deriving (Eq,Show) -- | Encryption and restricted permissions data PDFEncryptionInfo -- | Not encrypted = PDFNoEncryption -- | Encrypted with possible permission restrictions | PDFEncryption { pdfCanPrint :: !(Maybe Bool) -- ^ Can the file be printed? , pdfCanCopy :: !(Maybe Bool) -- ^ Can the file be copied? , pdfCanChange :: !(Maybe Bool) -- ^ Can the file be changed? , pdfCanAddNotes :: !(Maybe Bool) -- ^ Can notes be added? , pdfEncryptionAlgorithm :: !(Maybe Text) -- ^ Encryption algorithm: e.g. unknown, RC4, AES, AES-256 } deriving (Eq,Show) instance Error PDFInfoError where noMsg = NoMessage; strMsg = SomeError newtype ParsePDFInfo a = ParsePDFInfo { runParse :: Either PDFInfoError a } deriving (Monad,Functor,MonadError PDFInfoError) instance Applicative ParsePDFInfo where (<*>) = ap; pure = return -- | Run pdfinfo on the given file. Handles IO exceptions to do with -- running the process. pdfInfo :: MonadIO m => FilePath -> m (Either PDFInfoError PDFInfo) pdfInfo path = liftIO $ loadInfo `E.catch` ioErrorHandler where loadInfo = do (code,out,err) <- readProcessWithExitCode "pdfinfo" ["-enc","UTF-8",path] "" case code of ExitSuccess -> return (parse out) ExitFailure{} -> return (Left (ProcessFailure err)) ioErrorHandler = return . Left . ProcessError -- | Parse PDFInfo's output. parse :: Text -> Either PDFInfoError PDFInfo parse out = runParse $ PDFInfo <$> string props "Title" <*> string props "Subject" <*> string props "Author" <*> string props "Creator" <*> string props "Producer" <*> date "CreationDate" <*> date "ModDate" <*> bool props "Tagged" <*> integer "Pages" <*> encrypted "Encrypted" <*> size "Page size" <*> integer "File size" <*> bool props "Optimized" <*> floating "PDF version" where date = get parseDate size = get parseSize encrypted = get parseEncrypted floating = readIt integer = readIt readIt :: Read a => Text -> ParsePDFInfo (Maybe a) readIt = get readRight props = map split . T.lines $ out get = withProps props type Props = [(Text,Text)] -- | Look up a name in a finite map of "properties" and apply a (parsing) -- function. withProps :: Props -> (Text -> ParsePDFInfo a) -> Text -> ParsePDFInfo (Maybe a) withProps properties f name = case lookup name properties of Just ok -> catchError (Just <$> (f $ T.strip ok)) (\_ -> return Nothing) Nothing -> return Nothing split :: Text -> (Text, Text) split = second (T.drop 1) . T.span (/=':') string :: Props -> Text -> ParsePDFInfo (Maybe Text) string props = withProps props return bool :: Props -> Text -> ParsePDFInfo (Maybe Bool) bool props = withProps props $ \yes -> return $ yes == "yes" -- | Parse a page size. This is loosely defined. parseSize :: Text -> ParsePDFInfo PDFSize parseSize s = case T.words s of ((readRight -> Right x):"x":(readRight -> Right y):_) -> return $ PDFSize x y _ -> throwError $ ParseError $ "Unable to read size: " ++ show s -- | Parse a date according to pdfinfo's format. parseDate :: Text -> ParsePDFInfo UTCTime parseDate s = case parseTime defaultTimeLocale "%a %b %e %H:%M:%S %Y" (T.unpack s) of Just ok -> return ok Nothing -> throwError $ ParseError $ "Unable to parse date: " ++ show s -- | Parse encryption information according to pdfinfo's format. parseEncrypted :: Text -> ParsePDFInfo PDFEncryptionInfo parseEncrypted s = case T.break isSpace s of ("yes",rest) -> PDFEncryption <$> bool props "print" <*> bool props "copy" <*> bool props "change" <*> bool props "addNotes" <*> string props "algorithm" where props = map split $ T.words $ T.filter (flip notElem ['(',')']) rest ("no",_) -> return PDFNoEncryption _ -> throwError $ ParseError $ "Unable to parse encryption: " ++ show s -- | Read a value, maybe, allow misc trailing data. readRight :: (MonadError PDFInfoError m,Read a) => Text -> m a readRight s = case reads (T.unpack s) of [(v,_)] -> return v _ -> throwError $ ParseError $ "Couldn't read value: " ++ show s