-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A collection of tools for processing PDF files. -- -- Mid level tools for processing PDF files. -- -- Level of abstraction: document, catalog, page @package pdf-toolbox-document @version 0.1.2 -- | Various types module Pdf.Document.Types -- | Utilities for internal use module Pdf.Document.Internal.Util -- | Check that the dictionary has the specified "Type" filed ensureType :: Name -> Dict -> IO () -- | Get dictionary type, name at key "Type" dictionaryType :: Dict -> Either String Name decodeTextString :: ByteString -> Either String Text decodeTextStringThrow :: ByteString -> IO Text -- | Internal type declarations module Pdf.Document.Internal.Types data Pdf Pdf :: File -> IORef ObjectCache -> Pdf -- | PDF document -- -- It is a trailer under the hood data Document Document :: Pdf -> Dict -> Document -- | Document catalog data Catalog Catalog :: Pdf -> Ref -> Dict -> Catalog -- | Information dictionary data Info Info :: Pdf -> Ref -> Dict -> Info -- | Page tree node, contains pages or other nodes data PageNode PageNode :: Pdf -> Ref -> Dict -> PageNode -- | Pdf document page data Page Page :: Pdf -> Ref -> Dict -> Page -- | Page tree data PageTree PageTreeNode :: PageNode -> PageTree PageTreeLeaf :: Page -> PageTree -- | Font dictionary data FontDict FontDict :: Pdf -> Dict -> FontDict module Pdf.Document.Pdf data Pdf withPdfFile :: FilePath -> (Pdf -> IO a) -> IO a -- | Make Pdf with interface to pdf file fromFile :: File -> IO Pdf -- | Make Pdf with seekable handle fromHandle :: Handle -> IO Pdf -- | Make Pdf from a ByteString fromBytes :: ByteString -> IO Pdf -- | Get PDF document document :: Pdf -> IO Document -- | Find object by it's reference lookupObject :: Pdf -> Ref -> IO Object -- | Get stream content, decoded and decrypted -- -- Note: length of the content may differ from the raw one streamContent :: Pdf -> Ref -> Stream -> IO (InputStream ByteString) -- | Get stream content without decoding it rawStreamContent :: Pdf -> Ref -> Stream -> IO (InputStream ByteString) deref :: Pdf -> Object -> IO Object -- | Whether the PDF document it encrypted isEncrypted :: Pdf -> IO Bool -- | Set the password to be user for decryption -- -- Returns False when the password is wrong setUserPassword :: Pdf -> ByteString -> IO Bool -- | The default user password defaultUserPassword :: ByteString -- | File is enctypted data EncryptedError EncryptedError :: Text -> EncryptedError -- | Cache object for future lookups enableCache :: Pdf -> IO () -- | Don't cache object for future lookups disableCache :: Pdf -> IO () instance GHC.Show.Show Pdf.Document.Pdf.EncryptedError instance GHC.Exception.Type.Exception Pdf.Document.Pdf.EncryptedError -- | Page tree node module Pdf.Document.PageNode -- | Page tree node, contains pages or other nodes data PageNode -- | Page tree data PageTree PageTreeNode :: PageNode -> PageTree PageTreeLeaf :: Page -> PageTree -- | Total number of child leaf nodes, including deep children pageNodeNKids :: PageNode -> IO Int -- | Parent page node pageNodeParent :: PageNode -> IO (Maybe PageNode) -- | Referencies to all kids pageNodeKids :: PageNode -> IO [Ref] -- | Load page tree node by reference loadPageNode :: Pdf -> Ref -> IO PageTree -- | Find page by it's number -- -- Note: it is not efficient for PDF files with a lot of pages, because -- it performs traversal through the page tree each time. Use -- pageNodeNKids, pageNodeKids and loadPageNode for -- efficient traversal. pageNodePageByNum :: PageNode -> Int -> IO Page -- | Document info dictionary module Pdf.Document.Info -- | Information dictionary data Info -- | Document title infoTitle :: Info -> IO (Maybe Text) -- | The name of the person who created the document infoAuthor :: Info -> IO (Maybe Text) -- | The subject of the document infoSubject :: Info -> IO (Maybe Text) -- | Keywords associated with the document infoKeywords :: Info -> IO (Maybe Text) -- | The name of the application that created the original document infoCreator :: Info -> IO (Maybe Text) -- | The name of the application that converted the document to PDF format infoProducer :: Info -> IO (Maybe Text) -- | Font dictionary module Pdf.Document.FontDict -- | Font dictionary data FontDict -- | Font subtypes data FontSubtype FontType0 :: FontSubtype FontType1 :: FontSubtype FontMMType1 :: FontSubtype FontType3 :: FontSubtype FontTrueType :: FontSubtype -- | Get font subtype fontDictSubtype :: FontDict -> IO FontSubtype -- | Load font info for the font fontDictLoadInfo :: FontDict -> IO FontInfo instance GHC.Classes.Eq Pdf.Document.FontDict.FontSubtype instance GHC.Show.Show Pdf.Document.FontDict.FontSubtype -- | PDF document page module Pdf.Document.Page -- | Pdf document page data Page -- | Page's parent node pageParentNode :: Page -> IO PageNode -- | List of references to page's content streams pageContents :: Page -> IO [Ref] -- | Media box, inheritable pageMediaBox :: Page -> IO (Rectangle Double) -- | Font dictionaries for the page pageFontDicts :: Page -> IO [(Name, FontDict)] -- | Extract text from the page -- -- It tries to add spaces between chars if they don't present as actual -- characters in content stream. pageExtractText :: Page -> IO Text pageExtractGlyphs :: Page -> IO [Span] -- | Convert glyphs to text, trying to add spaces and newlines -- -- It takes list of spans. Each span is a list of glyphs that are -- outputed in one shot. So we don't need to add space inside span, only -- between them. glyphsToText :: [Span] -> Text instance GHC.Show.Show Pdf.Document.Page.XObject -- | PDF document module Pdf.Document.Document -- | PDF document -- -- It is a trailer under the hood data Document -- | Get the document catalog documentCatalog :: Document -> IO Catalog -- | Infornation dictionary for the document documentInfo :: Document -> IO (Maybe Info) -- | Document encryption dictionary documentEncryption :: Document -> IO (Maybe Dict) -- | Document datalog module Pdf.Document.Catalog -- | Document catalog data Catalog -- | Get root node of page tree catalogPageNode :: Catalog -> IO PageNode -- | Mid level utils for processing PDF file -- -- Basic example how to get number of pages in document -- --
--   import Pdf.Document
--   
--   withPdfFile "input.pdf" $ \pdf ->
--     doc <- document pdf
--     catalog <- documentCatalog doc
--     rootNode <- catalogPageNode catalog
--     count <- pageNodeNKids rootNode
--     print count
--     page <- loadPageByNum rootNode 1
--     text <- pageExtractText page
--     print text
--   
module Pdf.Document