Copyright | (c) 2020 G. Eyaeb |
---|---|
License | BSD-3-Clause |
Maintainer | geyaeb@protonmail.com |
Stability | experimental |
Portability | POSIX |
Safe Haskell | None |
Language | Haskell2010 |
Usage
import qualified Data.Text.IO as T import Pdftotext main :: IO () main = do Just pdf <- openFile "path/to/file.pdf" T.putStrLn $ pdftotext Physical pdf
Flags
xml-conduit
–metadata
of PDF document properties is parsed as XML, otherwise remains as text
Synopsis
- data Document
- data Layout
- data Page
- data Properties = Properties {}
- openByteString :: ByteString -> Maybe Document
- openFile :: FilePath -> IO (Maybe Document)
- page :: Int -> Document -> Maybe Page
- pages :: Document -> [Page]
- pagesTotal :: Document -> Int
- pdftotext :: Layout -> Document -> Text
- properties :: Document -> Properties
- pageNumber :: Page -> Int
- pageOutOf :: Page -> Int
- pageText :: Layout -> Page -> Text
Types
Layout of text extracted from PDF.
Physical | Text emulates layout of PDF, including horizontal spaces,
and preserves hyphenation; corresponds to calling |
Raw | Discards horizontal spaces, preserves hyphenation;
corresponds to calling |
None | Discards horizontal spaces, removes hyphenation;
corresponds to calling |
data Properties Source #
Document properties.
If flag xml-conduit
is set, metadata
is of type Maybe Text.XML.Document
.
Since: 0.0.2.0
Instances
Show Properties Source # | |
Defined in Pdftotext.Internal showsPrec :: Int -> Properties -> ShowS # show :: Properties -> String # showList :: [Properties] -> ShowS # |
Loading PDF's
openByteString :: ByteString -> Maybe Document Source #
Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
Nothing
is returned.
openFile :: FilePath -> IO (Maybe Document) Source #
Open PDF from file. If file does not exist or cannot be parsed as valid PDF,
Nothing
is returned.
Document functions
page :: Int -> Document -> Maybe Page Source #
Return page number no
from PDF document, if the page exists.
pagesTotal :: Document -> Int Source #
Return number of pages contained in document.
properties :: Document -> Properties Source #
Extract properties from the document.
Since: 0.0.2.0
Page functions
pageNumber :: Page -> Int Source #
Number of this page in original document.