pdftotext-0.0.1.0: Extracts text from PDF using poppler
Copyright(c) 2020 G. Eyaeb
LicenseBSD-3-Clause
Maintainergeyaeb@protonmail.com
Stabilityexperimental
PortabilityPOSIX
Safe HaskellNone
LanguageHaskell2010

Pdftotext

Description

Usage

import qualified Data.Text.IO as T
import Pdftotext

main :: IO ()
main = do
  Just pdf <- openFile "path/to/file.pdf"
  T.putStrLn $ pdftotext Physical pdf
Synopsis

Types

data Layout Source #

Layout of text extracted from PDF.

Constructors

Physical

Text emulates layout of PDF, including horizontal spaces, and preserves hyphenation; corresponds to calling pdftotext -layout

Raw

Discards horizontal spaces, preserves hyphenation; corresponds to calling pdftotext -raw

None

Discards horizontal spaces, removes hyphenation; corresponds to calling pdftotext without layout argument

Instances

Instances details
Eq Layout Source # 
Instance details

Defined in Pdftotext.Internal

Methods

(==) :: Layout -> Layout -> Bool #

(/=) :: Layout -> Layout -> Bool #

Show Layout Source # 
Instance details

Defined in Pdftotext.Internal

data Page Source #

Instances

Instances details
Show Page Source # 
Instance details

Defined in Pdftotext.Internal

Methods

showsPrec :: Int -> Page -> ShowS #

show :: Page -> String #

showList :: [Page] -> ShowS #

Loading PDF's

openByteString :: ByteString -> Maybe Document Source #

Open PDF represented as bytestring. If document cannot be parsed as valid PDF, Nothing is returned.

openFile :: FilePath -> IO (Maybe Document) Source #

Open PDF from file. If file does not exist or cannot be parsed as valid PDF, Nothing is returned.

Document functions

page :: Int -> Document -> Maybe Page Source #

Return page number no from PDF document, if the page exists.

pages :: Document -> [Page] Source #

Return all pages from document.

pagesTotal :: Document -> Int Source #

Return number of pages contained in document.

pdftotext :: Layout -> Document -> Text Source #

Extract text from PDF document with given Layout.

Page functions

pageNumber :: Page -> Int Source #

Number of this page in original document.

pageOutOf :: Page -> Int Source #

Total number of pages in original document.

pageText :: Layout -> Page -> Text Source #

Extract text from a page with given Layout.