{-# LANGUAGE BlockArguments #-} {- ORMOLU_DISABLE -} {-| Module : Pdftotext Description : Extracts text from PDF using poppler Copyright : (c) 2020 G. Eyaeb License : BSD-3-Clause Maintainer : geyaeb@protonmail.com Stability : experimental Portability : POSIX === Usage > import qualified Data.Text.IO as T > import Pdftotext > > main :: IO () > main = do > Just pdf <- openFile "path/to/file.pdf" > T.putStrLn $ pdftotext Physical pdf -} {- ORMOLU_ENABLE -} module Pdftotext ( -- * Types Document, Layout (..), Page, -- * Loading PDF's openByteString, openFile, -- * Document functions page, pages, pagesTotal, pdftotext, -- * Page functions pageNumber, pageOutOf, pageText, ) where import Data.ByteString import Data.Text (Text) import GHC.IO (unsafePerformIO) import Pdftotext.Internal -- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF, -- `Nothing` is returned. openByteString :: ByteString -> Maybe Document openByteString = unsafePerformIO . openByteStringIO -- | Return page number 'no' from PDF document, if the page exists. page :: Int -> Document -> Maybe Page page no doc = unsafePerformIO $ pageIO no doc -- | Return all pages from document. pages :: Document -> [Page] pages = unsafePerformIO . pagesIO -- | Return number of pages contained in document. pagesTotal :: Document -> Int pagesTotal = unsafePerformIO . pagesTotalIO -- | Extract text from PDF document with given 'Layout'. pdftotext :: Layout -> Document -> Text pdftotext lay doc = unsafePerformIO $ pdftotextIO lay doc -- | Extract text from a page with given 'Layout'. pageText :: Layout -> Page -> Text pageText l p = unsafePerformIO $ pageTextIO l p