{-# LANGUAGE BlockArguments #-}

{- ORMOLU_DISABLE -}
{-|
Module      : Pdftotext
Description : Extracts text from PDF using poppler
Copyright   : (c) 2020 G. Eyaeb
License     : BSD-3-Clause
Maintainer  : geyaeb@protonmail.com
Stability   : experimental
Portability : POSIX

=== Usage

> import qualified Data.Text.IO as T
> import Pdftotext
>
> main :: IO ()
> main = do
>   Just pdf <- openFile "path/to/file.pdf"
>   T.putStrLn $ pdftotext Physical pdf

-}
{- ORMOLU_ENABLE -}
module Pdftotext
  ( -- * Types
    Document,
    Layout (..),
    Page,

    -- * Loading PDF's
    openByteString,
    openFile,

    -- * Document functions
    page,
    pages,
    pagesTotal,
    pdftotext,

    -- * Page functions
    pageNumber,
    pageOutOf,
    pageText,
  )
where

import Data.ByteString
import Data.Text (Text)
import GHC.IO (unsafePerformIO)
import Pdftotext.Internal

-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
-- `Nothing` is returned.
openByteString :: ByteString -> Maybe Document
openByteString :: ByteString -> Maybe Document
openByteString = IO (Maybe Document) -> Maybe Document
forall a. IO a -> a
unsafePerformIO (IO (Maybe Document) -> Maybe Document)
-> (ByteString -> IO (Maybe Document))
-> ByteString
-> Maybe Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> IO (Maybe Document)
openByteStringIO

-- | Return page number 'no' from PDF document, if the page exists.
page :: Int -> Document -> Maybe Page
page :: Int -> Document -> Maybe Page
page no :: Int
no doc :: Document
doc = IO (Maybe Page) -> Maybe Page
forall a. IO a -> a
unsafePerformIO (IO (Maybe Page) -> Maybe Page) -> IO (Maybe Page) -> Maybe Page
forall a b. (a -> b) -> a -> b
$ Int -> Document -> IO (Maybe Page)
pageIO Int
no Document
doc

-- | Return all pages from document.
pages :: Document -> [Page]
pages :: Document -> [Page]
pages = IO [Page] -> [Page]
forall a. IO a -> a
unsafePerformIO (IO [Page] -> [Page])
-> (Document -> IO [Page]) -> Document -> [Page]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Document -> IO [Page]
pagesIO

-- | Return number of pages contained in document.
pagesTotal :: Document -> Int
pagesTotal :: Document -> Int
pagesTotal = IO Int -> Int
forall a. IO a -> a
unsafePerformIO (IO Int -> Int) -> (Document -> IO Int) -> Document -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Document -> IO Int
pagesTotalIO

-- | Extract text from PDF document with given 'Layout'.
pdftotext :: Layout -> Document -> Text
pdftotext :: Layout -> Document -> Text
pdftotext lay :: Layout
lay doc :: Document
doc = IO Text -> Text
forall a. IO a -> a
unsafePerformIO (IO Text -> Text) -> IO Text -> Text
forall a b. (a -> b) -> a -> b
$ Layout -> Document -> IO Text
pdftotextIO Layout
lay Document
doc

-- | Extract text from a page with given 'Layout'.
pageText :: Layout -> Page -> Text
pageText :: Layout -> Page -> Text
pageText l :: Layout
l p :: Page
p = IO Text -> Text
forall a. IO a -> a
unsafePerformIO (IO Text -> Text) -> IO Text -> Text
forall a b. (a -> b) -> a -> b
$ Layout -> Page -> IO Text
pageTextIO Layout
l Page
p