{-# LANGUAGE BlockArguments #-} {- ORMOLU_DISABLE -} {-| Module : Pdftotext.Internal Description : Internal functions Copyright : (c) 2020 G. Eyaeb License : BSD-3-Clause Maintainer : geyaeb@protonmail.com Stability : experimental Portability : POSIX Internal functions. -} {- ORMOLU_ENABLE -} module Pdftotext.Internal ( -- * Types Document (..), Layout (..), Page (..), -- * Loading PDF's openByteStringIO, openFile, -- * Document functions pageIO, pagesIO, pagesTotalIO, pdftotextIO, -- * Page functions pageTextIO, ) where import Control.Monad (forM) import Data.ByteString.Internal import qualified Data.Text as T import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr) import Foreign.C (withCString) import Pdftotext.Foreign newtype Document = Document (ForeignPtr Poppler_Document) data Page = Page { -- | Number of this page in original document. pageNumber :: Int, -- | Total number of pages in original document. pageOutOf :: Int, pagePtr :: ForeignPtr Poppler_Page } instance Show Page where show (Page n o _) = "Page " ++ show n ++ "/" ++ show o -- | Layout of text extracted from PDF. data Layout = -- | Text emulates layout of PDF, including horizontal spaces, -- and preserves hyphenation; corresponds to calling @pdftotext -layout@ Physical | -- | Discards horizontal spaces, preserves hyphenation; -- corresponds to calling @pdftotext -raw@ Raw | -- | Discards horizontal spaces, removes hyphenation; -- corresponds to calling @pdftotext@ without layout argument None deriving (Eq, Show) -- | Open PDF from file. If file does not exist or cannot be parsed as valid PDF, -- `Nothing` is returned. openFile :: FilePath -> IO (Maybe Document) openFile file = withCString file \cfile -> do docptr <- ffiOpenPdf cfile if docptr == nullPtr then return Nothing else Just . Document <$> newForeignPtr ffiDocumentDelete docptr -- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF, -- `Nothing` is returned. openByteStringIO :: ByteString -> IO (Maybe Document) openByteStringIO (PS ptr _ len) = withForeignPtr ptr \d -> do docptr <- ffiOpenData d (fromIntegral len) if docptr == nullPtr then return Nothing else Just . Document <$> newForeignPtr ffiDocumentDelete docptr -- | Return all pages from document. pagesIO :: Document -> IO [Page] pagesIO (Document doc) = do withForeignPtr doc \docptr -> do pageno <- ffiDocumentPages docptr forM [0 .. pageno - 1] \pno -> do p <- ffiDocumentOpenPage docptr pno >>= newForeignPtr ffiPageDelete return $ Page (fromIntegral pno + 1) (fromIntegral pageno) p -- | Return page number 'no' from PDF document, if the page exists. pageIO :: Int -> Document -> IO (Maybe Page) pageIO no d@(Document docptr) = withForeignPtr docptr \ptr -> do pno <- pagesTotalIO d if no > 0 && no <= pno then Just . Page no pno <$> (ffiDocumentOpenPage ptr (fromIntegral no - 1) >>= newForeignPtr ffiPageDelete) else return Nothing -- | Return number of pages contained in document. pagesTotalIO :: Document -> IO Int pagesTotalIO (Document doc) = fromIntegral <$> withForeignPtr doc ffiDocumentPages -- | Extract text from a page with given 'Layout'. pageTextIO :: Layout -> Page -> IO T.Text pageTextIO layout (Page _ _ ptr) = withForeignPtr ptr \p -> asText (ffiPageText p l) where l = case layout of Raw -> 0 Physical -> 1 None -> 2 -- | Extract text from PDF document with given 'Layout'. pdftotextIO :: Layout -> Document -> IO T.Text pdftotextIO layout doc = do ps <- pagesIO doc txt <- mapM (pageTextIO layout) ps return $ T.concat txt