{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DeriveGeneric #-}

{- ORMOLU_DISABLE -}
{-|
Module      : Pdftotext.Internal
Description : Internal functions
Copyright   : (c) 2020 G. Eyaeb
License     : BSD-3-Clause
Maintainer  : geyaeb@protonmail.com
Stability   : experimental
Portability : POSIX

Internal functions.
-}
{- ORMOLU_ENABLE -}
module Pdftotext.Internal
  ( -- * Types
    Document (..),
    Layout (..),
    Page (..),
    Properties (..),

    -- * Loading PDF's
    openByteStringIO,
    openFile,

    -- * Document functions
    pageIO,
    pagesIO,
    pagesTotalIO,
    pdftotextIO,
    propertiesIO,

    -- * Page functions
    pageTextIO,
  )
where

import Control.Monad (forM)
import Data.ByteString.Internal
import qualified Data.Text as T
import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
import Foreign.C (withCString)
import GHC.Generics
import Pdftotext.Foreign

newtype Document = Document (ForeignPtr Poppler_Document)

-- | Document properties.
--
-- @since 0.0.2.0
data Properties = Properties
  { Properties -> Maybe Text
author :: Maybe T.Text,
    Properties -> Maybe Text
creator :: Maybe T.Text,
    Properties -> Maybe Text
keywords :: Maybe T.Text,
    Properties -> Maybe Text
metadata :: Maybe T.Text,
    Properties -> Maybe Text
producer :: Maybe T.Text,
    Properties -> Maybe Text
subject :: Maybe T.Text,
    Properties -> Maybe Text
title :: Maybe T.Text
  }
  deriving (Int -> Properties -> ShowS
[Properties] -> ShowS
Properties -> String
(Int -> Properties -> ShowS)
-> (Properties -> String)
-> ([Properties] -> ShowS)
-> Show Properties
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Properties] -> ShowS
$cshowList :: [Properties] -> ShowS
show :: Properties -> String
$cshow :: Properties -> String
showsPrec :: Int -> Properties -> ShowS
$cshowsPrec :: Int -> Properties -> ShowS
Show, (forall x. Properties -> Rep Properties x)
-> (forall x. Rep Properties x -> Properties) -> Generic Properties
forall x. Rep Properties x -> Properties
forall x. Properties -> Rep Properties x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Properties x -> Properties
$cfrom :: forall x. Properties -> Rep Properties x
Generic)

data Page = Page
  { -- | Number of this page in original document.
    Page -> Int
pageNumber :: Int,
    -- | Total number of pages in original document.
    Page -> Int
pageOutOf :: Int,
    -- | Pointer to document which created this page.
    Page -> ForeignPtr Poppler_Document
pageDocumentPtr :: ForeignPtr Poppler_Document,
    -- | Pointer to this page.
    Page -> ForeignPtr Poppler_Page
pagePtr :: ForeignPtr Poppler_Page
  }

instance Show Page where
  show :: Page -> String
show (Page n :: Int
n o :: Int
o _ _) = "Page " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
n String -> ShowS
forall a. [a] -> [a] -> [a]
++ "/" String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
o

-- | Layout of text extracted from PDF.
data Layout
  = -- | Text emulates layout of PDF, including horizontal spaces,
    -- and preserves hyphenation; corresponds to calling @pdftotext -layout@
    Physical
  | -- | Discards horizontal spaces, preserves hyphenation;
    -- corresponds to calling @pdftotext -raw@
    Raw
  | -- | Discards horizontal spaces, removes hyphenation;
    -- corresponds to calling @pdftotext@ without layout argument
    None
  deriving (Layout -> Layout -> Bool
(Layout -> Layout -> Bool)
-> (Layout -> Layout -> Bool) -> Eq Layout
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Layout -> Layout -> Bool
$c/= :: Layout -> Layout -> Bool
== :: Layout -> Layout -> Bool
$c== :: Layout -> Layout -> Bool
Eq, Int -> Layout -> ShowS
[Layout] -> ShowS
Layout -> String
(Int -> Layout -> ShowS)
-> (Layout -> String) -> ([Layout] -> ShowS) -> Show Layout
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Layout] -> ShowS
$cshowList :: [Layout] -> ShowS
show :: Layout -> String
$cshow :: Layout -> String
showsPrec :: Int -> Layout -> ShowS
$cshowsPrec :: Int -> Layout -> ShowS
Show)

-- | Open PDF from file. If file does not exist or cannot be parsed as valid PDF,
-- `Nothing` is returned.
openFile :: FilePath -> IO (Maybe Document)
openFile :: String -> IO (Maybe Document)
openFile file :: String
file =
  String -> (CString -> IO (Maybe Document)) -> IO (Maybe Document)
forall a. String -> (CString -> IO a) -> IO a
withCString String
file \cfile :: CString
cfile -> do
    Ptr Poppler_Document
ptr <- CString -> IO (Ptr Poppler_Document)
ffiOpenPdf CString
cfile
    if Ptr Poppler_Document
ptr Ptr Poppler_Document -> Ptr Poppler_Document -> Bool
forall a. Eq a => a -> a -> Bool
== Ptr Poppler_Document
forall a. Ptr a
nullPtr
      then Maybe Document -> IO (Maybe Document)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Document
forall a. Maybe a
Nothing
      else Document -> Maybe Document
forall a. a -> Maybe a
Just (Document -> Maybe Document)
-> (ForeignPtr Poppler_Document -> Document)
-> ForeignPtr Poppler_Document
-> Maybe Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr Poppler_Document -> Document
Document (ForeignPtr Poppler_Document -> Maybe Document)
-> IO (ForeignPtr Poppler_Document) -> IO (Maybe Document)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FinalizerPtr Poppler_Document
-> Ptr Poppler_Document -> IO (ForeignPtr Poppler_Document)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Document
ffiDocumentDelete Ptr Poppler_Document
ptr

-- | Open PDF represented as bytestring. If document cannot be parsed as valid PDF,
-- `Nothing` is returned.
openByteStringIO :: ByteString -> IO (Maybe Document)
openByteStringIO :: ByteString -> IO (Maybe Document)
openByteStringIO (PS bsfptr :: ForeignPtr Word8
bsfptr _ len :: Int
len) =
  ForeignPtr Word8
-> (Ptr Word8 -> IO (Maybe Document)) -> IO (Maybe Document)
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
bsfptr \bsptr :: Ptr Word8
bsptr -> do
    Ptr Poppler_Document
docptr <- Ptr Word8 -> CInt -> IO (Ptr Poppler_Document)
ffiOpenData Ptr Word8
bsptr (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)
    if Ptr Poppler_Document
docptr Ptr Poppler_Document -> Ptr Poppler_Document -> Bool
forall a. Eq a => a -> a -> Bool
== Ptr Poppler_Document
forall a. Ptr a
nullPtr
      then Maybe Document -> IO (Maybe Document)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Document
forall a. Maybe a
Nothing
      else Document -> Maybe Document
forall a. a -> Maybe a
Just (Document -> Maybe Document)
-> (ForeignPtr Poppler_Document -> Document)
-> ForeignPtr Poppler_Document
-> Maybe Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr Poppler_Document -> Document
Document (ForeignPtr Poppler_Document -> Maybe Document)
-> IO (ForeignPtr Poppler_Document) -> IO (Maybe Document)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FinalizerPtr Poppler_Document
-> Ptr Poppler_Document -> IO (ForeignPtr Poppler_Document)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Document
ffiDocumentDelete Ptr Poppler_Document
docptr

-- | Return all pages from document.
pagesIO :: Document -> IO [Page]
pagesIO :: Document -> IO [Page]
pagesIO (Document fptr :: ForeignPtr Poppler_Document
fptr) = do
  ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO [Page]) -> IO [Page]
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
    CInt
pageno <- Ptr Poppler_Document -> IO CInt
ffiDocumentPages Ptr Poppler_Document
ptr
    [CInt] -> (CInt -> IO Page) -> IO [Page]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [0 .. CInt
pageno CInt -> CInt -> CInt
forall a. Num a => a -> a -> a
- 1] \pno :: CInt
pno -> do
      ForeignPtr Poppler_Page
p <- Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
ffiDocumentOpenPage Ptr Poppler_Document
ptr CInt
pno IO (Ptr Poppler_Page)
-> (Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page))
-> IO (ForeignPtr Poppler_Page)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= FinalizerPtr Poppler_Page
-> Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Page
ffiPageDelete
      Page -> IO Page
forall (m :: * -> *) a. Monad m => a -> m a
return (Page -> IO Page) -> Page -> IO Page
forall a b. (a -> b) -> a -> b
$ Int
-> Int
-> ForeignPtr Poppler_Document
-> ForeignPtr Poppler_Page
-> Page
Page (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
pno Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
pageno) ForeignPtr Poppler_Document
fptr ForeignPtr Poppler_Page
p

-- | Return page number 'no' from PDF document, if the page exists.
pageIO :: Int -> Document -> IO (Maybe Page)
pageIO :: Int -> Document -> IO (Maybe Page)
pageIO no :: Int
no doc :: Document
doc@(Document fptr :: ForeignPtr Poppler_Document
fptr) = ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO (Maybe Page)) -> IO (Maybe Page)
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
  Int
pno <- Document -> IO Int
pagesTotalIO Document
doc
  if Int
no Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0 Bool -> Bool -> Bool
&& Int
no Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
pno
    then Page -> Maybe Page
forall a. a -> Maybe a
Just (Page -> Maybe Page)
-> (ForeignPtr Poppler_Page -> Page)
-> ForeignPtr Poppler_Page
-> Maybe Page
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int
-> Int
-> ForeignPtr Poppler_Document
-> ForeignPtr Poppler_Page
-> Page
Page Int
no Int
pno ForeignPtr Poppler_Document
fptr (ForeignPtr Poppler_Page -> Maybe Page)
-> IO (ForeignPtr Poppler_Page) -> IO (Maybe Page)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
ffiDocumentOpenPage Ptr Poppler_Document
ptr (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
no CInt -> CInt -> CInt
forall a. Num a => a -> a -> a
- 1) IO (Ptr Poppler_Page)
-> (Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page))
-> IO (ForeignPtr Poppler_Page)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= FinalizerPtr Poppler_Page
-> Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Page
ffiPageDelete)
    else Maybe Page -> IO (Maybe Page)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Page
forall a. Maybe a
Nothing

-- | Return number of pages contained in document.
pagesTotalIO :: Document -> IO Int
pagesTotalIO :: Document -> IO Int
pagesTotalIO (Document fptr :: ForeignPtr Poppler_Document
fptr) =
  CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> Int) -> IO CInt -> IO Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO CInt) -> IO CInt
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr Ptr Poppler_Document -> IO CInt
ffiDocumentPages

-- | Extract text from a page with given 'Layout'.
pageTextIO :: Layout -> Page -> IO T.Text
pageTextIO :: Layout -> Page -> IO Text
pageTextIO layout :: Layout
layout (Page _ _ _ fptr :: ForeignPtr Poppler_Page
fptr) = ForeignPtr Poppler_Page -> (Ptr Poppler_Page -> IO Text) -> IO Text
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Page
fptr \ptr :: Ptr Poppler_Page
ptr -> IO StdString -> IO Text
asText (Ptr Poppler_Page -> CBool -> IO StdString
ffiPageText Ptr Poppler_Page
ptr CBool
l)
  where
    l :: CBool
l =
      case Layout
layout of
        Raw -> 0
        Physical -> 1
        None -> 2

-- | Extract properties from the document.
--
-- @since 0.0.2.0
propertiesIO :: Document -> IO Properties
propertiesIO :: Document -> IO Properties
propertiesIO (Document fptr :: ForeignPtr Poppler_Document
fptr) = ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO Properties) -> IO Properties
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
  Text
a <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentAuthor Ptr Poppler_Document
ptr
  Text
c <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentCreator Ptr Poppler_Document
ptr
  Text
k <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentKeywords Ptr Poppler_Document
ptr
  Text
m <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentMetadata Ptr Poppler_Document
ptr
  Text
p <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentProducer Ptr Poppler_Document
ptr
  Text
s <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentSubject Ptr Poppler_Document
ptr
  Text
t <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentTitle Ptr Poppler_Document
ptr
  Properties -> IO Properties
forall (m :: * -> *) a. Monad m => a -> m a
return (Properties -> IO Properties) -> Properties -> IO Properties
forall a b. (a -> b) -> a -> b
$ Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Properties
Properties (Text -> Maybe Text
f Text
a) (Text -> Maybe Text
f Text
c) (Text -> Maybe Text
f Text
k) (Text -> Maybe Text
f Text
m) (Text -> Maybe Text
f Text
p) (Text -> Maybe Text
f Text
s) (Text -> Maybe Text
f Text
t)
  where
    f :: Text -> Maybe Text
f x :: Text
x =
      if Text -> Bool
T.null Text
x
        then Maybe Text
forall a. Maybe a
Nothing
        else Text -> Maybe Text
forall a. a -> Maybe a
Just Text
x

-- | Extract text from PDF document with given 'Layout'.
pdftotextIO :: Layout -> Document -> IO T.Text
pdftotextIO :: Layout -> Document -> IO Text
pdftotextIO layout :: Layout
layout doc :: Document
doc = do
  [Page]
ps <- Document -> IO [Page]
pagesIO Document
doc
  [Text]
txt <- (Page -> IO Text) -> [Page] -> IO [Text]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (Layout -> Page -> IO Text
pageTextIO Layout
layout) [Page]
ps
  Text -> IO Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
T.concat [Text]
txt