{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DeriveGeneric #-}
module Pdftotext.Internal
(
Document (..),
Layout (..),
Page (..),
Properties (..),
openByteStringIO,
openFile,
pageIO,
pagesIO,
pagesTotalIO,
pdftotextIO,
propertiesIO,
pageTextIO,
)
where
import Control.Monad (forM)
import Data.ByteString.Internal
import qualified Data.Text as T
import Foreign (ForeignPtr, newForeignPtr, nullPtr, withForeignPtr)
import Foreign.C (withCString)
import GHC.Generics
import Pdftotext.Foreign
newtype Document = Document (ForeignPtr Poppler_Document)
data Properties = Properties
{ Properties -> Maybe Text
author :: Maybe T.Text,
Properties -> Maybe Text
creator :: Maybe T.Text,
Properties -> Maybe Text
keywords :: Maybe T.Text,
Properties -> Maybe Text
metadata :: Maybe T.Text,
Properties -> Maybe Text
producer :: Maybe T.Text,
Properties -> Maybe Text
subject :: Maybe T.Text,
Properties -> Maybe Text
title :: Maybe T.Text
}
deriving (Int -> Properties -> ShowS
[Properties] -> ShowS
Properties -> String
(Int -> Properties -> ShowS)
-> (Properties -> String)
-> ([Properties] -> ShowS)
-> Show Properties
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Properties] -> ShowS
$cshowList :: [Properties] -> ShowS
show :: Properties -> String
$cshow :: Properties -> String
showsPrec :: Int -> Properties -> ShowS
$cshowsPrec :: Int -> Properties -> ShowS
Show, (forall x. Properties -> Rep Properties x)
-> (forall x. Rep Properties x -> Properties) -> Generic Properties
forall x. Rep Properties x -> Properties
forall x. Properties -> Rep Properties x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Properties x -> Properties
$cfrom :: forall x. Properties -> Rep Properties x
Generic)
data Page = Page
{
Page -> Int
pageNumber :: Int,
Page -> Int
pageOutOf :: Int,
Page -> ForeignPtr Poppler_Document
pageDocumentPtr :: ForeignPtr Poppler_Document,
Page -> ForeignPtr Poppler_Page
pagePtr :: ForeignPtr Poppler_Page
}
instance Show Page where
show :: Page -> String
show (Page n :: Int
n o :: Int
o _ _) = "Page " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
n String -> ShowS
forall a. [a] -> [a] -> [a]
++ "/" String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
o
data Layout
=
Physical
|
Raw
|
None
deriving (Layout -> Layout -> Bool
(Layout -> Layout -> Bool)
-> (Layout -> Layout -> Bool) -> Eq Layout
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Layout -> Layout -> Bool
$c/= :: Layout -> Layout -> Bool
== :: Layout -> Layout -> Bool
$c== :: Layout -> Layout -> Bool
Eq, Int -> Layout -> ShowS
[Layout] -> ShowS
Layout -> String
(Int -> Layout -> ShowS)
-> (Layout -> String) -> ([Layout] -> ShowS) -> Show Layout
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Layout] -> ShowS
$cshowList :: [Layout] -> ShowS
show :: Layout -> String
$cshow :: Layout -> String
showsPrec :: Int -> Layout -> ShowS
$cshowsPrec :: Int -> Layout -> ShowS
Show)
openFile :: FilePath -> IO (Maybe Document)
openFile :: String -> IO (Maybe Document)
openFile file :: String
file =
String -> (CString -> IO (Maybe Document)) -> IO (Maybe Document)
forall a. String -> (CString -> IO a) -> IO a
withCString String
file \cfile :: CString
cfile -> do
Ptr Poppler_Document
ptr <- CString -> IO (Ptr Poppler_Document)
ffiOpenPdf CString
cfile
if Ptr Poppler_Document
ptr Ptr Poppler_Document -> Ptr Poppler_Document -> Bool
forall a. Eq a => a -> a -> Bool
== Ptr Poppler_Document
forall a. Ptr a
nullPtr
then Maybe Document -> IO (Maybe Document)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Document
forall a. Maybe a
Nothing
else Document -> Maybe Document
forall a. a -> Maybe a
Just (Document -> Maybe Document)
-> (ForeignPtr Poppler_Document -> Document)
-> ForeignPtr Poppler_Document
-> Maybe Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr Poppler_Document -> Document
Document (ForeignPtr Poppler_Document -> Maybe Document)
-> IO (ForeignPtr Poppler_Document) -> IO (Maybe Document)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FinalizerPtr Poppler_Document
-> Ptr Poppler_Document -> IO (ForeignPtr Poppler_Document)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Document
ffiDocumentDelete Ptr Poppler_Document
ptr
openByteStringIO :: ByteString -> IO (Maybe Document)
openByteStringIO :: ByteString -> IO (Maybe Document)
openByteStringIO (PS bsfptr :: ForeignPtr Word8
bsfptr _ len :: Int
len) =
ForeignPtr Word8
-> (Ptr Word8 -> IO (Maybe Document)) -> IO (Maybe Document)
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
bsfptr \bsptr :: Ptr Word8
bsptr -> do
Ptr Poppler_Document
docptr <- Ptr Word8 -> CInt -> IO (Ptr Poppler_Document)
ffiOpenData Ptr Word8
bsptr (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)
if Ptr Poppler_Document
docptr Ptr Poppler_Document -> Ptr Poppler_Document -> Bool
forall a. Eq a => a -> a -> Bool
== Ptr Poppler_Document
forall a. Ptr a
nullPtr
then Maybe Document -> IO (Maybe Document)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Document
forall a. Maybe a
Nothing
else Document -> Maybe Document
forall a. a -> Maybe a
Just (Document -> Maybe Document)
-> (ForeignPtr Poppler_Document -> Document)
-> ForeignPtr Poppler_Document
-> Maybe Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr Poppler_Document -> Document
Document (ForeignPtr Poppler_Document -> Maybe Document)
-> IO (ForeignPtr Poppler_Document) -> IO (Maybe Document)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FinalizerPtr Poppler_Document
-> Ptr Poppler_Document -> IO (ForeignPtr Poppler_Document)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Document
ffiDocumentDelete Ptr Poppler_Document
docptr
pagesIO :: Document -> IO [Page]
pagesIO :: Document -> IO [Page]
pagesIO (Document fptr :: ForeignPtr Poppler_Document
fptr) = do
ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO [Page]) -> IO [Page]
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
CInt
pageno <- Ptr Poppler_Document -> IO CInt
ffiDocumentPages Ptr Poppler_Document
ptr
[CInt] -> (CInt -> IO Page) -> IO [Page]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [0 .. CInt
pageno CInt -> CInt -> CInt
forall a. Num a => a -> a -> a
- 1] \pno :: CInt
pno -> do
ForeignPtr Poppler_Page
p <- Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
ffiDocumentOpenPage Ptr Poppler_Document
ptr CInt
pno IO (Ptr Poppler_Page)
-> (Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page))
-> IO (ForeignPtr Poppler_Page)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= FinalizerPtr Poppler_Page
-> Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Page
ffiPageDelete
Page -> IO Page
forall (m :: * -> *) a. Monad m => a -> m a
return (Page -> IO Page) -> Page -> IO Page
forall a b. (a -> b) -> a -> b
$ Int
-> Int
-> ForeignPtr Poppler_Document
-> ForeignPtr Poppler_Page
-> Page
Page (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
pno Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1) (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
pageno) ForeignPtr Poppler_Document
fptr ForeignPtr Poppler_Page
p
pageIO :: Int -> Document -> IO (Maybe Page)
pageIO :: Int -> Document -> IO (Maybe Page)
pageIO no :: Int
no doc :: Document
doc@(Document fptr :: ForeignPtr Poppler_Document
fptr) = ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO (Maybe Page)) -> IO (Maybe Page)
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
Int
pno <- Document -> IO Int
pagesTotalIO Document
doc
if Int
no Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0 Bool -> Bool -> Bool
&& Int
no Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
pno
then Page -> Maybe Page
forall a. a -> Maybe a
Just (Page -> Maybe Page)
-> (ForeignPtr Poppler_Page -> Page)
-> ForeignPtr Poppler_Page
-> Maybe Page
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int
-> Int
-> ForeignPtr Poppler_Document
-> ForeignPtr Poppler_Page
-> Page
Page Int
no Int
pno ForeignPtr Poppler_Document
fptr (ForeignPtr Poppler_Page -> Maybe Page)
-> IO (ForeignPtr Poppler_Page) -> IO (Maybe Page)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Ptr Poppler_Document -> CInt -> IO (Ptr Poppler_Page)
ffiDocumentOpenPage Ptr Poppler_Document
ptr (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
no CInt -> CInt -> CInt
forall a. Num a => a -> a -> a
- 1) IO (Ptr Poppler_Page)
-> (Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page))
-> IO (ForeignPtr Poppler_Page)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= FinalizerPtr Poppler_Page
-> Ptr Poppler_Page -> IO (ForeignPtr Poppler_Page)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr Poppler_Page
ffiPageDelete)
else Maybe Page -> IO (Maybe Page)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe Page
forall a. Maybe a
Nothing
pagesTotalIO :: Document -> IO Int
pagesTotalIO :: Document -> IO Int
pagesTotalIO (Document fptr :: ForeignPtr Poppler_Document
fptr) =
CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> Int) -> IO CInt -> IO Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO CInt) -> IO CInt
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr Ptr Poppler_Document -> IO CInt
ffiDocumentPages
pageTextIO :: Layout -> Page -> IO T.Text
pageTextIO :: Layout -> Page -> IO Text
pageTextIO layout :: Layout
layout (Page _ _ _ fptr :: ForeignPtr Poppler_Page
fptr) = ForeignPtr Poppler_Page -> (Ptr Poppler_Page -> IO Text) -> IO Text
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Page
fptr \ptr :: Ptr Poppler_Page
ptr -> IO StdString -> IO Text
asText (Ptr Poppler_Page -> CBool -> IO StdString
ffiPageText Ptr Poppler_Page
ptr CBool
l)
where
l :: CBool
l =
case Layout
layout of
Raw -> 0
Physical -> 1
None -> 2
propertiesIO :: Document -> IO Properties
propertiesIO :: Document -> IO Properties
propertiesIO (Document fptr :: ForeignPtr Poppler_Document
fptr) = ForeignPtr Poppler_Document
-> (Ptr Poppler_Document -> IO Properties) -> IO Properties
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Poppler_Document
fptr \ptr :: Ptr Poppler_Document
ptr -> do
Text
a <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentAuthor Ptr Poppler_Document
ptr
Text
c <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentCreator Ptr Poppler_Document
ptr
Text
k <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentKeywords Ptr Poppler_Document
ptr
Text
m <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentMetadata Ptr Poppler_Document
ptr
Text
p <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentProducer Ptr Poppler_Document
ptr
Text
s <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentSubject Ptr Poppler_Document
ptr
Text
t <- IO StdString -> IO Text
asText (IO StdString -> IO Text) -> IO StdString -> IO Text
forall a b. (a -> b) -> a -> b
$ Ptr Poppler_Document -> IO StdString
ffiDocumentTitle Ptr Poppler_Document
ptr
Properties -> IO Properties
forall (m :: * -> *) a. Monad m => a -> m a
return (Properties -> IO Properties) -> Properties -> IO Properties
forall a b. (a -> b) -> a -> b
$ Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Maybe Text
-> Properties
Properties (Text -> Maybe Text
f Text
a) (Text -> Maybe Text
f Text
c) (Text -> Maybe Text
f Text
k) (Text -> Maybe Text
f Text
m) (Text -> Maybe Text
f Text
p) (Text -> Maybe Text
f Text
s) (Text -> Maybe Text
f Text
t)
where
f :: Text -> Maybe Text
f x :: Text
x =
if Text -> Bool
T.null Text
x
then Maybe Text
forall a. Maybe a
Nothing
else Text -> Maybe Text
forall a. a -> Maybe a
Just Text
x
pdftotextIO :: Layout -> Document -> IO T.Text
pdftotextIO :: Layout -> Document -> IO Text
pdftotextIO layout :: Layout
layout doc :: Document
doc = do
[Page]
ps <- Document -> IO [Page]
pagesIO Document
doc
[Text]
txt <- (Page -> IO Text) -> [Page] -> IO [Text]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (Layout -> Page -> IO Text
pageTextIO Layout
layout) [Page]
ps
Text -> IO Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
T.concat [Text]
txt