module Holumbus.Crawler.PdfToText
where
import qualified Control.Exception as CE
import qualified Data.ByteString.Lazy as BS
import Data.String.Unicode ( utf8ToUnicode )
import System.Directory ( getTemporaryDirectory
, removeFile
)
import System.FilePath ( (</>) )
import System.Process ( rawSystem )
import System.Posix.Process ( getProcessID )
import Text.XML.HXT.Core
pdfToText :: String -> IO String
pdfToText = pdfToTextBS . BS.pack . map (toEnum . fromEnum)
pdfToTextBS :: BS.ByteString -> IO String
pdfToTextBS inp = ( do
td <- getTemporaryDirectory
pid <- getProcessID
let fn1 = fn td pid "pdfToText.pdf"
let fn2 = fn td pid "pdfToText.txt"
BS.writeFile fn1 inp
_ <- rawSystem "pdftotext" ["-q", "-enc", "UTF-8", fn1, fn2]
removeFile fn1
res <- BS.readFile fn2
BS.length res `seq`
removeFile fn2
return ( fst . utf8ToUnicode . map (toEnum . fromEnum) . BS.unpack $ res )
) `mycatch` ( const $ return "" )
where
fn d p f = d </> (show p ++ "-" ++ f)
mycatch :: IO a -> (CE.SomeException -> IO a) -> IO a
mycatch = CE.catch
pdfToTextA :: IOSArrow String String
pdfToTextA = perform ( traceString 2 (("pdfToTextA input:\n" ++) . take 128 . show) )
>>>
arrIO pdfToText
>>>
perform ( traceString 2 (( "pdfToText result:\n" ++ ) . take 128 . show) )