{-# OPTIONS #-} -- ------------------------------------------------------------ module Holumbus.Crawler.PdfToText where import Control.Concurrent.MVar import qualified Control.Exception as CE import qualified Data.ByteString.Lazy as BS import Data.String.Unicode ( utf8ToUnicode ) import System.Directory ( getTemporaryDirectory , removeFile ) import System.FilePath ( () ) import System.Process ( rawSystem ) import System.Posix.Process ( getProcessID ) import System.IO.Unsafe ( unsafePerformIO ) import Text.XML.HXT.Core -- ------------------------------------------------------------ -- | Conversion of pdf data into plain text. The conversion is done -- by calling an external program @pdftotext@ (contained in linux packages @xpdf@). -- IO is done via the ByteString API. pdfToText :: String -> IO String pdfToText = pdfToTextBS . BS.pack . map (toEnum . fromEnum) pdfToTextBS :: BS.ByteString -> IO String pdfToTextBS inp = ( do fns@(fn1, fn2) <- requestPdf BS.writeFile fn1 inp _ <- rawSystem "pdftotext" ["-q", "-enc", "UTF-8", fn1, fn2] removeFile fn1 res <- BS.readFile fn2 BS.length res `seq` removeFile fn2 releasePdf fns return ( fst . utf8ToUnicode . map (toEnum . fromEnum) . BS.unpack $ res ) ) `mycatch` ( const $ return "" ) where mycatch :: IO a -> (CE.SomeException -> IO a) -> IO a mycatch = CE.catch pdfToTextA :: IOSArrow String String pdfToTextA = perform ( traceString 2 (("pdfToTextA input:\n" ++) . take 128 . show) ) >>> arrIO pdfToText >>> perform ( traceString 2 (( "pdfToText result:\n" ++ ) . take 128 . show) ) -- ------------------------------------------------------------ -- The pdftotext call is not thread save pdfResource :: MVar (FilePath, FilePath) pdfResource = unsafePerformIO $ tmpFiles >>= newMVar where tmpFiles = do td <- getTemporaryDirectory pid <- getProcessID let fn1 = fn td pid "pdfToText.pdf" let fn2 = fn td pid "pdfToText.txt" return (fn1, fn2) fn d p f = d (show p ++ "-" ++ f) {-# NOINLINE pdfResource #-} requestPdf :: IO (FilePath, FilePath) requestPdf = takeMVar pdfResource releasePdf :: (FilePath, FilePath) -> IO () releasePdf = putMVar pdfResource -- ------------------------------------------------------------