-- | Module with function to read file in with pandoc and discard everything superfluous. module Data.Text.WordCount.FileRead ( processFile , globFile ) where import Control.Monad ((<=<)) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Lazy as BSL import qualified Data.Text as T import qualified Data.Text.IO as TIO import Filesystem.Path.CurrentOS as F import System.FilePath.Glob import Text.Pandoc hiding (glob) -- | Process files given a filename glob globFile :: String -> IO T.Text globFile str = do files <- glob str fmap T.concat . sequence $ fmap runIOorExplode $ fmap processFile files -- | Process a file given a filename. Return text only, discarding superflouous material. processFile :: String -> PandocIO T.Text processFile filepath = (T.filter goodChar) <$> case (extension . decodeString $ filepath) of (Just "md") -> (writePlain def <=< fmap filterCode . (readMarkdown def =<<)) <$> liftIO $ TIO.readFile filepath (Just "dbk") -> (writePlain def <=< fmap filterCode . (readDocBook def =<<)) <$> liftIO $ TIO.readFile filepath (Just "docx") -> (writePlain def <=< fmap filterCode . (readDocx def =<<)) $ liftIO $ BSL.readFile filepath (Just "epub") -> (writePlain def <=< fmap filterCode . (readEPUB def =<<)) $ liftIO $ BSL.readFile filepath (Just "html") -> (writePlain def <=< fmap filterCode . (readHtml def =<<)) $ liftIO $ TIO.readFile filepath (Just "tex") -> (writePlain def <=< fmap filterCode . (readLaTeX def =<<)) $ liftIO $ TIO.readFile filepath (Just "xml") -> (writePlain def <=< fmap filterCode . (readOPML def =<<)) $ liftIO $ TIO.readFile filepath (Just "odt") -> (writePlain def <=< fmap filterCode . (readOdt def =<<)) $ liftIO $ BSL.readFile filepath (Just "rst") -> (writePlain def <=< fmap filterCode . (readRST def =<<)) $ liftIO $ TIO.readFile filepath (Just "textile") -> (writePlain def <=< fmap filterCode . (readTextile def =<<)) $ liftIO $ TIO.readFile filepath _ -> liftIO $ TIO.readFile filepath goodChar :: Char -> Bool goodChar = not . flip any (".,?_()![]{}*&$#" :: String) . (==) -- | Filter out code and tables from the document filterCode :: Pandoc -> Pandoc filterCode (Pandoc meta content) = Pandoc meta $ filter rightBlock content where rightBlock CodeBlock { } = False rightBlock Table { } = False rightBlock _ = True