{-# LANGUAGE OverloadedStrings #-} -- | Utilities for reading mailman-style email archives. module NLP.Corpora.Email where import qualified Data.ByteString as BS import Data.List (isSuffixOf) import Data.List.Split (splitWhen) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.Text.Encoding as TE import Data.MBox import System.Directory (getDirectoryContents) import System.FilePath (()) import NLP.Tokenize.Text (tokenize) -- | Path to the directory containing all the PLUG archives. plugDataPath :: FilePath plugDataPath = "./data/corpora/PLUG/" plugArchiveText :: IO [Text] plugArchiveText = do archive <- fullPlugArchive return $ map body archive plugArchiveTokens :: IO [[Text]] plugArchiveTokens = do archive <- fullPlugArchive return $ map (tokenize . body) archive fullPlugArchive :: IO [Message] fullPlugArchive = do files <- getDirectoryContents plugDataPath let archiveFiles = filter (".txt" `isSuffixOf`) files contents <- mapM (\f->readF (plugDataPath f)) archiveFiles return $ concatMap parseMBox contents readF :: FilePath -> IO Text readF file = do bs <- BS.readFile file return $ TE.decodeLatin1 bs