{-# LANGUAGE OverloadedStrings #-}
-- | Utilities for reading mailman-style email archives.
module NLP.Corpora.Email where

import qualified Data.ByteString as BS
import Data.List (isSuffixOf)
import Data.Text (Text)
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy as LT

import Data.MBox (body, Message, parseMBox)

import System.Directory (getDirectoryContents)
import System.FilePath ((</>))

import NLP.Tokenize.Text (tokenize)

-- | Path to the directory containing all the PLUG archives.
plugDataPath :: FilePath
plugDataPath = "./data/corpora/PLUG/"

plugArchiveText :: IO [Text]
plugArchiveText = do
  archive <- fullPlugArchive
  return $ map (LT.toStrict . body) archive

plugArchiveTokens :: IO [[Text]]
plugArchiveTokens = do
  archive <- fullPlugArchive
  return $ map (tokenize . LT.toStrict . body) archive

fullPlugArchive :: IO [Message]
fullPlugArchive = do
  files <- getDirectoryContents plugDataPath
  let archiveFiles = filter (".txt" `isSuffixOf`) files
  contents <- mapM (\f->readF (plugDataPath </> f)) archiveFiles
  return $ concatMap parseMBox contents

readF :: FilePath -> IO LT.Text
readF file = do
  bs <- BS.readFile file
  return $ LT.fromStrict $ TE.decodeLatin1 bs