--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- |
--Module       : ParseEmail
--Author       : Joe Fredette
--License      : BSD3
--Copyright    : Joe Fredette
--
--Maintainer   : Joe Fredette <jfredett.at.gmail.dot.com>
--Stability    : Unstable
--Portability  : Portable 
--
--------------------------------------------------------------------------------
--Description  : Parses email, exports a datatype for email storage.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
{-# LANGUAGE DeriveDataTypeable #-}
module HackMail.Data.ParseEmail 
        ( Email (..)
        , Header (..)
        , Body (..)
        , HeaderTok (..)
        , parseEmail
        , parseEmailFromFile
        , matchHdr) where
{- Known issues:
 - Nonconformity to RFC2822
 -      Body allows \n, \r, and \r\n (CR, LF, LFCR) instead of the required \n\r
 -       (CRLF) - Section 2.3 of the RFC
 - 
 - Headers don't dump w/ appropriate hyphenation. 
 -      Part of the parsing removes hyphens from the fields, we need some way to
 -       put these back when printing out. (is this really a problem? the test 
 -       outputs look different then the show instance would indicate.) should be
 -       mostly fixed, not sure on all the details.
 -
 -}

-- Have to hack one up ourselves...
import Text.ParserCombinators.Parsec
import Data.Char
import Data.Typeable

-- TODO: rederive show to format better...
data Email = Email Header Body 
        deriving (Eq, Typeable)

instance Show Email where
        show (Email header (Body ss)) = show header
                                     ++ (unlines (filter (/="\r") ss)) 

newtype Body = Body [String]
        deriving (Eq, Typeable)

-- doesn't print quite the way I want...
instance Show Body where
        show (Body []) = "\n"
        show (Body (x:xs)) = trim x ++ "\n" ++ (show xs)
 
-- TODO: rederive show to format better
data Header = HDR HeaderTok String Header
            | STOP
        deriving (Eq, Typeable)

instance Show Header where
        show STOP = "\n"
        show (HDR ht s nextHdr) = show ht ++ ": " ++ s ++ "\n" 
                                ++ show nextHdr

data HeaderTok = TO | DATE | FROM | SENDER | REPLYTO | CC | BCC | MESSAGEID | INREPLYTO | REFERENCES
               | SUBJECT | KEYWORDS | XFIELD String
        deriving (Eq, {-Hack-} Read, Typeable)

instance Show HeaderTok where 
        show TO         = "To"
        show DATE       = "Date"
        show FROM       = "From"
        show SENDER     = "Sender"
        show REPLYTO    = "Reply-To"
        show CC         = "Cc"
        show BCC        = "Bcc"
        show MESSAGEID  = "Message-ID"
        show INREPLYTO  = "In-Reply-To"
        show REFERENCES = "References"
        show SUBJECT    = "Subject"
        show KEYWORDS   = "Keywords"
        show (XFIELD s) = s
--
-- The parser proper.
--
parseEmail e = parse parserEmail "" e
parseEmailFromFile path = parseFromFile parserEmail path

eol =  try (string "\n\r")
   <|> try (string "\r\n") 
   <|> string "\n"
   <|> string "\r"
   <?> "EOL character"

valueChar = anyChar -- see rfc2822, section 2.2.
fieldChar  = oneOf fieldChar' 
fieldChar' = "<>"++['\33'..'\57']++['\59'..'\126'] -- ibid
parserEmail = do
        -- this gives a list of pairs (field, value), we mangle them appropriately below
        header <- many1 parseHeaderline
        eol
        -- just a list of lines. ideally.
        body <- buildBody 
        eof
        -- now we do some mangling
        let headerRet = buildHeader header
        let bodyRet = Body body
        return (Email headerRet bodyRet)

parseHeaderline = do
        -- actual parsing stuff.
        field <- many1 fieldChar 
        choice [try $ string ": \n", try $ string ": ", try $ string ":"]
        value <- manyTill valueChar (try trueEndCond)
        return (field, value)


buildHeader [] = STOP
buildHeader ((f,v):xs)  = HDR (matchHdr f) v 
                        $ buildHeader xs

matchHdr :: String -> HeaderTok 
matchHdr s = match (map toUpper $ filter (isAlpha) s) s

match :: String -> String -> HeaderTok
match "TO" _            = TO
match "DATE" _          = DATE
match "FROM" _          = FROM
match "SENDER"  _       = SENDER
match "REPLYTO" _       = REPLYTO
match "CC"  _           = CC
match "BCC" _           = BCC
match "MESSAGEID" _     = MESSAGEID
match "INREPLYTO" _     = INREPLYTO
match "REFERENCES" _    = REFERENCES
match "SUBJECT" _       = SUBJECT
match "KEYWORDS" _      = KEYWORDS
match _ s'              = XFIELD s'


buildBody = sepBy (many $ noneOf "\n\r") eol

trimLeft s = dropWhile isSpace s
trimRight s = reverse . trimLeft . reverse 
trim = trimRight $ trimLeft

whitespace = oneOf "\t "

-- catches CRLF+Whitespaces that precede longfields.
trueEndCond = do
        eol
        notFollowedBy whitespace