-------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- | --Module : ParseEmail --Author : Joe Fredette --License : BSD3 --Copyright : Joe Fredette -- --Maintainer : Joe Fredette --Stability : Unstable --Portability : Portable -- -------------------------------------------------------------------------------- --Description : Parses email, exports a datatype for email storage. -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- {-# LANGUAGE DeriveDataTypeable #-} module HackMail.Data.ParseEmail ( Email (..) , Header (..) , Body (..) , HeaderTok (..) , parseEmail , parseEmailFromFile , matchHdr) where {- Known issues: - Nonconformity to RFC2822 - Body allows \n, \r, and \r\n (CR, LF, LFCR) instead of the required \n\r - (CRLF) - Section 2.3 of the RFC - - Headers don't dump w/ appropriate hyphenation. - Part of the parsing removes hyphens from the fields, we need some way to - put these back when printing out. (is this really a problem? the test - outputs look different then the show instance would indicate.) should be - mostly fixed, not sure on all the details. - -} -- Have to hack one up ourselves... import Text.ParserCombinators.Parsec import Data.Char import Data.Typeable -- TODO: rederive show to format better... data Email = Email Header Body deriving (Eq, Typeable) instance Show Email where show (Email header (Body ss)) = show header ++ (unlines (filter (/="\r") ss)) newtype Body = Body [String] deriving (Eq, Typeable) -- doesn't print quite the way I want... instance Show Body where show (Body []) = "\n" show (Body (x:xs)) = trim x ++ "\n" ++ (show xs) -- TODO: rederive show to format better data Header = HDR HeaderTok String Header | STOP deriving (Eq, Typeable) instance Show Header where show STOP = "\n" show (HDR ht s nextHdr) = show ht ++ ": " ++ s ++ "\n" ++ show nextHdr data HeaderTok = TO | DATE | FROM | SENDER | REPLYTO | CC | BCC | MESSAGEID | INREPLYTO | REFERENCES | SUBJECT | KEYWORDS | XFIELD String deriving (Eq, {-Hack-} Read, Typeable) instance Show HeaderTok where show TO = "To" show DATE = "Date" show FROM = "From" show SENDER = "Sender" show REPLYTO = "Reply-To" show CC = "Cc" show BCC = "Bcc" show MESSAGEID = "Message-ID" show INREPLYTO = "In-Reply-To" show REFERENCES = "References" show SUBJECT = "Subject" show KEYWORDS = "Keywords" show (XFIELD s) = s -- -- The parser proper. -- parseEmail e = parse parserEmail "" e parseEmailFromFile path = parseFromFile parserEmail path eol = try (string "\n\r") <|> try (string "\r\n") <|> string "\n" <|> string "\r" "EOL character" valueChar = anyChar -- see rfc2822, section 2.2. fieldChar = oneOf fieldChar' fieldChar' = "<>"++['\33'..'\57']++['\59'..'\126'] -- ibid parserEmail = do -- this gives a list of pairs (field, value), we mangle them appropriately below header <- many1 parseHeaderline eol -- just a list of lines. ideally. body <- buildBody eof -- now we do some mangling let headerRet = buildHeader header let bodyRet = Body body return (Email headerRet bodyRet) parseHeaderline = do -- actual parsing stuff. field <- many1 fieldChar choice [try $ string ": \n", try $ string ": ", try $ string ":"] value <- manyTill valueChar (try trueEndCond) return (field, value) buildHeader [] = STOP buildHeader ((f,v):xs) = HDR (matchHdr f) v $ buildHeader xs matchHdr :: String -> HeaderTok matchHdr s = match (map toUpper $ filter (isAlpha) s) s match :: String -> String -> HeaderTok match "TO" _ = TO match "DATE" _ = DATE match "FROM" _ = FROM match "SENDER" _ = SENDER match "REPLYTO" _ = REPLYTO match "CC" _ = CC match "BCC" _ = BCC match "MESSAGEID" _ = MESSAGEID match "INREPLYTO" _ = INREPLYTO match "REFERENCES" _ = REFERENCES match "SUBJECT" _ = SUBJECT match "KEYWORDS" _ = KEYWORDS match _ s' = XFIELD s' buildBody = sepBy (many $ noneOf "\n\r") eol trimLeft s = dropWhile isSpace s trimRight s = reverse . trimLeft . reverse trim = trimRight $ trimLeft whitespace = oneOf "\t " -- catches CRLF+Whitespaces that precede longfields. trueEndCond = do eol notFollowedBy whitespace