{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE DefaultSignatures #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeFamilies #-} {- | Email messages. Deals specifically with RFC 5322, which is stricter than RFC 822 or RFC 2822. If you have to deal with messages that comply with the older specifications but not RFC 5322, preprocess the input and massage it to be RFC 5322 compliant. This parser allows LF line endings in addition to CRLF (RFC 5322 demands CRLF but LF-only is common in on-disk formats). The main parsing function is 'message'. It takes a second function that can inspect the headers to determine how to parse the body. @ 'message' :: ('Headers' -> 'BodyHandler' a) -> Parser (Message ctx a) @ The 'Message' type is parameterised over the body type, and a phantom type that can be used for context. @ data 'Message' ctx a = Message 'Headers' a @ Headers and body can be accessed via the 'headers', 'header' and 'body' optics. @ 'headers' :: Lens' (Message ctx a) Headers 'header' :: CI B.ByteString -> Fold Headers B.ByteString 'body' :: Lens (Message ctx a) (Message ctx' b) a b @ The following example program parses an input, interpreting the body as a raw @ByteString@, and prints the subject (if present), the number of headers and the body length. The message context type is @()@. @ analyse :: B.ByteString -> IO () analyse input = case 'parse' ('message' (const takeByteString)) of Left errMsg -> hPutStrLn stderr errMsg *> exitFailure Right (msg :: Message () B.ByteString) -> do B.putStrLn $ "subject: " <> foldOf ('headers' . 'header' "subject") msg putStrLn $ "num headers: " <> show (length (view 'headers' msg)) putStrLn $ "body length: " <> show (B.length (view 'body' msg)) @ -} module Data.RFC5322 ( -- * Message types Message(..) , message , MessageContext , BodyHandler(..) , body , EqMessage(..) -- ** Headers , Header , HasHeaders(..) , header , headerList , Headers(..) -- ** Addresses , Address(..) , address , addressList , AddrSpec(..) , Domain(..) , Mailbox(..) , mailbox , mailboxList -- * Parsers , parse , parsed , parsePrint , crlf , quotedString -- * Helpers , field , rfc5422DateTimeFormat , rfc5422DateTimeFormatLax -- * Serialisation , buildMessage , renderMessage , RenderMessage(..) , renderRFC5422Date , buildFields , buildField , renderAddressSpec , renderMailbox , renderMailboxes , renderAddress , renderAddresses ) where import Control.Applicative import Data.Foldable (fold) import Data.List (findIndex, intersperse) import Data.List.NonEmpty (intersperse) import Data.Semigroup ((<>)) import Data.Word (Word8) import GHC.Generics (Generic) import Control.DeepSeq (NFData) import Control.Lens import Control.Lens.Cons.Extras (recons) import Data.Attoparsec.ByteString as A hiding (parse, take) import Data.Attoparsec.ByteString.Char8 (char8) import qualified Data.Attoparsec.ByteString.Lazy as AL import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Char8 as Char8 import qualified Data.ByteString.Builder as Builder import qualified Data.ByteString.Builder.Prim as Prim import qualified Data.Text as T import qualified Data.Text.Encoding as T import Data.Time.Clock (UTCTime) import Data.Time.Format (defaultTimeLocale, formatTime) import Data.RFC5322.Internal ( CI, ci, original , (<<>>), foldMany, foldMany1Sep , fromChar, isAtext, isQtext, isVchar, isWsp , optionalCFWS, word, wsp, vchar, optionalFWS, crlf , domainLiteral, dotAtom, localPart, quotedString ) import Data.RFC5322.Address.Types import Data.MIME.Charset import Data.MIME.EncodedWord (encodedWord, decodeEncodedWord, buildEncodedWord) import Data.MIME.TransferEncoding (transferEncode) type Header = (CI B.ByteString, B.ByteString) newtype Headers = Headers [Header] deriving (Eq, Show, Generic, NFData) instance Semigroup Headers where Headers a <> Headers b = Headers (a <> b) instance Monoid Headers where mempty = Headers [] class HasHeaders a where headers :: Lens' a Headers instance HasHeaders Headers where headers = id type instance Index Headers = CI B.ByteString type instance IxValue Headers = B.ByteString instance Ixed Headers where ix = header hdriso :: Iso' Headers [(CI B.ByteString, B.ByteString)] hdriso = iso (\(Headers xs) -> xs) Headers -- | Acts upon the first occurrence of the header only. -- instance At Headers where at k = hdriso . l where l :: Lens' [(CI B.ByteString, B.ByteString)] (Maybe B.ByteString) l f kv = let i = findIndex ((== k) . fst) kv g Nothing = maybe kv (\j -> take j kv <> drop (j + 1) kv) i g (Just v) = maybe ((k,v):kv) (\j -> set (ix j) (k,v) kv) i in g <$> f (lookup k kv) -- | Target all values of the given header header :: HasHeaders a => CI B.ByteString -> Traversal' a B.ByteString header k = headerList . traversed . filtered ((k ==) . fst) . _2 -- | Message type, parameterised over context and body type. The -- context type is not used in this module but is provided for uses -- such as tracking the transfer/charset encoding state in MIME -- messages. -- data Message s a = Message Headers a deriving (Show, Generic, NFData) instance HasHeaders (Message s a) where headers f (Message h b) = fmap (`Message` b) (f h) instance Functor (Message s) where fmap f (Message h a) = Message h (f a) -- | How to compare messages with this body type. -- -- This class arises because we may want to tweak the headers, -- possibly in response to body data, or vice-versa, when -- comparing messages. -- -- The default implementation compares headers and body using (==). -- class EqMessage a where eqMessage :: Message s a -> Message s a -> Bool default eqMessage :: (Eq a) => Message s a -> Message s a -> Bool eqMessage (Message h1 b1) (Message h2 b2) = h1 == h2 && b1 == b2 instance EqMessage a => Eq (Message s a) where (==) = eqMessage -- | Access headers as a list of key/value pairs. headerList :: HasHeaders a => Lens' a [(CI B.ByteString, B.ByteString)] headerList = headers . coerced body :: Lens (Message ctx a) (Message ctx' b) a b body f (Message h b) = fmap (\b' -> Message h b') (f b) {-# ANN body ("HLint: ignore Avoid lambda" :: String) #-} isSpecial :: Word8 -> Bool isSpecial = inClass "()<>[]:;@\\,.\"" special :: Parser Word8 special = satisfy isSpecial -- §3.3 Date and Time Specification -- Sat, 29 Sep 2018 12:51:05 +1000 rfc5422DateTimeFormat :: String rfc5422DateTimeFormat = "%a, %d %b %Y %T %z" rfc5422DateTimeFormatLax :: String rfc5422DateTimeFormatLax = "%a, %-d %b %Y %-H:%-M:%-S %z" renderRFC5422Date :: UTCTime -> B.ByteString renderRFC5422Date = Char8.pack . formatTime defaultTimeLocale rfc5422DateTimeFormat -- §3.4 Address Specification buildMailbox :: Mailbox -> Builder.Builder buildMailbox (Mailbox n a) = maybe a' (\n' -> buildPhrase n' <> " <" <> a' <> ">") n where a' = buildAddressSpec a -- Encode a phrase. -- -- * Empty string is special case; must be in quotes -- * If valid as an atom, use as-is (ideally, but we don't do this yet) -- * If it can be in a quoted-string, do so. -- * Otherwise make it an encoded-word -- buildPhrase :: T.Text -> Builder.Builder buildPhrase "" = "\"\"" buildPhrase s = case enc s of PhraseAtom -> T.encodeUtf8Builder s PhraseQuotedString _ -> "\"" <> T.encodeUtf8BuilderEscaped escPrim s <> "\"" PhraseEncodedWord -> buildEncodedWord . transferEncode . charsetEncode $ s where enc = T.foldr (\c z -> encChar c <> z) mempty encChar c | isAtext c = PhraseAtom | isQtext c = PhraseQuotedString 0 | isVchar c || c == ' ' = PhraseQuotedString 1 | otherwise = PhraseEncodedWord -- FIXME: this probably doesn't handle consecutive SPACE properly -- due to FWS: -- -- quoted-string = [CFWS] -- DQUOTE *([FWS] qcontent) [FWS] DQUOTE -- [CFWS] -- -- Do not be surprised if the roundtrip property fails -- escPrim = Prim.condB (\c -> isQtext c || c == 32) (Prim.liftFixedToBounded Prim.word8) (Prim.liftFixedToBounded $ (fromChar '\\',) Prim.>$< Prim.word8 Prim.>*< Prim.word8) -- | Data type used to compute escaping requirement of a Text 'phrase' -- 'PhraseQuotedString' records the number of additional characters -- needed for escapes (backslash). It does not include the surrounding -- DQUOTE characters. -- data PhraseEscapeRequirement = PhraseAtom | PhraseQuotedString Int | PhraseEncodedWord instance Semigroup PhraseEscapeRequirement where PhraseEncodedWord <> _ = PhraseEncodedWord PhraseAtom <> a = a PhraseQuotedString n <> PhraseQuotedString m = PhraseQuotedString (n + m) a <> PhraseAtom = a _ <> PhraseEncodedWord = PhraseEncodedWord instance Monoid PhraseEscapeRequirement where mempty = PhraseAtom renderMailboxes :: [Mailbox] -> B.ByteString renderMailboxes = L.toStrict . Builder.toLazyByteString . buildMailboxes buildMailboxes :: [Mailbox] -> Builder.Builder buildMailboxes = fold . Data.List.intersperse ", " . fmap buildMailbox renderMailbox :: Mailbox -> B.ByteString renderMailbox = L.toStrict . Builder.toLazyByteString . buildMailbox mailbox :: CharsetLookup -> Parser Mailbox mailbox charsets = Mailbox <$> optional (displayName charsets) <*> angleAddr <|> Mailbox Nothing <$> addressSpec phrase :: CharsetLookup -> Parser T.Text phrase charsets = foldMany1Sep " " $ fmap (decodeEncodedWord charsets) ("=?" *> encodedWord) <|> fmap decodeLenient word displayName :: CharsetLookup -> Parser T.Text displayName = phrase angleAddr :: Parser AddrSpec angleAddr = optionalCFWS *> char8 '<' *> addressSpec <* char8 '>' <* optionalCFWS buildAddressSpec :: AddrSpec -> Builder.Builder buildAddressSpec (AddrSpec lp (DomainDotAtom b)) | " " `B.isInfixOf` lp = "\"" <> buildLP <> "\"" <> rest | otherwise = buildLP <> rest where buildLP = Builder.byteString lp rest = "@" <> foldMap Builder.byteString (Data.List.NonEmpty.intersperse "." b) buildAddressSpec (AddrSpec lp (DomainLiteral b)) = foldMap Builder.byteString [lp, "@", b] renderAddressSpec :: AddrSpec -> B.ByteString renderAddressSpec = L.toStrict . Builder.toLazyByteString . buildAddressSpec addressSpec :: Parser AddrSpec addressSpec = AddrSpec <$> localPart <*> (char8 '@' *> domain) -- | Printable US-ASCII excl "[", "]", or "\" isDtext :: Word8 -> Bool isDtext c = (c >= 33 && c <= 90) || (c >= 94 && c <= 126) domain :: Parser Domain domain = (DomainDotAtom <$> dotAtom) <|> (DomainLiteral <$> domainLiteral) mailboxList :: CharsetLookup -> Parser [Mailbox] mailboxList charsets = mailbox charsets `sepBy` char8 ',' renderAddresses :: [Address] -> B.ByteString renderAddresses xs = B.intercalate ", " $ renderAddress <$> xs renderAddress :: Address -> B.ByteString renderAddress (Single m) = renderMailbox m renderAddress (Group name xs) = T.encodeUtf8 name <> ":" <> renderMailboxes xs <> ";" addressList :: CharsetLookup -> Parser [Address] addressList charsets = address charsets `sepBy` char8 ',' group :: CharsetLookup -> Parser Address group charsets = Group <$> (displayName charsets) <* char8 ':' <*> mailboxList charsets <* char8 ';' <* optionalCFWS address :: CharsetLookup -> Parser Address address charsets = group charsets <|> Single <$> mailbox charsets -- §3.5. Overall Message Syntax -- | Specify how to handle a message body, including the possibility -- of optional bodies and no body (which is distinct from empty body). data BodyHandler a = RequiredBody (Parser a) | OptionalBody (Parser a, a) -- ^ If body is present run parser, otherwise use constant value | NoBody a -- | Parse a message. The function argument receives the headers and -- yields a handler for the message body. -- message :: (Headers -> BodyHandler a) -> Parser (Message (MessageContext a) a) message f = fields >>= \hdrs -> Message hdrs <$> case f hdrs of RequiredBody b -> crlf *> b OptionalBody (b, a) -> optional crlf >>= maybe (pure a) (const b) NoBody b -> pure b type family MessageContext a fields :: Parser Headers fields = Headers <$> many field -- | Define how to render an RFC 5322 message with given payload type. -- class RenderMessage a where -- | Build the body. If there should be no body (as distinct from -- /empty body/) return Nothing buildBody :: Headers -> a -> Maybe Builder.Builder -- | Allows tweaking the headers before rendering. Default -- implementation is a no-op. tweakHeaders :: Headers -> Headers tweakHeaders = id -- | Construct a 'Builder.Builder' for the message. This allows efficient -- streaming to IO handles. -- buildMessage :: forall ctx a. (RenderMessage a) => Message ctx a -> Builder.Builder buildMessage (Message h b) = buildFields (tweakHeaders @a h) <> maybe mempty ("\r\n" <>) (buildBody h b) -- | Render a message to a lazy 'L.ByteString'. (You will probably not -- need a strict @ByteString@ and it is inefficient for most use cases.) -- renderMessage :: (RenderMessage a) => Message ctx a -> L.ByteString renderMessage = Builder.toLazyByteString . buildMessage -- Header serialisation buildFields :: Headers -> Builder.Builder buildFields = foldMapOf (hdriso . traversed) buildField buildField :: (CI B.ByteString, B.ByteString) -> Builder.Builder buildField (k,v) = let key = original k in Builder.byteString key <> ": " <> Builder.byteString (foldUnstructured (B.length key) v) <> "\r\n" -- | Render a field body with proper folding -- -- Algorithm: -- * Break the string on white space -- * Use a counter which indicates a new folding line if it exceeds 77 characters -- * Whenever we create a new line, concatenate all words back with white space and push it into the result -- * The result is a list of byte strings, which is concatenated with \r\n\s -- -- Notes: -- * First take at this, so possibly very inefficient -- * No other delimiters (e.g. commas, full stops, etc) are considered for -- folding other than whitespace -- * Attaches an additional whitespace when joining -- foldUnstructured :: Int -> B.ByteString -> B.ByteString foldUnstructured i b = let xs = chunk (i + 2) (Char8.words b) [] [] in B.intercalate "\r\n " (filter (not . B.null) xs) chunk :: Int -> [B.ByteString] -> [B.ByteString] -> [B.ByteString] -> [B.ByteString] chunk _ [] xs result = result <> [Char8.unwords xs] chunk max' (x:rest) xs result = if (max' + B.length x + 1) >= 77 then result <> [Char8.unwords xs] <> chunk (B.length x + 1) rest [x] [] else result <> chunk (max' + B.length x + 1) rest (xs <> [x]) result -- | Printable ASCII excl. ':' isFtext :: Word8 -> Bool isFtext c = (c >= 33 && c <= 57) || (c >= 59 && c <= 126) field :: Parser (CI B.ByteString, B.ByteString) field = (,) <$> ci (takeWhile1 isFtext) <* char8 ':' <* many wsp <*> unstructured <* crlf unstructured :: Parser B.ByteString unstructured = foldMany (optionalFWS <<>> (B.singleton <$> vchar)) <<>> A.takeWhile isWsp -- | Given a parser, construct a 'Fold' -- -- See 'parse' for discussion of performance. -- parsed :: (Cons s s Word8 Word8) => Parser a -> Fold s a parsed p = to (parse p) . folded {-# INLINE parsed #-} -- | Construct a prism from a parser and a printer parsePrint :: Parser a -> (a -> B.ByteString) -> Prism' B.ByteString a parsePrint fwd rev = prism' rev (AL.maybeResult . AL.parse fwd . view recons) -- | Parse an @a@. -- -- The input is convered to a /lazy/ @ByteString@. -- Build with rewrite rules enabled (@-O@, cabal's default) -- to achieve the following conversion overheads: -- -- * Lazy @ByteString@: no conversion -- * Strict @ByteString@: /O(1)/ conversion -- * @[Word8]@: /O(n)/ conversion -- -- It is __recommended to use strict bytestring__ input. Parsing a -- lazy bytestring will cause numerous parser buffer resizes. The -- lazy chunks in the input can be GC'd but the buffer keeps growing -- so you don't actually keep the memory usage low by using a lazy -- bytestring. -- parse :: (Cons s s Word8 Word8) => Parser a -> s -> Either String a parse p = AL.eitherResult . AL.parse p . view recons {-# INLINE parse #-}