{-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} -- | Header parsers. Most exported parsers (with the exception of 'fws', -- 'cfws', and 'unstructured') are for parsing structured header fields. -- They expect no leading space and will eat an trailing white space. module Network.Email.Header.Parser ( -- * Whitespace fws , cfws -- * Combinators , lexeme , symbol , commaSep -- * Date and time , dateTime -- * Addresses , address , mailbox , mailboxList , recipient , recipientList -- * Message IDs , messageID , messageIDList -- * Text , phrase , phraseList , unstructured -- * MIME , mimeVersion , contentType , contentTransferEncoding ) where import Control.Applicative import Control.Monad import Data.Attoparsec.ByteString (Parser) import qualified Data.Attoparsec.ByteString as A import qualified Data.Attoparsec.ByteString.Char8 as A8 import Data.Attoparsec.Combinator import Data.Bits import qualified Data.ByteString as B import qualified Data.ByteString.Base64 as Base64 import qualified Data.ByteString.Char8 as B8 import Data.ByteString.Internal (w2c) import qualified Data.ByteString.Lazy as L (toStrict) import Data.ByteString.Lazy.Builder import Data.CaseInsensitive (CI) import qualified Data.CaseInsensitive as CI import Data.List import qualified Data.Map.Strict as Map import Data.Maybe import qualified Data.Text as T import Data.Text.Encoding import qualified Data.Text.Lazy as L (Text, fromChunks) import Data.Time import Data.Time.Calendar.WeekDate import Data.Word import Network.Email.Charset import Network.Email.Header.Types hiding (mimeType) infixl 3 <+> -- | Concatenate the results of two parsers. (<+>) :: (Applicative f, Monoid a) => f a -> f a -> f a (<+>) = liftA2 mappend -- | Repeat and concatenate. concatMany :: (Alternative f, Monoid a) => f a -> f a concatMany p = mconcat <$> many p -- | Return a 'Just' value, and 'fail' a 'Nothing' value. parseMaybe :: Monad m => String -> Maybe a -> m a parseMaybe s = maybe (fail s) return -- | Return a 'Right' value, and 'fail' a 'Left' value. parseEither :: Monad m => Either String a -> m a parseEither = either fail return -- | Run a 'Builder' as a strict 'B.ByteString'. toByteString :: Builder -> B.ByteString toByteString = L.toStrict . toLazyByteString -- | Skip folding whitespace. fws :: Parser () fws = A8.skipSpace -- | Parse a comment, including all nested comments. comment :: Parser B.ByteString comment = A8.char '(' *> A.scan (0 :: Int, False) f <* A8.char ')' where f (!n, True ) _ = Just (n, False) f (!n, False) w = case w2c w of '(' -> Just (n + 1, False) ')' -> if n == 0 then Nothing else Just (n - 1, False) '\\' -> Just (n, True) _ -> Just (n, False) -- | Skip any comments or folding whitespace. cfws :: Parser () cfws = () <$ fws `sepBy` comment -- | Parse a value followed by whitespace. lexeme :: Parser a -> Parser a lexeme p = p <* cfws -- | Parse a character followed by whitespace. symbol :: Char -> Parser Char symbol = lexeme . A8.char -- | Quickly (and unsafely) convert a digit to the number it represents. fromDigit :: Integral a => Word8 -> a fromDigit w = fromIntegral (w - 48) -- | Parse a fixed number of digits. digits :: Integral a => Int -> Parser a digits 0 = return 0 digits 1 = fromDigit <$> A.satisfy A8.isDigit_w8 digits n = do s <- A.take n unless (B.all A8.isDigit_w8 s) $ fail $ "expected " ++ show n ++ " digits" return $ B.foldl' (\a w -> 10*a + fromDigit w) 0 s -- | Parse a number lexeme with a fixed number of digits. number :: Integral a => Int -> Parser a number = lexeme . digits -- | Parse a hexadecimal pair. hexPair :: Parser Word8 hexPair = decode <$> hexDigit <*> hexDigit where decode a b = shiftL a 4 .|. b hexDigit = fromHexDigit <$> A.satisfy isHexDigit isHexDigit w = w >= 48 && w <= 57 || w >= 64 && w <= 70 || w >= 97 && w <= 102 fromHexDigit w | w >= 97 = w - 87 | w >= 64 = w - 55 | otherwise = w - 48 -- | Parse an token lexeme consisting of all printable characters, but -- disallowing the specified special characters. tokenWith :: String -> Parser B.ByteString tokenWith specials = lexeme (A.takeWhile1 isAtom) where isAtom w = w <= 126 && w >= 33 && A.notInClass specials w -- | Parse an atom, which contains ASCII letters, digits, and the -- characters @"!#$%&\'*+-/=?^_`{|}~"@. atom :: Parser B.ByteString atom = tokenWith "()<>[]:;@\\\",." -- | Parse a dot-atom, or an atom which may contain periods. dotAtom :: Parser B.ByteString dotAtom = tokenWith "()<>[]:;@\\\"," -- | A MIME token, which contains ASCII letters, digits, and the -- characters @"!#$%\'*+-^_`{|}~."@. token :: Parser B.ByteString token = tokenWith "()<>@,;:\\\"/[]?=" -- | A case-insensitive MIME token. tokenCI :: Parser (CI B.ByteString) tokenCI = CI.mk <$> token -- | Parse a quoted-string. quotedString :: Parser B.ByteString quotedString = lexeme $ toByteString <$ A8.char '"' <*> concatMany quotedChar <* A8.char '"' where quotedChar = mempty <$ A.string "\r\n" <|> word8 <$ A8.char '\\' <*> A.anyWord8 <|> char8 <$> A8.satisfy (/= '"') -- | Parse an encoded word, as per RFC 2047. encodedWord :: Parser T.Text encodedWord = do _ <- A.string "=?" name <- B8.unpack <$> tokenWith "()<>@,;:\"/[]?.=" _ <- A8.char '?' method <- decodeMethod _ <- A8.char '?' enc <- method _ <- A.string "?=" charset <- parseMaybe "charset not found" $ lookupCharset name return $ toUnicode charset enc where decodeMethod = quoted <$ A.satisfy (`B.elem` "Qq") <|> base64String <$ A.satisfy (`B.elem` "Bb") quoted = toByteString <$> concatMany quotedChar quotedChar = char8 ' ' <$ A8.char '_' <|> word8 <$ A8.char '=' <*> hexPair <|> char8 <$> A8.satisfy (not . isBreak) isBreak c = A8.isSpace c || c == '?' base64String = do s <- A8.takeWhile (/= '?') parseEither (Base64.decode s) -- | Return a quoted string as-is. scanString :: Char -> Char -> Parser Builder scanString start end = lexeme $ do s <- byteString <$ A8.char start <*> A8.scan False f <* A8.char end return (char8 start <> s <> char8 end) where f True _ = Just False f False c | c == end = Nothing | c == '\\' = Just True | otherwise = Just False -- | Parse an email address, stripping out whitespace and comments. addrSpec :: Parser B.ByteString addrSpec = toByteString <$> (localPart <+> at <+> domain) where at = char8 <$> symbol '@' dot = char8 <$> symbol '.' dotSep p = p <+> concatMany (dot <+> p) addrAtom = byteString <$> atom addrQuote = scanString '"' '"' domainLiteral = scanString '[' ']' localPart = dotSep (addrAtom <|> addrQuote) domain = dotSep addrAtom <|> domainLiteral -- | Parse an address specification in angle brackets. angleAddrSpec :: Parser B.ByteString angleAddrSpec = symbol '<' *> addrSpec <* symbol '>' -- | Parse two or more occurences of @p@, separated by @sep@. sepBy2 :: Alternative f => f a -> f b -> f [a] sepBy2 p sep = (:) <$> p <*> many1 (sep *> p) -- | Parse a list of elements, with possibly null entries in between -- separators. At least one entry or separator will be parsed. optionalSepBy1 :: Alternative f => f a -> f b -> f [a] optionalSepBy1 p sep = catMaybes <$> sepBy2 (optional p) sep <|> return <$> p -- | Parse a list of elements, separated by commas. commaSep :: Parser a -> Parser [a] commaSep p = optionalSepBy1 p (symbol ',') -- | Parse a date and time. -- TODO: non-numeric timezones (such as \"PDT\") are considered equivalent -- to UTC time. dateTime :: Parser ZonedTime dateTime = do wday <- optional dayOfWeek zoned <- zonedTime let (_, _, expected) = toWeekDate . localDay . zonedTimeToLocalTime $ zoned case wday of Just actual | actual /= expected -> fail "day of week does not match date" _ -> return zoned where dayOfWeek = dayName <* symbol ',' localTime = LocalTime <$> date <*> timeOfDay zonedTime = ZonedTime <$> localTime <*> timeZone date = do d <- lexeme A8.decimal m <- month y <- year parseMaybe "invalid date" $ fromGregorianValid y m d year = number 4 <|> (+ 1900) <$> number 3 <|> adjust <$> number 2 where adjust n | n < 50 = 2000 + n | otherwise = 1900 + n timeOfDay = do h <- number 2 m <- symbol ':' *> number 2 s <- option (0 :: Int) (symbol ':' *> number 2) parseMaybe "invalid time of day" $ makeTimeOfDayValid h m (fromIntegral s) timeZone = minutesToTimeZone <$> timeZoneOffset <|> return utc timeZoneOffset = lexeme . A8.signed $ do hh <- digits 2 mm <- digits 2 if mm >= 60 then fail "invalid time zone" else return $ hh * 60 + mm listIndex = lexeme . choice . map (\(n, s) -> n <$ A.string s) . zip [1..] dayName = listIndex [ "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" ] month = listIndex [ "Jan", "Feb", "Mar", "Apr", "May", "Jun" , "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ] -- | Parse an email address in angle brackets. angleAddr :: Parser Address angleAddr = Address <$> angleAddrSpec -- | Parse an email address. address :: Parser Address address = Address <$> addrSpec -- | Parse a 'Mailbox'. mailbox :: Parser Mailbox mailbox = Mailbox <$> optional phrase <*> angleAddr <|> Mailbox Nothing <$> address -- | Parse a list of @'Mailbox'es@. mailboxList :: Parser [Mailbox] mailboxList = commaSep mailbox -- | Parse a 'Recipient'. recipient :: Parser Recipient recipient = Group <$> phrase <* symbol ':' <*> mailboxList <* symbol ';' <|> Individual <$> mailbox -- | Parse a list of @'Recipient's@. recipientList :: Parser [Recipient] recipientList = commaSep recipient -- | Parse a message identifier. messageID :: Parser MessageID messageID = MessageID <$> angleAddrSpec -- | Parse a list of message identifiers. messageIDList :: Parser [MessageID] messageIDList = many1 messageID -- | Combine a list of text elements (atoms, quoted strings, encoded words, -- etc.) into a larger phrase. fromElements :: [T.Text] -> L.Text fromElements = L.fromChunks . intersperse (T.singleton ' ') -- | Parse a phrase. Adjacent encoded words are concatenated. White space -- is reduced to a single space, except when quoted or part of an encoded -- word. phrase :: Parser L.Text phrase = fromElements <$> many1 element where element = T.concat <$> many1 (lexeme encodedWord) <|> decodeLatin1 <$> quotedString <|> decodeLatin1 <$> dotAtom -- | Parse a comma-separated list of phrases. phraseList :: Parser [L.Text] phraseList = commaSep phrase -- | Parse unstructured text. Adjacent encoded words are concatenated. -- White space is reduced to a single space, except when part of an encoded -- word. unstructured :: Parser L.Text unstructured = fromElements <$ fws <*> many element <* A.endOfInput where element = T.concat <$> many1 (encodedWord <* fws) <|> decodeLatin1 <$> word <* fws word = A.takeWhile1 (not . A8.isSpace_w8) -- | Parse the MIME version (which should be 1.0). mimeVersion :: Parser (Int, Int) mimeVersion = (,) <$> number 1 <* symbol '.' <*> number 1 -- | Parse the content type. contentType :: Parser (MimeType, Parameters) contentType = (,) <$> mimeType <*> parameters where mimeType = MimeType <$> tokenCI <* symbol '/' <*> tokenCI parameters = Map.fromList <$> many (symbol ';' *> parameter) parameter = (,) <$> tokenCI <* symbol '=' <*> (token <|> quotedString) -- | Parse the content transfer encoding. contentTransferEncoding :: Parser (CI B.ByteString) contentTransferEncoding = tokenCI