-- This file is part of purebred-email -- Copyright (C) 2018-2021 Fraser Tweedale -- -- purebred-email is free software: you can redistribute it and/or modify -- it under the terms of the GNU Affero General Public License as published by -- the Free Software Foundation, either version 3 of the License, or -- (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU Affero General Public License for more details. -- -- You should have received a copy of the GNU Affero General Public License -- along with this program. If not, see . {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeFamilies #-} {- | @encoded-word@s for representing non-7bit ASCII data in headers (RFC 2047 and RFC 2231). -} module Data.MIME.EncodedWord ( decodeEncodedWord , decodeEncodedWords , EncodedWord , encodedWord , buildEncodedWord , encodeEncodedWords , chooseEncodedWordEncoding ) where import Control.Applicative ((<|>), liftA2, optional) import Data.Maybe (fromMaybe) import Data.Monoid (Sum(Sum), Any(Any)) import Control.Lens (to, clonePrism, review, view, foldMapOf) import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString.Char8 (char8) import Data.ByteString.Lens (bytes) import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Builder as Builder import qualified Data.CaseInsensitive as CI import qualified Data.Text as T import qualified Data.Text.Encoding as T import Data.MIME.Charset import Data.MIME.Error (EncodingError) import Data.MIME.TransferEncoding import Data.MIME.Base64 import Data.MIME.QuotedPrintable import Data.IMF.Syntax (ci, takeTillString) {-# ANN module ("HLint: ignore Eta reduce" :: String) #-} data EncodedWord = EncodedWord { _encodedWordCharset :: CI.CI B.ByteString , _encodedWordLanguage :: Maybe (CI.CI B.ByteString) , _encodedWordEncoding :: CI.CI B.ByteString , _encodedWordText :: B.ByteString } instance HasTransferEncoding EncodedWord where type TransferDecoded EncodedWord = TransferDecodedEncodedWord transferEncodingName = to _encodedWordEncoding transferEncodedData = to _encodedWordText transferDecoded = to $ \a@(EncodedWord charset lang _ _) -> TransferDecodedEncodedWord charset lang <$> view transferDecodedBytes a transferEncode = transferEncodeEncodedWord data TransferDecodedEncodedWord = TransferDecodedEncodedWord { _transDecWordCharset :: CI.CI B.ByteString , _transDecWordLanguage :: Maybe (CI.CI B.ByteString) , _transDecWordText :: B.ByteString } -- | 'charsetEncode' uses UTF-8, but sets the charset to @us-ascii@ -- if the text only contains ASCII characters. No language is set -- upon encoding. -- instance HasCharset TransferDecodedEncodedWord where type Decoded TransferDecodedEncodedWord = T.Text charsetName = to (pure . _transDecWordCharset) charsetData = to _transDecWordText charsetDecoded charsets = charsetText charsets charsetEncode s = let bs = T.encodeUtf8 s charset = if B.all (< 0x80) bs then "us-ascii" else "utf-8" in TransferDecodedEncodedWord charset Nothing bs -- NOTE: may not be > 75 chars long -- -- NOTE: DOES NOT PARSE THE LEADING "=?" -- encodedWord :: Parser EncodedWord encodedWord = EncodedWord <$> ci token <*> optional (char8 '*' *> ci token) <*> (qmark *> ci token) <*> (qmark *> encodedText <* qmark <* eq) where eq = char8 '=' qmark = char8 '?' token = takeWhile1 (\c -> c >= 33 && c <= 126 && notInClass "()<>@,;:\\\"/[]?.=*" c) -- any printable ascii char other than ? or SPACE encodedText = takeWhile1 (\c -> (c >= 33 && c < 63) || (c > 63 && c <= 127)) serialiseEncodedWord :: EncodedWord -> B.ByteString serialiseEncodedWord = L.toStrict . Builder.toLazyByteString . buildEncodedWord buildEncodedWord :: EncodedWord -> Builder.Builder buildEncodedWord (EncodedWord charset lang enc s) = "=?" <> Builder.byteString (CI.original charset) <> maybe "" (\l -> "*" <> Builder.byteString (CI.original l)) lang <> "?" <> Builder.byteString (CI.original enc) <> "?" <> Builder.byteString s <> "?=" transferEncodeEncodedWord :: TransferDecodedEncodedWord -> EncodedWord transferEncodeEncodedWord (TransferDecodedEncodedWord charset lang s) = let (enc, p) = fromMaybe ("Q", q) (chooseEncodedWordEncoding s) in EncodedWord charset lang enc (review (clonePrism p) s) -- | RFC 2047 and RFC 2231 define the /encoded-words/ mechanism for -- embedding non-ASCII data in headers. This function locates -- encoded-words and decodes them. -- -- @ -- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "hello =?utf-8?B?5LiW55WM?=!" -- hello 世界! -- @ -- -- If parsing fails or the encoding is unrecognised the encoded-word -- is left unchanged in the result. -- -- @ -- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8?B?bogus?=" -- =?utf-8?B?bogus?= -- -- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8?X?unrecognised_encoding?=" -- =?utf-8?X?unrecognised_encoding?= -- @ -- -- Language specification is supported (the datum is discarded). -- -- @ -- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8*es?Q?hola_mundo!?=" -- hola mundo! -- @ -- decodeEncodedWords :: CharsetLookup -> B.ByteString -> T.Text decodeEncodedWords charsets s = either (const (decodeLenient s)) (foldMap conv) (parseOnly tokens s) where tokens :: Parser [Either B.ByteString EncodedWord] tokens = liftA2 (:) (Left <$> takeTillString "=?") more <|> ((:[]) . Left <$> takeByteString) more = liftA2 (:) (Right <$> encodedWord <|> pure (Left "=?")) tokens conv = either decodeLenient (decodeEncodedWord charsets) -- | Decode an 'EncodedWord'. If transfer or charset decoding fails, -- returns the serialised encoded word. decodeEncodedWord :: CharsetLookup -> EncodedWord -> T.Text decodeEncodedWord charsets w = either (decodeLenient . const (serialiseEncodedWord w) :: EncodingError -> T.Text) id (view transferDecoded w >>= view (charsetDecoded charsets)) -- | This function encodes necessary parts of some text -- -- Currently turns the whole text into a single encoded word, if necessary. encodeEncodedWords :: T.Text -> B.ByteString encodeEncodedWords t = maybe utf8 (const ew) (chooseEncodedWordEncoding utf8) where ew = B.intercalate " " . fmap encOrNot . B.split 32 $ utf8 encOrNot s = maybe s (g s) (chooseEncodedWordEncoding s) g s (enc, p) = serialiseEncodedWord $ EncodedWord "utf-8" Nothing enc (review (clonePrism p) s) utf8 = T.encodeUtf8 t chooseEncodedWordEncoding :: B.ByteString -> Maybe (TransferEncodingName, TransferEncoding) chooseEncodedWordEncoding s | not doEnc = Nothing | nQP < nB64 = Just ("Q", q) | otherwise = Just ("B", b) where -- https://tools.ietf.org/html/rfc5322#section-3.5 'text' needEnc c = c > 127 || c == 0 qpBytes c | encodingRequiredNonEOL Q c = 3 | otherwise = 1 (Any doEnc, Sum nQP) = foldMapOf bytes (\c -> (Any (needEnc c), Sum (qpBytes c))) s nB64 = ((B.length s + 2) `div` 3) * 4