-- This file is part of purebred-email -- Copyright (C) 2018-2021 Fraser Tweedale -- -- purebred-email is free software: you can redistribute it and/or modify -- it under the terms of the GNU Affero General Public License as published by -- the Free Software Foundation, either version 3 of the License, or -- (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU Affero General Public License for more details. -- -- You should have received a copy of the GNU Affero General Public License -- along with this program. If not, see . {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {- | MIME character sets. Recognised charsets: * @us-ascii@ / @iso646-us@ * @utf-8@ * @iso-8859-1@ See also the plugin, which adds support for many character sets. -} module Data.MIME.Charset ( HasCharset(..) , CharsetName , charsetText , charsetText' , CharsetError(..) , AsCharsetError(..) , charsetPrism , CharsetLookup , defaultCharsets , decodeLenient ) where import Control.Lens import qualified Data.ByteString as B import qualified Data.CaseInsensitive as CI import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.Encoding.Error as T {-# ANN module ("HLint: ignore Use camelCase" :: String) #-} type CharsetName = CI.CI B.ByteString type Charset = B.ByteString -> T.Text -- eventually we might want a prism data CharsetError = CharsetUnspecified | CharsetUnsupported CharsetName | CharsetDecodeError CharsetName deriving (Show) class AsCharsetError s where _CharsetError :: Prism' s CharsetError _CharsetUnspecified :: Prism' s () _CharsetUnsupported :: Prism' s CharsetName _CharsetDecodeError :: Prism' s CharsetName _CharsetUnspecified = _CharsetError . _CharsetUnspecified _CharsetUnsupported = _CharsetError . _CharsetUnsupported _CharsetDecodeError = _CharsetError . _CharsetDecodeError instance AsCharsetError CharsetError where _CharsetError = id _CharsetUnspecified = prism' (const CharsetUnspecified) $ \case CharsetUnspecified -> Just () ; _ -> Nothing _CharsetUnsupported = prism' CharsetUnsupported $ \case CharsetUnsupported k -> Just k ; _ -> Nothing _CharsetDecodeError = prism' CharsetDecodeError $ \case CharsetDecodeError k -> Just k ; _ -> Nothing class HasCharset a where type Decoded a -- | Get the declared (or default) character set name. There is no guarantee -- that it corresponds to a registered or supported charset. charsetName :: Getter a (Maybe CharsetName) -- | Return the encoded data in the structure charsetData :: Getter a B.ByteString -- | Structure with the encoded data replaced with 'Text' charsetDecoded :: AsCharsetError e => CharsetLookup -> ( forall p f. (Profunctor p, Contravariant f) => Optic' p f a (Either e (Decoded a)) ) -- | Structure with the encoded data replaced with 'Text' (monomorphic error type) charsetDecoded' :: CharsetLookup -> ( forall p f. (Profunctor p, Contravariant f) => Optic' p f a (Either CharsetError (Decoded a)) ) charsetDecoded' = charsetDecoded -- | Encode the data charsetEncode :: Decoded a -> a -- | Decode the object according to the declared charset. charsetText :: (HasCharset a, AsCharsetError e, Profunctor p, Contravariant f) => CharsetLookup -> Optic' p f a (Either e T.Text) charsetText lookupCharset = to $ \a -> maybe (Left $ review _CharsetUnspecified ()) Right (view charsetName a) >>= \k -> maybe (Left $ review _CharsetUnsupported k) Right (lookupCharset k) >>= \f -> pure (f (view charsetData a)) -- | Monomorphic in error type charsetText' :: (HasCharset a, Profunctor p, Contravariant f) => CharsetLookup -> Optic' p f a (Either CharsetError T.Text) charsetText' = charsetText -- | Prism for charset decoded/encoded data. -- Information about decoding failures is discarded. charsetPrism :: forall a. (HasCharset a) => CharsetLookup -> Prism' a (Decoded a) charsetPrism m = prism' charsetEncode (either err Just . view (charsetDecoded m)) where err = const Nothing :: CharsetError -> Maybe x charsets :: [(CI.CI B.ByteString, Charset)] charsets = [ ("us-ascii", us_ascii) , ("utf-8", utf_8) , ("iso-8859-1", iso_8859_1) -- us-ascii aliases , ("ISO646-US", us_ascii) , ("ANSI_X3.4-1968", us_ascii) , ("iso-ir-6", us_ascii) , ("ANSI_X3.4-1986", us_ascii) , ("ISO_646.irv:1991", us_ascii) , ("us", us_ascii) , ("IBM367", us_ascii) , ("cp367", us_ascii) , ("csASCII.4-1968", us_ascii) , ("ascii", us_ascii) -- https://github.com/purebred-mua/purebred-email/issues/69 -- iso-8859-1 aliases , ("iso-ir-100", iso_8859_1) , ("ISO_8859-1", iso_8859_1) , ("latin1", iso_8859_1) , ("l1", iso_8859_1) , ("IBM819", iso_8859_1) , ("CP819", iso_8859_1) , ("csISOLatin1", iso_8859_1) -- utf-8 aliases , ("csUTF8", utf_8) -- utf-16 , ("UTF-16BE", utf16be) , ("UTF-16LE", utf16le) , ("UTF-16", utf16) , ("csUTF16BE", utf16be) , ("csUTF16LE", utf16le) , ("csUTF16", utf16) -- utf-32 , ("UTF-32BE", utf32be) , ("UTF-32LE", utf32le) , ("UTF-32", utf32) , ("csUTF32BE", utf32be) , ("csUTF32LE", utf32le) , ("csUTF32", utf32) -- Other charsets I observed in my mail corpus. -- , ("iso-8859-2", ...) -- , ("iso-8859-15", ...) -- , ("iso-2022-jp", ...) (common) -- , ("windows-1252", ...) (common) -- , ("windows-1256", ...) -- , ("cp1252", ...) (alias of windows-1252) -- , ("big5", ...) (common) -- , ("euc-kr", ...) -- , ("cp932", ...) -- , ("gb2312", ...) (Chinese) ] us_ascii, utf_8, iso_8859_1 :: Charset us_ascii = decodeLenient utf_8 = decodeLenient iso_8859_1 = T.decodeLatin1 utf16be, utf16le, utf16 :: Charset utf16be = T.decodeUtf16BEWith T.lenientDecode utf16le = T.decodeUtf16LEWith T.lenientDecode -- https://www.rfc-editor.org/rfc/rfc2781.html#section-4.3 utf16 b = case B.take 2 b of "\xff\xfe" -> utf16le b _ -> utf16be b utf32be, utf32le, utf32 :: Charset utf32be = T.decodeUtf32BEWith T.lenientDecode utf32le = T.decodeUtf32LEWith T.lenientDecode -- Unicode 4.0, Section 3.10, D45 -- https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G7404 utf32 b = case B.take 4 b of "\xff\xfe\x00\x00" -> utf32le b _ -> utf32be b type CharsetLookup = CI.CI B.ByteString -> Maybe Charset -- | Supports US-ASCII, UTF-8 and ISO-8859-1, UTF-16[BE|LE] -- and UTF-32[BE|LE]. The /purebred-icu/ package provides -- support for more charsets. -- defaultCharsets :: CharsetLookup defaultCharsets k = lookup k charsets -- | Decode as UTF-8, replacing invalid sequences with placeholders. -- decodeLenient :: B.ByteString -> T.Text decodeLenient = T.decodeUtf8With T.lenientDecode