-- This file is part of purebred-email
-- Copyright (C) 2018-2021  Fraser Tweedale
--
-- purebred-email is free software: you can redistribute it and/or modify
-- it under the terms of the GNU Affero General Public License as published by
-- the Free Software Foundation, either version 3 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU Affero General Public License for more details.
--
-- You should have received a copy of the GNU Affero General Public License
-- along with this program.  If not, see <http://www.gnu.org/licenses/>.

{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}

{- |

MIME character sets.

Recognised charsets:

* @us-ascii@ / @iso646-us@
* @utf-8@
* @iso-8859-1@

See also the <https://github.com/purebred-mua/purebred-icu purebred-icu>
plugin, which adds support for many character sets.

-}
module Data.MIME.Charset
  (
    HasCharset(..)
  , CharsetName
  , charsetText
  , charsetText'
  , CharsetError(..)
  , AsCharsetError(..)
  , charsetPrism

  , CharsetLookup
  , defaultCharsets

  , decodeLenient
  ) where

import Control.Lens
import qualified Data.ByteString as B
import qualified Data.CaseInsensitive as CI
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as T

{-# ANN module ("HLint: ignore Use camelCase" :: String) #-}

type CharsetName = CI.CI B.ByteString
type Charset = B.ByteString -> T.Text  -- eventually we might want a prism

data CharsetError
  = CharsetUnspecified
  | CharsetUnsupported CharsetName
  | CharsetDecodeError CharsetName
  deriving (Int -> CharsetError -> ShowS
[CharsetError] -> ShowS
CharsetError -> String
(Int -> CharsetError -> ShowS)
-> (CharsetError -> String)
-> ([CharsetError] -> ShowS)
-> Show CharsetError
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CharsetError] -> ShowS
$cshowList :: [CharsetError] -> ShowS
show :: CharsetError -> String
$cshow :: CharsetError -> String
showsPrec :: Int -> CharsetError -> ShowS
$cshowsPrec :: Int -> CharsetError -> ShowS
Show)

class AsCharsetError s where
  _CharsetError :: Prism' s CharsetError
  _CharsetUnspecified :: Prism' s ()
  _CharsetUnsupported :: Prism' s CharsetName
  _CharsetDecodeError :: Prism' s CharsetName

  _CharsetUnspecified = p CharsetError (f CharsetError) -> p s (f s)
forall s. AsCharsetError s => Prism' s CharsetError
_CharsetError (p CharsetError (f CharsetError) -> p s (f s))
-> (p () (f ()) -> p CharsetError (f CharsetError))
-> p () (f ())
-> p s (f s)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. p () (f ()) -> p CharsetError (f CharsetError)
forall s. AsCharsetError s => Prism' s ()
_CharsetUnspecified
  _CharsetUnsupported = p CharsetError (f CharsetError) -> p s (f s)
forall s. AsCharsetError s => Prism' s CharsetError
_CharsetError (p CharsetError (f CharsetError) -> p s (f s))
-> (p CharsetName (f CharsetName)
    -> p CharsetError (f CharsetError))
-> p CharsetName (f CharsetName)
-> p s (f s)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. p CharsetName (f CharsetName) -> p CharsetError (f CharsetError)
forall s. AsCharsetError s => Prism' s CharsetName
_CharsetUnsupported
  _CharsetDecodeError = p CharsetError (f CharsetError) -> p s (f s)
forall s. AsCharsetError s => Prism' s CharsetError
_CharsetError (p CharsetError (f CharsetError) -> p s (f s))
-> (p CharsetName (f CharsetName)
    -> p CharsetError (f CharsetError))
-> p CharsetName (f CharsetName)
-> p s (f s)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. p CharsetName (f CharsetName) -> p CharsetError (f CharsetError)
forall s. AsCharsetError s => Prism' s CharsetName
_CharsetDecodeError

instance AsCharsetError CharsetError where
  _CharsetError :: p CharsetError (f CharsetError) -> p CharsetError (f CharsetError)
_CharsetError = p CharsetError (f CharsetError) -> p CharsetError (f CharsetError)
forall a. a -> a
id
  _CharsetUnspecified :: p () (f ()) -> p CharsetError (f CharsetError)
_CharsetUnspecified = (() -> CharsetError)
-> (CharsetError -> Maybe ()) -> Prism' CharsetError ()
forall b s a. (b -> s) -> (s -> Maybe a) -> Prism s s a b
prism' (CharsetError -> () -> CharsetError
forall a b. a -> b -> a
const CharsetError
CharsetUnspecified) ((CharsetError -> Maybe ()) -> Prism' CharsetError ())
-> (CharsetError -> Maybe ()) -> Prism' CharsetError ()
forall a b. (a -> b) -> a -> b
$ \case
      CharsetError
CharsetUnspecified -> () -> Maybe ()
forall a. a -> Maybe a
Just () ; CharsetError
_ -> Maybe ()
forall a. Maybe a
Nothing
  _CharsetUnsupported :: p CharsetName (f CharsetName) -> p CharsetError (f CharsetError)
_CharsetUnsupported = (CharsetName -> CharsetError)
-> (CharsetError -> Maybe CharsetName)
-> Prism' CharsetError CharsetName
forall b s a. (b -> s) -> (s -> Maybe a) -> Prism s s a b
prism' CharsetName -> CharsetError
CharsetUnsupported ((CharsetError -> Maybe CharsetName)
 -> Prism' CharsetError CharsetName)
-> (CharsetError -> Maybe CharsetName)
-> Prism' CharsetError CharsetName
forall a b. (a -> b) -> a -> b
$ \case
      CharsetUnsupported CharsetName
k -> CharsetName -> Maybe CharsetName
forall a. a -> Maybe a
Just CharsetName
k ; CharsetError
_ -> Maybe CharsetName
forall a. Maybe a
Nothing
  _CharsetDecodeError :: p CharsetName (f CharsetName) -> p CharsetError (f CharsetError)
_CharsetDecodeError = (CharsetName -> CharsetError)
-> (CharsetError -> Maybe CharsetName)
-> Prism' CharsetError CharsetName
forall b s a. (b -> s) -> (s -> Maybe a) -> Prism s s a b
prism' CharsetName -> CharsetError
CharsetDecodeError ((CharsetError -> Maybe CharsetName)
 -> Prism' CharsetError CharsetName)
-> (CharsetError -> Maybe CharsetName)
-> Prism' CharsetError CharsetName
forall a b. (a -> b) -> a -> b
$ \case
      CharsetDecodeError CharsetName
k -> CharsetName -> Maybe CharsetName
forall a. a -> Maybe a
Just CharsetName
k ; CharsetError
_ -> Maybe CharsetName
forall a. Maybe a
Nothing


class HasCharset a where
  type Decoded a

  -- | Get the declared (or default) character set name.  There is no guarantee
  -- that it corresponds to a registered or supported charset.
  charsetName :: Getter a (Maybe CharsetName)

  -- | Return the encoded data in the structure
  charsetData :: Getter a B.ByteString

  -- | Structure with the encoded data replaced with 'Text'
  charsetDecoded
    :: AsCharsetError e
    => CharsetLookup
    -> ( forall p f. (Profunctor p, Contravariant f)
        => Optic' p f a (Either e (Decoded a)) )

  -- | Structure with the encoded data replaced with 'Text' (monomorphic error type)
  charsetDecoded'
    :: CharsetLookup
    -> ( forall p f. (Profunctor p, Contravariant f)
        => Optic' p f a (Either CharsetError (Decoded a)) )
  charsetDecoded' = CharsetLookup -> Optic' p f a (Either CharsetError (Decoded a))
forall a e.
(HasCharset a, AsCharsetError e) =>
CharsetLookup
-> forall (p :: * -> * -> *) (f :: * -> *).
   (Profunctor p, Contravariant f) =>
   Optic' p f a (Either e (Decoded a))
charsetDecoded

  -- | Encode the data
  charsetEncode :: Decoded a -> a


-- | Decode the object according to the declared charset.
charsetText
  :: (HasCharset a, AsCharsetError e, Profunctor p, Contravariant f)
  => CharsetLookup -> Optic' p f a (Either e T.Text)
charsetText :: CharsetLookup -> Optic' p f a (Either e Text)
charsetText CharsetLookup
lookupCharset = (a -> Either e Text) -> Optic' p f a (Either e Text)
forall (p :: * -> * -> *) (f :: * -> *) s a.
(Profunctor p, Contravariant f) =>
(s -> a) -> Optic' p f s a
to ((a -> Either e Text) -> Optic' p f a (Either e Text))
-> (a -> Either e Text) -> Optic' p f a (Either e Text)
forall a b. (a -> b) -> a -> b
$ \a
a ->
  Either e CharsetName
-> (CharsetName -> Either e CharsetName)
-> Maybe CharsetName
-> Either e CharsetName
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (e -> Either e CharsetName
forall a b. a -> Either a b
Left (e -> Either e CharsetName) -> e -> Either e CharsetName
forall a b. (a -> b) -> a -> b
$ AReview e () -> () -> e
forall b (m :: * -> *) t. MonadReader b m => AReview t b -> m t
review AReview e ()
forall s. AsCharsetError s => Prism' s ()
_CharsetUnspecified ()) CharsetName -> Either e CharsetName
forall a b. b -> Either a b
Right (Getting (Maybe CharsetName) a (Maybe CharsetName)
-> a -> Maybe CharsetName
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view Getting (Maybe CharsetName) a (Maybe CharsetName)
forall a. HasCharset a => Getter a (Maybe CharsetName)
charsetName a
a)
  Either e CharsetName
-> (CharsetName -> Either e Text) -> Either e Text
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \CharsetName
k -> Either e Charset
-> (Charset -> Either e Charset)
-> Maybe Charset
-> Either e Charset
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (e -> Either e Charset
forall a b. a -> Either a b
Left (e -> Either e Charset) -> e -> Either e Charset
forall a b. (a -> b) -> a -> b
$ AReview e CharsetName -> CharsetName -> e
forall b (m :: * -> *) t. MonadReader b m => AReview t b -> m t
review AReview e CharsetName
forall s. AsCharsetError s => Prism' s CharsetName
_CharsetUnsupported CharsetName
k) Charset -> Either e Charset
forall a b. b -> Either a b
Right (CharsetLookup
lookupCharset CharsetName
k)
  Either e Charset -> (Charset -> Either e Text) -> Either e Text
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Charset
f -> Text -> Either e Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Charset
f (Getting ByteString a ByteString -> a -> ByteString
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view Getting ByteString a ByteString
forall a. HasCharset a => Getter a ByteString
charsetData a
a))

-- | Monomorphic in error type
charsetText'
  :: (HasCharset a, Profunctor p, Contravariant f)
  => CharsetLookup
  -> Optic' p f a (Either CharsetError T.Text)
charsetText' :: CharsetLookup -> Optic' p f a (Either CharsetError Text)
charsetText' = CharsetLookup -> Optic' p f a (Either CharsetError Text)
forall a e (p :: * -> * -> *) (f :: * -> *).
(HasCharset a, AsCharsetError e, Profunctor p, Contravariant f) =>
CharsetLookup -> Optic' p f a (Either e Text)
charsetText

-- | Prism for charset decoded/encoded data.
-- Information about decoding failures is discarded.
charsetPrism :: forall a. (HasCharset a) => CharsetLookup -> Prism' a (Decoded a)
charsetPrism :: CharsetLookup -> Prism' a (Decoded a)
charsetPrism CharsetLookup
m = (Decoded a -> a)
-> (a -> Maybe (Decoded a)) -> Prism' a (Decoded a)
forall b s a. (b -> s) -> (s -> Maybe a) -> Prism s s a b
prism' Decoded a -> a
forall a. HasCharset a => Decoded a -> a
charsetEncode ((CharsetError -> Maybe (Decoded a))
-> (Decoded a -> Maybe (Decoded a))
-> Either CharsetError (Decoded a)
-> Maybe (Decoded a)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either CharsetError -> Maybe (Decoded a)
forall x. CharsetError -> Maybe x
err Decoded a -> Maybe (Decoded a)
forall a. a -> Maybe a
Just (Either CharsetError (Decoded a) -> Maybe (Decoded a))
-> (a -> Either CharsetError (Decoded a)) -> a -> Maybe (Decoded a)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Getting
  (Either CharsetError (Decoded a))
  a
  (Either CharsetError (Decoded a))
-> a -> Either CharsetError (Decoded a)
forall s (m :: * -> *) a. MonadReader s m => Getting a s a -> m a
view (CharsetLookup
-> forall (p :: * -> * -> *) (f :: * -> *).
   (Profunctor p, Contravariant f) =>
   Optic' p f a (Either CharsetError (Decoded a))
forall a e.
(HasCharset a, AsCharsetError e) =>
CharsetLookup
-> forall (p :: * -> * -> *) (f :: * -> *).
   (Profunctor p, Contravariant f) =>
   Optic' p f a (Either e (Decoded a))
charsetDecoded CharsetLookup
m))
  where
  err :: CharsetError -> Maybe x
err = Maybe x -> CharsetError -> Maybe x
forall a b. a -> b -> a
const Maybe x
forall a. Maybe a
Nothing :: CharsetError -> Maybe x

charsets :: [(CI.CI B.ByteString, Charset)]
charsets :: [(CharsetName, Charset)]
charsets =
  [ (CharsetName
"us-ascii", Charset
us_ascii)
  , (CharsetName
"utf-8", Charset
utf_8)
  , (CharsetName
"iso-8859-1", Charset
iso_8859_1)

  -- us-ascii aliases
  , (CharsetName
"ISO646-US", Charset
us_ascii)
  , (CharsetName
"ANSI_X3.4-1968", Charset
us_ascii)
  , (CharsetName
"iso-ir-6", Charset
us_ascii)
  , (CharsetName
"ANSI_X3.4-1986", Charset
us_ascii)
  , (CharsetName
"ISO_646.irv:1991", Charset
us_ascii)
  , (CharsetName
"us", Charset
us_ascii)
  , (CharsetName
"IBM367", Charset
us_ascii)
  , (CharsetName
"cp367", Charset
us_ascii)
  , (CharsetName
"csASCII.4-1968", Charset
us_ascii)

  -- iso-8859-1 aliases
  , (CharsetName
"iso-ir-100", Charset
iso_8859_1)
  , (CharsetName
"ISO_8859-1", Charset
iso_8859_1)
  , (CharsetName
"latin1", Charset
iso_8859_1)
  , (CharsetName
"l1", Charset
iso_8859_1)
  , (CharsetName
"IBM819", Charset
iso_8859_1)
  , (CharsetName
"CP819", Charset
iso_8859_1)
  , (CharsetName
"csISOLatin1", Charset
iso_8859_1)

  -- utf-8 aliases
  , (CharsetName
"csUTF8", Charset
utf_8)

  -- utf-16
  , (CharsetName
"UTF-16BE", Charset
utf16be)
  , (CharsetName
"UTF-16LE", Charset
utf16le)
  , (CharsetName
"UTF-16", Charset
utf16)
  , (CharsetName
"csUTF16BE", Charset
utf16be)
  , (CharsetName
"csUTF16LE", Charset
utf16le)
  , (CharsetName
"csUTF16", Charset
utf16)

  -- utf-32
  , (CharsetName
"UTF-32BE", Charset
utf32be)
  , (CharsetName
"UTF-32LE", Charset
utf32le)
  , (CharsetName
"UTF-32", Charset
utf32)
  , (CharsetName
"csUTF32BE", Charset
utf32be)
  , (CharsetName
"csUTF32LE", Charset
utf32le)
  , (CharsetName
"csUTF32", Charset
utf32)

  -- Other charsets I observed in my mail corpus.
  -- , ("iso-8859-2", ...)
  -- , ("iso-8859-15", ...)
  -- , ("iso-2022-jp", ...)    (common)
  -- , ("windows-1252", ...)   (common)
  -- , ("windows-1256", ...)
  -- , ("cp1252", ...)         (alias of windows-1252)
  -- , ("big5", ...)           (common)
  -- , ("euc-kr", ...)
  -- , ("cp932", ...)
  -- , ("gb2312", ...)         (Chinese)
  ]

us_ascii, utf_8, iso_8859_1 :: Charset
us_ascii :: Charset
us_ascii = Charset
decodeLenient
utf_8 :: Charset
utf_8 = Charset
decodeLenient
iso_8859_1 :: Charset
iso_8859_1 = Charset
T.decodeLatin1

utf16be, utf16le, utf16 :: Charset
utf16be :: Charset
utf16be = OnDecodeError -> Charset
T.decodeUtf16BEWith OnDecodeError
T.lenientDecode
utf16le :: Charset
utf16le = OnDecodeError -> Charset
T.decodeUtf16LEWith OnDecodeError
T.lenientDecode
-- https://www.rfc-editor.org/rfc/rfc2781.html#section-4.3
utf16 :: Charset
utf16 ByteString
b = case Int -> ByteString -> ByteString
B.take Int
2 ByteString
b of
  ByteString
"\xff\xfe"  -> Charset
utf16le ByteString
b
  ByteString
_           -> Charset
utf16be ByteString
b

utf32be, utf32le, utf32 :: Charset
utf32be :: Charset
utf32be = OnDecodeError -> Charset
T.decodeUtf32BEWith OnDecodeError
T.lenientDecode
utf32le :: Charset
utf32le = OnDecodeError -> Charset
T.decodeUtf32LEWith OnDecodeError
T.lenientDecode
-- Unicode 4.0, Section 3.10, D45
-- https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G7404
utf32 :: Charset
utf32 ByteString
b = case Int -> ByteString -> ByteString
B.take Int
4 ByteString
b of
  ByteString
"\xff\xfe\x00\x00"  -> Charset
utf32le ByteString
b
  ByteString
_                   -> Charset
utf32be ByteString
b


type CharsetLookup = CI.CI B.ByteString -> Maybe Charset

-- | Supports US-ASCII, UTF-8 and ISO-8859-1, UTF-16[BE|LE]
-- and UTF-32[BE|LE].  The /purebred-icu/ package provides
-- support for more charsets.
--
defaultCharsets :: CharsetLookup
defaultCharsets :: CharsetLookup
defaultCharsets CharsetName
k = CharsetName -> [(CharsetName, Charset)] -> Maybe Charset
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup CharsetName
k [(CharsetName, Charset)]
charsets

-- | Decode as UTF-8, replacing invalid sequences with placeholders.
--
decodeLenient :: B.ByteString -> T.Text
decodeLenient :: Charset
decodeLenient = OnDecodeError -> Charset
T.decodeUtf8With OnDecodeError
T.lenientDecode