{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}

{- |

@encoded-word@s for representing non-7bit ASCII data in headers
(RFC 2047 and RFC 2231).

-}
module Data.MIME.EncodedWord
  (
    decodeEncodedWords
  ) where

import Control.Applicative ((<|>), liftA2, optional)
import Data.Bifunctor (first)
import Data.Semigroup ((<>))

import Control.Lens (to, clonePrism, review, view)
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (char8)
import qualified Data.ByteString as B
import qualified Data.CaseInsensitive as CI
import qualified Data.Text as T
import qualified Data.Text.Encoding as T

import Data.MIME.Charset
import Data.MIME.TransferEncoding
import Data.MIME.QuotedPrintable
import Data.RFC5322.Internal

data EncodedWord = EncodedWord
  { _encodedWordCharset :: CI.CI B.ByteString
  , _encodedWordLanguage :: Maybe (CI.CI B.ByteString)
  , _encodedWordEncoding :: CI.CI B.ByteString
  , _encodedWordText :: B.ByteString
  }

instance HasTransferEncoding EncodedWord where
  type TransferDecoded EncodedWord = TransferDecodedEncodedWord
  transferEncodingName = to _encodedWordEncoding
  transferEncodedData = to _encodedWordText
  transferDecoded = to $ \a@(EncodedWord charset lang _ _) ->
    TransferDecodedEncodedWord charset lang <$> view transferDecodedBytes a


data TransferDecodedEncodedWord = TransferDecodedEncodedWord
  { _transDecWordCharset :: CI.CI B.ByteString
  , _transDecWordLanguage :: Maybe (CI.CI B.ByteString)
  , _transDecWordText :: B.ByteString
  }

-- | 'charsetEncode' uses UTF-8, but sets the charset to @us-ascii@
-- if the text only contains ASCII characters.  No language is set
-- upon encoding.
--
instance HasCharset TransferDecodedEncodedWord where
  type Decoded TransferDecodedEncodedWord = T.Text
  charsetName = to (pure . _transDecWordCharset)
  charsetData = to _transDecWordText
  charsetDecoded = charsetText

  charsetEncode s =
    let
      bs = T.encodeUtf8 s
      charset = if B.all (< 0x80) bs then "us-ascii" else "utf-8"
    in TransferDecodedEncodedWord charset Nothing bs

-- NOTE: may not be > 75 chars long
--
-- NOTE: DOES NOT PARSE THE LEADING "=?"
--
encodedWord :: Parser EncodedWord
encodedWord =
  EncodedWord
  <$> ci token
  <*> optional (char8 '*' *> ci token)
  <*> (qmark *> ci token)
  <*> (qmark *> encodedText <* qmark <* eq)
  where
    eq = char8 '='
    qmark = char8 '?'
    token = takeWhile1 (\c -> c >= 33 && c <= 126 && notInClass "()<>@,;:\\\"/[]?.=*" c)

    -- any printable ascii char other than ? or SPACE
    encodedText = takeWhile1 (\c -> (c >= 33 && c < 63) || (c > 63 && c <= 127))

serialiseEncodedWord :: EncodedWord -> B.ByteString
serialiseEncodedWord (EncodedWord charset lang enc s) =
  "=?" <> CI.original charset
  <> maybe "" (\l -> "*" <> CI.original l) lang
  <> "?" <> CI.original enc <> 
  "?" <> s <> "?="

transferEncodeEncodedWord :: TransferDecodedEncodedWord -> EncodedWord
transferEncodeEncodedWord (TransferDecodedEncodedWord charset lang s) =
  EncodedWord charset lang "Q" (review (clonePrism q) s)

-- | RFC 2047 and RFC 2231 define the /encoded-words/ mechanism for
-- embedding non-ASCII data in headers.  This function locates
-- encoded-words and decodes them.
--
-- @
-- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "hello =?utf-8?B?5LiW55WM?=!"
-- hello 世界!
-- @
--
-- If parsing fails or the encoding is unrecognised the encoded-word
-- is left unchanged in the result.
--
-- @
-- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8?B?bogus?="
-- =?utf-8?B?bogus?=
--
-- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8?X?unrecognised_encoding?="
-- =?utf-8?X?unrecognised_encoding?=
-- @
--
-- Language specification is supported (the datum is discarded).
--
-- @
-- λ> T.putStrLn $ decodeEncodedWords defaultCharsets "=?utf-8*es?Q?hola_mundo!?="
-- hola mundo!
-- @
--
decodeEncodedWords :: CharsetLookup -> B.ByteString -> T.Text
decodeEncodedWords charsets s =
  either (const $ decodeLenient s) merge $ fmap (g . f) <$> parseOnly tokens s
  where
    -- parse the ByteString into a series of tokens
    -- of either raw ASCII text, or EncodedWords
    tokens = liftA2 (:) (Left <$> takeTillString "=?") more
          <|> ((:[]) . Left <$> takeByteString)
    more = liftA2 (:) (Right <$> encodedWord <|> pure (Left "=?")) tokens

    f (Left t) = Left t
    f (Right w) = first
      (const $ serialiseEncodedWord w :: TransferEncodingError -> B.ByteString)
      (view transferDecoded w)
    g (Left t) = Left t
    g (Right w) = first
      (const $ serialiseEncodedWord $ transferEncodeEncodedWord w :: CharsetError -> B.ByteString)
      (view (charsetDecoded charsets) w)

    merge = foldMap (either decodeLenient id)