-- This file is part of purebred-email
-- Copyright (C) 2018-2021 Fraser Tweedale
--
-- purebred-email is free software: you can redistribute it and/or modify
-- it under the terms of the GNU Affero General Public License as published by
-- the Free Software Foundation, either version 3 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- GNU Affero General Public License for more details.
--
-- You should have received a copy of the GNU Affero General Public License
-- along with this program. If not, see .
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeFamilies #-}
{- |
MIME parameters, per RFC 2045 and RFC 2231.
RFC 2231 defines a mechanism for parameter continuations (for long
parameters), encoding of non-ASCII characters, and charset and
language annotation. The most common use of these capabilities is
in the @Content-Disposition@ header, for the @filename@ parameter.
This module provides types and functions for working with parameters.
-}
module Data.MIME.Parameter
(
Parameters(..)
, emptyParameters
, parameterList
, parameter
, rawParameter
, newParameter
, ParameterValue(..)
, EncodedParameterValue
, DecodedParameterValue
, value
, HasParameters(..)
) where
import Control.Applicative ((<|>), optional)
import Data.Foldable (fold)
import Data.Functor (($>))
import Data.Semigroup (Sum(..), Max(..))
import Data.String (IsString(..))
import Data.Word (Word8)
import Data.Void (Void)
import Foreign (withForeignPtr, plusPtr, minusPtr, peek, peekByteOff, poke)
import GHC.Generics (Generic)
import System.IO.Unsafe (unsafeDupablePerformIO)
import Control.DeepSeq (NFData)
import Control.Lens
import Control.Lens.Cons.Extras (recons)
import Data.Attoparsec.ByteString.Char8 hiding (take)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Char8 as C
import Data.CaseInsensitive (CI, foldedCase, mk, original)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.MIME.Charset
import Data.MIME.Internal
import Data.IMF.Syntax (ci, isQtext, isVchar)
type RawParameters = [(CI B.ByteString, B.ByteString)]
-- | Header parameters. Used for some headers including Content-Type
-- and Content-Disposition. This type handles parameter continuations
-- and optional charset and language information (RFC 2231).
--
newtype Parameters = Parameters [(CI B.ByteString, B.ByteString)]
deriving (Eq, Show, Generic, NFData)
instance Semigroup Parameters where
Parameters a <> Parameters b = Parameters (a <> b)
instance Monoid Parameters where
mempty = Parameters []
type instance Index Parameters = CI B.ByteString
type instance IxValue Parameters = EncodedParameterValue
paramiso :: Iso' Parameters [(CI B.ByteString, B.ByteString)]
paramiso = iso (\(Parameters raw) -> raw) Parameters
-- Traverses 0 or 1 instances of a parameter, which may consist of
-- one or more raw parameters.
instance Ixed Parameters where
ix k = paramiso . l
where
l f kv = case getParameter k kv of
Nothing -> pure kv
Just v -> (\v' -> setParam k v' kv) <$> f v
-- | Same as 'mempty', but useful where the type would otherwise be ambiguous.
emptyParameters :: Parameters
emptyParameters = mempty
-- | Set the parameter (which may need to use the parameter
-- continuation mechanism).
setParam :: CI B.ByteString -> EncodedParameterValue -> RawParameters -> RawParameters
setParam k v = (renderParam k v <>) . deleteParam k
-- | Turn a ParameterValue into a list of raw parameters
--
-- FIXME: currently does not do continutations etc.
-- 'ParameterValue' value is used as-is.
renderParam :: CI B.ByteString -> EncodedParameterValue -> [(CI B.ByteString, B.ByteString)]
renderParam k pv = case pv of
ParameterValue Nothing Nothing v -> case extEncode minBound v of
(Plain, v') -> [(k, v')]
(Quoted, v') -> [(k, "\"" <> v' <> "\"")]
(Extended, v') -> [(k <> "*", "''" <> v')]
ParameterValue cs lang v ->
-- charset or lang has been specified; force extended syntax
[(k <> "*", f cs <> "'" <> f lang <> "'" <> snd (extEncode Extended v))]
where
f = maybe "" original
-- | Delete all raw keys that are "part of" the extended/continued
-- parameter.
deleteParam :: CI B.ByteString -> RawParameters -> RawParameters
deleteParam k = filter (not . test . fst)
where
test x =
x == k
|| (foldedCase k <> "*") `B.isPrefixOf` foldedCase x
instance At Parameters where
at k = paramiso . l
where
l :: Lens' RawParameters (Maybe EncodedParameterValue)
l f kv =
let
g Nothing = deleteParam k kv
g (Just v) = (setParam k v . deleteParam k) kv
in
g <$> f (getParameter k kv)
data Continued = Continued | NotContinued
deriving (Show)
data Encoded = Encoded | NotEncoded
deriving (Show)
-- | Not percent-decoded. 'Encoded' indicates whether
-- percent-decoding is required. 'Continued' indicates whether
-- there are more sections to follow
--
data InitialSection = InitialSection Continued Encoded B.ByteString
deriving (Show)
-- | Not percent-decoded. 'Encoded' indicates whether
-- percent-decoding is required.
--
data OtherSection = OtherSection Encoded B.ByteString
deriving (Show)
initialSection
:: CI B.ByteString
-> RawParameters
-> Maybe InitialSection
initialSection k m =
InitialSection NotContinued NotEncoded <$> lookup k m
<|> InitialSection Continued NotEncoded <$> lookup (k <> "*0") m
<|> InitialSection NotContinued Encoded <$> lookup (k <> "*") m
<|> InitialSection Continued Encoded <$> lookup (k <> "*0*") m
otherSection
:: CI B.ByteString
-> Int
-> RawParameters
-> Maybe OtherSection
otherSection k i m =
OtherSection NotEncoded <$> lookup (k <> "*" <> i') m
<|> OtherSection Encoded <$> lookup (k <> "*" <> i' <> "*") m
where
i' = mk $ C.pack (show i)
data ParameterValue cs a = ParameterValue
(Maybe cs) -- charset
(Maybe (CI B.ByteString)) -- language
a -- value
deriving (Eq, Show, Generic, NFData)
type EncodedParameterValue = ParameterValue CharsetName B.ByteString
type DecodedParameterValue = ParameterValue Void T.Text
-- | Parameter value with no language.
instance IsString DecodedParameterValue where
fromString = ParameterValue Nothing Nothing . T.pack
-- | Parameter value with no language, encoded either in @us-ascii@
-- or @utf-8.
instance IsString EncodedParameterValue where
fromString = charsetEncode . fromString
value :: Lens (ParameterValue cs a) (ParameterValue cs b) a b
value f (ParameterValue a b c) = ParameterValue a b <$> f c
charset :: Lens (ParameterValue cs a) (ParameterValue cs' a) (Maybe cs) (Maybe cs')
charset f (ParameterValue a b c) = (\a' -> ParameterValue a' b c) <$> f a
-- | Convenience function to construct a parameter value.
-- If you need to to specify language, use the 'ParameterValue'
-- constructor directly.
--
newParameter :: Cons s s Char Char => s -> EncodedParameterValue
newParameter = charsetEncode . ParameterValue Nothing Nothing . view recons
-- | The default charset @us-ascii@ is implied by the abstract of
-- RFC 2231 which states: /This memo defines … a means to specify
-- parameter values in character sets other than US-ASCII/.
--
-- When encoding, 'utf-8' is always used, but if the whole string
-- contains only ASCII characters then the charset declaration is
-- omitted (so that it can be encoded as a non-extended parameter).
--
instance HasCharset EncodedParameterValue where
type Decoded EncodedParameterValue = DecodedParameterValue
charsetName = to $ \(ParameterValue name _ _) -> name <|> Just "us-ascii"
charsetData = value
charsetDecoded m = to $ \a -> (\t -> (set charset Nothing . set value t) a) <$> view (charsetText m) a
charsetEncode (ParameterValue _ lang s) =
let
bs = T.encodeUtf8 s
cs = if B.all (< 0x80) bs then Nothing else Just "utf-8"
in ParameterValue cs lang bs
getParameter :: CI B.ByteString -> RawParameters -> Maybe EncodedParameterValue
getParameter k m = do
InitialSection cont enc s <- initialSection k m
(cs, lang, v0) <-
either (const Nothing) Just $ parseOnly (parseInitialValue enc) s
let
sect0 = OtherSection enc v0
otherSects i = maybe [] (: otherSects (i + 1)) (otherSection k i m)
sects = case cont of
NotContinued -> [sect0]
Continued -> sect0 : otherSects 1
ParameterValue cs lang . fold <$> traverse decode sects
where
parseInitialValue NotEncoded =
(Nothing, Nothing, ) <$> takeByteString
parseInitialValue Encoded =
(,,) <$> charsetOrLang <*> charsetOrLang <*> takeByteString
charsetOrLang = optional (ci (takeWhile1 (/= '\''))) <* char8 '\''
decode (OtherSection enc s) = case enc of
NotEncoded -> pure s
Encoded -> decodePercent s
decodePercent :: B.ByteString -> Maybe B.ByteString
decodePercent (B.PS sfp soff slen) = unsafeDupablePerformIO $ do
-- Length of decoded string is not yet known, but it cannot be
-- longer than input, and is likely to be not much shorter.
-- Therefore allocate slen bytes and only use as much as we need.
dfp <- B.mallocByteString slen
result <- withForeignPtr dfp $ \dptr ->
withForeignPtr sfp $ \sptr -> do
let
slimit = sptr `plusPtr` (soff + slen)
fill !dp !sp
| sp >= slimit = pure $ Just (dp `minusPtr` dptr)
| otherwise = do
c <- peek sp
case (c :: Word8) of
37 {- % -}
| sp `plusPtr` 2 >= slimit -> pure Nothing
-- reached end of input during '=' decoding
| otherwise -> do
c1 <- peekByteOff sp 1
c2 <- peekByteOff sp 2
maybe
(pure Nothing) -- invalid hex sequence
(\(hi,lo) -> do
poke dp (hi * 16 + lo)
fill (dp `plusPtr` 1) (sp `plusPtr` 3) )
((,) <$> parseHex c1 <*> parseHex c2)
_ ->
poke dp c *> fill (dp `plusPtr` 1) (sp `plusPtr` 1)
fill dptr (sptr `plusPtr` soff)
pure $ B.PS dfp 0 <$> result
data ParameterEncoding = Plain | Quoted | Extended
deriving (Eq, Ord, Bounded)
-- | Given a requested encoding and a string, return an encoded
-- string along with the actual encoding used.
--
-- The requested encoding will be used when it is capable of
-- encoding the string, otherwise the first capable encoding
-- is used.
--
extEncode :: ParameterEncoding -> B.ByteString -> (ParameterEncoding, B.ByteString)
extEncode encReq s@(B.PS sfp soff slen) = (enc, d)
where
-- regular parameter:
-- value := token / quoted-string (RFC 2045)
-- token := 1*
-- tspecials := "(" / ")" / "<" / ">" / "@" /
-- "," / ";" / ":" / "\" / <">
-- "/" / "[" / "]" / "?" / "="
--
-- extended-parameter:
-- attribute-char :=
-- extended-other-values := *(ext-octet / attribute-char)
-- ext-octet := "%" 2(DIGIT / "A" / "B" / "C" / "D" / "E" / "F")
--
isTspecial = (`B.elem` "()<>@,;:\\\"/[]?=")
isAttrChar c = isVchar c && c `B.notElem` "*'%" && not (isTspecial c)
numEncChars c = if isAttrChar c then 1 else 3 -- conservative estimate of bytes
-- needed to encode char
charEncoding c
| isAttrChar c = Plain
| isVchar c || c == 0x20 || c == 0x09 = Quoted
| otherwise = Extended
charInfo c = (Sum (numEncChars c), Max (charEncoding c))
(Sum dlenMax, encCap) = foldMap charInfo $ B.unpack s
enc
| B.null s = Quoted -- Plain cannot encode empty string
| otherwise = getMax (Max encReq <> encCap)
-- poke the char (possibly encoded) and return updated dest ptr
poke' ptr c = case enc of
Plain -> poke ptr c $> ptr `plusPtr` 1
Quoted
| isQtext c -> poke ptr c $> ptr `plusPtr` 1
| otherwise -> do
poke ptr 0x5c -- backslash
poke (ptr `plusPtr` 1) c
pure (ptr `plusPtr` 2)
Extended
| isAttrChar c -> poke ptr c $> ptr `plusPtr` 1
| otherwise -> do
let (hi, lo) = hexEncode c
poke ptr 37 {- % -}
poke (ptr `plusPtr` 1) hi
poke (ptr `plusPtr` 2) lo
pure (ptr `plusPtr` 3)
d = unsafeDupablePerformIO $ do
dfp <- B.mallocByteString dlenMax
dlen <- withForeignPtr dfp $ \dptr ->
withForeignPtr sfp $ \sptr -> do
let
slimit = sptr `plusPtr` (soff + slen)
fill !sp !dp
| sp >= slimit = pure (dp `minusPtr` dptr)
| otherwise = peek sp >>= poke' dp >>= fill (sp `plusPtr` 1)
fill sptr dptr
pure $ B.PS dfp 0 dlen
-- | Types that have 'Parameters'
class HasParameters a where
parameters :: Lens' a Parameters
instance HasParameters Parameters where
parameters = id
-- Access the 'Parameters' as a @[(CI B.ByteString, B.ByteString)]@
parameterList :: HasParameters a => Lens' a RawParameters
parameterList = parameters . coerced
-- | Access parameter value. Continuations, encoding and charset
-- are processed.
--
parameter
:: HasParameters a
=> CI B.ByteString -> Lens' a (Maybe EncodedParameterValue)
parameter k = parameters . at k
-- | Raw parameter. The key is used as-is. No processing of
-- continuations, encoding or charset is performed.
--
rawParameter :: HasParameters a => CI B.ByteString -> Traversal' a B.ByteString
rawParameter k = parameters . paramiso . traversed . filtered ((k ==) . fst) . _2