module Codec.MIME.String.Parse where import qualified Codec.Binary.Base64.String as Base64 (decode) import Codec.MIME.String.ContentDisposition ( ContentDisposition(ContentDisposition), get_content_disposition, DispositionType(Inline, Attachment), DispositionParameter(..), ) import Codec.MIME.String.Date (get_date) import Codec.MIME.String.Headers ( ContentTransferEncoding(ContentTransferEncoding), get_content_transfer_encoding, ContentType(ContentType), get_content_type, get_content_description, get_boundary, Parameter(Parameter), MIMEVersion(MIMEVersion), get_mime_version, Subject(Subject), get_subject, get_from, get_to, ) import qualified Codec.MIME.String.QuotedPrintable as QuotedPrintable (decode) import Codec.MIME.String.Types ( ParseM, Header(Header, h_raw_header, h_raw_name, h_name, h_body), Headers, Message(Message), MessageInfo(..), Multipart(Multipart), MessageContent(NoContent, Mixed, Alternative, Parallel, Digest, RFC822), mkData, mkBody, digest_content_type, ascii_text_content_type, ) import Codec.MIME.String.Internal.Utils import Codec.Text.IConv import Control.Monad (liftM) import Control.Monad.State (evalState, get, put) import qualified Data.ByteString.Lazy.Char8 as BS import Data.ByteString.Lazy.Char8 (ByteString) import Data.Maybe (fromMaybe) mkMessage :: MessageInfo -> ParseM MessageContent -> ParseM Message mkMessage mi f_mc = do pn <- get put (pn + 1) -- This is done slightly oddly so we get numbering in the natural -- order, which in turn is important when looking for a part number mc <- f_mc return $ Message pn mi mc parse :: String -> Message parse msg = evalState (parse_message msg) 1 parse_message :: String -> ParseM Message parse_message msg = let (headers, m_body) = parse_headers msg in case get_header headers "mime-version:" get_mime_version of -- We only try and be clever if the MIME version is 1.0 Just (MIMEVersion 1 0) -> parse_mime_message ascii_text_content_type msg _ -> do let m_from = get_header headers "from:" get_from m_subject = get_header headers "subject:" (Just . Subject) m_to = get_header headers "to:" get_to m_date = get_header headers "date:" get_date mi = MessageInfo { mi_headers = headers, mi_from = m_from, mi_subject = m_subject, mi_to = m_to, mi_date = m_date, mi_content_description = Nothing } mc = case m_body of Just body -> mkBody ascii_text_content_type "unknown" (convertAsciiToUtf8 body) Nothing -> return $ NoContent ascii_text_content_type mkMessage mi mc convertAsciiToUtf8 :: String -> String convertAsciiToUtf8 xs = BS.unpack $ convertFuzzy Transliterate "US-ASCII" "utf8" $ BS.pack xs parse_mime_message :: ContentType -> String -> ParseM Message parse_mime_message def_content_type msg = let (headers, m_body) = parse_headers msg mi = make_mime_message_info headers mc = case m_body of Nothing -> do let m_ct = get_header headers "content-type:" get_content_type content_type = fromMaybe def_content_type m_ct return $ NoContent content_type Just body -> make_mime_message_content def_content_type headers body in mkMessage mi mc make_mime_message_info :: Headers -> MessageInfo make_mime_message_info headers = let m_content_description = get_header headers "content-description:" get_content_description m_from = get_header headers "from:" get_from m_subject = get_header headers "subject:" get_subject m_to = get_header headers "to:" get_to m_date = get_header headers "date:" get_date in MessageInfo { mi_headers = headers, mi_from = m_from, mi_subject = m_subject, mi_to = m_to, mi_date = m_date, mi_content_description = m_content_description } make_mime_message_content :: ContentType -> Headers -> String -> ParseM MessageContent make_mime_message_content def_content_type headers body = -- We accept illegal combinations of content type and -- encoding, so just decode whatever it says case m_decoded_body of Nothing -> mkData (Just cte) content_type filename body Just decoded_body -> case content_disposition of ContentDisposition Attachment _ -> mkData Nothing content_type filename decoded_body ContentDisposition Inline _ -> case content_type of ContentType "text" _ ps -> let charset = fromMaybe "US-ASCII" $ lookup_param "charset" ps decoded_body' = BS.pack decoded_body in case tryConvertFuzzy Transliterate charset "utf8" decoded_body' of Just xs -> mkBody content_type filename $ BS.unpack xs Nothing -> mkData Nothing content_type filename decoded_body ContentType "multipart" st ps | Just raw_b <- lookup_param "boundary" ps, Just b <- get_boundary raw_b -> do let (preamble, parts, epilogue) = get_parts b decoded_body ct = case st of "digest" -> digest_content_type _ -> ascii_text_content_type constr = case st of "alternative" -> Alternative "parallel" -> Parallel "digest" -> Digest _ -> Mixed ms <- mapM (parse_mime_message ct) parts return $ constr (Multipart preamble ms epilogue) ContentType "message" "rfc822" _ -> liftM (RFC822 decoded_body filename) (parse_message decoded_body) -- Anything else is treated like an -- application/octet-stream _ -> mkData Nothing content_type filename decoded_body where m_cd = get_header headers "content-disposition:" get_content_disposition content_disposition = fromMaybe (ContentDisposition Inline []) m_cd m_ct = get_header headers "content-type:" get_content_type content_type = fromMaybe def_content_type m_ct filename = get_filename content_disposition content_type m_ce = get_header headers "content-transfer-encoding:" get_content_transfer_encoding cte@(ContentTransferEncoding content_transfer_encoding) = fromMaybe (ContentTransferEncoding "7bit") m_ce m_decoded_body = if content_transfer_encoding == "base64" then Just (Base64.decode body) else if content_transfer_encoding == "quoted-printable" then Just (QuotedPrintable.decode $ my_lines body) else if content_transfer_encoding `elem` ["7bit", "8bit", "binary"] then -- Don't worry if 8-bit data is in 7-bit transfer-encoded data Just body else Nothing -- XXX This is rather hacky. We really want convertFuzzy to tell us if -- the conversion is supported itself tryConvertFuzzy :: Fuzzy -> EncodingName -> EncodingName -> ByteString -> Maybe ByteString tryConvertFuzzy fuzzy from_charset to_charset decoded_body = case convertStrictly from_charset to_charset decoded_body of Right (UnsuportedConversion {}) -> Nothing _ -> Just $ convertFuzzy fuzzy from_charset to_charset decoded_body get_filename :: ContentDisposition -> ContentType -> FilePath get_filename (ContentDisposition _ params) (ContentType _ _ params') = loop params where loop [] = -- Look up legacy name parameter in content type case lookup_param "name" params' of Nothing -> default_filename Just f -> sanitise f loop (Filename f:_) = sanitise f loop (_:ps) = loop ps sanitise f = case reverse $ takeWhile ('/' /=) $ reverse f of "" -> default_filename f' -> f' default_filename = "unknown" get_parts :: String -> String -> (String, [String], String) get_parts boundary body = case gps [] body of (pre:parts, epi) -> (pre, parts, epi) _ -> error "Parse.get_parts: Can't happen XXX" where gps acc "" = ([from_acc acc], "") gps acc xs = case fmap (dropWhile isWSP) $ after dd_boundary $ snd $ read_line_ending xs of Just "--" -> ([from_acc acc], "") Just ('-':'-':cs@(c:_)) | is_cr_or_lf c -> ([from_acc acc], cs) Just "" -> ([from_acc acc], "") Just (c1:cs) | is_cr_or_lf c1 -> let cs' = case cs of c2:cs'' | is_cr_or_lf c2 && c1 /= c2 -> cs'' _ -> cs in case gps [] cs' of (ps, ep) -> (from_acc acc:ps, ep) _ -> case read_line_ending xs of (le, xs') -> case break is_cr_or_lf xs' of (ys, zs) -> gps (ys:le:acc) zs from_acc acc = concat $ reverse acc read_line_ending cs = case cs of (c1:cs1) | is_cr_or_lf c1 -> case cs1 of (c2:cs2) | is_cr_or_lf c2 && c1 /= c2 -> ([c1, c2], cs2) _ -> ([c1], cs1) _ -> ("", cs) after "" ys = Just ys after (x:xs) (y:ys) | x == y = after xs ys | otherwise = Nothing after (_:_) "" = Nothing dd_boundary = "--" ++ boundary isWSP ' ' = True isWSP '\t' = True isWSP _ = False -- Returns the value of the first parameter of the name (must be lower cased) lookup_param :: String -> [Parameter] -> Maybe String lookup_param _ [] = Nothing lookup_param name (Parameter n v:ps) | name == n = Just v | otherwise = lookup_param name ps get_header :: Headers -> String -> (String -> Maybe a) -> Maybe a get_header hs name getter = case filter ((name ==) . h_name) hs of [h] -> getter (h_body h) _ -> Nothing -- We skip over leading white space lines, both for resilience and -- because this is allowed for MIME part headers. parse_headers :: String -> (Headers, Maybe String) parse_headers msg = skip_whitespace (Just msg) where skip_whitespace :: Maybe String -> ([Header], Maybe String) skip_whitespace Nothing = ([], Nothing) skip_whitespace (Just xs) = case get_line xs of ([], m_zs) -> ([], m_zs) (ys, m_zs) -> case dropWhile isWhite ys of [] -> skip_whitespace m_zs ys' -> gather [ys'] m_zs gather :: [String] -> Maybe String -> ([Header], Maybe String) gather acc Nothing = ([mk_rev_header acc], Nothing) gather acc (Just xs) = case get_line xs of ([], m_zs) -> ([mk_rev_header acc], m_zs) (ys, m_zs) | starts_with_white ys -> gather (ys:acc) m_zs | otherwise -> let (hs, m_rest) = gather [ys] m_zs in (mk_rev_header acc:hs, m_rest) starts_with_white (c:_) = isWhite c starts_with_white [] = False isWhite ' ' = True isWhite '\t' = True isWhite _ = False -- Takes the lines comprising a header (in reverse order) and constructs -- the corresponding Header mk_rev_header :: [String] -> Header mk_rev_header = mk_header . reverse -- Takes the lines comprising a header and constructs -- the corresponding Header mk_header :: [String] -> Header mk_header xs = let unfolded = concat xs (raw_name, body) = case break ends_header unfolded of (name, ':':val) -> (name ++ ":", val) (name, val) -> (name, val) in Header { h_raw_header = xs, h_raw_name = raw_name, h_name = map asciiToLower raw_name, h_body = body } where ends_header ':' = True ends_header ' ' = True ends_header '\t' = True ends_header _ = False