{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE OverloadedStrings #-}

{-|
Description:    An algorithm for guessing character encoding from file contents.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

In an ideal internet, every server would declare the binary encoding with which
it is transmitting a file (actually, the /true/ ideal would be for it to always
be 'Utf8', but there are still a lot of legacy documents out there).  However,
that's not always the case.

A good fallback would be for every document to declare itself what encoding it
has been saved in.  However, not every one does, and the ones that do may still
get it wrong (take, for instance, the case of a server which /does/ translate
everything it sends to 'Utf8').

And so, the [HTML standard](https://html.spec.whatwg.org/) describes an
algorithm for guessing the proper bytes-to-text translation to use in
'Web.Willow.Common.Encoding.decode'.  While this does therefore assume some
HTML syntax and specific tags, none of the semantics should cause an issue for
other filetypes.
-}
module Web.Willow.Common.Encoding.Sniffer
    ( -- * Types
      Encoding ( .. )
    , Confidence ( .. )
    , ReparseData ( .. )
    , emptyReparseData
      -- * The Algorithm
    , sniff
    , SnifferEnvironment ( .. )
    , emptySnifferEnvironment
    , sniffDecoderState
      -- ** Auxiliary
    , decoderConfidence
    , confidenceEncoding
    , extractEncoding
    ) where


import qualified Control.Applicative as A

import qualified Data.ByteString as BS
import qualified Data.Maybe as Y
import qualified Data.Text.Encoding as T
import qualified Data.Word as W

import Data.Functor ( ($>) )

import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Encoding.Labels
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import qualified Web.Willow.Common.Encoding.Utf8 as Utf8
import qualified Web.Willow.Common.Encoding.Utf16 as Utf16


-- | A parser specialized for recovering a single potential encoding from a
-- binary stream.
type Sniffer = ParserT BS.ByteString Maybe


-- | Guess what encoding may be in use by the binary stream, and generate a
-- collection of data based on that which results in the behaviour described by
-- the decoding algorithm at the start of the stream.
sniffDecoderState :: SnifferEnvironment -> BS.ByteString -> DecoderState
sniffDecoderState env stream = (initialDecoderState $ confidenceEncoding conf)
    { decoderConfidence_ = conf
    }
  where conf = sniff env stream


-- | __HTML:__
--      @[encoding sniffing algorithm]
--      (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
-- 
-- Given a stream and related metadata, try to determine what encoding may have
-- been used to write it.
-- 
-- Will resolve and/or wait for the number of bytes requested by 'prescanDepth'
-- to be available in the stream (or, if it comes sooner, the end of the
-- stream), if they have not yet been produced.
sniff :: SnifferEnvironment -> BS.ByteString -> Confidence
sniff opt bs = maybe defaultSniff fst $ runParser (sniff' opt) bs

-- | __HTML:__
--      @[encoding sniffing algorithm]
--      (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
-- 
-- Dispatcher to fold the various options and parameters given by the
-- environment into a single output 'Encoding' for the stream, which may or may
-- not wind up being correct, but is still the best guess.
sniff' :: SnifferEnvironment -> Sniffer Confidence
sniff' opt = choice
    [ lookAhead bom >>= sniffAlways Certain
    , sniffMaybe Certain $ userOverride opt
    , sniffMaybe Certain $ transportHeader opt
    , prescan (prescanDepth opt) >>= sniffAlways tentative
    , sniffMaybe tentative $ parentEncoding opt
    -- Try any implementation-defined autodetection ('Tentative').
    , sniffMaybe tentative $ cachedInfo opt
    , sniffMaybe tentative $ userDefault opt
    , sniffMaybe tentative $ localeEncoding opt
    ]
  where sniffMaybe conf (Just enc) = pure $ conf enc
        sniffMaybe _ Nothing = A.empty
        sniffAlways conf enc = pure $ conf enc
        tentative = flip Tentative emptyReparseData
        bom = choice
            [ Utf8.byteOrderMark
            , Utf16.byteOrderMarkBigEndian
            , Utf16.byteOrderMarkLittleEndian
            ]

-- | The fallback 'Encoding' to guess when nothing better is available, as
-- determined by the body of pre-existing content.  If nothing else, this is a
-- single-byte encoding with minimal control characters, so can generally do a
-- half-decent job of representing the underlying binary structure.
defaultSniff :: Confidence
defaultSniff = Tentative Windows1252 emptyReparseData


-- | Various datapoints which may indicate a document's binary encoding, to be
-- fed into the 'sniff' algorithm.  Values may be easily instantiated as
-- updates to 'emptySnifferEnvironment'.
data SnifferEnvironment = SnifferEnvironment
    { userOverride :: Maybe Encoding
        -- ^ The encoding the end user has specified should be used.  Note that
        -- even this can still be overridden by the presence of a byte-order
        -- mark at the head of the stream.
    , transportHeader :: Maybe Encoding
        -- ^ The encoding given by the transport layer (e.g. through an HTTP
        -- @Content-Type@ header).
    , prescanDepth :: Word
        -- ^ The number of bytes which should be skimmed for @<meta>@
        -- attributes specifying an encoding.
    , parentEncoding :: Maybe Encoding
        -- ^ The encoding used for the enclosing document (e.g., if this
        -- document is loaded via an @\<iframe\>@).
    , cachedInfo :: Maybe Encoding
        -- ^ The encoding from the last time this page was loaded, other pages
        -- on the site, or other cached data.
    , userDefault :: Maybe Encoding
        -- ^ The encoding the end user has specified as being their preferred
        -- default, if no better encoding can be determined.
    , localeEncoding :: Maybe Encoding
        -- ^ The encoding recommended as a reasonable guess based on the
        -- current language of the user's system.
    }
  deriving ( Eq, Show, Read )
{-# WARNING localeEncoding "The type of this argument will be changed in a future release" #-}

-- | A neutral set of parameters to pass to the 'sniff' algorithm: no accessory
-- data, and a 'prescanDepth' limit of 1024 bytes.
emptySnifferEnvironment :: SnifferEnvironment
emptySnifferEnvironment = SnifferEnvironment
    { userOverride = Nothing
    , transportHeader = Nothing
    , prescanDepth = 1024
    , parentEncoding = Nothing
    , cachedInfo = Nothing
    , userDefault = Nothing
    , localeEncoding = Nothing
    }


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      with an explicit length.
-- 
-- Guess what encoding a stream may have been written in based on suspected
-- byte sequences in its content; in practice, whether the document has any
-- @<meta>@ tags which specify one.
-- 
-- Fails if no encoding can be guessed and never consumes any input, but will
-- resolve and/or wait for @maxDepth@ bytes to be available in the stream (or,
-- if it comes sooner, the end of the stream), if they have not yet been
-- produced.
prescan :: Word -> Sniffer Encoding
prescan maxDepth = do
    toScan <- lookAhead $ nextChunk maxDepth
    maybe A.empty (return . fst) $ runParser prescan' toScan

-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
-- 
-- The main loop of the @prescan@ algorithm, dispatching the various methods
-- of parsing input to guess its encoding based on content.
-- 
-- Note that this consumes input even if it fails, and will attempt to scan the
-- entire input stream if not fed a smaller section.
prescan' :: Sniffer Encoding
prescan' = do
    _ <- A.many prescanSkip
    enc' <- prescanMeta
    case enc' of
        Just enc -> return enc
        Nothing -> prescan'
  where prescanMarkup = do
            token $ toByte '<'
            next >>= satisfying (`BS.elem` "!/?")
            A.many $ next >>= satisfying (/= toByte '>')
            pure ()
        prescanUnmatched = do
            token $ toByte '<'
            -- Need to check for this as 'prescanMeta' is not run yet.
            avoiding metaName
            pure ()
        prescanSkip = choice
            [ prescanComment
            , prescanTag
            , prescanMarkup
            , prescanUnmatched
            , A.some (next >>= satisfying (/= toByte '<')) $> ()
            ]


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 1
-- 
-- Consume an HTML comment @<!-- (...) -->@.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<!--@.  Doesn't
-- fail at the end of the stream, so be sure the calling scanner does.
prescanComment :: Sniffer ()
prescanComment = do
    chunk "<!--"
    A.many . token $ toByte '-'
    choice
        [ token (toByte '>') $> ()
        , prescanCommentEnd
        ]

-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 1
-- 
-- Consume all bytes until an end @-->@, or the end of the stream.
-- 
-- Never fails, even if at the end of the stream.
prescanCommentEnd :: Sniffer ()
prescanCommentEnd = A.many (next >>= satisfying (/= toByte '-')) *> choice
    [ endToken
    , next *> prescanCommentEnd
    , end
    ]
  where endToken = do
            chunk "--"
            A.many . token $ toByte '-'
            token $ toByte '>'
            pure ()


-- | The unambiguous start of a @<meta>@ tag.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<meta@, case
-- insensitive, followed by either a @/@ or whitespace.
metaName :: Sniffer ()
metaName = (nextChunk 4 >>= satisfying ((==) "meta" . BS.map toAsciiLowerB)) *>
    (next >>= satisfying (`elem` toByte '/' : asciiWhitespaceB)) $> ()

-- | Consume any number of HTML-defined ASCII whitespace (does /not/ include
-- 0x11 VT).
-- 
-- Never fails, but may return 'BS.empty'.
metaWhitespace :: Sniffer BS.ByteString
metaWhitespace = fmap (Y.fromMaybe BS.empty) . A.optional . fmap BS.pack . A.some $
    next >>= satisfying (`elem` asciiWhitespaceB)


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 2
-- 
-- Consume an HTML meta declaration @<meta (...)>@, and return any likely
-- encoding it may name.
-- 
-- Fails if the stream doesn't start with an ASCII-encoded @<meta @ or
-- @<meta/@, case insensitive.  Returns 'Nothing' if the end of the tag is
-- reached without properly providing an encoding.
prescanMeta :: Sniffer (Maybe Encoding)
prescanMeta = do
    token $ toByte '<'
    metaName
    attrs <- A.many prescanAttribute
    avoiding end
    return $ case finalizeEncoding $ foldr gatherAttrs (Nothing, False, Left Nothing) attrs of
        (Nothing, _, _) -> Nothing
        (Just True, False, _) -> Nothing
        (_, _, Nothing) -> Nothing
        (_, _, Just Utf16be) -> pure Utf8
        (_, _, Just Utf16le) -> pure Utf8
        (_, _, Just UserDefined) -> pure Windows1252
        (_, _, Just enc) -> pure enc
  where gatherAttrs ("http-equiv", "content-type") (need, _, charset) =
            (need, True, charset)
        -- Reset the "http-equiv" if an earlier attribute shadows the
        -- "=content-type" (folding right-to-left; nothing else sets this).
        gatherAttrs ("http-equiv", _) (need, _, charset) =
            (need, False, charset)
        -- "content" is a weaker version of "charset" and so gets overwritten
        -- if the latter appears later and ignored if it appears earlier
        gatherAttrs ("content", v) (need, got, Left _) =
            case extractEncoding v of
                Just enc -> (Just $ Y.fromMaybe True need, got, Left $ Just enc)
                Nothing -> (need, got, Left Nothing)
        gatherAttrs ("charset", v) (_, got, _) =
            (Just False, got, Right . lookupEncoding $ T.decodeLatin1 v)
        gatherAttrs _ attrs = attrs
        finalizeEncoding :: (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding)) -> (Maybe Bool, Bool, Maybe Encoding)
        finalizeEncoding (need, got, Left enc) = (need, got, enc)
        finalizeEncoding (need, got, Right enc) = (need, got, enc)


-- | __HTML:__
--      @[prescan a byte stream to determine its encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@
--      step 2, case 3
-- 
-- Consume any HTML tag fitting the pattern @</?[A-Za-z].*>?@ up to (but not
-- including) the closing @>@.
-- 
-- Fails if no such tag is found, or, for compatibility, if the tag name is
-- exactly @meta@.
prescanTag :: Sniffer ()
prescanTag = do
    token $ toByte '<'
    -- Need to check for this as 'prescanMeta' might fail
    avoiding metaName
    metaWhitespace
    A.optional . token $ toByte '/'
    next >>= satisfying (flip elem . map toByte $ ['A'..'Z'] ++ ['a'..'z'])
    A.many $ next >>= satisfying (not . (`elem` toByte '>' : asciiWhitespaceB))
    A.many prescanAttribute
    pure ()


-- | __HTML:__
--      @[get an attribute]
--      (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
-- 
-- Common a tag attribute, with or without a value, and return its name and
-- either a value or 'BS.empty'.
-- 
-- Fails if the stream begins with an ASCII-encoded @>@.  Intended for use in a
-- 'A.many' loop, where it may return extra @('BS.empty', 'BS.empty')@
-- "attributes"---do not rely on 'length' being accurate.
prescanAttribute :: Sniffer (BS.ByteString, BS.ByteString)
prescanAttribute = metaWhitespace *> choice
    -- A null return is the safest way to loop here.
    [ token (toByte '/') $> (BS.empty, BS.empty)
    , do
        nc <- next >>= satisfying (/= toByte '>')
        ncs <- A.many $ next >>= satisfying isNameChar
        metaWhitespace
        value <- choice
            [ token (toByte '=') *> metaWhitespace *> choice
                [ quotedValue
                , unquotedValue $ toByte '>'
                ]
            , pure BS.empty
            ]
        return (BS.map toAsciiLowerB (BS.cons nc $ BS.pack ncs), BS.map toAsciiLowerB value)
    ]
  where isNameChar c
            | BS.elem c "/=>" = False
            | elem c asciiWhitespaceB = False
            | otherwise = True

-- | __HTML:__
-- 
--      * @[get an attribute]
--        (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
--        step 10, case 1
--      * @[algorithm for extracting a character encoding from a meta element]
--        (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
--        step 6, case 1
-- 
-- Retrieve a string contained between matched quotation marks (@"@ or @'@),
-- dropping that punctuation.
-- 
-- Fails if the stream doesn't start with one of the ASCII-encoded quotation
-- marks, or if the same mark does /not/ appear again before the end of the
-- stream.
quotedValue :: Sniffer BS.ByteString
quotedValue = BS.pack <$> choice
    [ parseValue $ toByte '"'
    , parseValue $ toByte '\''
    ]
  where parseValue quote = token quote *> A.many (next >>= satisfying (/= quote)) <* token quote

-- | __HTML:__
-- 
--      * @[get an attribute]
--        (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@
--        step 10, cases 3 and 4; step 11
--      * @[algorithm for extracting a character encoding from a meta element]
--        (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
--        step 6, case 3 (via the added generality)
-- 
-- Retrieve a string written as a single, unquoted and unbroken sequence (e.g.
-- @utf8@ or @iso-ir-109@ but /not/ @Windows 1252@ or @"SHIFT-JIS"@).  The
-- additional parameter is treated as a breakpoint as well:
-- 
-- >>> parseMaybe (unquotedValue 0x3B) <* takeRest) "content;x"
-- "content"
-- 
-- Only fails if the stream begins with an ASCII-encoded @"@ or @'@, but may
-- return 'BS.empty'.
unquotedValue :: W.Word8 -> Sniffer BS.ByteString
unquotedValue terminal = do
    c <- next >>= satisfying (not . (`elem` terminal : BS.unpack "\"'" ++ asciiWhitespaceB))
    cs <- A.many $ next >>= satisfying (not . (`elem` terminal : asciiWhitespaceB))
    metaWhitespace
    return . BS.pack $ c : cs


-- | The encoding scheme currently in use by the parser, along with how likely
-- that scheme actually represents the binary stream.
decoderConfidence :: DecoderState -> Confidence
-- Very simple indirection to prevent this being used as a record setter.
decoderConfidence = decoderConfidence_
{-# INLINE decoderConfidence #-}


-- | __HTML:__
--      @[algorithm for extracting a character encoding from a meta element]
--      (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
-- 
-- Find the first occurrence of an ASCII-encoded string @charset@ in the
-- stream, and try to parse its attribute-style value into an 'Encoding'.
-- 
-- Returns 'Nothing' if the stream does not contain @charset@ followed by @=@,
-- or if the value can not be successfully parsed as an encoding label.
extractEncoding :: BS.ByteString -> Maybe Encoding
extractEncoding = fmap fst . runParser extractEncoding'

-- | __HTML:__
--      @[algorithm for extracting a character encoding from a meta element]
--      (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@
-- 
-- Find the first occurrence of an ASCII-encoded string @charset@ in the
-- stream, and try to parse its attribute-style value into an 'Encoding'.
-- 
-- Fails if the stream does not contain @charset@ followed by @=@, or if the
-- value can not be successfully parsed as an encoding label.
extractEncoding' :: Sniffer Encoding
extractEncoding' = do
    findNext $ nextChunk 7 >>= satisfying ((==) "charset" . BS.map toAsciiLowerB)
    metaWhitespace
    choice
        [ do
            token $ toByte '='
            metaWhitespace
            value <- choice
                [ quotedValue
                , unquotedValue $ toByte ';'
                ]
            maybe A.empty return . lookupEncoding $ T.decodeLatin1 value
        , extractEncoding'
        ]