{-# LANGUAGE ApplicativeDo #-} {-# LANGUAGE OverloadedStrings #-} {-| Description: An algorithm for guessing character encoding from file contents. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable In an ideal internet, every server would declare the binary encoding with which it is transmitting a file (actually, the /true/ ideal would be for it to always be 'Utf8', but there are still a lot of legacy documents out there). However, that's not always the case. A good fallback would be for every document to declare itself what encoding it has been saved in. However, not every one does, and the ones that do may still get it wrong (take, for instance, the case of a server which /does/ translate everything it sends to 'Utf8'). And so, the [HTML standard](https://html.spec.whatwg.org/) describes an algorithm for guessing the proper bytes-to-text translation to use in 'Web.Willow.Common.Encoding.decode'. While this does therefore assume some HTML syntax and specific tags, none of the semantics should cause an issue for other filetypes. -} module Web.Willow.Common.Encoding.Sniffer ( -- * Types Encoding ( .. ) , Confidence ( .. ) , ReparseData ( .. ) , emptyReparseData -- * The Algorithm , sniff , SnifferEnvironment ( .. ) , emptySnifferEnvironment , sniffDecoderState -- ** Auxiliary , decoderConfidence , confidenceEncoding , extractEncoding ) where import qualified Control.Applicative as A import qualified Data.ByteString as BS import qualified Data.Maybe as Y import qualified Data.Text.Encoding as T import qualified Data.Word as W import Data.Functor ( ($>) ) import Web.Willow.Common.Encoding import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Encoding.Labels import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Util import qualified Web.Willow.Common.Encoding.Utf8 as Utf8 import qualified Web.Willow.Common.Encoding.Utf16 as Utf16 -- | A parser specialized for recovering a single potential encoding from a -- binary stream. type Sniffer = ParserT BS.ByteString Maybe -- | Guess what encoding may be in use by the binary stream, and generate a -- collection of data based on that which results in the behaviour described by -- the decoding algorithm at the start of the stream. sniffDecoderState :: SnifferEnvironment -> BS.ByteString -> DecoderState sniffDecoderState env stream = (initialDecoderState $ confidenceEncoding conf) { decoderConfidence_ = conf } where conf = sniff env stream -- | __HTML:__ -- @[encoding sniffing algorithm] -- (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@ -- -- Given a stream and related metadata, try to determine what encoding may have -- been used to write it. -- -- Will resolve and/or wait for the number of bytes requested by 'prescanDepth' -- to be available in the stream (or, if it comes sooner, the end of the -- stream), if they have not yet been produced. sniff :: SnifferEnvironment -> BS.ByteString -> Confidence sniff opt bs = maybe defaultSniff fst $ runParser (sniff' opt) bs -- | __HTML:__ -- @[encoding sniffing algorithm] -- (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@ -- -- Dispatcher to fold the various options and parameters given by the -- environment into a single output 'Encoding' for the stream, which may or may -- not wind up being correct, but is still the best guess. sniff' :: SnifferEnvironment -> Sniffer Confidence sniff' opt = choice [ lookAhead bom >>= sniffAlways Certain , sniffMaybe Certain $ userOverride opt , sniffMaybe Certain $ transportHeader opt , prescan (prescanDepth opt) >>= sniffAlways tentative , sniffMaybe tentative $ parentEncoding opt -- Try any implementation-defined autodetection ('Tentative'). , sniffMaybe tentative $ cachedInfo opt , sniffMaybe tentative $ userDefault opt , sniffMaybe tentative $ localeEncoding opt ] where sniffMaybe conf (Just enc) = pure $ conf enc sniffMaybe _ Nothing = A.empty sniffAlways conf enc = pure $ conf enc tentative = flip Tentative emptyReparseData bom = choice [ Utf8.byteOrderMark , Utf16.byteOrderMarkBigEndian , Utf16.byteOrderMarkLittleEndian ] -- | The fallback 'Encoding' to guess when nothing better is available, as -- determined by the body of pre-existing content. If nothing else, this is a -- single-byte encoding with minimal control characters, so can generally do a -- half-decent job of representing the underlying binary structure. defaultSniff :: Confidence defaultSniff = Tentative Windows1252 emptyReparseData -- | Various datapoints which may indicate a document's binary encoding, to be -- fed into the 'sniff' algorithm. Values may be easily instantiated as -- updates to 'emptySnifferEnvironment'. data SnifferEnvironment = SnifferEnvironment { userOverride :: Maybe Encoding -- ^ The encoding the end user has specified should be used. Note that -- even this can still be overridden by the presence of a byte-order -- mark at the head of the stream. , transportHeader :: Maybe Encoding -- ^ The encoding given by the transport layer (e.g. through an HTTP -- @Content-Type@ header). , prescanDepth :: Word -- ^ The number of bytes which should be skimmed for @@ -- attributes specifying an encoding. , parentEncoding :: Maybe Encoding -- ^ The encoding used for the enclosing document (e.g., if this -- document is loaded via an @\@). , cachedInfo :: Maybe Encoding -- ^ The encoding from the last time this page was loaded, other pages -- on the site, or other cached data. , userDefault :: Maybe Encoding -- ^ The encoding the end user has specified as being their preferred -- default, if no better encoding can be determined. , localeEncoding :: Maybe Encoding -- ^ The encoding recommended as a reasonable guess based on the -- current language of the user's system. } deriving ( Eq, Show, Read ) {-# WARNING localeEncoding "The type of this argument will be changed in a future release" #-} -- | A neutral set of parameters to pass to the 'sniff' algorithm: no accessory -- data, and a 'prescanDepth' limit of 1024 bytes. emptySnifferEnvironment :: SnifferEnvironment emptySnifferEnvironment = SnifferEnvironment { userOverride = Nothing , transportHeader = Nothing , prescanDepth = 1024 , parentEncoding = Nothing , cachedInfo = Nothing , userDefault = Nothing , localeEncoding = Nothing } -- | __HTML:__ -- @[prescan a byte stream to determine its encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@ -- with an explicit length. -- -- Guess what encoding a stream may have been written in based on suspected -- byte sequences in its content; in practice, whether the document has any -- @@ tags which specify one. -- -- Fails if no encoding can be guessed and never consumes any input, but will -- resolve and/or wait for @maxDepth@ bytes to be available in the stream (or, -- if it comes sooner, the end of the stream), if they have not yet been -- produced. prescan :: Word -> Sniffer Encoding prescan maxDepth = do toScan <- lookAhead $ nextChunk maxDepth maybe A.empty (return . fst) $ runParser prescan' toScan -- | __HTML:__ -- @[prescan a byte stream to determine its encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@ -- -- The main loop of the @prescan@ algorithm, dispatching the various methods -- of parsing input to guess its encoding based on content. -- -- Note that this consumes input even if it fails, and will attempt to scan the -- entire input stream if not fed a smaller section. prescan' :: Sniffer Encoding prescan' = do _ <- A.many prescanSkip enc' <- prescanMeta case enc' of Just enc -> return enc Nothing -> prescan' where prescanMarkup = do token $ toByte '<' next >>= satisfying (`BS.elem` "!/?") A.many $ next >>= satisfying (/= toByte '>') pure () prescanUnmatched = do token $ toByte '<' -- Need to check for this as 'prescanMeta' is not run yet. avoiding metaName pure () prescanSkip = choice [ prescanComment , prescanTag , prescanMarkup , prescanUnmatched , A.some (next >>= satisfying (/= toByte '<')) $> () ] -- | __HTML:__ -- @[prescan a byte stream to determine its encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@ -- step 2, case 1 -- -- Consume an HTML comment @@. -- -- Fails if the stream doesn't start with an ASCII-encoded @@, or the end of the stream. -- -- Never fails, even if at the end of the stream. prescanCommentEnd :: Sniffer () prescanCommentEnd = A.many (next >>= satisfying (/= toByte '-')) *> choice [ endToken , next *> prescanCommentEnd , end ] where endToken = do chunk "--" A.many . token $ toByte '-' token $ toByte '>' pure () -- | The unambiguous start of a @@ tag. -- -- Fails if the stream doesn't start with an ASCII-encoded @>= satisfying ((==) "meta" . BS.map toAsciiLowerB)) *> (next >>= satisfying (`elem` toByte '/' : asciiWhitespaceB)) $> () -- | Consume any number of HTML-defined ASCII whitespace (does /not/ include -- 0x11 VT). -- -- Never fails, but may return 'BS.empty'. metaWhitespace :: Sniffer BS.ByteString metaWhitespace = fmap (Y.fromMaybe BS.empty) . A.optional . fmap BS.pack . A.some $ next >>= satisfying (`elem` asciiWhitespaceB) -- | __HTML:__ -- @[prescan a byte stream to determine its encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@ -- step 2, case 2 -- -- Consume an HTML meta declaration @@, and return any likely -- encoding it may name. -- -- Fails if the stream doesn't start with an ASCII-encoded @ Nothing (Just True, False, _) -> Nothing (_, _, Nothing) -> Nothing (_, _, Just Utf16be) -> pure Utf8 (_, _, Just Utf16le) -> pure Utf8 (_, _, Just UserDefined) -> pure Windows1252 (_, _, Just enc) -> pure enc where gatherAttrs ("http-equiv", "content-type") (need, _, charset) = (need, True, charset) -- Reset the "http-equiv" if an earlier attribute shadows the -- "=content-type" (folding right-to-left; nothing else sets this). gatherAttrs ("http-equiv", _) (need, _, charset) = (need, False, charset) -- "content" is a weaker version of "charset" and so gets overwritten -- if the latter appears later and ignored if it appears earlier gatherAttrs ("content", v) (need, got, Left _) = case extractEncoding v of Just enc -> (Just $ Y.fromMaybe True need, got, Left $ Just enc) Nothing -> (need, got, Left Nothing) gatherAttrs ("charset", v) (_, got, _) = (Just False, got, Right . lookupEncoding $ T.decodeLatin1 v) gatherAttrs _ attrs = attrs finalizeEncoding :: (Maybe Bool, Bool, Either (Maybe Encoding) (Maybe Encoding)) -> (Maybe Bool, Bool, Maybe Encoding) finalizeEncoding (need, got, Left enc) = (need, got, enc) finalizeEncoding (need, got, Right enc) = (need, got, enc) -- | __HTML:__ -- @[prescan a byte stream to determine its encoding] -- (https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding)@ -- step 2, case 3 -- -- Consume any HTML tag fitting the pattern @?@ up to (but not -- including) the closing @>@. -- -- Fails if no such tag is found, or, for compatibility, if the tag name is -- exactly @meta@. prescanTag :: Sniffer () prescanTag = do token $ toByte '<' -- Need to check for this as 'prescanMeta' might fail avoiding metaName metaWhitespace A.optional . token $ toByte '/' next >>= satisfying (flip elem . map toByte $ ['A'..'Z'] ++ ['a'..'z']) A.many $ next >>= satisfying (not . (`elem` toByte '>' : asciiWhitespaceB)) A.many prescanAttribute pure () -- | __HTML:__ -- @[get an attribute] -- (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@ -- -- Common a tag attribute, with or without a value, and return its name and -- either a value or 'BS.empty'. -- -- Fails if the stream begins with an ASCII-encoded @>@. Intended for use in a -- 'A.many' loop, where it may return extra @('BS.empty', 'BS.empty')@ -- "attributes"---do not rely on 'length' being accurate. prescanAttribute :: Sniffer (BS.ByteString, BS.ByteString) prescanAttribute = metaWhitespace *> choice -- A null return is the safest way to loop here. [ token (toByte '/') $> (BS.empty, BS.empty) , do nc <- next >>= satisfying (/= toByte '>') ncs <- A.many $ next >>= satisfying isNameChar metaWhitespace value <- choice [ token (toByte '=') *> metaWhitespace *> choice [ quotedValue , unquotedValue $ toByte '>' ] , pure BS.empty ] return (BS.map toAsciiLowerB (BS.cons nc $ BS.pack ncs), BS.map toAsciiLowerB value) ] where isNameChar c | BS.elem c "/=>" = False | elem c asciiWhitespaceB = False | otherwise = True -- | __HTML:__ -- -- * @[get an attribute] -- (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@ -- step 10, case 1 -- * @[algorithm for extracting a character encoding from a meta element] -- (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@ -- step 6, case 1 -- -- Retrieve a string contained between matched quotation marks (@"@ or @'@), -- dropping that punctuation. -- -- Fails if the stream doesn't start with one of the ASCII-encoded quotation -- marks, or if the same mark does /not/ appear again before the end of the -- stream. quotedValue :: Sniffer BS.ByteString quotedValue = BS.pack <$> choice [ parseValue $ toByte '"' , parseValue $ toByte '\'' ] where parseValue quote = token quote *> A.many (next >>= satisfying (/= quote)) <* token quote -- | __HTML:__ -- -- * @[get an attribute] -- (https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing)@ -- step 10, cases 3 and 4; step 11 -- * @[algorithm for extracting a character encoding from a meta element] -- (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@ -- step 6, case 3 (via the added generality) -- -- Retrieve a string written as a single, unquoted and unbroken sequence (e.g. -- @utf8@ or @iso-ir-109@ but /not/ @Windows 1252@ or @"SHIFT-JIS"@). The -- additional parameter is treated as a breakpoint as well: -- -- >>> parseMaybe (unquotedValue 0x3B) <* takeRest) "content;x" -- "content" -- -- Only fails if the stream begins with an ASCII-encoded @"@ or @'@, but may -- return 'BS.empty'. unquotedValue :: W.Word8 -> Sniffer BS.ByteString unquotedValue terminal = do c <- next >>= satisfying (not . (`elem` terminal : BS.unpack "\"'" ++ asciiWhitespaceB)) cs <- A.many $ next >>= satisfying (not . (`elem` terminal : asciiWhitespaceB)) metaWhitespace return . BS.pack $ c : cs -- | The encoding scheme currently in use by the parser, along with how likely -- that scheme actually represents the binary stream. decoderConfidence :: DecoderState -> Confidence -- Very simple indirection to prevent this being used as a record setter. decoderConfidence = decoderConfidence_ {-# INLINE decoderConfidence #-} -- | __HTML:__ -- @[algorithm for extracting a character encoding from a meta element] -- (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@ -- -- Find the first occurrence of an ASCII-encoded string @charset@ in the -- stream, and try to parse its attribute-style value into an 'Encoding'. -- -- Returns 'Nothing' if the stream does not contain @charset@ followed by @=@, -- or if the value can not be successfully parsed as an encoding label. extractEncoding :: BS.ByteString -> Maybe Encoding extractEncoding = fmap fst . runParser extractEncoding' -- | __HTML:__ -- @[algorithm for extracting a character encoding from a meta element] -- (https://html.spec.whatwg.org/multipage/parsing.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element)@ -- -- Find the first occurrence of an ASCII-encoded string @charset@ in the -- stream, and try to parse its attribute-style value into an 'Encoding'. -- -- Fails if the stream does not contain @charset@ followed by @=@, or if the -- value can not be successfully parsed as an encoding label. extractEncoding' :: Sniffer Encoding extractEncoding' = do findNext $ nextChunk 7 >>= satisfying ((==) "charset" . BS.map toAsciiLowerB) metaWhitespace choice [ do token $ toByte '=' metaWhitespace value <- choice [ quotedValue , unquotedValue $ toByte ';' ] maybe A.empty return . lookupEncoding $ T.decodeLatin1 value , extractEncoding' ]