{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE OverloadedStrings #-}
{-|
Description: An algorithm for guessing character encoding from file contents.
Copyright: (c) 2020 Sam May
License: MPL-2.0
Maintainer: ag.eitilt@gmail.com
Stability: experimental
Portability: portable
In an ideal internet, every server would declare the binary encoding with which
it is transmitting a file (actually, the /true/ ideal would be for it to always
be 'Utf8', but there are still a lot of legacy documents out there). However,
that's not always the case.
A good fallback would be for every document to declare itself what encoding it
has been saved in. However, not every one does, and the ones that do may still
get it wrong (take, for instance, the case of a server which /does/ translate
everything it sends to 'Utf8').
And so, the [HTML standard](https://html.spec.whatwg.org/) describes an
algorithm for guessing the proper bytes-to-text translation to use in
'Web.Willow.Common.Encoding.decode'. While this does therefore assume some
HTML syntax and specific tags, none of the semantics should cause an issue for
other filetypes.
-}
module Web.Willow.Common.Encoding.Sniffer
( -- * Types
Encoding ( .. )
, Confidence ( .. )
, ReparseData ( .. )
, emptyReparseData
-- * The Algorithm
, sniff
, SnifferEnvironment ( .. )
, emptySnifferEnvironment
, sniffDecoderState
-- ** Auxiliary
, decoderConfidence
, confidenceEncoding
, extractEncoding
) where
import qualified Control.Applicative as A
import qualified Data.ByteString as BS
import qualified Data.Maybe as Y
import qualified Data.Text.Encoding as T
import qualified Data.Word as W
import Data.Functor ( ($>) )
import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Encoding.Labels
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util
import qualified Web.Willow.Common.Encoding.Utf8 as Utf8
import qualified Web.Willow.Common.Encoding.Utf16 as Utf16
-- | A parser specialized for recovering a single potential encoding from a
-- binary stream.
type Sniffer = ParserT BS.ByteString Maybe
-- | Guess what encoding may be in use by the binary stream, and generate a
-- collection of data based on that which results in the behaviour described by
-- the decoding algorithm at the start of the stream.
sniffDecoderState :: SnifferEnvironment -> BS.ByteString -> DecoderState
sniffDecoderState env stream = (initialDecoderState $ confidenceEncoding conf)
{ decoderConfidence_ = conf
}
where conf = sniff env stream
-- | __HTML:__
-- @[encoding sniffing algorithm]
-- (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
--
-- Given a stream and related metadata, try to determine what encoding may have
-- been used to write it.
--
-- Will resolve and/or wait for the number of bytes requested by 'prescanDepth'
-- to be available in the stream (or, if it comes sooner, the end of the
-- stream), if they have not yet been produced.
sniff :: SnifferEnvironment -> BS.ByteString -> Confidence
sniff opt bs = maybe defaultSniff fst $ runParser (sniff' opt) bs
-- | __HTML:__
-- @[encoding sniffing algorithm]
-- (https://html.spec.whatwg.org/multipage/parsing.html#encoding-sniffing-algorithm)@
--
-- Dispatcher to fold the various options and parameters given by the
-- environment into a single output 'Encoding' for the stream, which may or may
-- not wind up being correct, but is still the best guess.
sniff' :: SnifferEnvironment -> Sniffer Confidence
sniff' opt = choice
[ lookAhead bom >>= sniffAlways Certain
, sniffMaybe Certain $ userOverride opt
, sniffMaybe Certain $ transportHeader opt
, prescan (prescanDepth opt) >>= sniffAlways tentative
, sniffMaybe tentative $ parentEncoding opt
-- Try any implementation-defined autodetection ('Tentative').
, sniffMaybe tentative $ cachedInfo opt
, sniffMaybe tentative $ userDefault opt
, sniffMaybe tentative $ localeEncoding opt
]
where sniffMaybe conf (Just enc) = pure $ conf enc
sniffMaybe _ Nothing = A.empty
sniffAlways conf enc = pure $ conf enc
tentative = flip Tentative emptyReparseData
bom = choice
[ Utf8.byteOrderMark
, Utf16.byteOrderMarkBigEndian
, Utf16.byteOrderMarkLittleEndian
]
-- | The fallback 'Encoding' to guess when nothing better is available, as
-- determined by the body of pre-existing content. If nothing else, this is a
-- single-byte encoding with minimal control characters, so can generally do a
-- half-decent job of representing the underlying binary structure.
defaultSniff :: Confidence
defaultSniff = Tentative Windows1252 emptyReparseData
-- | Various datapoints which may indicate a document's binary encoding, to be
-- fed into the 'sniff' algorithm. Values may be easily instantiated as
-- updates to 'emptySnifferEnvironment'.
data SnifferEnvironment = SnifferEnvironment
{ userOverride :: Maybe Encoding
-- ^ The encoding the end user has specified should be used. Note that
-- even this can still be overridden by the presence of a byte-order
-- mark at the head of the stream.
, transportHeader :: Maybe Encoding
-- ^ The encoding given by the transport layer (e.g. through an HTTP
-- @Content-Type@ header).
, prescanDepth :: Word
-- ^ The number of bytes which should be skimmed for @@
-- attributes specifying an encoding.
, parentEncoding :: Maybe Encoding
-- ^ The encoding used for the enclosing document (e.g., if this
-- document is loaded via an @\