{-# LANGUAGE OverloadedStrings #-} module Text.Html.Encoding.Detection ( EncodingName , detect , detectBom , detectMetaCharset ) where import Control.Monad import Data.Maybe import Data.Word import Prelude hiding (drop, take) import Data.Attoparsec.ByteString hiding (Done, Fail, Result, parse, take) import Data.Attoparsec.ByteString.Lazy (Result (..), parse) import Codec.Text.Detect (detectEncodingName) import qualified Data.ByteString import Data.ByteString.Lazy -- | Represent a name of text encoding (i.e., @charset@). E.g., @"UTF-8"@. type EncodingName = String -- | Detect the character encoding from a given HTML fragment. The precendence -- order for determining the character encoding is: -- -- 1. A BOM (byte order mark) before any other data in the HTML document itself. -- (See also 'detectBom' function for details.) -- 2. A @@ declaration with a @charset@ attribute or an @http-equiv@ -- attribute set to @Content-Type@ and a value set for @charset@. -- Note that it looks at only first 1024 bytes. -- (See also 'detectMetaCharset' for details.) -- 3. [Mozilla's Charset -- Detectors](https://www-archive.mozilla.org/projects/intl/chardet.html) -- heuristics. To be specific, it delegates to 'detectEncodingName' from the -- [charsetdetect-ae](https://hackage.haskell.org/package/charsetdetect-ae) -- package, a Haskell implementation of that. -- -- >>> :set -XOverloadedStrings -- >>> detect "\xef\xbb\xbf\xe4\xbd\xa0\xe5\xa5\xbd
..." -- Just "UTF-8" -- >>> detect "..." -- Just "latin-1" -- >>> detect "