{-# LANGUAGE OverloadedStrings #-} module Text.Html.Encoding.Detection ( EncodingName , detect , detectBom , detectMetaCharset ) where import Control.Monad import Data.Maybe import Data.Word import Prelude hiding (drop, take) import Data.Attoparsec.ByteString hiding (Done, Fail, Result, parse, take) import Data.Attoparsec.ByteString.Lazy (Result (..), parse) import Codec.Text.Detect (detectEncodingName) import qualified Data.ByteString import Data.ByteString.Lazy -- | Represent a name of text encoding (i.e., @charset@). E.g., @"UTF-8"@. type EncodingName = String -- | Detect the character encoding from a given HTML fragment. The precendence -- order for determining the character encoding is: -- -- 1. A BOM (byte order mark) before any other data in the HTML document itself. -- (See also 'detectBom' function for details.) -- 2. A @@ declaration with a @charset@ attribute or an @http-equiv@ -- attribute set to @Content-Type@ and a value set for @charset@. -- Note that it looks at only first 1024 bytes. -- (See also 'detectMetaCharset' for details.) -- 3. [Mozilla's Charset -- Detectors](https://www-archive.mozilla.org/projects/intl/chardet.html) -- heuristics. To be specific, it delegates to 'detectEncodingName' from the -- [charsetdetect-ae](https://hackage.haskell.org/package/charsetdetect-ae) -- package, a Haskell implementation of that. -- -- >>> :set -XOverloadedStrings -- >>> detect "\xef\xbb\xbf\xe4\xbd\xa0\xe5\xa5\xbd..." -- Just "UTF-8" -- >>> detect "..." -- Just "latin-1" -- >>> detect "\xbe\xee\xbc\xad\xbf\xc0\xbc\xbc\xbf\xe4..." -- Just "EUC-KR" -- -- It may return 'Nothing' if it fails to determine the character encoding, -- although it's less likely. detect :: ByteString -> Maybe EncodingName detect fragment = listToMaybe $ mapMaybe ($ fragment) [ detectBom , detectMetaCharset . take 1024 , detectEncodingName ] -- | Detect the character encoding from a given HTML fragment by looking the -- initial BOM (byte order mark). -- -- >>> :set -XOverloadedStrings -- >>> detectBom "\xef\xbb\xbf\xe4\xbd\xa0\xe5\xa5\xbd" -- Just "UTF-8" -- >>> detectBom "\xfe\xff\x4f\x60\x59\x7d" -- Just "UTF-16BE" -- >>> detectBom "\xff\xfe\x60\x4f\x7d\x59" -- Just "UTF-16LE" -- >>> detectBom "\x00\x00\xfe\xff\x00\x00\x4f\x60\x00\x00\x59\x7d" -- Just "UTF-32BE" -- >>> detectBom "\xff\xfe\x00\x00\x60\x4f\x00\x00\x7d\x59\x00\x00" -- Just "UTF-32LE" -- >>> detectBom "\x84\x31\x95\x33\xc4\xe3\xba\xc3" -- Just "GB-18030" -- -- It returns 'Nothing' if it fails to find no valid BOM sequence. -- -- >>> detectBom "foobar" -- Nothing detectBom :: ByteString -> Maybe EncodingName detectBom fragment = case take 2 fragment of "\xfe\xff" -> Just "UTF-16BE" "\xef\xbb" -> if take 1 (drop 2 fragment) == "\xbf" then Just "UTF-8" else Nothing "\x00\x00" -> if take 2 (drop 2 fragment) == "\xfe\xff" then Just "UTF-32BE" else Nothing "\xff\xfe" -> if take 2 (drop 2 fragment) == "\x00\x00" then Just "UTF-32LE" else Just "UTF-16LE" "\x84\x31" -> if take 2 (drop 2 fragment) == "\x95\x33" then Just "GB-18030" else Nothing _ -> Nothing -- | Detect the character encoding from a given HTML fragment by looking -- a @<meta>@ declaration with a @charset@ attribute or an @http-equiv@ -- attribute set to @Content-Type@ and a value set for @charset@. -- -- >>> :set -XOverloadedStrings -- >>> detectMetaCharset "<html><head><meta charset=utf-8>" -- Just "utf-8" -- >>> detectMetaCharset "<html><head><meta charset='EUC-KR'>" -- Just "EUC-KR" -- >>> detectMetaCharset "<html><head><meta charset=\"latin-1\"/></head></html>" -- Just "latin-1" -- >>> :{ -- detectMetaCharset -- "<meta http-equiv=content-type content='text/html; charset=utf-8'>" -- :} -- Just "utf-8" -- -- Return 'Nothing' if it failed to any appropriate @<meta>@ tag: -- -- >>> detectMetaCharset "<html><body></body></html>" -- Nothing detectMetaCharset :: ByteString -> Maybe EncodingName detectMetaCharset fragment = case parse html fragment of Fail {} -> Nothing Done _ r -> decodeAscii <$> r where html :: Parser (Maybe Data.ByteString.ByteString) html = do metas <- many' $ choice [ do -- [^<]+ void $ takeWhile1 (/= 0x3c) return Nothing , metaCharset , do -- [<] void $ word8 0x3c return Nothing ] return $ case catMaybes metas of [] -> Nothing name : _ -> Just name metaCharset :: Parser (Maybe Data.ByteString.ByteString) metaCharset = do void $ string "<meta" void space candidates <- many' $ choice [ do -- [^>Cc]+ void $ takeWhile1 $ \ c -> c /= 0x3e && c /= 0x43 && c /= 0x63 return Nothing , do -- (?iCHARSET) void $ string' "CHARSET" -- [=] void $ word8 0x3d -- ["']? q <- option 0 $ satisfy $ \ c -> c == 0x22 || c == 0x27 -- [A-Za-z0-9._-]+ charset <- takeWhile1 $ \ c -> 0x41 <= c && c <= 0x5a || 0x61 <= c && c <= 0x7a || 0x30 <= c && c <= 0x39 || c == 0x2e || c == 0x5f || c == 0x2d when (q /= 0) $ void $ word8 q return $ Just charset , do -- [^>] void $ satisfy (/= 0x3e) return Nothing ] return $ case catMaybes candidates of [] -> Nothing name : _ -> Just name isSpace :: Word8 -> Bool isSpace c = -- [ \t\n\xff\r] c == 0x20 || c == 0x09 || c == 0x0a || c == 0xff || c == 0x0d space :: Parser Word8 space = satisfy isSpace string' :: String -> Parser () string' s = sequence_ [ satisfy $ \ i -> toEnum (fromEnum c) == i || toEnum (fromEnum c + 32) == i | c <- s ] decodeAscii :: Data.ByteString.ByteString -> String decodeAscii = fmap (toEnum . fromEnum) . Data.ByteString.unpack