{-# LANGUAGE OverloadedStrings #-}
module Text.Html.Encoding.Detection
    ( EncodingName
    , detect
    , detectBom
    , detectMetaCharset
    ) where

import Control.Monad
import Data.Maybe
import Data.Word
import Prelude hiding (drop, take)

import Data.Attoparsec.ByteString hiding (Done, Fail, Result, parse, take)
import Data.Attoparsec.ByteString.Lazy (Result (..), parse)
import Codec.Text.Detect (detectEncodingName)
import qualified Data.ByteString
import Data.ByteString.Lazy

-- | Represent a name of text encoding (i.e., @charset@).  E.g., @"UTF-8"@.
type EncodingName = String

-- | Detect the character encoding from a given HTML fragment.  The precendence
-- order for determining the character encoding is:
--
-- 1. A BOM (byte order mark) before any other data in the HTML document itself.
--    (See also 'detectBom' function for details.)
-- 2. A @<meta>@ declaration with a @charset@ attribute or an @http-equiv@
--    attribute set to @Content-Type@ and a value set for @charset@.
--    Note that it looks at only first 1024 bytes.
--    (See also 'detectMetaCharset' for details.)
-- 3. [Mozilla's Charset
--    Detectors](https://www-archive.mozilla.org/projects/intl/chardet.html)
--    heuristics.  To be specific, it delegates to 'detectEncodingName' from the
--    [charsetdetect-ae](https://hackage.haskell.org/package/charsetdetect-ae)
--    package, a Haskell implementation of that.
--
-- >>> :set -XOverloadedStrings
-- >>> detect "\xef\xbb\xbf\xe4\xbd\xa0\xe5\xa5\xbd<html><head>..."
-- Just "UTF-8"
-- >>> detect "<html><head><meta charset=latin-1>..."
-- Just "latin-1"
-- >>> detect "<html><head><title>\xbe\xee\xbc\xad\xbf\xc0\xbc\xbc\xbf\xe4..."
-- Just "EUC-KR"
--
-- It may return 'Nothing' if it fails to determine the character encoding,
-- although it's less likely.
detect :: ByteString -> Maybe EncodingName
detect :: ByteString -> Maybe EncodingName
detect ByteString
fragment =
    forall a. [a] -> Maybe a
listToMaybe forall a b. (a -> b) -> a -> b
$ forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (forall a b. (a -> b) -> a -> b
$ ByteString
fragment)
        [ ByteString -> Maybe EncodingName
detectBom
        , ByteString -> Maybe EncodingName
detectMetaCharset forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int64 -> ByteString -> ByteString
take Int64
1024
        , ByteString -> Maybe EncodingName
detectEncodingName
        ]

-- | Detect the character encoding from a given HTML fragment by looking the
-- initial BOM (byte order mark).
--
-- >>> :set -XOverloadedStrings
-- >>> detectBom "\xef\xbb\xbf\xe4\xbd\xa0\xe5\xa5\xbd"
-- Just "UTF-8"
-- >>> detectBom "\xfe\xff\x4f\x60\x59\x7d"
-- Just "UTF-16BE"
-- >>> detectBom "\xff\xfe\x60\x4f\x7d\x59"
-- Just "UTF-16LE"
-- >>> detectBom "\x00\x00\xfe\xff\x00\x00\x4f\x60\x00\x00\x59\x7d"
-- Just "UTF-32BE"
-- >>> detectBom "\xff\xfe\x00\x00\x60\x4f\x00\x00\x7d\x59\x00\x00"
-- Just "UTF-32LE"
-- >>> detectBom "\x84\x31\x95\x33\xc4\xe3\xba\xc3"
-- Just "GB-18030"
--
-- It returns 'Nothing' if it fails to find no valid BOM sequence.
--
-- >>> detectBom "foobar"
-- Nothing
detectBom :: ByteString -> Maybe EncodingName
detectBom :: ByteString -> Maybe EncodingName
detectBom ByteString
fragment =
    case Int64 -> ByteString -> ByteString
take Int64
2 ByteString
fragment of
        ByteString
"\xfe\xff" -> forall a. a -> Maybe a
Just EncodingName
"UTF-16BE"
        ByteString
"\xef\xbb" -> if Int64 -> ByteString -> ByteString
take Int64
1 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\xbf"
            then forall a. a -> Maybe a
Just EncodingName
"UTF-8"
            else forall a. Maybe a
Nothing
        ByteString
"\x00\x00" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\xfe\xff"
            then forall a. a -> Maybe a
Just EncodingName
"UTF-32BE"
            else forall a. Maybe a
Nothing
        ByteString
"\xff\xfe" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\x00\x00"
            then forall a. a -> Maybe a
Just EncodingName
"UTF-32LE"
            else forall a. a -> Maybe a
Just EncodingName
"UTF-16LE"
        ByteString
"\x84\x31" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\x95\x33"
            then forall a. a -> Maybe a
Just EncodingName
"GB-18030"
            else forall a. Maybe a
Nothing
        ByteString
_ -> forall a. Maybe a
Nothing

-- | Detect the character encoding from a given HTML fragment by looking
-- a @<meta>@ declaration with a @charset@ attribute or an @http-equiv@
-- attribute set to @Content-Type@ and a value set for @charset@.
--
-- >>> :set -XOverloadedStrings
-- >>> detectMetaCharset "<html><head><meta charset=utf-8>"
-- Just "utf-8"
-- >>> detectMetaCharset "<html><head><meta charset='EUC-KR'>"
-- Just "EUC-KR"
-- >>> detectMetaCharset "<html><head><meta charset=\"latin-1\"/></head></html>"
-- Just "latin-1"
-- >>> :{
-- detectMetaCharset
--      "<meta http-equiv=content-type content='text/html; charset=utf-8'>"
-- :}
-- Just "utf-8"
--
-- Return 'Nothing' if it failed to any appropriate @<meta>@ tag:
--
-- >>> detectMetaCharset "<html><body></body></html>"
-- Nothing
detectMetaCharset :: ByteString -> Maybe EncodingName
detectMetaCharset :: ByteString -> Maybe EncodingName
detectMetaCharset ByteString
fragment =
    case forall a. Parser a -> ByteString -> Result a
parse Parser (Maybe ByteString)
html ByteString
fragment of
        Fail {} -> forall a. Maybe a
Nothing
        Done ByteString
_ Maybe ByteString
r -> ByteString -> EncodingName
decodeAscii forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe ByteString
r
  where
    html :: Parser (Maybe Data.ByteString.ByteString)
    html :: Parser (Maybe ByteString)
html = do
        [Maybe ByteString]
metas <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
            [ do
                -- [^<]+
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser ByteString
takeWhile1 (forall a. Eq a => a -> a -> Bool
/= Word8
0x3c)
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
            , Parser (Maybe ByteString)
metaCharset
            , do
                -- [<]
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
0x3c
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
            ]
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case forall a. [Maybe a] -> [a]
catMaybes [Maybe ByteString]
metas of
            [] -> forall a. Maybe a
Nothing
            ByteString
name : [ByteString]
_ -> forall a. a -> Maybe a
Just ByteString
name
    metaCharset :: Parser (Maybe Data.ByteString.ByteString)
    metaCharset :: Parser (Maybe ByteString)
metaCharset = do
        forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ ByteString -> Parser ByteString
string ByteString
"<meta"
        forall (f :: * -> *) a. Functor f => f a -> f ()
void Parser Word8
space
        [Maybe ByteString]
candidates <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
            [ do
                -- [^>Cc]+
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser ByteString
takeWhile1 forall a b. (a -> b) -> a -> b
$ \ Word8
c -> Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x3e Bool -> Bool -> Bool
&& Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x43 Bool -> Bool -> Bool
&& Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x63
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
            , do
                -- (?iCHARSET)
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ EncodingName -> Parser ByteString ()
string' EncodingName
"CHARSET"
                -- [=]
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
0x3d
                -- ["']?
                Word8
q <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
option Word8
0 forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser Word8
satisfy forall a b. (a -> b) -> a -> b
$ \ Word8
c -> Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x22 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x27
                -- [A-Za-z0-9._-]+
                ByteString
charset <- (Word8 -> Bool) -> Parser ByteString
takeWhile1 forall a b. (a -> b) -> a -> b
$ \ Word8
c ->
                    Word8
0x41 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x5a Bool -> Bool -> Bool
||
                    Word8
0x61 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x7a Bool -> Bool -> Bool
||
                    Word8
0x30 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x39 Bool -> Bool -> Bool
||
                    Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x2e Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x5f Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x2d
                forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Word8
q forall a. Eq a => a -> a -> Bool
/= Word8
0) forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
q
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just ByteString
charset
            , do
                -- [^>]
                forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser Word8
satisfy (forall a. Eq a => a -> a -> Bool
/= Word8
0x3e)
                forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
            ]
        forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case forall a. [Maybe a] -> [a]
catMaybes [Maybe ByteString]
candidates of
            [] -> forall a. Maybe a
Nothing
            ByteString
name : [ByteString]
_ -> forall a. a -> Maybe a
Just ByteString
name
    isSpace :: Word8 -> Bool
    isSpace :: Word8 -> Bool
isSpace Word8
c =
        -- [ \t\n\xff\r]
        Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x20 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x09 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x0a Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0xff Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x0d
    space :: Parser Word8
    space :: Parser Word8
space = (Word8 -> Bool) -> Parser Word8
satisfy Word8 -> Bool
isSpace
    string' :: String -> Parser ()
    string' :: EncodingName -> Parser ByteString ()
string' EncodingName
s = forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, Monad m) =>
t (m a) -> m ()
sequence_
        [ (Word8 -> Bool) -> Parser Word8
satisfy forall a b. (a -> b) -> a -> b
$ \ Word8
i ->
            forall a. Enum a => Int -> a
toEnum (forall a. Enum a => a -> Int
fromEnum Char
c) forall a. Eq a => a -> a -> Bool
== Word8
i Bool -> Bool -> Bool
|| forall a. Enum a => Int -> a
toEnum (forall a. Enum a => a -> Int
fromEnum Char
c forall a. Num a => a -> a -> a
+ Int
32) forall a. Eq a => a -> a -> Bool
== Word8
i
        | Char
c <- EncodingName
s
        ]
    decodeAscii :: Data.ByteString.ByteString -> String
    decodeAscii :: ByteString -> EncodingName
decodeAscii =
        forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. Enum a => Int -> a
toEnum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> Int
fromEnum) forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
Data.ByteString.unpack