{-# LANGUAGE OverloadedStrings #-}
module Text.Html.Encoding.Detection
( EncodingName
, detect
, detectBom
, detectMetaCharset
) where
import Control.Monad
import Data.Maybe
import Data.Word
import Prelude hiding (drop, take)
import Data.Attoparsec.ByteString hiding (Done, Fail, Result, parse, take)
import Data.Attoparsec.ByteString.Lazy (Result (..), parse)
import Codec.Text.Detect (detectEncodingName)
import qualified Data.ByteString
import Data.ByteString.Lazy
type EncodingName = String
detect :: ByteString -> Maybe EncodingName
detect :: ByteString -> Maybe EncodingName
detect ByteString
fragment =
forall a. [a] -> Maybe a
listToMaybe forall a b. (a -> b) -> a -> b
$ forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (forall a b. (a -> b) -> a -> b
$ ByteString
fragment)
[ ByteString -> Maybe EncodingName
detectBom
, ByteString -> Maybe EncodingName
detectMetaCharset forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int64 -> ByteString -> ByteString
take Int64
1024
, ByteString -> Maybe EncodingName
detectEncodingName
]
detectBom :: ByteString -> Maybe EncodingName
detectBom :: ByteString -> Maybe EncodingName
detectBom ByteString
fragment =
case Int64 -> ByteString -> ByteString
take Int64
2 ByteString
fragment of
ByteString
"\xfe\xff" -> forall a. a -> Maybe a
Just EncodingName
"UTF-16BE"
ByteString
"\xef\xbb" -> if Int64 -> ByteString -> ByteString
take Int64
1 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\xbf"
then forall a. a -> Maybe a
Just EncodingName
"UTF-8"
else forall a. Maybe a
Nothing
ByteString
"\x00\x00" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\xfe\xff"
then forall a. a -> Maybe a
Just EncodingName
"UTF-32BE"
else forall a. Maybe a
Nothing
ByteString
"\xff\xfe" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\x00\x00"
then forall a. a -> Maybe a
Just EncodingName
"UTF-32LE"
else forall a. a -> Maybe a
Just EncodingName
"UTF-16LE"
ByteString
"\x84\x31" -> if Int64 -> ByteString -> ByteString
take Int64
2 (Int64 -> ByteString -> ByteString
drop Int64
2 ByteString
fragment) forall a. Eq a => a -> a -> Bool
== ByteString
"\x95\x33"
then forall a. a -> Maybe a
Just EncodingName
"GB-18030"
else forall a. Maybe a
Nothing
ByteString
_ -> forall a. Maybe a
Nothing
detectMetaCharset :: ByteString -> Maybe EncodingName
detectMetaCharset :: ByteString -> Maybe EncodingName
detectMetaCharset ByteString
fragment =
case forall a. Parser a -> ByteString -> Result a
parse Parser (Maybe ByteString)
html ByteString
fragment of
Fail {} -> forall a. Maybe a
Nothing
Done ByteString
_ Maybe ByteString
r -> ByteString -> EncodingName
decodeAscii forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe ByteString
r
where
html :: Parser (Maybe Data.ByteString.ByteString)
html :: Parser (Maybe ByteString)
html = do
[Maybe ByteString]
metas <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
[ do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser ByteString
takeWhile1 (forall a. Eq a => a -> a -> Bool
/= Word8
0x3c)
forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
, Parser (Maybe ByteString)
metaCharset
, do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
0x3c
forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
]
forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case forall a. [Maybe a] -> [a]
catMaybes [Maybe ByteString]
metas of
[] -> forall a. Maybe a
Nothing
ByteString
name : [ByteString]
_ -> forall a. a -> Maybe a
Just ByteString
name
metaCharset :: Parser (Maybe Data.ByteString.ByteString)
metaCharset :: Parser (Maybe ByteString)
metaCharset = do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ ByteString -> Parser ByteString
string ByteString
"<meta"
forall (f :: * -> *) a. Functor f => f a -> f ()
void Parser Word8
space
[Maybe ByteString]
candidates <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many' forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
[ do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser ByteString
takeWhile1 forall a b. (a -> b) -> a -> b
$ \ Word8
c -> Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x3e Bool -> Bool -> Bool
&& Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x43 Bool -> Bool -> Bool
&& Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
0x63
forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
, do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ EncodingName -> Parser ByteString ()
string' EncodingName
"CHARSET"
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
0x3d
Word8
q <- forall (f :: * -> *) a. Alternative f => a -> f a -> f a
option Word8
0 forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser Word8
satisfy forall a b. (a -> b) -> a -> b
$ \ Word8
c -> Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x22 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x27
ByteString
charset <- (Word8 -> Bool) -> Parser ByteString
takeWhile1 forall a b. (a -> b) -> a -> b
$ \ Word8
c ->
Word8
0x41 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x5a Bool -> Bool -> Bool
||
Word8
0x61 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x7a Bool -> Bool -> Bool
||
Word8
0x30 forall a. Ord a => a -> a -> Bool
<= Word8
c Bool -> Bool -> Bool
&& Word8
c forall a. Ord a => a -> a -> Bool
<= Word8
0x39 Bool -> Bool -> Bool
||
Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x2e Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x5f Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x2d
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Word8
q forall a. Eq a => a -> a -> Bool
/= Word8
0) forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ Word8 -> Parser Word8
word8 Word8
q
forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall a. a -> Maybe a
Just ByteString
charset
, do
forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Parser Word8
satisfy (forall a. Eq a => a -> a -> Bool
/= Word8
0x3e)
forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
]
forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ case forall a. [Maybe a] -> [a]
catMaybes [Maybe ByteString]
candidates of
[] -> forall a. Maybe a
Nothing
ByteString
name : [ByteString]
_ -> forall a. a -> Maybe a
Just ByteString
name
isSpace :: Word8 -> Bool
isSpace :: Word8 -> Bool
isSpace Word8
c =
Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x20 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x09 Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x0a Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0xff Bool -> Bool -> Bool
|| Word8
c forall a. Eq a => a -> a -> Bool
== Word8
0x0d
space :: Parser Word8
space :: Parser Word8
space = (Word8 -> Bool) -> Parser Word8
satisfy Word8 -> Bool
isSpace
string' :: String -> Parser ()
string' :: EncodingName -> Parser ByteString ()
string' EncodingName
s = forall (t :: * -> *) (m :: * -> *) a.
(Foldable t, Monad m) =>
t (m a) -> m ()
sequence_
[ (Word8 -> Bool) -> Parser Word8
satisfy forall a b. (a -> b) -> a -> b
$ \ Word8
i ->
forall a. Enum a => Int -> a
toEnum (forall a. Enum a => a -> Int
fromEnum Char
c) forall a. Eq a => a -> a -> Bool
== Word8
i Bool -> Bool -> Bool
|| forall a. Enum a => Int -> a
toEnum (forall a. Enum a => a -> Int
fromEnum Char
c forall a. Num a => a -> a -> a
+ Int
32) forall a. Eq a => a -> a -> Bool
== Word8
i
| Char
c <- EncodingName
s
]
decodeAscii :: Data.ByteString.ByteString -> String
decodeAscii :: ByteString -> EncodingName
decodeAscii =
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. Enum a => Int -> a
toEnum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> Int
fromEnum) forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
Data.ByteString.unpack