{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE CPP #-}
module Text.HTML.TagStream
  ( Token (..)
  , tokenStream
  ) where

import           Control.Applicative
import           Control.Monad (unless)
import           Control.Monad.Trans.Resource (MonadThrow)
import           Data.Char
import qualified Data.Conduit.List as CL

import           Data.Attoparsec.Text
import           Data.Conduit
import qualified Data.Conduit.Attoparsec as CA
import           Data.Maybe (fromMaybe, isJust)
import           Data.Monoid ((<>))
import           Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import qualified Text.XML.Stream.Parse as XML
import           Data.Map (Map)
import qualified Data.Map.Strict as Map
import           Control.Arrow (first)

data Token
  = TagOpen Text (Map Text Text) Bool
  | TagClose Text
  | Text Text
  | Comment Text
  | Special Text Text
  | Incomplete Text
  deriving (Token -> Token -> Bool
(Token -> Token -> Bool) -> (Token -> Token -> Bool) -> Eq Token
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Token -> Token -> Bool
$c/= :: Token -> Token -> Bool
== :: Token -> Token -> Bool
$c== :: Token -> Token -> Bool
Eq, Int -> Token -> ShowS
[Token] -> ShowS
Token -> String
(Int -> Token -> ShowS)
-> (Token -> String) -> ([Token] -> ShowS) -> Show Token
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Token] -> ShowS
$cshowList :: [Token] -> ShowS
show :: Token -> String
$cshow :: Token -> String
showsPrec :: Int -> Token -> ShowS
$cshowsPrec :: Int -> Token -> ShowS
Show)

data TagType
  = TagTypeClose
  | TagTypeSpecial
  | TagTypeNormal

{--
 - match quoted string, can fail.
 -}
quoted :: Char -> Parser Text
quoted :: Char -> Parser Text
quoted Char
q = Text -> Text -> Text
T.append (Text -> Text -> Text) -> Parser Text -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeTill ((Char, Char) -> Char -> Bool
forall a. Eq a => (a, a) -> a -> Bool
in2 (Char
'\\',Char
q))
                    Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ( Char -> Parser Char
char Char
q Parser Char -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Text -> Parser Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure Text
""
                      Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> Parser Char
char Char
'\\' Parser Char -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Int -> Parser Text -> Parser Text
atLeast Int
1 (Char -> Parser Text
quoted Char
q) )

quotedOr :: Parser Text -> Parser Text
quotedOr :: Parser Text -> Parser Text
quotedOr Parser Text
p = Parser Char -> Parser (Maybe Char)
forall a. Parser a -> Parser (Maybe a)
maybeP ((Char -> Bool) -> Parser Char
satisfy ((Char, Char) -> Char -> Bool
forall a. Eq a => (a, a) -> a -> Bool
in2 (Char
'"',Char
'\''))) Parser (Maybe Char) -> (Maybe Char -> Parser Text) -> Parser Text
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
             Parser Text -> (Char -> Parser Text) -> Maybe Char -> Parser Text
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Parser Text
p Char -> Parser Text
quoted

{--
 - attribute value, can't fail.
 -}
attrValue :: Parser Text
attrValue :: Parser Text
attrValue = Parser Text -> Parser Text
quotedOr (Parser Text -> Parser Text) -> Parser Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Parser Text
takeTill ((Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'>') (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
forall (f :: * -> *). Applicative f => f Bool -> f Bool -> f Bool
||. Char -> Bool
isSpace)

{--
 - attribute name, at least one char, can fail when meet tag end.
 - might match self-close tag end "/>" , make sure match `tagEnd' first.
 -}
attrName :: Parser Text
attrName :: Parser Text
attrName = Parser Text -> Parser Text
quotedOr (Parser Text -> Parser Text) -> Parser Text -> Parser Text
forall a b. (a -> b) -> a -> b
$
             Char -> Text -> Text
T.cons (Char -> Text -> Text) -> Parser Char -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Char
satisfy (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'>')
                    Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Char -> Bool) -> Parser Text
takeTill ((Char, Char, Char) -> Char -> Bool
forall a. Eq a => (a, a, a) -> a -> Bool
in3 (Char
'/',Char
'>',Char
'=') (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
forall (f :: * -> *). Applicative f => f Bool -> f Bool -> f Bool
||. Char -> Bool
isSpace)

{--
 - tag end, return self-close or not, can fail.
 -}
tagEnd :: Parser Bool
tagEnd :: Parser Bool
tagEnd = Char -> Parser Char
char Char
'>' Parser Char -> Parser Bool -> Parser Bool
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Bool -> Parser Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False
     Parser Bool -> Parser Bool -> Parser Bool
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> Parser Text
string Text
"/>" Parser Text -> Parser Bool -> Parser Bool
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Bool -> Parser Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
True

{--
 - attribute pair or tag end, can fail if tag end met.
 -}
attr :: Parser (Text, Text)
attr :: Parser (Text, Text)
attr = (,) (Text -> Text -> (Text, Text))
-> Parser Text -> Parser Text (Text -> (Text, Text))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
attrName Parser Text (Text -> (Text, Text))
-> Parser Text () -> Parser Text (Text -> (Text, Text))
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
skipSpace
           Parser Text (Text -> (Text, Text))
-> Parser Text -> Parser (Text, Text)
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ( Parser Char -> Parser Bool
forall a. Parser a -> Parser Bool
boolP (Char -> Parser Char
char Char
'=') Parser Bool -> (Bool -> Parser Text) -> Parser Text
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
                 Parser Text -> Parser Text -> Bool -> Parser Text
forall a. a -> a -> Bool -> a
cond (Parser Text ()
skipSpace Parser Text () -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
attrValue)
                      (Text -> Parser Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure Text
"")
               )

{--
 - all attributes before tag end. can't fail.
 -}
attrs :: Parser (Map Text Text, Bool)
attrs :: Parser (Map Text Text, Bool)
attrs = Map Text Text -> Parser (Map Text Text, Bool)
loop Map Text Text
forall k a. Map k a
Map.empty
  where
    loop :: Map Text Text -> Parser (Map Text Text, Bool)
loop Map Text Text
acc = Parser Text ()
skipSpace Parser Text ()
-> Parser Text (Either Bool (Text, Text))
-> Parser Text (Either Bool (Text, Text))
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> (Bool -> Either Bool (Text, Text)
forall a b. a -> Either a b
Left (Bool -> Either Bool (Text, Text))
-> Parser Bool -> Parser Text (Either Bool (Text, Text))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Bool
tagEnd Parser Text (Either Bool (Text, Text))
-> Parser Text (Either Bool (Text, Text))
-> Parser Text (Either Bool (Text, Text))
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Text, Text) -> Either Bool (Text, Text)
forall a b. b -> Either a b
Right ((Text, Text) -> Either Bool (Text, Text))
-> Parser (Text, Text) -> Parser Text (Either Bool (Text, Text))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser (Text, Text)
attr) Parser Text (Either Bool (Text, Text))
-> (Either Bool (Text, Text) -> Parser (Map Text Text, Bool))
-> Parser (Map Text Text, Bool)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
               (Bool -> Parser (Map Text Text, Bool))
-> ((Text, Text) -> Parser (Map Text Text, Bool))
-> Either Bool (Text, Text)
-> Parser (Map Text Text, Bool)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either
                 ((Map Text Text, Bool) -> Parser (Map Text Text, Bool)
forall (m :: * -> *) a. Monad m => a -> m a
return ((Map Text Text, Bool) -> Parser (Map Text Text, Bool))
-> (Bool -> (Map Text Text, Bool))
-> Bool
-> Parser (Map Text Text, Bool)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Map Text Text
acc,))
                 (\(Text
key, Text
value) -> Map Text Text -> Parser (Map Text Text, Bool)
loop (Map Text Text -> Parser (Map Text Text, Bool))
-> Map Text Text -> Parser (Map Text Text, Bool)
forall a b. (a -> b) -> a -> b
$ Text -> Text -> Map Text Text -> Map Text Text
forall k a. Ord k => k -> a -> Map k a -> Map k a
Map.insert Text
key Text
value Map Text Text
acc)

{--
 - comment tag without prefix.
 -}
comment :: Parser Token
comment :: Parser Token
comment = Text -> Token
Comment (Text -> Token) -> Parser Text -> Parser Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
comment'
  where comment' :: Parser Text
comment' = Text -> Text -> Text
T.append (Text -> Text -> Text) -> Parser Text -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'-')
                            Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> ( Text -> Parser Text
string Text
"-->" Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
""
                              Parser Text -> Parser Text -> Parser Text
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Int -> Parser Text -> Parser Text
atLeast Int
1 Parser Text
comment' )

{--
 - tags begine with <! , e.g. <!DOCTYPE ...>
 -}
special :: Parser Token
special :: Parser Token
special = Text -> Text -> Token
Special
          (Text -> Text -> Token)
-> Parser Text -> Parser Text (Text -> Token)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ( Char -> Text -> Text
T.cons (Char -> Text -> Text) -> Parser Char -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Char
satisfy (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'-') (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
forall (f :: * -> *). Applicative f => f Bool -> f Bool -> f Bool
||. Char -> Bool
isSpace))
                       Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Char -> Bool) -> Parser Text
takeTill ((Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'>') (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
forall (f :: * -> *). Applicative f => f Bool -> f Bool -> f Bool
||. Char -> Bool
isSpace)
                       Parser Text -> Parser Text () -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser Text ()
skipSpace )
          Parser Text (Text -> Token) -> Parser Text -> Parser Token
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'>') Parser Token -> Parser Char -> Parser Token
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> Parser Char
char Char
'>'

{--
 - parse a tag, can fail.
 -}
tag :: Parser Token
tag :: Parser Token
tag = do
    TagType
t <-     Text -> Parser Text
string Text
"/" Parser Text -> Parser Text TagType -> Parser Text TagType
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> TagType -> Parser Text TagType
forall (m :: * -> *) a. Monad m => a -> m a
return TagType
TagTypeClose
         Parser Text TagType -> Parser Text TagType -> Parser Text TagType
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Text -> Parser Text
string Text
"!" Parser Text -> Parser Text TagType -> Parser Text TagType
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> TagType -> Parser Text TagType
forall (m :: * -> *) a. Monad m => a -> m a
return TagType
TagTypeSpecial
         Parser Text TagType -> Parser Text TagType -> Parser Text TagType
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> TagType -> Parser Text TagType
forall (m :: * -> *) a. Monad m => a -> m a
return TagType
TagTypeNormal
    case TagType
t of
        TagType
TagTypeClose ->
            Text -> Token
TagClose (Text -> Token) -> Parser Text -> Parser Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'>')
            Parser Token -> Parser Char -> Parser Token
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Char -> Parser Char
char Char
'>'
        TagType
TagTypeSpecial -> Parser Text -> Parser Bool
forall a. Parser a -> Parser Bool
boolP (Text -> Parser Text
string Text
"--") Parser Bool -> (Bool -> Parser Token) -> Parser Token
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>=
                          Parser Token -> Parser Token -> Bool -> Parser Token
forall a. a -> a -> Bool -> a
cond Parser Token
comment Parser Token
special
        TagType
TagTypeNormal -> do
            Text
name <- (Char -> Bool) -> Parser Text
takeTill ((Char, Char, Char) -> Char -> Bool
forall a. Eq a => (a, a, a) -> a -> Bool
in3 (Char
'<',Char
'>',Char
'/') (Char -> Bool) -> (Char -> Bool) -> Char -> Bool
forall (f :: * -> *). Applicative f => f Bool -> f Bool -> f Bool
||. Char -> Bool
isSpace)
            (Map Text Text
as, Bool
close) <- Parser (Map Text Text, Bool)
attrs
            Token -> Parser Token
forall (m :: * -> *) a. Monad m => a -> m a
return (Token -> Parser Token) -> Token -> Parser Token
forall a b. (a -> b) -> a -> b
$ Text -> Map Text Text -> Bool -> Token
TagOpen Text
name ((Text -> Text) -> Map Text Text -> Map Text Text
forall a b k. (a -> b) -> Map k a -> Map k b
Map.map Text -> Text
decodeString Map Text Text
as) Bool
close

{--
 - record incomplete tag for streamline processing.
 -}
incomplete :: Parser Token
incomplete :: Parser Token
incomplete = Text -> Token
Incomplete (Text -> Token) -> (Text -> Text) -> Text -> Token
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Text -> Text
T.cons Char
'<' (Text -> Token) -> Parser Text -> Parser Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
takeText

{--
 - parse text node. consume at least one char, to make sure progress.
 -}
text :: Parser Token
text :: Parser Token
text = Text -> Token
Text (Text -> Token) -> Parser Text -> Parser Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> Parser Text -> Parser Text
atLeast Int
1 ((Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'<'))

decodeEntity :: MonadThrow m => Text -> m Text
decodeEntity :: Text -> m Text
decodeEntity Text
entity =
             ConduitT () Void m Text -> m Text
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit
           (ConduitT () Void m Text -> m Text)
-> ConduitT () Void m Text -> m Text
forall a b. (a -> b) -> a -> b
$ [Text] -> ConduitT () Text m ()
forall (m :: * -> *) a i. Monad m => [a] -> ConduitT i a m ()
CL.sourceList [Text
"&",Text
entity,Text
";"]
#if MIN_VERSION_xml_conduit(1,9,0)
          ConduitT () Text m ()
-> ConduitM Text Void m Text -> ConduitT () Void m Text
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ParseSettings -> ConduitT Text Event m ()
forall (m :: * -> *).
MonadThrow m =>
ParseSettings -> ConduitT Text Event m ()
XML.parseText ParseSettings
forall a. Default a => a
XML.def { psDecodeEntities :: DecodeEntities
XML.psDecodeEntities = DecodeEntities
XML.decodeHtmlEntities }
#else
          .| XML.parseText' XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities }
#endif
          ConduitT Text Event m ()
-> ConduitM Event Void m Text -> ConduitM Text Void m Text
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM Event Void m Text
forall (m :: * -> *) o. MonadThrow m => ConduitT Event o m Text
XML.content

token :: Parser Token
token :: Parser Token
token = Char -> Parser Char
char Char
'<' Parser Char -> Parser Token -> Parser Token
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> (Parser Token
tag Parser Token -> Parser Token -> Parser Token
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
incomplete)
    Parser Token -> Parser Token -> Parser Token
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Token
text

{--
 - treat script tag specially, can't fail.
 -}
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd Token
open =
    Builder -> Parser [Token]
loop Builder
forall a. Monoid a => a
mempty
  where
    loop :: Builder -> Parser [Token]
loop Builder
acc = do
      Text
chunk <- (Char -> Bool) -> Parser Text
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'<')
      let acc' :: Builder
acc' = Builder
acc Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Text -> Builder
B.fromText Text
chunk
          finish :: Parser [Token]
finish = [Token] -> Parser [Token]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Token
open, Text -> Token
Text (Text -> Token) -> Text -> Token
forall a b. (a -> b) -> a -> b
$ Text -> Text
L.toStrict (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Builder -> Text
B.toLazyText Builder
acc', Text -> Token
TagClose Text
"script"]
          hasContent :: Parser [Token]
hasContent = (Text -> Parser Text
string Text
"/script>" Parser Text -> Parser [Token] -> Parser [Token]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser [Token]
finish) Parser [Token] -> Parser [Token] -> Parser [Token]
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Builder -> Parser [Token]
loop (Builder
acc' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
"<")
      (Char -> Parser Char
char Char
'<' Parser Char -> Parser [Token] -> Parser [Token]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser [Token]
hasContent) Parser [Token] -> Parser [Token] -> Parser [Token]
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser [Token]
finish

tokens :: Parser [Token]
tokens :: Parser [Token]
tokens = do
  Token
t <- Parser Token
token
  case Token
t of
    TagOpen Text
"script" Map Text Text
_ Bool
False -> Token -> Parser [Token]
tillScriptEnd Token
t
    Text Text
text0 -> do
      let parseText :: Parser Text
parseText = do
            Text Text
text <- Parser Token
token
            Text -> Parser Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure Text
text
      [Text]
texts <- Parser Text -> Parser Text [Text]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser Text
parseText
      [Token] -> Parser [Token]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Text -> Token
Text (Text -> Token) -> Text -> Token
forall a b. (a -> b) -> a -> b
$ Text -> Text
decodeString (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ [Text] -> Text
T.concat ([Text] -> Text) -> [Text] -> Text
forall a b. (a -> b) -> a -> b
$ Text
text0 Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
texts]
    Token
_ -> [Token] -> Parser [Token]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Token
t]

{--
 - Utils {{{
 -}

atLeast :: Int -> Parser Text -> Parser Text
atLeast :: Int -> Parser Text -> Parser Text
atLeast Int
0 Parser Text
p = Parser Text
p
atLeast Int
n Parser Text
p = Char -> Text -> Text
T.cons (Char -> Text -> Text) -> Parser Char -> Parser Text (Text -> Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Char
anyChar Parser Text (Text -> Text) -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Int -> Parser Text -> Parser Text
atLeast (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1) Parser Text
p

cond :: a -> a -> Bool -> a
cond :: a -> a -> Bool -> a
cond a
a1 a
a2 Bool
b = if Bool
b then a
a1 else a
a2

(||.) :: Applicative f => f Bool -> f Bool -> f Bool
||. :: f Bool -> f Bool -> f Bool
(||.) = (Bool -> Bool -> Bool) -> f Bool -> f Bool -> f Bool
forall (f :: * -> *) a b c.
Applicative f =>
(a -> b -> c) -> f a -> f b -> f c
liftA2 Bool -> Bool -> Bool
(||)

in2 :: Eq a => (a,a) -> a -> Bool
in2 :: (a, a) -> a -> Bool
in2 (a
a1,a
a2) a
a = a
aa -> a -> Bool
forall a. Eq a => a -> a -> Bool
==a
a1 Bool -> Bool -> Bool
|| a
aa -> a -> Bool
forall a. Eq a => a -> a -> Bool
==a
a2

in3 :: Eq a => (a,a,a) -> a -> Bool
in3 :: (a, a, a) -> a -> Bool
in3 (a
a1,a
a2,a
a3) a
a = a
aa -> a -> Bool
forall a. Eq a => a -> a -> Bool
==a
a1 Bool -> Bool -> Bool
|| a
aa -> a -> Bool
forall a. Eq a => a -> a -> Bool
==a
a2 Bool -> Bool -> Bool
|| a
aa -> a -> Bool
forall a. Eq a => a -> a -> Bool
==a
a3

boolP :: Parser a -> Parser Bool
boolP :: Parser a -> Parser Bool
boolP Parser a
p = Parser a
p Parser a -> Parser Bool -> Parser Bool
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Bool -> Parser Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
True Parser Bool -> Parser Bool -> Parser Bool
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Bool -> Parser Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False

maybeP :: Parser a -> Parser (Maybe a)
maybeP :: Parser a -> Parser (Maybe a)
maybeP Parser a
p = a -> Maybe a
forall a. a -> Maybe a
Just (a -> Maybe a) -> Parser a -> Parser (Maybe a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser a
p Parser (Maybe a) -> Parser (Maybe a) -> Parser (Maybe a)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe a -> Parser (Maybe a)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe a
forall a. Maybe a
Nothing
-- }}}

-- {{{ Stream
tokenStream :: Monad m
            => ConduitT Text Token m ()
tokenStream :: ConduitT Text Token m ()
tokenStream =
    (Text -> Bool) -> ConduitT Text Text m ()
forall (m :: * -> *) a. Monad m => (a -> Bool) -> ConduitT a a m ()
CL.filter (Bool -> Bool
not (Bool -> Bool) -> (Text -> Bool) -> Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null) ConduitT Text Text m ()
-> ConduitT Text Token m () -> ConduitT Text Token m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| Parser [Token]
-> ConduitT Text (Either ParseError (PositionRange, [Token])) m ()
forall (m :: * -> *) a b.
(Monad m, AttoparsecInput a) =>
Parser a b
-> ConduitT a (Either ParseError (PositionRange, b)) m ()
CA.conduitParserEither Parser [Token]
tokens ConduitT Text (Either ParseError (PositionRange, [Token])) m ()
-> ConduitM (Either ParseError (PositionRange, [Token])) Token m ()
-> ConduitT Text Token m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| (Either ParseError (PositionRange, [Token]) -> [Token])
-> ConduitM (Either ParseError (PositionRange, [Token])) Token m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> [b]) -> ConduitT a b m ()
CL.concatMap Either ParseError (PositionRange, [Token]) -> [Token]
forall a a p. Show a => Either a (a, p) -> p
go
  where
    go :: Either a (a, p) -> p
go (Left a
e) = String -> p
forall a. HasCallStack => String -> a
error (String -> p) -> String -> p
forall a b. (a -> b) -> a -> b
$ String
"html-conduit: parse error that should never happen occurred! " String -> ShowS
forall a. [a] -> [a] -> [a]
++ a -> String
forall a. Show a => a -> String
show a
e
    go (Right (a
_, p
tokens')) = p
tokens'

splitAccum :: [Token] -> (Text, [Token])
splitAccum :: [Token] -> (Text, [Token])
splitAccum [] = (Text
forall a. Monoid a => a
mempty, [])
splitAccum ([Token] -> [Token]
forall a. [a] -> [a]
reverse -> (Incomplete Text
s : [Token]
xs)) = (Text
s, [Token] -> [Token]
forall a. [a] -> [a]
reverse [Token]
xs)
splitAccum [Token]
tokens = (Text
forall a. Monoid a => a
mempty, [Token]
tokens)

-- Entities

-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: Monad m => ConduitT Token Token m ()
decodeEntities :: ConduitT Token Token m ()
decodeEntities =
    ConduitT Token Token m ()
start
  where
    start :: ConduitT Token Token m ()
start = ConduitT Token Token m (Maybe Token)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT Token Token m (Maybe Token)
-> (Maybe Token -> ConduitT Token Token m ())
-> ConduitT Token Token m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT Token Token m ()
-> (Token -> ConduitT Token Token m ())
-> Maybe Token
-> ConduitT Token Token m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT Token Token m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (\Token
token' -> Token -> ConduitT Token Token m ()
forall (m :: * -> *). Monad m => Token -> ConduitM Token Token m ()
start' Token
token' ConduitT Token Token m ()
-> ConduitT Token Token m () -> ConduitT Token Token m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT Token Token m ()
start)
    start' :: Token -> ConduitM Token Token m ()
start' (Text Text
t) = (Text -> ConduitT Token Text m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Text
t ConduitT Token Text m ()
-> ConduitT Token Text m () -> ConduitT Token Text m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT Token Text m ()
forall (m :: * -> *). Monad m => ConduitT Token Text m ()
yieldWhileText) ConduitT Token Text m ()
-> ConduitM Text Token m () -> ConduitM Token Token m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT Text Text m ()
forall (m :: * -> *). Monad m => ConduitT Text Text m ()
decodeEntities' ConduitT Text Text m ()
-> ConduitM Text Token m () -> ConduitM Text Token m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| (Text -> Maybe Token) -> ConduitM Text Token m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> Maybe b) -> ConduitT a b m ()
CL.mapMaybe Text -> Maybe Token
go
    start' (TagOpen Text
name Map Text Text
attrs' Bool
bool) = Token -> ConduitM Token Token m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Text -> Map Text Text -> Bool -> Token
TagOpen Text
name ((Text -> Text) -> Map Text Text -> Map Text Text
forall a b k. (a -> b) -> Map k a -> Map k b
Map.map Text -> Text
decodeString Map Text Text
attrs') Bool
bool)
    start' Token
token' = Token -> ConduitM Token Token m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Token
token'

    go :: Text -> Maybe Token
go Text
t
        | Text
t Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
""   = Maybe Token
forall a. Maybe a
Nothing
        | Bool
otherwise = Token -> Maybe Token
forall a. a -> Maybe a
Just (Text -> Token
Text Text
t)

-- | Decode entities in a complete string.
decodeString :: Text -> Text
decodeString :: Text -> Text
decodeString Text
input =
  case Text -> (Text, Text)
makeEntityDecoder Text
input of
    (Text
value', Text
remainder)
      | Text
value' Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
/= Text
forall a. Monoid a => a
mempty -> Text
value' Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text -> Text
decodeString Text
remainder
      | Bool
otherwise -> Text
input

decodeEntities' :: Monad m => ConduitT Text Text m ()
decodeEntities' :: ConduitT Text Text m ()
decodeEntities' =
    (Text -> Text) -> ConduitT Text Text m ()
forall (m :: * -> *).
Monad m =>
(Text -> Text) -> ConduitT Text Text m ()
loop Text -> Text
forall a. a -> a
id
  where
    loop :: (Text -> Text) -> ConduitT Text Text m ()
loop Text -> Text
accum = do
        Maybe Text
mchunk <- ConduitT Text Text m (Maybe Text)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
        let chunk :: Text
chunk = Text -> Text
accum (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Text -> Maybe Text -> Text
forall a. a -> Maybe a -> a
fromMaybe Text
forall a. Monoid a => a
mempty Maybe Text
mchunk
            (Text
newStr, Text
remainder) = Text -> (Text, Text)
makeEntityDecoder Text
chunk
        Text -> ConduitT Text Text m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Text
newStr
        if Maybe Text -> Bool
forall a. Maybe a -> Bool
isJust Maybe Text
mchunk
            then (Text -> Text) -> ConduitT Text Text m ()
loop (Text -> Text -> Text
forall a. Monoid a => a -> a -> a
mappend Text
remainder)
            else Text -> ConduitT Text Text m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Text
remainder

-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => ConduitT Token Text m ()
yieldWhileText :: ConduitT Token Text m ()
yieldWhileText =
    ConduitT Token Text m ()
loop
  where
    loop :: ConduitT Token Text m ()
loop = ConduitT Token Text m (Maybe Token)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT Token Text m (Maybe Token)
-> (Maybe Token -> ConduitT Token Text m ())
-> ConduitT Token Text m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT Token Text m ()
-> (Token -> ConduitT Token Text m ())
-> Maybe Token
-> ConduitT Token Text m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT Token Text m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()) Token -> ConduitT Token Text m ()
go
    go :: Token -> ConduitT Token Text m ()
go (Text Text
t) = Text -> ConduitT Token Text m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Text
t ConduitT Token Text m ()
-> ConduitT Token Text m () -> ConduitT Token Text m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT Token Text m ()
loop
    go Token
token' = Token -> ConduitT Token Text m ()
forall i o (m :: * -> *). i -> ConduitT i o m ()
leftover Token
token'

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: Text -> (Text, Text)
makeEntityDecoder :: Text -> (Text, Text)
makeEntityDecoder = (Builder -> Text) -> (Builder, Text) -> (Text, Text)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (Text -> Text
L.toStrict (Text -> Text) -> (Builder -> Text) -> Builder -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> Text
B.toLazyText) ((Builder, Text) -> (Text, Text))
-> (Text -> (Builder, Text)) -> Text -> (Text, Text)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> (Builder, Text)
go
  where
    go :: Text -> (Builder, Text)
go Text
s =
      case (Char -> Bool) -> Text -> (Text, Text)
T.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'&') Text
s of
        (Text
_,Text
"") -> (Text -> Builder
B.fromText Text
s, Text
"")
        (Text
before,restPlusAmp :: Text
restPlusAmp@(Int -> Text -> Text
T.drop Int
1 -> Text
rest)) ->
          case (Char -> Bool) -> Text -> (Text, Text)
T.break (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Char
c -> Char -> Bool
isNameChar Char
c Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#')) Text
rest of
            (Text
_,Text
"") -> (Text -> Builder
B.fromText Text
before, Text
restPlusAmp)
            (Text
entity,Text
after) -> (Builder
before1 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
before2, Text
after')
              where
                before1 :: Builder
before1 = Text -> Builder
B.fromText Text
before
                (Builder
before2, Text
after') =
                  case Maybe Text
mdecoded of
                    Maybe Text
Nothing -> (Builder -> Builder) -> (Builder, Text) -> (Builder, Text)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first ((Builder
"&" Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Text -> Builder
B.fromText Text
entity) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>) (Text -> (Builder, Text)
go Text
after)
                    Just (Text -> Builder
B.fromText -> Builder
decoded) ->
                      case Text -> Maybe (Char, Text)
T.uncons Text
after of
                        Just (Char
';',Text
validAfter) -> (Builder -> Builder) -> (Builder, Text) -> (Builder, Text)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (Builder
decoded Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>) (Text -> (Builder, Text)
go Text
validAfter)
                        Just (Char
_invalid,Text
_rest) -> (Builder -> Builder) -> (Builder, Text) -> (Builder, Text)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (Builder
decoded Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>) (Text -> (Builder, Text)
go Text
after)
                        Maybe (Char, Text)
Nothing -> (Builder
forall a. Monoid a => a
mempty, Text
s)
                mdecoded :: Maybe Text
mdecoded =
                  if Text
entity Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
forall a. Monoid a => a
mempty
                     then Maybe Text
forall a. Maybe a
Nothing
                     else Text -> Maybe Text
forall (m :: * -> *). MonadThrow m => Text -> m Text
decodeEntity Text
entity

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart :: Char -> Bool
isNameStart Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
':' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'_' Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiLower Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xC0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xD6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xD8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xF6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xF8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x370' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x37D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x37F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x1FFF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x200C' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x200D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x2070' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x218F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x2C00' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2FEF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x3001' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xD7FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xF900' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFDCF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\xFDF0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xFFFD') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x10000' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xEFFFF')

-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar :: Char -> Bool
isNameChar Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'-' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'.' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\xB7' Bool -> Bool -> Bool
||
  Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isNameStart Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x0300' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x036F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x203F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x2040')