module Text.HTML.TagStream.Parser where
import Control.Applicative
import Data.ByteString (ByteString)
import Data.Attoparsec.Char8
import Blaze.ByteString.Builder (toByteString)
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (cons, append)
quoted :: Char -> Parser ByteString
quoted q = append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser ByteString -> Parser ByteString
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
attrValue :: Parser ByteString
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
attrName :: Parser ByteString
attrName = quotedOr $
cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
attr :: Parser Attr
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
attrs :: Parser ([Attr], Bool)
attrs = loop []
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (reverse acc,))
(loop . (:acc))
comment :: Parser Token
comment = Comment <$> comment'
where comment' = append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
special :: Parser Token
special = Special
<$> ( cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
skipSpace
return $ TagOpen name as close
incomplete :: Parser Token
incomplete = Incomplete . cons '<' <$> takeByteString
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd t = reverse <$> loop [t]
<|> (:[]) . Incomplete . append script <$> takeByteString
where
script = toByteString $ showToken id t
loop acc = (:acc) <$> scriptEnd
<|> (text >>= loop . (:acc))
scriptEnd = string "</script>" *> return (TagClose "script")
html :: Parser [Token]
html = tokens <|> pure []
where
tokens :: Parser [Token]
tokens = do
t <- token
case t of
(TagOpen name _ close)
| not close && name=="script"
-> (++) <$> tillScriptEnd t <*> html
_ -> (t:) <$> html
decode :: ByteString -> Either String [Token]
decode = parseOnly html
atLeast :: Int -> Parser ByteString -> Parser ByteString
atLeast 0 p = p
atLeast n p = cons <$> anyChar <*> atLeast (n1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing