module HTMLTokenizer.Parsing
(
token,
)
where
import HTMLTokenizer.Prelude hiding (takeWhile)
import HTMLTokenizer.Data
import Data.Attoparsec.Text
import HTMLEntities.Parser (htmlEntity)
import qualified Text.Builder as A
import qualified HTMLTokenizer.MonadPlus as B
import qualified VectorBuilder.MonadPlus as C
import qualified Data.Text as D
token :: Parser Token
token =
labeled "HTML Token" $
DoctypeToken <$> doctype <|>
openingTag OpeningTagToken <|>
ClosingTagToken <$> closingTag <|>
TextToken <$> textBetweenTags <|>
CommentToken <$> comment <|>
fail "Invalid token"
doctype :: Parser Text
doctype =
labeled "Doctype" $ do
string "<!"
skipSpace
asciiCI "doctype"
space
skipSpace
contents <- takeWhile1 (/= '>')
char '>'
return contents
openingTag :: (Name -> Vector Attribute -> Bool -> openingTag) -> Parser openingTag
openingTag openingTag =
labeled "Opening tag" $ do
char '<'
skipSpace
tagName <- name
attributes <- C.many (space *> skipSpace *> attribute)
skipSpace
closed <- (char '/' $> True) <|> pure False
char '>'
return (openingTag tagName attributes closed)
closingTag :: Parser Name
closingTag =
labeled "Closing tag" $
string "</" *> skipSpace *> name <* skipSpace <* char '>'
textBetweenTags :: Parser Text
textBetweenTags =
labeled "Text between tags" $ do
prefixSpace <- (space *> skipSpace $> A.char ' ') <|> pure mempty
text <- loop prefixSpace mempty
if A.null text
then mzero
else return (A.run text)
where
loop !builder !unconsumedSpace =
mplus word end
where
word =
do
parsedWord <- word
space <- takeWhile isSpace
if D.null space
then return (builder <> A.text unconsumedSpace <> parsedWord)
else loop (builder <> A.text unconsumedSpace <> parsedWord) space
where
word =
B.concat1 (normalChunk <|> entity <|> ampersand)
where
normalChunk =
A.text <$> takeWhile1 (\ x -> not (isSpace x) && x /= '<' && x /= '&')
entity =
A.text <$> htmlEntity
ampersand =
A.char <$> char '&'
end =
if D.null unconsumedSpace
then return builder
else return (builder <> A.char ' ')
comment :: Parser Text
comment =
labeled "Comment" $
string "<!--" *> (A.run <$> loop mempty)
where
loop !builder =
do
textWithoutDashes <- A.text <$> takeWhile (/= '-')
mplus
(string "-->" $> builder <> textWithoutDashes)
(mplus
(char '-' *> loop (builder <> textWithoutDashes <> A.char '-'))
(return (builder <> textWithoutDashes)))
where
textWithoutDashes =
A.text <$> takeWhile1 (/= '-')
attribute :: Parser Attribute
attribute =
labeled "Attribute" $ do
attributeName <- name
mplus
(do
skipSpace
char '='
skipSpace
attributeValue <- msum (map quotedContent ['"', '\'', '`']) <|> unquotedContent
return (Attribute attributeName attributeValue))
(return (Attribute attributeName ""))
quotedContent :: Char -> Parser Text
quotedContent quotChar =
char quotChar *> (A.run <$> B.concat escapedContentChunk) <* char quotChar
where
escapedContentChunk =
normal <|> entity <|> escaped <|> failedEscaping
where
normal =
A.text <$> takeWhile1 (\ x -> x /= quotChar && x /= '&' && x /= '\\')
entity =
A.text <$> htmlEntity
escaped =
char '\\' *> (A.char <$> satisfy (\ x -> x == quotChar || x == '\\'))
failedEscaping =
A.char <$> char '\\'
unquotedContent :: Parser Text
unquotedContent =
isolatedTextInsideTag
name :: Parser Name
name =
labeled "Name" $ do
c1 <- isolatedTextInsideTag
(mplus
(do
skipSpace
char ':'
skipSpace
c2 <- isolatedTextInsideTag
return (PrefixedName c1 c2))
(return (UnprefixedName c1)))
isolatedTextInsideTag :: Parser Text
isolatedTextInsideTag =
A.run <$> B.concat1 (normal <|> entity <|> ampersand)
where
normal =
A.text <$> takeWhile1 predicate
where
predicate x =
x /= '>' && x /= '/' && not (isSpace x) && x /= '=' &&
x /= '&' && x /= '<' &&
x /= '"' && x /= '\'' && x /= '`'
entity =
A.text <$> htmlEntity
ampersand =
A.char <$> char '&'
shouldFail :: Parser a -> Parser ()
shouldFail p =
join ((p $> empty) <|> pure (pure ()))
skipSpaceLeaving1 :: Parser ()
skipSpaceLeaving1 =
mplus
(do
space
peekedChar <- peekChar'
if isSpace peekedChar
then skipSpaceLeaving1
else mzero)
(return ())
labeled :: String -> Parser a -> Parser a
labeled label parser =
parser <?> label