module HTMLTokenizer.Parser ( -- * Model Token(..), OpeningTag, Identifier, Attribute, -- * Parsers token, ) where import BasePrelude hiding (takeWhile) import Conversion import Conversion.Text import Conversion.CaseInsensitive import Data.Text (Text) import Data.Text.Lazy.Builder (Builder) import Data.CaseInsensitive (CI) import Data.Attoparsec.Text import qualified Data.Text -- | -- An HTML token. data Token = -- | -- An opening tag. Token_OpeningTag OpeningTag | -- | -- A closing tag. Token_ClosingTag Identifier | -- | -- A text between tags with HTML-entities decoded. Token_Text Text | -- | -- Contents of a comment. Token_Comment Text deriving (Show, Ord, Eq, Generic, Data, Typeable) -- | -- An opening tag name, attributes and whether it is closed. type OpeningTag = (Identifier, [Attribute], Bool) -- | -- A case-insensitive identifier. type Identifier = CI Text -- | -- A tag attribute identifier and a value. type Attribute = (Identifier, Maybe Text) -- | -- A token parser. token :: Parser Token token = Token_Comment <$> comment <|> Token_ClosingTag <$> closingTag <|> Token_OpeningTag <$> openingTag <|> Token_Text <$> text openingTag :: Parser OpeningTag openingTag = do char '<' skipSpace theIdentifier <- identifier attributes <- many $ space *> skipSpace *> attribute skipSpace closed <- convert <$> optional (char '/') char '>' return (theIdentifier, attributes, closed) attribute :: Parser Attribute attribute = do theIdentifier <- identifier value <- optional $ do skipSpace char '=' skipSpace msum (map quotedValue ['"', '\'', '`']) <|> entityQuotedValue <|> unquotedValue return (theIdentifier, value) where quotedValue q = char q *> takeWhile (/= q) <* char q unquotedValue = takeWhile1 $ flip all [not . isSpace, not . flip elem ['=', '<', '>', '/']] . (&) entityQuotedValue = fmap convert $ q *> manyTill' anyChar q where q = asciiCI """ identifier :: Parser Identifier identifier = fmap convert $ takeWhile1 (flip any [isAlphaNum, flip elem ['_', '-', '!', '?']] . flip ($)) comment :: Parser Text comment = (convert :: Builder -> Text) <$> (string "")) (liftA2 mappend (fmap convert (char '-')) (content)))) closingTag :: Parser Identifier closingTag = string " skipSpace *> identifier <* skipSpace <* char '>' text :: Parser Text text = fmap ((convert :: Builder -> Text) . mconcat) $ many1 $ convert <$> nonTagChar where nonTagChar = shouldFail comment *> shouldFail closingTag *> shouldFail openingTag *> anyChar shouldFail :: Parser a -> Parser () shouldFail p = join $ (p $> empty) <|> pure (pure ())