{-| This is a tag soup parser with a custom tag data structure. The parser works only on proper Unicode texts, that is, you must have decoded it before, e.g. using decoding functions from hxt or encoding package. -} module Text.HTML.TagSoup.HT.Parser ( CharType, runSoup, runSoupWithPositions, runSoupWithPositionsName, runTag, runInnerOfTag, ) where import Text.HTML.TagSoup.HT.Parser.Combinator (allowFail, withDefault, char, dropSpaces, getPos, many, many1, many1Satisfy, readUntil, satisfy, string, emit, mfix, ) import qualified Text.HTML.TagSoup.HT.Parser.Combinator as Parser import qualified Text.HTML.TagSoup.HT.Tag as Tag import qualified Text.HTML.TagSoup.HT.PositionTag as PosTag import qualified Text.HTML.TagSoup.HT.Position as Position import qualified Text.HTML.TagSoup.HT.HTMLChar as HTMLChar import qualified Text.HTML.TagSoup.Entity as HTMLEntity import Control.Monad (mplus, msum, when, liftM, ) -- import Control.Monad.Identity (Identity(..), ) import Data.Char (isAlphaNum, isAscii, isDigit, chr, ) import Data.Maybe (fromMaybe, ) -- * run parser in several ways {- | Parse a single tag, throws an error if there is a syntax error. This is useful for parsing a match pattern. -} runTag :: (CharType char, Show char) => String -> Tag.T char runTag str = let tags = fromMaybe (error "runTag: no parse at all") $ Parser.write "string" parsePosTag str makeError = error $ "runTag: parsing results in\n" ++ unlines (map show tags) in case tags of [(_,tag)] -> if Tag.isWarning tag then makeError else tag _ -> makeError {- | Parse the inner of a single tag. That is, @runTag \"\\"@ is the same as @runInnerOfTag \"\\"@. -} runInnerOfTag :: (CharType char, Show char) => String -> Tag.T char runInnerOfTag str = runTag $ "<"++str++">" runSoupWithPositionsName :: CharType char => FilePath -> String -> [PosTag.T char] runSoupWithPositionsName fileName = Parser.runIdentity . Parser.write fileName (many parsePosTag >> return ()) -- | Parse an HTML document to a list of 'Tag.T'. -- Automatically expands out escape characters. runSoupWithPositions :: CharType char => String -> [PosTag.T char] runSoupWithPositions = Parser.runIdentity . Parser.write "input" (many parsePosTag >> return ()) -- | Like 'runSoupWithPositions' but hides source file positions. runSoup :: CharType char => String -> [Tag.T char] runSoup = map snd . runSoupWithPositions -- * parser parts type Parser char a = Parser.Full (PosTag.T char) a type ParserEmit char a = Parser.Emitting (PosTag.T char) a parsePosTag :: CharType char => Parser char () parsePosTag = do pos <- getPos mplus (do char '<' allowFail $ withDefault (msum $ parseSpecialTag pos : parseProcessingTag pos : parseCloseTag pos : parseOpenTag pos : []) (do emitTag pos (Tag.Text [fromChar '<']) emitWarning pos "A '<', that is not part of a tag. Encode it as < please.")) (parseText pos) parseOpenTag :: CharType char => Position.T -> Parser char () parseOpenTag pos = do name <- parseName allowFail $ do dropSpaces mfix (\attrs -> emit (pos, Tag.Open name attrs) >> many parseAttribute) withDefault (do closePos <- getPos string "/>" allowFail $ emitTag closePos (Tag.Close name)) (do junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in opening tag: \"" ++ junk ++ "\"")) ("Unterminated open tag \"" ++ name ++ "\"") ">") parseCloseTag :: Position.T -> Parser char () parseCloseTag pos = do char '/' name <- parseName allowFail $ do emitTag pos (Tag.Close name) dropSpaces junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in closing tag: \"" ++ junk ++"\"")) ("Unterminated closing tag \"" ++ name ++"\"") ">" parseSpecialTag :: Position.T -> Parser char () parseSpecialTag pos = do char '!' msum $ (do string "--" allowFail $ readUntilTerm (\ cmt -> emitTag pos (Tag.Comment cmt)) "Unterminated comment" "-->") : (do string "[CDATA[" allowFail $ readUntilTerm (\ cdata -> emitTag pos (Tag.Special "[CDATA[" cdata)) "Unterminated cdata" "]]>") : (do name <- many1Satisfy isAlphaNum allowFail $ do dropSpaces readUntilTerm (\ info -> emitTag pos (Tag.Special name info)) ("Unterminated special tag \"" ++ name ++ "\"") ">") : [] parseProcessingTag :: CharType char => Position.T -> Parser char () parseProcessingTag pos = do char '?' name <- parseName allowFail $ do dropSpaces mfix (\proc -> emit (pos, Tag.Processing name proc) >> if elem name ["xml", "xml-stylesheet"] then do attrs <- many parseAttribute junkPos <- getPos readUntilTerm (\ junk -> emitWarningWhen (not $ null junk) junkPos ("Junk in processing info tag: \"" ++ junk ++ "\"")) ("Unterminated processing info tag \"" ++ name ++ "\"") "?>" return $ Tag.KnownProcessing attrs else readUntilTerm (return . Tag.UnknownProcessing) "Unterminated processing instruction" "?>") return () parseText :: CharType char => Position.T -> Parser char () parseText pos = mfix (\ text -> allowFail (emitTag pos (Tag.Text text)) >> parseString1 ('<'/=)) >> return () parseAttribute :: CharType char => Parser char (Tag.Attribute char) parseAttribute = parseName >>= \name -> allowFail $ do dropSpaces value <- withDefault (string "=" >> allowFail (dropSpaces >> parseValue)) (return []) dropSpaces return (name, value) parseName :: Parser char String parseName = -- we must restrict to ASCII alphanum characters in order to exclude umlauts many1Satisfy (\c -> isAlphaNum c && isAscii c || c `elem` "_-:") parseValue :: CharType char => ParserEmit char [char] parseValue = (msum $ parseQuoted "Unterminated doubly quoted value string" '"' : parseQuoted "Unterminated singly quoted value string" '\'' : []) `withDefault` parseUnquotedValue parseUnquotedValueChar :: ParserEmit Char String parseUnquotedValueChar = let parseValueChar = do pos <- getPos str <- parseChar (not . flip elem " >\"\'") let wrong = filter (not . isValidValueChar) str allowFail $ emitWarningWhen (not (null wrong)) pos $ "Illegal characters in unquoted value: " ++ wrong return str in liftM concat $ many parseValueChar parseUnquotedValueHTMLChar :: ParserEmit HTMLChar.T [HTMLChar.T] parseUnquotedValueHTMLChar = let parseValueChar = do pos <- getPos hc <- parseHTMLChar (not . flip elem " >\"\'") case hc of HTMLChar.Char c -> allowFail $ emitWarningWhen (not (isValidValueChar c)) pos $ "Illegal characters in unquoted value: '" ++ c : "'" _ -> return () return hc in many parseValueChar isValidValueChar :: Char -> Bool isValidValueChar c = isAlphaNum c || c `elem` "_-:." parseQuoted :: CharType char => String -> Char -> Parser char [char] parseQuoted termMsg quote = char quote >> (allowFail $ do str <- parseString (quote/=) withDefault (char quote >> return ()) (do termPos <- getPos emitWarning termPos termMsg) return str) {- Instead of using 'generateTag' we could also wrap the call to 'readUntilTerm' in 'mfix' in order to emit a tag, where some information is read later. -} readUntilTerm :: (String -> ParserEmit char a) -> String -> String -> ParserEmit char a readUntilTerm generateTag termWarning termPat = do ~(termFound,str) <- readUntil termPat result <- generateTag str termPos <- getPos emitWarningWhen (not termFound) termPos termWarning return result class CharType char where fromChar :: Char -> char parseString :: (Char -> Bool) -> ParserEmit char [char] parseString1 :: (Char -> Bool) -> Parser char [char] parseUnquotedValue :: ParserEmit char [char] instance CharType Char where fromChar = id parseString p = liftM concat $ many (parseChar p) parseString1 p = liftM concat $ many1 (parseChar p) parseUnquotedValue = parseUnquotedValueChar instance CharType HTMLChar.T where fromChar = HTMLChar.Char parseString p = many (parseHTMLChar p) parseString1 p = many1 (parseHTMLChar p) parseUnquotedValue = parseUnquotedValueHTMLChar parseChar :: (Char -> Bool) -> Parser char String parseChar p = do pos <- getPos x <- parseHTMLChar p let returnChar c = return $ c:[] allowFail $ case x of HTMLChar.Char c -> returnChar c HTMLChar.NumericRef num -> returnChar (chr num) HTMLChar.NamedRef name -> maybe (let refName = '&':name++";" in emitWarning pos ("Unknown HTML entity " ++ refName) >> return refName) (returnChar . chr) (lookup name HTMLEntity.htmlEntities) parseHTMLChar :: (Char -> Bool) -> Parser char HTMLChar.T parseHTMLChar p = do pos <- getPos c <- satisfy p allowFail $ if c=='&' then withDefault (do ent <- mplus (char '#' >> liftM (HTMLChar.NumericRef . read) (many1Satisfy isDigit)) (liftM HTMLChar.NamedRef $ many1Satisfy isAlphaNum) char ';' return ent) (emitWarning pos "Ill formed entity" >> return (HTMLChar.Char '&')) else return (HTMLChar.Char c) emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit char () emitWarningWhen cond pos msg = when cond $ emitWarning pos msg emitWarning :: Position.T -> String -> ParserEmit char () emitWarning pos msg = emitTag pos (Tag.Warning msg) emitTag :: Position.T -> Tag.T char -> ParserEmit char () emitTag = curry emit