module Text.HTML.TagSoup.HT.Parser (
CharType,
runSoup, runSoupWithPositions, runSoupWithPositionsName,
runTag, runInnerOfTag,
) where
import Text.HTML.TagSoup.HT.Parser.Combinator
(allowFail, withDefault,
char, dropSpaces, getPos,
many, many1, many1Satisfy, readUntil,
satisfy, string,
emit, mfix, )
import qualified Text.HTML.TagSoup.HT.Parser.Combinator as Parser
import qualified Text.HTML.TagSoup.HT.Tag as Tag
import qualified Text.HTML.TagSoup.HT.PositionTag as PosTag
import qualified Text.HTML.TagSoup.HT.Position as Position
import qualified Text.HTML.TagSoup.HT.HTMLChar as HTMLChar
import qualified Text.HTML.TagSoup.Entity as HTMLEntity
import Control.Monad (mplus, msum, when, liftM, )
import Data.Char (isAlphaNum, isAscii, isDigit, chr, )
import Data.Maybe (fromMaybe, )
runTag :: (CharType char, Show char) =>
String -> Tag.T char
runTag str =
let tags =
fromMaybe (error "runTag: no parse at all") $
Parser.write "string" parsePosTag str
makeError = error $
"runTag: parsing results in\n" ++
unlines (map show tags)
in case tags of
[(_,tag)] ->
if Tag.isWarning tag
then makeError
else tag
_ -> makeError
runInnerOfTag :: (CharType char, Show char) =>
String -> Tag.T char
runInnerOfTag str = runTag $ "<"++str++">"
runSoupWithPositionsName :: CharType char =>
FilePath -> String -> [PosTag.T char]
runSoupWithPositionsName fileName =
Parser.runIdentity .
Parser.write fileName (many parsePosTag >> return ())
runSoupWithPositions :: CharType char =>
String -> [PosTag.T char]
runSoupWithPositions =
Parser.runIdentity .
Parser.write "input" (many parsePosTag >> return ())
runSoup :: CharType char => String -> [Tag.T char]
runSoup = map snd . runSoupWithPositions
type Parser char a = Parser.Full (PosTag.T char) a
type ParserEmit char a = Parser.Emitting (PosTag.T char) a
parsePosTag :: CharType char => Parser char ()
parsePosTag = do
pos <- getPos
mplus
(do char '<'
allowFail $ withDefault
(msum $
parseSpecialTag pos :
parseProcessingTag pos :
parseCloseTag pos :
parseOpenTag pos :
[])
(do emitTag pos (Tag.Text [fromChar '<'])
emitWarning pos "A '<', that is not part of a tag. Encode it as < please."))
(parseText pos)
parseOpenTag :: CharType char => Position.T -> Parser char ()
parseOpenTag pos =
do name <- parseName
allowFail $
do dropSpaces
mfix
(\attrs ->
emit (pos, Tag.Open name attrs) >>
many parseAttribute)
withDefault
(do closePos <- getPos
string "/>"
allowFail $ emitTag closePos (Tag.Close name))
(do junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in opening tag: \"" ++ junk ++ "\""))
("Unterminated open tag \"" ++ name ++ "\"") ">")
parseCloseTag :: Position.T -> Parser char ()
parseCloseTag pos =
do char '/'
name <- parseName
allowFail $
do emitTag pos (Tag.Close name)
dropSpaces
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen
(not $ null junk)
junkPos ("Junk in closing tag: \"" ++ junk ++"\""))
("Unterminated closing tag \"" ++ name ++"\"") ">"
parseSpecialTag :: Position.T -> Parser char ()
parseSpecialTag pos =
do char '!'
msum $
(do string "--"
allowFail $ readUntilTerm
(\ cmt -> emitTag pos (Tag.Comment cmt))
"Unterminated comment" "-->") :
(do string "[CDATA["
allowFail $ readUntilTerm
(\ cdata -> emitTag pos (Tag.Special "[CDATA[" cdata))
"Unterminated cdata" "]]>") :
(do name <- many1Satisfy isAlphaNum
allowFail $
do dropSpaces
readUntilTerm
(\ info -> emitTag pos (Tag.Special name info))
("Unterminated special tag \"" ++ name ++ "\"") ">") :
[]
parseProcessingTag :: CharType char => Position.T -> Parser char ()
parseProcessingTag pos =
do char '?'
name <- parseName
allowFail $
do dropSpaces
mfix
(\proc ->
emit (pos, Tag.Processing name proc) >>
if elem name ["xml", "xml-stylesheet"]
then
do attrs <- many parseAttribute
junkPos <- getPos
readUntilTerm
(\ junk ->
emitWarningWhen (not $ null junk) junkPos
("Junk in processing info tag: \"" ++ junk ++ "\""))
("Unterminated processing info tag \"" ++ name ++ "\"") "?>"
return $ Tag.KnownProcessing attrs
else readUntilTerm (return . Tag.UnknownProcessing)
"Unterminated processing instruction" "?>")
return ()
parseText :: CharType char => Position.T -> Parser char ()
parseText pos =
mfix
(\ text ->
allowFail (emitTag pos (Tag.Text text)) >>
parseString1 ('<'/=))
>> return ()
parseAttribute :: CharType char => Parser char (Tag.Attribute char)
parseAttribute =
parseName >>= \name -> allowFail $
do dropSpaces
value <-
withDefault
(string "=" >> allowFail (dropSpaces >> parseValue))
(return [])
dropSpaces
return (name, value)
parseName :: Parser char String
parseName =
many1Satisfy (\c -> isAlphaNum c && isAscii c || c `elem` "_-:")
parseValue :: CharType char => ParserEmit char [char]
parseValue =
(msum $
parseQuoted "Unterminated doubly quoted value string" '"' :
parseQuoted "Unterminated singly quoted value string" '\'' :
[])
`withDefault`
parseUnquotedValue
parseUnquotedValueChar :: ParserEmit Char String
parseUnquotedValueChar =
let parseValueChar =
do pos <- getPos
str <- parseChar (not . flip elem " >\"\'")
let wrong = filter (not . isValidValueChar) str
allowFail $
emitWarningWhen (not (null wrong)) pos $
"Illegal characters in unquoted value: " ++ wrong
return str
in liftM concat $ many parseValueChar
parseUnquotedValueHTMLChar :: ParserEmit HTMLChar.T [HTMLChar.T]
parseUnquotedValueHTMLChar =
let parseValueChar =
do pos <- getPos
hc <- parseHTMLChar (not . flip elem " >\"\'")
case hc of
HTMLChar.Char c ->
allowFail $
emitWarningWhen (not (isValidValueChar c)) pos $
"Illegal characters in unquoted value: '" ++ c : "'"
_ -> return ()
return hc
in many parseValueChar
isValidValueChar :: Char -> Bool
isValidValueChar c = isAlphaNum c || c `elem` "_-:."
parseQuoted :: CharType char => String -> Char -> Parser char [char]
parseQuoted termMsg quote =
char quote >>
(allowFail $
do str <- parseString (quote/=)
withDefault
(char quote >> return ())
(do termPos <- getPos
emitWarning termPos termMsg)
return str)
readUntilTerm ::
(String -> ParserEmit char a) -> String -> String -> ParserEmit char a
readUntilTerm generateTag termWarning termPat =
do ~(termFound,str) <- readUntil termPat
result <- generateTag str
termPos <- getPos
emitWarningWhen (not termFound) termPos termWarning
return result
class CharType char where
fromChar :: Char -> char
parseString :: (Char -> Bool) -> ParserEmit char [char]
parseString1 :: (Char -> Bool) -> Parser char [char]
parseUnquotedValue :: ParserEmit char [char]
instance CharType Char where
fromChar = id
parseString p = liftM concat $ many (parseChar p)
parseString1 p = liftM concat $ many1 (parseChar p)
parseUnquotedValue = parseUnquotedValueChar
instance CharType HTMLChar.T where
fromChar = HTMLChar.Char
parseString p = many (parseHTMLChar p)
parseString1 p = many1 (parseHTMLChar p)
parseUnquotedValue = parseUnquotedValueHTMLChar
parseChar :: (Char -> Bool) -> Parser char String
parseChar p =
do pos <- getPos
x <- parseHTMLChar p
let returnChar c = return $ c:[]
allowFail $
case x of
HTMLChar.Char c -> returnChar c
HTMLChar.NumericRef num -> returnChar (chr num)
HTMLChar.NamedRef name ->
maybe
(let refName = '&':name++";"
in emitWarning pos ("Unknown HTML entity " ++ refName) >>
return refName)
(returnChar . chr)
(lookup name HTMLEntity.htmlEntities)
parseHTMLChar :: (Char -> Bool) -> Parser char HTMLChar.T
parseHTMLChar p =
do pos <- getPos
c <- satisfy p
allowFail $
if c=='&'
then
withDefault
(do ent <-
mplus
(char '#' >>
liftM (HTMLChar.NumericRef . read) (many1Satisfy isDigit))
(liftM HTMLChar.NamedRef $ many1Satisfy isAlphaNum)
char ';'
return ent)
(emitWarning pos "Ill formed entity" >>
return (HTMLChar.Char '&'))
else return (HTMLChar.Char c)
emitWarningWhen :: Bool -> Position.T -> String -> ParserEmit char ()
emitWarningWhen cond pos msg =
when cond $ emitWarning pos msg
emitWarning :: Position.T -> String -> ParserEmit char ()
emitWarning pos msg = emitTag pos (Tag.Warning msg)
emitTag :: Position.T -> Tag.T char -> ParserEmit char ()
emitTag = curry emit