module Text.HTML.TagSoup.Parser(
parseTags, parseTagsOptions,
Options(..), options
) where
import Text.HTML.TagSoup.Type
import Text.HTML.TagSoup.Entity
import Control.Monad.State
import Data.Char
import Data.List
import Data.Maybe
infix 9 ?->
(?->) :: Bool -> [x] -> [x]
(?->) b true = if b then true else []
data Options = Options
{optTagPosition :: Bool
,optTagWarning :: Bool
,optLookupEntity :: String -> [Tag]
,optMaxEntityLength :: Maybe Int
}
options :: Options
options = Options False False f (Just 10)
where
f x = case lookupEntity x of
Nothing -> [TagText $ "&" ++ x ++ ";", TagWarning $ "Unknown entity: &" ++ x ++ ";"]
Just x -> [TagText [x]]
parseTags :: String -> [Tag]
parseTags = parseTagsOptions options
tagWarn :: Options -> String -> [Tag]
tagWarn opts x = [TagWarning x | optTagWarning opts]
data Position = Position !Row !Column
updateOnString :: Position -> String -> Position
updateOnString = foldl' updateOnChar
updateOnChar :: Position -> Char -> Position
updateOnChar (Position r c) x = case x of
'\n' -> Position (r+1) c
'\t' -> Position r (c + 8 mod (c1) 8)
_ -> Position r (c+1)
tagPos :: Options -> Position -> [Tag]
tagPos opts (Position r c) = [TagPosition r c | optTagPosition opts]
tagPosWarn :: Options -> Position -> String -> [Tag]
tagPosWarn opts p x = optTagWarning opts ?-> (tagPos opts p ++ [TagWarning x])
tagPosWarnFix :: Options -> Position -> [Tag] -> [Tag]
tagPosWarnFix opts p = addPositions . remWarnings
where
remWarnings = if optTagWarning opts then id else filter (not . isTagWarning)
addPositions = concatMap (\x -> tagPos opts p ++ [x])
parseTagsOptions :: Options -> String -> [Tag]
parseTagsOptions opts x = mergeTexts $ evalState (parse opts) $ Value x (Position 0 0)
mergeTexts :: [Tag] -> [Tag]
mergeTexts (TagText x:xs) = (TagText $ concat $ x:texts) : warns ++ mergeTexts rest
where
(texts,warns,rest) = f xs
f (TagText x:xs) = (x:a,b,c)
where (a,b,c) = f xs
f (TagPosition _ _:TagText x:xs) = (x:a,b,c)
where (a,b,c) = f xs
f (p@TagPosition{}:TagWarning y:xs) = (a,p:TagWarning y:b,c)
where (a,b,c) = f xs
f (TagWarning x:xs) = (a,TagWarning x:b,c)
where (a,b,c) = f xs
f xs = ([],[],xs)
mergeTexts (x:xs) = x : mergeTexts xs
mergeTexts [] = []
data Value = Value String !Position
type Parser a = State Value a
isNameCharFirst x = isAlphaNum x || x `elem` "_:"
isNameChar x = isAlphaNum x || x `elem` "-_:."
consume :: Int -> Parser ()
consume n = do
Value s p <- get
let (a,b) = splitAt n s
put $ Value b (updateOnString p a)
breakOn :: String -> Parser (String,Bool)
breakOn end = do
Value s p <- get
if null s then
return ("",True)
else if end `isPrefixOf` s then
consume (length end) >> return ("",False)
else do
consume 1
~(a,b) <- breakOn end
return (head s:a,b)
breakName :: Parser String
breakName = do
Value s p <- get
if not (null s) && isNameCharFirst (head s) then do
let (a,b) = span isNameChar s
consume (length a)
return a
else
return ""
breakNumber :: Parser (Maybe Int)
breakNumber = do
Value s p <- get
if not (null s) && isDigit (head s) then do
let (a,b) = span isDigit s
consume (length a)
return $ Just $ read a
else
return Nothing
dropSpaces :: Parser ()
dropSpaces = do
Value s p <- get
let n = length $ takeWhile isSpace s
consume n
parse :: Options -> Parser [Tag]
parse opts = do
Value s p <- get
case s of
'<':'!':'-':'-':_ -> consume 4 >> comment opts p
'<':'/':_ -> consume 2 >> close opts p
'<':_ -> consume 1 >> open opts p
[] -> return []
'&':_ -> do
consume 1
s <- entity opts p
rest <- parse opts
return $ s ++ rest
s:ss -> do
consume 1
rest <- parse opts
return $ tagPos opts p ++ [TagText [s]] ++ rest
comment opts p1 = do
~(inner,bad) <- breakOn "-->"
rest <- parse opts
return $ tagPos opts p1 ++ [TagComment inner] ++
(bad ?-> tagPosWarn opts p1 "Unexpected end when looking for \"-->\"") ++
rest
close opts p1 = do
name <- breakName
dropSpaces
~(Value s p) <- get
rest <- f s
return $ tagPos opts p1 ++ [TagClose name] ++
(null name ?-> tagPosWarn opts p1 "Empty name in close tag") ++
rest
where
f ('>':s) = do
consume 1
rest <- parse opts
return rest
f _ = do
~(_,bad) <- breakOn ">"
rest <- parse opts
return $ tagPosWarn opts p1 "Junk in closing tag" ++
bad ?-> tagPosWarn opts p1 "Unexpected end when looking for \">\"" ++
rest
open opts p1 = do
Value s p <- get
prefix <- if take 1 s `elem` ["!","?"] then consume 1 >> return [head s] else return ""
name <- liftM (prefix++) breakName
if null name then do
rest <- parse opts
return $ tagPos opts p1 ++ [TagText ('<':prefix)] ++ tagPosWarn opts p1 "Expected name of tag" ++ rest
else do
~(atts,shut,warns) <- attribs opts p1
rest <- parse opts
return $ tagPos opts p1 ++ [TagOpen name atts] ++
shut ?-> (tagPos opts p1 ++ [TagClose name]) ++
warns ++ rest
attribs :: Options -> Position -> Parser ([Attribute],Bool,[Tag])
attribs opts p1 = do
dropSpaces
Value s p <- get
case s of
'/':'>':_ -> consume 2 >> return ([],True ,[])
'>':_ -> consume 1 >> return ([],False,[])
x:xs | x `elem` "'\"" -> do
~(val,warns1) <- value opts
~(atts,shut,warns2) <- attribs opts p1
return (("",val):atts,shut,warns1++warns2)
[] -> return ([],False,tagPosWarn opts p1 "Unexpected end when looking for \">\"")
_ -> attrib opts p1
attrib :: Options -> Position -> Parser ([Attribute],Bool,[Tag])
attrib opts p1 = do
name <- breakName
if null name then do
consume 1
~(atts,shut,warns) <- attribs opts p1
return (atts,shut,tagPosWarn opts p1 "Junk character in tag" ++ warns)
else do
~(Value s p) <- get
~(val,warns1) <- f s
~(atts,shut,warns2) <- attribs opts p1
return ((name,val):atts,shut,warns1++warns2)
where
f ('=':s) = consume 1 >> value opts
f xs | not $ junk xs = return ([], [])
| otherwise = do
~(Value s p) <- get
dropJunk
return ([], tagPosWarn opts p "Junk character in tag")
junk ('/':'>':_) = False
junk ('>':_) = False
junk (c:cs) | not $ isSpace c = True
junk _ = False
dropJunk = do
~(Value s p) <- get
when (junk s) $ consume 1 >> dropJunk
value :: Options -> Parser (String,[Tag])
value opts = do
Value s p <- get
case s of
'\"':_ -> consume 1 >> f p True "\""
'\'':_ -> consume 1 >> f p True "\'"
_ -> f p False " />"
where
f p1 quote end = do
Value s p <- get
case s of
'&':_ -> do
consume 1
~(cs1,warns1) <- entityString opts p
~(cs2,warns2) <- f p1 quote end
return (cs1++cs2,warns1++warns2)
c:_ | c `elem` end -> do
if quote then consume 1 else return ()
return ([],[])
c:_ -> do
consume 1
~(cs,warns) <- f p1 quote end
return (c:cs,warns)
[] -> return ([],tagPosWarn opts p1 "Unexpected end in attibute value")
entity :: Options -> Position -> Parser [Tag]
entity opts p1 = do
Value s p <- get
case s of
'#':'x':_ -> f "#x" isHexDigit
'#':_ -> f "#" isDigit
_ -> f "" isNameChar
where
f prefix match = do
consume (length prefix)
g match (reverse prefix) (fromMaybe maxBound (optMaxEntityLength opts))
g match buf bound | bound < 0 = return $
tagPos opts p1 ++ [TagText ('&':reverse buf)] ++
tagPosWarn opts p1 "Unexpected '&' not in an entity"
g match buf bound = do
Value s p <- get
case s of
';':_ -> do
consume 1
return $ tagPosWarnFix opts p1 $ optLookupEntity opts (reverse buf)
x:xs | match x -> consume 1 >> g match (x:buf) (bound1)
_ -> g match buf (1)
entityString :: Options -> Position -> Parser (String,[Tag])
entityString opts p = do
tags <- entity opts p
let warnings = tagPosWarnFix opts p $ filter isTagWarning tags
return (innerText tags, warnings)