module Text.XML.HXT.Parser.TagSoup
( parseHtmlTagSoup
)
where
import Data.Char (toLower)
import Data.Maybe
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Entity ( lookupNumericEntity
)
import Text.XML.HXT.DOM.Unicode ( isXmlSpaceChar
)
import Text.XML.HXT.Parser.HtmlParsec ( isEmptyHtmlTag
, isInnerHtmlTagOf
, closesHtmlTag
)
import Text.XML.HXT.Parser.XhtmlEntities
import Text.XML.HXT.DOM.Interface ( XmlTrees
, QName
, NsEnv
, toNsEnv
, newXName
, nullXName
, mkQName'
, mkName
, isWellformedQualifiedName
, c_warn
, a_xml
, a_xmlns
, xmlNamespace
, xmlnsNamespace
)
import Text.XML.HXT.DOM.XmlNode ( isElem
, mkError
, mkCmt
, mkText
, mkElement
, mkAttr
)
type STag = Tag String
type Tags = [STag]
type Context = ([String], NsEnv)
type State = Tags
newtype Parser a = P { parse :: State -> (a, State)}
instance Monad Parser where
return x = P $ \ ts -> (x, ts)
p >>= f = P $ \ ts -> let
(res, ts') = parse p ts
in
parse (f res) ts'
runParser :: Parser a -> Tags -> a
runParser p ts = fst . parse p $ ts
cond :: Parser Bool -> Parser a -> Parser a -> Parser a
cond c t e = do
p <- c
if p then t else e
lookAhead :: (STag -> Bool) -> Parser Bool
lookAhead p = P $ \ s -> (not (null s) && p (head s), s)
isEof :: Parser Bool
isEof = P $ \ s -> (null s, s)
isText :: Parser Bool
isText = lookAhead is
where
is (TagText _) = True
is _ = False
isCmt :: Parser Bool
isCmt = lookAhead is
where
is (TagComment _) = True
is _ = False
isWarn :: Parser Bool
isWarn = lookAhead is
where
is (TagWarning _) = True
is _ = False
isPos :: Parser Bool
isPos = lookAhead is
where
is (TagPosition _ _) = True
is _ = False
isCls :: Parser Bool
isCls = lookAhead is
where
is (TagClose _) = True
is _ = False
isOpn :: Parser Bool
isOpn = lookAhead is
where
is (TagOpen _ _) = True
is _ = False
getTag :: Parser STag
getTag = P $ \ (t1:ts1) -> (t1, ts1)
getSym :: (STag -> a) -> Parser a
getSym f = do
t <- getTag
return (f t)
getText :: Parser String
getText = getSym sym
where
sym (TagText t) = t
sym _ = undefined
getCmt :: Parser String
getCmt = getSym sym
where
sym (TagComment c) = c
sym _ = undefined
getWarn :: Parser String
getWarn = getSym sym
where
sym (TagWarning w) = w
sym _ = undefined
getPos :: Parser (Int, Int)
getPos = getSym sym
where
sym (TagPosition l c) = (l, c)
sym _ = undefined
getCls :: Parser String
getCls = getSym sym
where
sym (TagClose n) = n
sym _ = undefined
getOpn :: Parser (String, [(String,String)])
getOpn = getSym sym
where
sym (TagOpen n al) = (n, al)
sym _ = undefined
pushBack :: STag -> Parser ()
pushBack t = P $ \ ts -> ((), t:ts)
insCls :: String -> Parser ()
insCls n = pushBack (TagClose n)
insOpn :: String -> [(String, String)] -> Parser ()
insOpn n al = pushBack (TagOpen n al)
mkQN :: Bool -> Bool -> NsEnv -> String -> Parser QName
mkQN withNamespaces isAttr env s
| withNamespaces
= return qn1
| otherwise
= return qn0
where
qn1
| isAttr && isSimpleName = s'
| isSimpleName = mkQName' nullXName (newXName s) (nsUri nullXName)
| isWellformedQualifiedName s = mkQName' px' lp' (nsUri px')
| otherwise = s'
qn0 = s'
nsUri x = fromMaybe nullXName . lookup x $ env
isSimpleName = all (/= ':') s
(px, (_ : lp)) = span(/= ':') s
px' = newXName px
lp' = newXName lp
s' = mkName s
extendNsEnv :: Bool -> [(String, String)] -> NsEnv -> NsEnv
extendNsEnv withNamespaces al1 env
| withNamespaces
= toNsEnv (concatMap (uncurry addNs) al1) ++ env
| otherwise
= env
where
addNs n v
| px == a_xmlns
&&
(null lp || (not . null . tail $ lp))
= [(drop 1 lp, v)]
| otherwise
= []
where
(px, lp) = span (/= ':') n
lookupEntity :: Bool -> Bool -> String -> Tags
lookupEntity withWarnings _asHtml e0@('#':e)
= case lookupNumericEntity e of
Just c -> [ TagText [c] ]
Nothing -> ( TagText $ "&" ++ e0 ++ ";") :
if withWarnings
then [TagWarning $ "illegal char reference: &" ++ e ++ ";"]
else []
lookupEntity withWarnings asHtml e
= case (lookup e entities) of
Just x -> [TagText [toEnum x]]
Nothing -> (TagText $ "&" ++ e ++ ";") :
if withWarnings
then [TagWarning $ "Unknown entity reference: &" ++ e ++ ";"]
else []
where
entities
| asHtml = xhtmlEntities
| otherwise = xhtmlEntities
lookupEntityAttr :: Bool -> Bool -> (String, Bool) -> (String, Tags)
lookupEntityAttr withWarnings asHtml (e, b)
| null r = (s, r)
| otherwise = ("&" ++ s ++ [';'|b], r)
where
(TagText s) : r = lookupEntity withWarnings asHtml e
lowerCaseNames :: Tags -> Tags
lowerCaseNames
= map f
where
f (TagOpen name attrs)
= TagOpen (nameToLower name) (map attrToLower attrs)
f (TagClose name)
= TagClose (nameToLower name)
f a = a
nameToLower = map toLower
attrToLower (an, av) = (nameToLower an, av)
parseHtmlTagSoup :: Bool -> Bool -> Bool -> Bool -> Bool -> String -> String -> XmlTrees
parseHtmlTagSoup withNamespaces withWarnings withComment removeWhiteSpace asHtml doc
= ( docRootElem
. runParser (buildCont initContext)
. ( if asHtml
then lowerCaseNames
else id
)
. tagsoupParse
)
where
tagsoupParse :: String -> Tags
tagsoupParse = parseTagsOptions tagsoupOptions
tagsoupOptions :: ParseOptions String
tagsoupOptions = parseOptions' { optTagWarning = withWarnings
, optEntityData = lookupEntity withWarnings asHtml
, optEntityAttrib = lookupEntityAttr withWarnings asHtml
}
where
parseOptions' :: ParseOptions String
parseOptions' = parseOptions
docRootElem
= take 1 . filter isElem
initContext = ( []
, toNsEnv $
[ (a_xml, xmlNamespace)
, (a_xmlns, xmlnsNamespace)
]
)
wrap = (:[])
warn
| withWarnings = wrap . mkError c_warn . show . (doc ++) . (" " ++)
| otherwise = const []
cmt
| withComment = wrap . mkCmt
| otherwise = const []
txt
| removeWhiteSpace
= \ t ->
if all isXmlSpaceChar t
then []
else wrap . mkText $ t
| otherwise = wrap . mkText
isEmptyElem
| asHtml = isEmptyHtmlTag
| otherwise = const False
isInnerElem
| asHtml = isInnerHtmlTagOf
| otherwise = const (const False)
closesElem
| asHtml = \ ns n1 ->
not (null ns)
&&
n1 `closesHtmlTag` (head ns)
| otherwise = const (const False)
buildCont :: Context -> Parser XmlTrees
buildCont ns
= cond isText ( do
t <- getText
rl <- buildCont ns
return (txt t ++ rl)
)
( cond isOpn ( do
(n,al) <- getOpn
openTag ns n al
)
( cond isCls ( do
n <- getCls
closeTag ns n
)
( cond isCmt ( do
c <- getCmt
rl <- buildCont ns
return (cmt c ++ rl)
)
( cond isWarn ( do
w <- getWarn
rl <- buildCont ns
return (warn w ++ rl)
)
( cond isPos ( do
_ <- getPos
buildCont ns
)
( cond isEof ( do
_ <- isEof
closeAll ns
)
( return (warn "parse error in tagsoup tree construction")
)
)
)
)
)
)
)
where
closeTag :: Context -> String -> Parser XmlTrees
closeTag ((n':_), _) n1
| n' == n1 = return []
closeTag ns'@((n':_), _) n1
| n' `isInnerElem` n1
= do
insCls n1
insCls n'
buildCont ns'
closeTag ns' n1
| isEmptyElem n1 = buildCont ns'
closeTag ns'@((n':ns1'), _) n1
| n1 `elem` ns1' = do
insCls n1
insCls n'
rl <- buildCont ns'
return ( warn ("closing tag " ++ show n' ++
" expected, but " ++ show n1 ++ " found")
++ rl
)
closeTag ns' n1
= do
rl <- buildCont ns'
return ( warn ("no opening tag for closing tag " ++ show n1)
++ rl
)
openTag :: Context -> String -> [(String, String)] -> Parser XmlTrees
openTag cx'@(ns',env') n1 al1
| isPiDT n1 = buildCont cx'
| isEmptyElem n1
= do
qn <- mkElemQN nenv n1
al <- mkAttrs al1
rl <- buildCont cx'
return (mkElement qn al [] : rl)
| closesElem ns' n1 = do
insOpn n1 al1
insCls (head ns')
buildCont cx'
| otherwise = do
qn <- mkElemQN nenv n1
al <- mkAttrs al1
cs <- buildCont ((n1 : ns'), nenv)
rl <- buildCont cx'
return (mkElement qn al cs : rl)
where
nenv = extendNsEnv withNamespaces al1 env'
mkElemQN = mkQN withNamespaces False
mkAttrQN = mkQN withNamespaces True
isPiDT ('?':_) = True
isPiDT ('!':_) = True
isPiDT _ = False
mkAttrs = mapM (uncurry mkA)
mkA an av = do
qan <- mkAttrQN nenv an
return (mkAttr qan (wrap . mkText $ av))
closeAll :: ([String], NsEnv) -> Parser XmlTrees
closeAll (ns',_) = return (concatMap wrn ns')
where
wrn = warn . ("insert missing closing tag " ++) . show