module Text.XML.HXT.Parser.TagSoup
( parseHtmlTagSoup
)
where
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Entity
( lookupNumericEntity
)
import Text.XML.HXT.DOM.Unicode
( isXmlSpaceChar
)
import Text.XML.HXT.DOM.NamespacePredicates
( isWellformedQualifiedName
)
import Text.XML.HXT.Parser.XmlEntities
(
)
import Text.XML.HXT.Parser.XhtmlEntities
( xhtmlEntities
)
import Text.XML.HXT.Parser.HtmlParsec
( isEmptyHtmlTag
, isInnerHtmlTagOf
, closesHtmlTag
)
import Text.XML.HXT.DOM.Interface
( XmlTrees
, QName
, NsEnv
, mkQName
, mkName
, mkPrefixLocalPart
, c_warn
, a_xml
, a_xmlns
, xmlNamespace
, xmlnsNamespace
)
import Text.XML.HXT.DOM.XmlNode
( isElem
, mkError
, mkCmt
, mkText
, mkElement
, mkAttr
)
import Data.Maybe
import qualified Data.Map as M
type Tags = [Tag]
type NameTable = M.Map QName QName
type Context = ([String], NsEnv)
data State = S !Tags !NameTable
newtype Parser a = P { parse :: State -> (a, State)}
instance Monad Parser where
return x = P $ \ ts -> (x, ts)
p >>= f = P $ \ ts -> let
(res, ts') = parse p ts
in
parse (f res) ts'
runParser :: Parser a -> Tags -> a
runParser p ts = fst . parse p $ S ts M.empty
cond :: Parser Bool -> Parser a -> Parser a -> Parser a
cond c t e = do
p <- c
if p then t else e
lookAhead :: (Tag -> Bool) -> Parser Bool
lookAhead p = P $ \ s@(S ts _) -> (not (null ts) && p (head ts), s)
isEof :: Parser Bool
isEof = P $ \ s@(S ts _) -> (null ts, s)
isText :: Parser Bool
isText = lookAhead is
where
is (TagText _) = True
is _ = False
isCmt :: Parser Bool
isCmt = lookAhead is
where
is (TagComment _) = True
is _ = False
isWarn :: Parser Bool
isWarn = lookAhead is
where
is (TagWarning _) = True
is _ = False
isPos :: Parser Bool
isPos = lookAhead is
where
is (TagPosition _ _) = True
is _ = False
isCls :: Parser Bool
isCls = lookAhead is
where
is (TagClose _) = True
is _ = False
isOpn :: Parser Bool
isOpn = lookAhead is
where
is (TagOpen _ _) = True
is _ = False
getTag :: Parser Tag
getTag = P $ \ (S (t1:ts1) nt) -> (t1, S ts1 nt)
getSym :: (Tag -> a) -> Parser a
getSym f = do
t <- getTag
return (f t)
getText :: Parser String
getText = getSym sym
where
sym (TagText t) = t
sym _ = undefined
getCmt :: Parser String
getCmt = getSym sym
where
sym (TagComment c) = c
sym _ = undefined
getWarn :: Parser String
getWarn = getSym sym
where
sym (TagWarning w) = w
sym _ = undefined
getPos :: Parser (Int, Int)
getPos = getSym sym
where
sym (TagPosition l c) = (l, c)
sym _ = undefined
getCls :: Parser String
getCls = getSym sym
where
sym (TagClose n) = n
sym _ = undefined
getOpn :: Parser (String, [(String,String)])
getOpn = getSym sym
where
sym (TagOpen n al) = (n, al)
sym _ = undefined
pushBack :: Tag -> Parser ()
pushBack t = P $ \ (S ts nt) -> ((), S (t:ts) nt)
insCls :: String -> Parser ()
insCls n = pushBack (TagClose n)
insOpn :: String -> [(String, String)] -> Parser ()
insOpn n al = pushBack (TagOpen n al)
insertQName :: QName -> Parser QName
insertQName n = P $ \ (S ts nt) -> let
(n', nt') = insert n nt
in
(n', S ts nt')
where
insert s nt
| isJust r = (fromJust r, nt)
| otherwise = (s, M.insert s s nt)
where
r = M.lookup s nt
mkQN :: Bool -> Bool -> NsEnv -> String -> Parser QName
mkQN withNamespaces isAttr env s
| withNamespaces
= insertQName qn1
| otherwise
= insertQName qn0
where
qn1
| isAttr && isSimpleName = mkName s
| isSimpleName = mkQName "" s (nsUri "")
| isWellformedQualifiedName s = mkQName px lp (nsUri px)
| otherwise = mkName s
qn0
| isSimpleName = mkName s
| isWellformedQualifiedName s = mkPrefixLocalPart px lp
| otherwise = mkName s
nsUri x = fromMaybe "" . lookup x $ env
isSimpleName = all (/= ':') s
(px, (_ : lp)) = span(/= ':') s
extendNsEnv :: Bool -> [(String, String)] -> NsEnv -> NsEnv
extendNsEnv withNamespaces al1 env
| withNamespaces
= concatMap (uncurry addNs) al1 ++ env
| otherwise
= env
where
addNs n v
| px == a_xmlns
&&
(null lp || (not . null . tail $ lp))
= [(drop 1 lp, v)]
| otherwise
= []
where
(px, lp) = span (/= ':') n
lookupEntity :: Bool -> Bool -> String -> [Tag]
lookupEntity withWarnings _asHtml e0@('#':e)
= case lookupNumericEntity e of
Just c -> [ TagText [c] ]
Nothing -> ( TagText $ "&" ++ e0 ++ ";") :
if withWarnings
then [TagWarning $ "illegal char reference: &" ++ e ++ ";"]
else []
lookupEntity withWarnings asHtml e
= case (lookup e entities) of
Just x -> [ TagText [toEnum x]]
Nothing -> (TagText $ "&" ++ e ++ ";") :
if withWarnings
then [TagWarning $ "Unknown entity reference: &" ++ e ++ ";"]
else []
where
entities
| asHtml = xhtmlEntities
| otherwise = xhtmlEntities
parseHtmlTagSoup :: Bool -> Bool -> Bool -> Bool -> Bool -> String -> String -> XmlTrees
parseHtmlTagSoup withNamespaces withWarnings withComment removeWhiteSpace asHtml doc
= ( docRootElem
. runParser (buildCont initContext)
. ( if asHtml
then canonicalizeTags
else id
)
. parseTagsOptions (parseOptions { optTagWarning = withWarnings
, optLookupEntity = lookupEntity withWarnings asHtml
}
)
)
where
docRootElem
= take 1 . filter isElem
initContext = ( []
, [ (a_xml, xmlNamespace)
, (a_xmlns, xmlnsNamespace)
]
)
wrap = (:[])
warn
| withWarnings = wrap . mkError c_warn . show . (doc ++) . (" " ++)
| otherwise = const []
cmt
| withComment = wrap . mkCmt
| otherwise = const []
txt
| removeWhiteSpace
= \ t ->
if all isXmlSpaceChar t
then []
else wrap . mkText $ t
| otherwise = wrap . mkText
isEmptyElem
| asHtml = isEmptyHtmlTag
| otherwise = const False
isInnerElem
| asHtml = isInnerHtmlTagOf
| otherwise = const (const False)
closesElem
| asHtml = \ ns n1 ->
not (null ns)
&&
n1 `closesHtmlTag` (head ns)
| otherwise = const (const False)
buildCont :: Context -> Parser XmlTrees
buildCont ns
= cond isText ( do
t <- getText
rl <- buildCont ns
return (txt t ++ rl)
)
( cond isOpn ( do
(n,al) <- getOpn
openTag ns n al
)
( cond isCls ( do
n <- getCls
closeTag ns n
)
( cond isCmt ( do
c <- getCmt
rl <- buildCont ns
return (cmt c ++ rl)
)
( cond isWarn ( do
w <- getWarn
rl <- buildCont ns
return (warn w ++ rl)
)
( cond isPos ( do
getPos
buildCont ns
)
( cond isEof ( do
isEof
closeAll ns
)
( return (warn "parse error in tagsoup tree construction")
)
)
)
)
)
)
)
where
closeTag :: Context -> String -> Parser XmlTrees
closeTag ((n':_), _) n1
| n' == n1 = return []
closeTag ns'@((n':_), _) n1
| n' `isInnerElem` n1
= do
insCls n1
insCls n'
buildCont ns'
closeTag ns' n1
| isEmptyElem n1 = buildCont ns'
closeTag ns'@((n':ns1'), _) n1
| n1 `elem` ns1' = do
insCls n1
insCls n'
rl <- buildCont ns'
return ( warn ("closing tag " ++ show n' ++
" expected, but " ++ show n1 ++ " found")
++ rl
)
closeTag ns' n1
= do
rl <- buildCont ns'
return ( warn ("no opening tag for closing tag " ++ show n1)
++ rl
)
openTag :: Context -> String -> [(String, String)] -> Parser XmlTrees
openTag cx'@(ns',env') n1 al1
| isPiDT n1 = buildCont cx'
| isEmptyElem n1
= do
qn <- mkElemQN nenv n1
al <- mkAttrs al1
rl <- buildCont cx'
return (mkElement qn al [] : rl)
| closesElem ns' n1 = do
insOpn n1 al1
insCls (head ns')
buildCont cx'
| otherwise = do
qn <- mkElemQN nenv n1
al <- mkAttrs al1
cs <- buildCont ((n1 : ns'), nenv)
rl <- buildCont cx'
return (mkElement qn al cs : rl)
where
nenv = extendNsEnv withNamespaces al1 env'
mkElemQN = mkQN withNamespaces False
mkAttrQN = mkQN withNamespaces True
isPiDT ('?':_) = True
isPiDT ('!':_) = True
isPiDT _ = False
mkAttrs = mapM (uncurry mkA)
mkA an av = do
qan <- mkAttrQN nenv an
return (mkAttr qan (wrap . mkText $ av))
closeAll :: ([String], NsEnv) -> Parser XmlTrees
closeAll (ns',_) = return (concatMap wrn ns')
where
wrn = warn . ("insert missing closing tag " ++) . show