module Text.HTML.KURE
(
parseHTML,
element,
text,
attr,
zero,
htmlT, htmlC,
elementT, elementC,
textT, textC,
attrsT, attrsC,
attrT, attrC,
getAttr,
isTag,
getTag,
getAttrs,
getInner,
anyElementHTML,
unconcatHTML,
HTML,
Element,
Text,
Attrs,
Attr,
Syntax,
Context(..),
Node,
Html(..),
injectT',
projectT',
extractT',
promoteT',
extractR',
promoteR'
)where
import Text.XML.HXT.Parser.HtmlParsec
import Text.XML.HXT.DOM.ShowXml
import Text.XML.HXT.DOM.TypeDefs
import Text.XML.HXT.DOM.XmlNode
import Data.Tree.NTree.TypeDefs
import Text.XML.HXT.Parser.XmlParsec hiding (element)
import Text.XML.HXT.Parser.XhtmlEntities
import Control.Arrow
import Control.Applicative
import Data.Char
import Data.Monoid
import Data.Maybe
import Control.Monad
import qualified Language.KURE as KURE
import Language.KURE hiding ()
newtype HTML = HTML XmlTrees
newtype Element = Element XmlTree
newtype Text = Text XmlTrees
newtype Attrs = Attrs XmlTrees
newtype Attr = Attr XmlTree
newtype Syntax = Syntax XmlTree
newtype Context = Context [Element]
data Node
= HTMLNode HTML
| ElementNode Element
| TextNode Text
| AttrsNode Attrs
| AttrNode Attr
| SyntaxNode Syntax
deriving Show
instance Show HTML where
show (HTML html) = xshow html
instance Show Element where
show (Element html) = xshow [html]
instance Show Text where
show (Text html) = xshow html
instance Show Attrs where
show (Attrs html) = xshow html
instance Show Attr where
show (Attr html) = xshow [html]
instance Show Syntax where
show (Syntax syntax) = xshow [syntax]
instance Monoid HTML where
mempty = HTML []
mappend (HTML xs) (HTML ys) = HTML $ xs ++ ys
instance Monoid Context where
mempty = Context []
mappend (Context xs) (Context ys) = Context $ xs ++ ys
instance Injection HTML Node where
inject = HTMLNode
project u = do HTMLNode t <- return u
return t
instance Injection Element Node where
inject = ElementNode
project u = do ElementNode t <- return u
return t
instance Injection Text Node where
inject = TextNode
project u = do TextNode t <- return u
return t
instance Injection Attrs Node where
inject = AttrsNode
project u = do AttrsNode t <- return u
return t
instance Injection Attr Node where
inject = AttrNode
project u = do AttrNode t <- return u
return t
instance Injection Syntax Node where
inject = SyntaxNode
project u = do SyntaxNode t <- return u
return t
instance Walker Context Node where
allR :: forall m . MonadCatch m => Rewrite Context m Node -> Rewrite Context m Node
allR rr = prefixFailMsg "allR failed: " $
rewrite $ \ c -> \ case
HTMLNode o -> liftM HTMLNode $ KURE.apply (htmlT (extractR rr >>> arr html)
(extractR rr >>> arr html)
(extractR rr >>> arr html) $ htmlC) c o
ElementNode o -> liftM ElementNode $ KURE.apply (elementT (extractR rr) (extractR rr) $ elementC) c o
TextNode o -> liftM TextNode $ return o
AttrsNode o -> liftM AttrsNode $ KURE.apply (attrsT (extractR rr) $ attrsC) c o
AttrNode o -> liftM AttrNode $ return o
SyntaxNode o -> liftM SyntaxNode $ return o
class Html a where
html :: a -> HTML
instance Html Element where
html (Element b) = HTML [b]
instance Html Text where
html (Text b) = HTML b
instance Html Syntax where
html (Syntax b) = HTML [b]
htmlT :: (Monad m)
=> Translate Context m Element a
-> Translate Context m Text a
-> Translate Context m Syntax a
-> ([a] -> x)
-> Translate Context m HTML x
htmlT tr1 tr2 tr3 k = translate $ \ c (HTML ts) -> liftM k $ flip mapM ts $ \ case
t@(NTree (XTag {}) _) -> apply tr1 c (Element t)
t@(NTree (XText {}) _) -> apply tr2 c (Text [t])
t@(NTree (XCharRef n) _) -> apply tr2 c (Text [t])
t@(NTree (XPi {}) _) -> apply tr3 c (Syntax t)
t@(NTree (XDTD {}) _) -> apply tr3 c (Syntax t)
t@(NTree (XCmt {}) _) -> apply tr3 c (Syntax t)
t@(NTree (XError {}) _) -> apply tr3 c (Syntax t)
t -> error $ "not XTag or XText: " ++ take 100 (show t)
htmlC :: [HTML] -> HTML
htmlC = mconcat
elementT :: (Monad m)
=> Translate Context m Attrs a
-> Translate Context m HTML b
-> (String -> a -> b -> x)
-> Translate Context m Element x
elementT tr1 tr2 k = translate $ \ (Context cs) (Element t) ->
case t of
NTree (XTag tag attrs) rest
| namePrefix tag == ""
&& namespaceUri tag == "" -> do
let nm = localPart tag
let c = Context (Element t : cs)
attrs' <- apply tr1 c (Attrs attrs)
rest' <- apply tr2 c (HTML rest)
return $ k nm attrs' rest'
_ -> fail "elementT runtime type error"
elementC :: String -> Attrs -> HTML -> Element
elementC nm (Attrs attrs) (HTML rest) = Element (NTree (XTag (mkName nm) attrs) rest)
textT :: (Monad m)
=> (String -> x)
-> Translate Context m Text x
textT k = translate $ \ _ (Text txt) ->
return $ k $ unescapeText $ [ fn t | (NTree t _) <- txt ]
where
fn (XText xs) = Left xs
fn (XCharRef c) = Right c
fn _ = error "found non XText / XCharRef in Text"
textC :: String -> Text
textC "" = Text [ NTree (XText "") [] ]
textC str = Text [ NTree t [] | t <- map (either XText XCharRef) $ escapeText str ]
attrsT :: (Monad m)
=> Translate Context m Attr a
-> ([a] -> x)
-> Translate Context m Attrs x
attrsT tr k = translate $ \ c (Attrs ts) -> liftM k $ flip mapM ts $ \ case
t@(NTree (XAttr {}) _) -> apply tr c (Attr t)
_ -> fail "not XTag or XText"
attrsC :: [Attr] -> Attrs
attrsC xs = Attrs [ x | Attr x <- xs ]
attrT :: (Monad m)
=> (String -> String -> x)
-> Translate Context m Attr x
attrT k = translate $ \ c -> \ case
Attr (NTree (XAttr nm) ts)
| namePrefix nm == ""
&& namespaceUri nm == "" -> apply (textT $ k (localPart nm)) c (Text ts)
_ -> fail "textT runtime error"
attrC :: String -> String -> Attr
attrC nm val = Attr $ mkAttr (mkName nm) ts
where Text ts = textC val
element :: String -> [Attr] -> HTML -> HTML
element nm xs inner = HTML [t]
where Element t = elementC nm (attrsC xs) inner
text txt = HTML ts
where Text ts = textC txt
zero :: HTML
zero = text ""
attr :: String -> String -> Attr
attr = attrC
getAttr :: (MonadCatch m) => String -> Translate Context m Element String
getAttr nm = getAttrs >>> attrsT find catchesM >>> joinT
where
find :: (MonadCatch m) => Translate Context m Attr (m String)
find = attrT $ \ nm' val -> if nm' == nm
then return val
else fail $ "getAttr: not" ++ show nm
isTag :: (Monad m) => String -> Translate Context m Element ()
isTag nm = elementT idR idR (\ nm' _ _ -> nm == nm') >>> guardT
getTag :: (Monad m) => Translate Context m Element String
getTag = elementT idR idR (\ nm _ _ -> nm)
getAttrs :: (Monad m) => Translate Context m Element Attrs
getAttrs = elementT idR idR (\ _ as _ -> as)
getInner :: (Monad m) => Translate Context m Element HTML
getInner = elementT idR idR (\ _ _ h -> h)
injectT' :: (Monad m, Injection a g, g ~ Node) => Translate c m a g
injectT' = injectT
projectT' :: (Monad m, Injection a g, g ~ Node) => Translate c m g a
projectT' = projectT
extractT' :: (Monad m, Injection a g, g ~ Node) => Translate c m g b -> Translate c m a b
extractT' = extractT
promoteT' :: (Monad m, Injection a g, g ~ Node) => Translate c m a b -> Translate c m g b
promoteT' = promoteT
extractR' :: (Monad m, Injection a g, g ~ Node) => Rewrite c m g -> Rewrite c m a
extractR' = extractR
promoteR' :: (Monad m, Injection a g, g ~ Node) => Rewrite c m a -> Rewrite c m g
promoteR' = promoteR
unconcatHTML :: HTML -> [HTML]
unconcatHTML (HTML ts) = map (\ t -> HTML [t]) ts
anyElementHTML :: (MonadCatch m) => Translate Context m Element HTML -> Rewrite Context m HTML
anyElementHTML tr = arr unconcatHTML >>> unwrapAnyR (mapT (wrapAnyR $ extractT' $ oneT $ promoteT' tr)) >>> arr mconcat
parseHTML :: FilePath -> String -> HTML
parseHTML fileName input = HTML $ parseHtmlDocument fileName input
escapeText :: String -> [Either String Int]
escapeText = foldr join [] . map f
where f n | n == '<' = Right (ord n)
| n == '"' = Right (ord n)
| n == '&' = Right (ord n)
| n == '\n' = Left n
| n == '\t' = Left n
| n == '\r' = Left n
| n > '~' = Right (ord n)
| n < ' ' = Right (ord n)
| otherwise = Left n
join (Left x) (Left xs :rest) = Left (x : xs) : rest
join (Left x) rest = Left [x] : rest
join (Right x) rest = Right x : rest
unescapeText :: [Either String Int] -> String
unescapeText = concatMap (either id ((: []) . chr))