module Text.HTML.KURE
(
parseHTML,
element,
text,
attr,
zero,
htmlT, htmlC,
elementT, elementC,
textT, textC,
attrsT, attrsC,
attrT, attrC,
getAttr,
isTag,
getTag,
getAttrs,
getInner,
anyElementHTML,
unconcatHTML,
HTML,
Element,
Text,
Attrs,
Attr,
Syntax,
Context(..),
Node,
Html(..),
injectT',
projectT',
extractT',
promoteT',
extractR',
promoteR'
)where
import Text.XML.HXT.Parser.HtmlParsec
import Text.XML.HXT.DOM.ShowXml
import Text.XML.HXT.DOM.TypeDefs
import Text.XML.HXT.DOM.XmlNode
import Control.Arrow
import Data.Char
import Data.Monoid
import Control.Monad
import qualified Language.KURE as KURE
import Language.KURE hiding ()
newtype HTML = HTML XmlTrees
newtype Element = Element XmlTree
newtype Text = Text XmlTrees
newtype Attrs = Attrs XmlTrees
newtype Attr = Attr XmlTree
newtype Syntax = Syntax XmlTree
newtype Context = Context [Element]
data Node
= HTMLNode HTML
| ElementNode Element
| TextNode Text
| AttrsNode Attrs
| AttrNode Attr
| SyntaxNode Syntax
deriving Show
instance Show HTML where
show (HTML html) = xshow html
instance Show Element where
show (Element html) = xshow [html]
instance Show Text where
show (Text html) = xshow html
instance Show Attrs where
show (Attrs html) = xshow html
instance Show Attr where
show (Attr html) = xshow [html]
instance Show Syntax where
show (Syntax syntax) = xshow [syntax]
instance Monoid HTML where
mempty = HTML []
mappend (HTML xs) (HTML ys) = HTML $ xs ++ ys
instance Monoid Context where
mempty = Context []
mappend (Context xs) (Context ys) = Context $ xs ++ ys
instance Injection HTML Node where
inject = HTMLNode
project u = do HTMLNode t <- return u
return t
instance Injection Element Node where
inject = ElementNode
project u = do ElementNode t <- return u
return t
instance Injection Text Node where
inject = TextNode
project u = do TextNode t <- return u
return t
instance Injection Attrs Node where
inject = AttrsNode
project u = do AttrsNode t <- return u
return t
instance Injection Attr Node where
inject = AttrNode
project u = do AttrNode t <- return u
return t
instance Injection Syntax Node where
inject = SyntaxNode
project u = do SyntaxNode t <- return u
return t
instance Walker Context Node where
allR :: forall m . MonadCatch m => Rewrite Context m Node -> Rewrite Context m Node
allR rr = prefixFailMsg "allR failed: " $
rewrite $ \ c -> \ case
HTMLNode o -> liftM HTMLNode $ KURE.applyT (htmlT (extractR rr >>> arr html)
(extractR rr >>> arr html)
(extractR rr >>> arr html) $ htmlC) c o
ElementNode o -> liftM ElementNode $ KURE.applyT (elementT (extractR rr) (extractR rr) $ elementC) c o
TextNode o -> liftM TextNode $ return o
AttrsNode o -> liftM AttrsNode $ KURE.applyT (attrsT (extractR rr) $ attrsC) c o
AttrNode o -> liftM AttrNode $ return o
SyntaxNode o -> liftM SyntaxNode $ return o
class Html a where
html :: a -> HTML
instance Html Element where
html (Element b) = HTML [b]
instance Html Text where
html (Text b) = HTML b
instance Html Syntax where
html (Syntax b) = HTML [b]
htmlT :: (Monad m)
=> Transform Context m Element a
-> Transform Context m Text a
-> Transform Context m Syntax a
-> ([a] -> x)
-> Transform Context m HTML x
htmlT tr1 tr2 tr3 k = transform $ \ c (HTML ts) -> liftM k $ flip mapM ts $ \ case
t@(NTree (XTag {}) _) -> applyT tr1 c (Element t)
t@(NTree (XText {}) _) -> applyT tr2 c (Text [t])
t@(NTree (XCharRef {}) _) -> applyT tr2 c (Text [t])
t@(NTree (XPi {}) _) -> applyT tr3 c (Syntax t)
t@(NTree (XDTD {}) _) -> applyT tr3 c (Syntax t)
t@(NTree (XCmt {}) _) -> applyT tr3 c (Syntax t)
t@(NTree (XError {}) _) -> applyT tr3 c (Syntax t)
t -> error $ "not XTag or XText: " ++ take 100 (show t)
htmlC :: [HTML] -> HTML
htmlC = mconcat
elementT :: (Monad m)
=> Transform Context m Attrs a
-> Transform Context m HTML b
-> (String -> a -> b -> x)
-> Transform Context m Element x
elementT tr1 tr2 k = transform $ \ (Context cs) (Element t) ->
case t of
NTree (XTag tag attrs) rest
| namePrefix tag == ""
&& namespaceUri tag == "" -> do
let nm = localPart tag
let c = Context (Element t : cs)
attrs' <- applyT tr1 c (Attrs attrs)
rest' <- applyT tr2 c (HTML rest)
return $ k nm attrs' rest'
_ -> fail "elementT runtime type error"
elementC :: String -> Attrs -> HTML -> Element
elementC nm (Attrs attrs) (HTML rest) = Element (NTree (XTag (mkName nm) attrs) rest)
textT :: (Monad m)
=> (String -> x)
-> Transform Context m Text x
textT k = transform $ \ _ (Text txt) ->
return $ k $ unescapeText $ [ fn t | (NTree t _) <- txt ]
where
fn (XText xs) = Left xs
fn (XCharRef c) = Right c
fn _ = error "found non XText / XCharRef in Text"
textC :: String -> Text
textC "" = Text [ NTree (XText "") [] ]
textC str = Text [ NTree t [] | t <- map (either XText XCharRef) $ escapeText str ]
attrsT :: (Monad m)
=> Transform Context m Attr a
-> ([a] -> x)
-> Transform Context m Attrs x
attrsT tr k = transform $ \ c (Attrs ts) -> liftM k $ flip mapM ts $ \ case
t@(NTree (XAttr {}) _) -> applyT tr c (Attr t)
_ -> fail "not XTag or XText"
attrsC :: [Attr] -> Attrs
attrsC xs = Attrs [ x | Attr x <- xs ]
attrT :: (Monad m)
=> (String -> String -> x)
-> Transform Context m Attr x
attrT k = transform $ \ c -> \ case
Attr (NTree (XAttr nm) ts)
| namePrefix nm == ""
&& namespaceUri nm == "" -> applyT (textT $ k (localPart nm)) c (Text ts)
_ -> fail "textT runtime error"
attrC :: String -> String -> Attr
attrC nm val = Attr $ mkAttr (mkName nm) ts
where Text ts = textC val
element :: String -> [Attr] -> HTML -> HTML
element nm xs inner = HTML [t]
where Element t = elementC nm (attrsC xs) inner
text :: String -> HTML
text txt = HTML ts
where Text ts = textC txt
zero :: HTML
zero = text ""
attr :: String -> String -> Attr
attr = attrC
getAttr :: (MonadCatch m) => String -> Transform Context m Element String
getAttr nm = getAttrs >>> attrsT find catchesM >>> joinT
where
find :: (MonadCatch m) => Transform Context m Attr (m String)
find = attrT $ \ nm' val -> if nm' == nm
then return val
else fail $ "getAttr: not" ++ show nm
isTag :: (Monad m) => String -> Transform Context m Element ()
isTag nm = elementT idR idR (\ nm' _ _ -> nm == nm') >>> guardT
getTag :: (Monad m) => Transform Context m Element String
getTag = elementT idR idR (\ nm _ _ -> nm)
getAttrs :: (Monad m) => Transform Context m Element Attrs
getAttrs = elementT idR idR (\ _ as _ -> as)
getInner :: (Monad m) => Transform Context m Element HTML
getInner = elementT idR idR (\ _ _ h -> h)
injectT' :: (Monad m, Injection a Node) => Transform c m a Node
injectT' = injectT
projectT' :: (Monad m, Injection a Node) => Transform c m Node a
projectT' = projectT
extractT' :: (Monad m, Injection a Node) => Transform c m Node b -> Transform c m a b
extractT' = extractT
promoteT' :: (Monad m, Injection a Node) => Transform c m a b -> Transform c m Node b
promoteT' = promoteT
extractR' :: (Monad m, Injection a Node) => Rewrite c m Node -> Rewrite c m a
extractR' = extractR
promoteR' :: (Monad m, Injection a Node) => Rewrite c m a -> Rewrite c m Node
promoteR' = promoteR
unconcatHTML :: HTML -> [HTML]
unconcatHTML (HTML ts) = map (\ t -> HTML [t]) ts
anyElementHTML :: (MonadCatch m) => Transform Context m Element HTML -> Rewrite Context m HTML
anyElementHTML tr = arr unconcatHTML >>> unwrapAnyR (mapT (wrapAnyR $ extractT' $ oneT $ promoteT' tr)) >>> arr mconcat
parseHTML :: FilePath -> String -> HTML
parseHTML fileName input = HTML $ parseHtmlDocument fileName input
escapeText :: String -> [Either String Int]
escapeText = foldr join [] . map f
where f n | n == '<' = Right (ord n)
| n == '"' = Right (ord n)
| n == '&' = Right (ord n)
| n == '\n' = Left n
| n == '\t' = Left n
| n == '\r' = Left n
| n > '~' = Right (ord n)
| n < ' ' = Right (ord n)
| otherwise = Left n
join (Left x) (Left xs :rest) = Left (x : xs) : rest
join (Left x) rest = Left [x] : rest
join (Right x) rest = Right x : rest
unescapeText :: [Either String Int] -> String
unescapeText = concatMap (either id ((: []) . chr))