module Text.XML.Scraping (
GetInner (..)
, GetAttribute (..)
, remove
, removeDepth
, removeTags
, removeQueries
, rmElem
, nodeHaving
, removeQuery
) where
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Text.XML as X
import Text.XML.Cursor
import Data.String
import qualified Text.HTML.DOM as H
import qualified Data.Map as M
import Data.List
import Data.Maybe
import System.Environment (getArgs)
import qualified Text.Blaze.Html as Bl
import Text.Blaze.Html.Renderer.Text
import Data.Text.Lazy (fromStrict,toStrict, unpack)
import Text.XML.Selector
import Text.XML.Selector.Types
class GetInner elem where
innerHtml :: elem -> TL.Text
innerText :: elem -> TL.Text
toHtml :: elem -> TL.Text
instance GetInner Node where
innerHtml = TL.concat . map toHtml . child . fromNode
innerText (NodeElement (Element _ _ cs)) = (TL.concat . map innerText) cs
innerText (NodeContent txt) = fromStrict txt
innerText _ = ""
toHtml = renderHtml . Bl.toHtml
instance GetInner Cursor where
innerHtml = TL.concat . map (toHtml . node) . child
innerText = innerText . node
toHtml = toHtml . node
instance (GetInner a) => GetInner [a] where
innerHtml = TL.concat . map innerHtml
innerText = TL.concat . map innerText
toHtml = TL.concat . map toHtml
class GetAttribute elem where
ename :: elem -> Maybe Text
eid :: elem -> Maybe Text
eclass :: elem -> [Text]
getMeta :: Text -> elem -> [Text]
instance GetAttribute Node where
ename (NodeElement (Element n _ _)) = Just $ nameLocalName n
ename _ = Nothing
eid (NodeElement (Element _ as _)) = M.lookup "id" as
eid _ = Nothing
eclass (NodeElement (Element _ as _)) = maybe [] T.words $ M.lookup "class" as
eclass _ = []
getMeta n = getMeta n . fromNode
instance GetAttribute Cursor where
ename = ename . node
eid = eid . node
eclass = eclass . node
getMeta n cursor = concat $ cursor $// element "meta" &| attributeIs "name" n &.// attribute "content"
remove :: (Node->Bool)->Node->Node
remove f (NodeElement (Element a b cs)) = NodeElement (Element a b (map (remove f) (filter (not . f) cs)))
remove _ n = n
removeDepth :: (Node->Bool)->Int->Node->Node
removeDepth _ (1) n = n
removeDepth f d (NodeElement (Element a b cs)) = NodeElement (Element a b (map (removeDepth f (d1)) (filter (not . f) cs)))
removeDepth _ _ n = n
removeTags :: [String] -> [Node] -> [Node]
removeTags ts ns = map (remove (\n -> ename n `elem` map (Just . T.pack) ts)) ns
removeQueries :: [String] -> [Node] -> [Node]
removeQueries [q] ns = map (remove (queryMatchNode q)) ns
removeQueries qs ns = map (remove f) ns
where
f :: Node -> Bool
f n = any (flip queryMatchNode n) qs
removeQuery :: String -> [Node] -> [Node]
removeQuery q ns = removeQueries [q] ns
nodeHaving :: (Node->Bool)->Node->Bool
nodeHaving f n@(NodeElement (Element _ _ cs)) = f n || any (nodeHaving f) cs
nodeHaving _ _ = False
rmElem :: String -> String -> [String] -> [Node] -> [Node]
rmElem tag id kl ns = map (remove f) ns
where
f :: Node -> Bool
f (NodeElement e) = selectorMatch (JQSelector Descendant (g tag) (g id) kl []) e
f _ = False
g :: String -> Maybe String
g "" = Nothing
g s = Just s