{- | A simple 'Query' format to query the 'HTMLTree'. The Syntax is as follows : @ "nodeName[Class(optional)]{ID(optional)} > nodeName[Class(optional)]{ID(optional)}" @ eg : @"div{id1} > span[class][id_h1] > a"@ -} module HScraper.Query ( parseQuery, (~=~), (|>>), (>=>), getText, getEntireText, getAttribute ) where import qualified Data.Text as T import Data.Monoid import HScraper.Types import HScraper.QueryParser (===) :: Eq a => Maybe a -> Maybe a -> Bool Just x === Just y = x == y _ === Nothing = True Nothing === Just _ = False -- | Compares 'NodeQuery' with a 'NodeType'. (~=~) :: NodeQuery -> NodeType -> Bool NodeQuery{} ~=~ Text _ = False NodeQuery name cls idd ~=~ Element nm xs = (name == nm) && (lookup (T.pack "class") xs === cls) && (lookup (T.pack "id") xs === idd) -- | Returns the list of nodes matching the query -- with root matching the first NodeQuery, and subsequent -- Children satisfying subsequent 'NodeQueries' continously. (>=>) :: HTMLTree -> Query -> [HTMLTree] NullTree >=> _ = [] nt >=> [] = [nt] nt@(NTree a _) >=> [q] | q ~=~ a = [nt] | otherwise = [] NTree a xs >=> (q:qs) | q ~=~ a = foldl g [] xs | otherwise = [] where g acc l = acc `mappend` (l >=> qs) -- | Applies '>=>' considering each node as root and -- combines the result. (|>>) :: HTMLTree -> Query -> [HTMLTree] NullTree |>> _ = [] nt@(NTree _ xs) |>> q = foldl (\x y -> (y |>> q) `mappend` x) (nt >=> q) xs -- | Get Combined Text of immediate children of current node. getText :: HTMLTree -> T.Text getText NullTree = T.empty getText nt@(NTree _ xs ) = foldl f (g nt) xs where f acc x = acc `T.append` g x g (NTree (Text x) _) = x g _ = T.empty -- | Get Entire text contained in the subtree. getEntireText :: HTMLTree -> T.Text getEntireText NullTree = T.empty getEntireText (NTree (Text x) _) = x getEntireText (NTree (Element _ _) xs) = foldl fn T.empty xs where fn acc x = acc `T.append` gn x gn NullTree = T.empty gn (NTree (Text x) _) = x gn ntm@(NTree (Element _ _) _) = getEntireText ntm -- | Get the value of an attribute of a node. getAttribute :: String -> HTMLTree -> Maybe String getAttribute str (NTree (Element _ xs ) _) = fmap T.unpack (lookup (T.pack str) xs) getAttribute _ _ = Nothing