{- |
A simple 'Query' format to query the 'HTMLTree'.
The Syntax is as follows :
@
"nodeName[Class(optional)]{ID(optional)} > nodeName[Class(optional)]{ID(optional)}"
@
eg : @"div{id1} > span[class][id_h1] > a"@
-}
module HScraper.Query (
parseQuery,
(~=~),
(|>>),
(>=>),
getText,
getEntireText,
getAttribute
) where
import qualified Data.Text as T
import Data.Monoid
import HScraper.Types
import HScraper.QueryParser
(===) :: Eq a => Maybe a -> Maybe a -> Bool
Just x === Just y = x == y
_ === Nothing = True
Nothing === Just _ = False
-- | Compares 'NodeQuery' with a 'NodeType'.
(~=~) :: NodeQuery -> NodeType -> Bool
NodeQuery{} ~=~ Text _ = False
NodeQuery name cls idd ~=~ Element nm xs = (name == nm)
&& (lookup (T.pack "class") xs === cls)
&& (lookup (T.pack "id") xs === idd)
-- | Returns the list of nodes matching the query
-- with root matching the first NodeQuery, and subsequent
-- Children satisfying subsequent 'NodeQueries' continously.
(>=>) :: HTMLTree -> Query -> [HTMLTree]
NullTree >=> _ = []
nt >=> [] = [nt]
nt@(NTree a _) >=> [q]
| q ~=~ a = [nt]
| otherwise = []
NTree a xs >=> (q:qs)
| q ~=~ a = foldl g [] xs
| otherwise = []
where g acc l = acc `mappend` (l >=> qs)
-- | Applies '>=>' considering each node as root and
-- combines the result.
(|>>) :: HTMLTree -> Query -> [HTMLTree]
NullTree |>> _ = []
nt@(NTree _ xs) |>> q = foldl (\x y -> (y |>> q) `mappend` x) (nt >=> q) xs
-- | Get Combined Text of immediate children of current node.
getText :: HTMLTree -> T.Text
getText NullTree = T.empty
getText nt@(NTree _ xs ) = foldl f (g nt) xs
where
f acc x = acc `T.append` g x
g (NTree (Text x) _) = x
g _ = T.empty
-- | Get Entire text contained in the subtree.
getEntireText :: HTMLTree -> T.Text
getEntireText NullTree = T.empty
getEntireText (NTree (Text x) _) = x
getEntireText (NTree (Element _ _) xs) = foldl fn T.empty xs
where
fn acc x = acc `T.append` gn x
gn NullTree = T.empty
gn (NTree (Text x) _) = x
gn ntm@(NTree (Element _ _) _) = getEntireText ntm
-- | Get the value of an attribute of a node.
getAttribute :: String -> HTMLTree -> Maybe String
getAttribute str (NTree (Element _ xs ) _) = fmap T.unpack (lookup (T.pack str) xs)
getAttribute _ _ = Nothing