{- | Selecting subtrees from TagSoup 'TagTree's. -} module Text.HTML.TagSoup.Tree.Selection ( selectGroup, select, selectAll, canMatch, matchesWith ) where import Control.Applicative import Control.Category ((>>>)) import Data.Maybe import Text.CSS3.Selectors.Syntax import Text.HTML.TagSoup import Text.HTML.TagSoup.Tree import Text.HTML.TagSoup.Tree.Util import Text.HTML.TagSoup.Tree.Zipper import Text.StringLike import Text.StringLike.Matchable {- | Traverses a tree, collecting all subtree positions for which at least one matches on the given selector group. If at least one subtree matches, a list of the same length as the number of selectors in the given selector group is added to the result list. In order, a 'Nothing' indicates no match and 'Just' a match for the corresponding selector. -} selectGroup :: Matchable str => SelectorGroup -> TagTree str -> [[Maybe (TagTreePos str)]] selectGroup (SelectorGroup sel sels) = map (\(pos, bs) -> map (\b -> if b then Just pos else Nothing) bs) . filter (or . snd) . selectAll (sel : sels) -- | Traverses a tree, collecting all subtree positions which match on the given selector. select :: Matchable str => Selector -> TagTree str -> [TagTreePos str] select sel = map fst . filter (head . snd) . selectAll [sel] {- | Traverses a tree, collecting all traversed positions in order. For each position all selectors are tried to be matched with the node at the current position and the information about whether a match is possible is added to the result list. -} selectAll :: Matchable str => [Selector] -> TagTree str -> [(TagTreePos str, [Bool])] selectAll sels = traverseTree (\pos -> [(pos, map (`canMatch` pos) sels)]) . fromTagTree -- | Checks if a selector can be matched with the node at the given position. canMatch :: Matchable str => Selector -> TagTreePos str -> Bool canMatch sel = (&&) <$> simpleSelSeqPred s <*> matches ss (map combPosList cs) where (s : ss, cs) = chain sel combPosList :: Combinator -> TagTreePos str -> [TagTreePos str] combPosList c = case c of Descendant -> iteratePos parent Child -> take 1 . iteratePos parent AdjacentSibling -> take 1 . iteratePos prevSibling GeneralSibling -> iteratePos prevSibling matches :: Matchable str => [SimpleSelectorSequence] -> [TagTreePos str -> [TagTreePos str]] -> TagTreePos str -> Bool matches ss cfs = foldr (\(s, cf) rest -> any ((&&) <$> simpleSelSeqPred s <*> rest) . cf) (const True) (zip ss cfs) chain :: Selector -> ([SimpleSelectorSequence], [Combinator]) chain sel = let (seqs, combs) = chain' sel in (reverse seqs, reverse combs) where chain' (Selector seq mbComb) = case mbComb of Nothing -> (seq : [], []) Just (comb, seq') -> let (seqs, combs) = chain' seq' in (seq : seqs, comb : combs) simpleSelSeqPred :: Matchable str => SimpleSelectorSequence -> TagTreePos str -> Bool simpleSelSeqPred (SimpleSelectorSequence head tails) = foldr1 (liftA2 (&&)) $ hFilter head : map tFilter tails where hFilter :: Matchable str => HeadSimpleSelector -> TagTreePos str -> Bool hFilter sel = case sel of TypeSelector n -> content >>> hasTagBranchName (fromString n) UniversalSelector -> content >>> isTagBranch tFilter :: Matchable str => TailSimpleSelector -> TagTreePos str -> Bool tFilter sel = case sel of AttributeSelector a Nothing -> content >>> hasTagBranchAttr (fromString a) AttributeSelector a (Just (op, v)) -> content >>> findTagBranchAttr (fromString a) >>> maybe False (matchesWith op $ fromString v) ClassSelector v -> content >>> findTagBranchAttr (fromString "class") >>> maybe False (matchesWith IncludesMatch $ fromString v) IDSelector v -> content >>> findTagBranchAttr (fromString "id") >>> maybe False (matchesWith ExactMatch $ fromString v) PseudoClass p -> pseudo p pseudo :: Matchable str => PseudoClass -> TagTreePos str -> Bool pseudo p = case p of Root -> parents >>> null -- pattern: a*n+b; counting starts at one; a==0 => match b-th; n always >= 0; only a*n+b >= 0 count NthChild p -> error "simpleSelSeqPred: nth-child not implemented yet" NthLastChild p -> error "simpleSelSeqPred: nth-last-child not implemented yet" NthOfType p -> error "simpleSelSeqPred: nth-of-type not implemented yet" NthLastOfType p -> error "simpleSelSeqPred: nth-last-of-type not implemented yet" FirstChild -> (&&) <$> (parents >>> not . null) <*> (before >>> null) LastChild -> (&&) <$> (parents >>> not . null) <*> (after >>> null) FirstOfType -> \pos -> let name = tagBranchName $ content pos in (&&) <$> (parents >>> not . null) <*> (before >>> all (not . hasTagBranchName name)) $ pos LastOfType -> \pos -> let name = tagBranchName $ content pos in (&&) <$> (parents >>> not . null) <*> (after >>> all (not . hasTagBranchName name)) $ pos OnlyChild -> (&&) <$> pseudo FirstChild <*> pseudo LastChild OnlyOfType -> (&&) <$> pseudo FirstOfType <*> pseudo LastOfType Empty -> content >>> children >>> null Not (Right (PseudoClass (Not _))) -> error "simpleSelSeqPred: nested :not pseudo classes" Not (Left sel) -> not . hFilter sel Not (Right sel) -> not . tFilter sel -- | Checks if two string-like values match under the given attribute operator. matchesWith :: Matchable str => AttributeOperator -> str -> str -> Bool matchesWith op = case op of ExactMatch -> matchesExactly IncludesMatch -> matchesWordOf DashMatch -> \v s -> v `matchesExactly` s || (v `append` fromChar '-') `matchesPrefixOf` s PrefixMatch -> matchesPrefixOf SuffixMatch -> matchesSuffixOf InfixMatch -> matchesInfixOf