{- |
This module uses HXT to transverse an HTML document using CSS selectors.
The most important function here is 'findBySelector', it takes a CSS query and
a string containing the HTML to look into,
and it returns a list of the HTML fragments that matched the given query.
Only a subset of the CSS spec is currently supported:
* By tag name: /table td a/
* By class names: /.container .content/
* By Id: /#oneId/
* By attribute: /[hasIt]/, /[exact=match]/, /[contains*=text]/, /[starts^=with]/, /[ends$=with]/
* Union: /a, span, p/
* Immediate children: /div > p/
* Get jiggy with it: /div[data-attr=yeah] > .mon, .foo.bar div, #oneThing/
-}
module Yesod.Test.TransversingCSS (
findBySelector,
Html,
Query,
-- * For HXT hackers
-- | These functions expose some low level details that you can blissfully ignore.
parseQuery,
runQuery,
queryToArrow,
Selector(..),
SelectorGroup(..)
)
where
import Text.XML.HXT.Core
import qualified Data.List as DL
import Text.ParserCombinators.Parsec
import Text.Parsec.Prim (Parsec)
type Html = String
type Query = String
-- | Perform a css 'Query' on 'Html'. Returns Either
--
-- * Left: Query parse error.
--
-- * Right: List of matching Html fragments.
findBySelector :: Html-> Query -> Either ParseError [Html]
findBySelector html query = fmap (runQuery html) (parseQuery query)
-- Run a compiled query on Html, returning a list of matching Html fragments.
runQuery :: Html -> [[SelectorGroup]] -> [Html]
runQuery html query =
runLA (hread >>> (queryToArrow query) >>> xshow this) html
-- | Transform a compiled query into the HXT arrow that finally transverses the Html
queryToArrow :: ArrowXml a => [[SelectorGroup]] -> a XmlTree XmlTree
queryToArrow commaSeparated =
DL.foldl uniteCommaSeparated none commaSeparated
where
uniteCommaSeparated accum selectorGroups =
accum <+> (DL.foldl sequenceSelectorGroups this selectorGroups)
sequenceSelectorGroups accum (DirectChildren sels) =
accum >>> getChildren >>> (DL.foldl applySelectors this $ sels)
sequenceSelectorGroups accum (DeepChildren sels) =
accum >>> getChildren >>> multi (DL.foldl applySelectors this $ sels)
applySelectors accum selector = accum >>> (toArrow selector)
toArrow selector = case selector of
ById v -> hasAttrValue "id" (==v)
ByClass v -> hasAttrValue "class" ((DL.elem v) . words)
ByTagName v -> hasName v
ByAttrExists n -> hasAttr n
ByAttrEquals n v -> hasAttrValue n (==v)
ByAttrContains n v -> hasAttrValue n (DL.isInfixOf v)
ByAttrStarts n v -> hasAttrValue n (DL.isPrefixOf v)
ByAttrEnds n v -> hasAttrValue n (DL.isSuffixOf v)
-- | Parses a query into an intermediate format which is easy to feed to HXT
--
-- * The top-level lists represent the top level comma separated queries.
--
-- * SelectorGroup is a group of qualifiers which are separated
-- with spaces or > like these three: /table.main.odd tr.even > td.big/
--
-- * A SelectorGroup as a list of Selector items, following the above example
-- the selectors in the group are: /table/, /.main/ and /.odd/
parseQuery :: String -> Either ParseError [[SelectorGroup]]
parseQuery = parse cssQuery ""
data SelectorGroup
= DirectChildren [Selector]
| DeepChildren [Selector]
deriving Show
data Selector
= ById String
| ByClass String
| ByTagName String
| ByAttrExists String
| ByAttrEquals String String
| ByAttrContains String String
| ByAttrStarts String String
| ByAttrEnds String String
deriving Show
-- Below this line is the Parsec parser for css queries.
cssQuery :: Parsec String u [[SelectorGroup]]
cssQuery = sepBy rules (char ',' >> (optional (char ' ')))
rules :: Parsec String u [SelectorGroup]
rules = many $ directChildren <|> deepChildren
directChildren :: Parsec String u SelectorGroup
directChildren = do
_ <- char '>'
_ <- char ' '
sels <- selectors
optional $ char ' '
return $ DirectChildren sels
deepChildren :: Parsec String u SelectorGroup
deepChildren = do
sels <- selectors
optional $ char ' '
return $ DeepChildren sels
selectors :: Parsec String u [Selector]
selectors = many1 $ parseId
<|> parseClass
<|> parseTag
<|> parseAttr
parseId :: Parsec String u Selector
parseId = do
_ <- char '#'
x <- many $ noneOf ",#.[ >"
return $ ById x
parseClass :: Parsec String u Selector
parseClass = do
_ <- char '.'
x <- many $ noneOf ",#.[ >"
return $ ByClass x
parseTag :: Parsec String u Selector
parseTag = do
x <- many1 $ noneOf ",#.[ >"
return $ ByTagName x
parseAttr :: Parsec String u Selector
parseAttr = do
_ <- char '['
name <- many $ noneOf ",#.=$^*]"
(parseAttrExists name)
<|> (parseAttrWith "=" ByAttrEquals name)
<|> (parseAttrWith "*=" ByAttrContains name)
<|> (parseAttrWith "^=" ByAttrStarts name)
<|> (parseAttrWith "$=" ByAttrEnds name)
parseAttrExists :: String -> Parsec String u Selector
parseAttrExists attrname = do
_ <- char ']'
return $ ByAttrExists attrname
parseAttrWith :: String -> (String -> String -> Selector) -> String -> Parsec String u Selector
parseAttrWith sign constructor name = do
_ <- string sign
value <- many $ noneOf ",#.]"
_ <- char ']'
return $ constructor name value