{- |
This module uses HXT to transverse an HTML document using CSS selectors.

The most important function here is 'findBySelector', it takes a CSS query and
a string containing the HTML to look into,
and it returns a list of the HTML fragments that matched the given query.

Only a subset of the CSS spec is currently supported:

 * By tag name: /table td a/
 
 * By class names: /.container .content/

 * By Id: /#oneId/

 * By attribute: /[hasIt]/, /[exact=match]/, /[contains*=text]/, /[starts^=with]/, /[ends$=with]/
 
 * Union: /a, span, p/
 
 * Immediate children: /div > p/ 

 * Get jiggy with it: /div[data-attr=yeah] > .mon, .foo.bar div, #oneThing/

-}

module Yesod.Test.TransversingCSS (
  findBySelector,
  Html,
  Query,
  -- * For HXT hackers
  -- | These functions expose some low level details that you can blissfully ignore.
  parseQuery,
  runQuery,
  queryToArrow,
  Selector(..),
  SelectorGroup(..)

  )
where

import Text.XML.HXT.Core
import qualified Data.List as DL
import Text.ParserCombinators.Parsec
import Text.Parsec.Prim (Parsec)

type Html = String
type Query = String
 
-- | Perform a css 'Query' on 'Html'. Returns Either
--
-- * Left: Query parse error.
--
-- * Right: List of matching Html fragments.
findBySelector :: Html-> Query -> Either ParseError [Html]
findBySelector html query = fmap (runQuery html) (parseQuery query)

-- Run a compiled query on Html, returning a list of matching Html fragments.
runQuery :: Html -> [[SelectorGroup]] -> [Html]
runQuery html query =
  runLA (hread >>> (queryToArrow query) >>> xshow this) html

-- | Transform a compiled query into the HXT arrow that finally transverses the Html
queryToArrow :: ArrowXml a => [[SelectorGroup]] -> a XmlTree XmlTree
queryToArrow commaSeparated = 
  DL.foldl uniteCommaSeparated none commaSeparated
 where
  uniteCommaSeparated accum selectorGroups =
    accum <+> (DL.foldl sequenceSelectorGroups this selectorGroups)
  sequenceSelectorGroups accum (DirectChildren sels) =
    accum >>> getChildren >>> (DL.foldl applySelectors this $ sels)
  sequenceSelectorGroups accum (DeepChildren sels) =
    accum >>> getChildren >>> multi (DL.foldl applySelectors this $ sels)
  applySelectors accum selector = accum >>> (toArrow selector)
  toArrow selector = case selector of
    ById v -> hasAttrValue "id" (==v)
    ByClass v -> hasAttrValue "class" ((DL.elem v) . words)
    ByTagName v -> hasName v
    ByAttrExists n -> hasAttr n
    ByAttrEquals n v -> hasAttrValue n (==v)
    ByAttrContains n v -> hasAttrValue n (DL.isInfixOf v)
    ByAttrStarts n v -> hasAttrValue n (DL.isPrefixOf v)
    ByAttrEnds n v -> hasAttrValue n (DL.isSuffixOf v)

-- | Parses a query into an intermediate format which is easy to feed to HXT
--
-- * The top-level lists represent the top level comma separated queries.
--
-- * SelectorGroup is a group of qualifiers which are separated
--   with spaces or > like these three: /table.main.odd tr.even > td.big/
--
-- * A SelectorGroup as a list of Selector items, following the above example
--   the selectors in the group are: /table/, /.main/ and /.odd/ 
parseQuery :: String -> Either ParseError [[SelectorGroup]]
parseQuery = parse cssQuery "" 

data SelectorGroup 
  = DirectChildren [Selector]
  | DeepChildren [Selector]
  deriving Show

data Selector
  = ById String
  | ByClass String
  | ByTagName String
  | ByAttrExists String
  | ByAttrEquals String String
  | ByAttrContains String String
  | ByAttrStarts String String
  | ByAttrEnds String String
  deriving Show

-- Below this line is the Parsec parser for css queries.
cssQuery :: Parsec String u [[SelectorGroup]]
cssQuery = sepBy rules (char ',' >> (optional (char ' ')))

rules :: Parsec String u [SelectorGroup]
rules = many $ directChildren <|> deepChildren

directChildren :: Parsec String u SelectorGroup
directChildren = do
  _ <- char '>'
  _ <- char ' '
  sels <- selectors
  optional $ char ' '
  return $ DirectChildren sels

deepChildren :: Parsec String u SelectorGroup
deepChildren = do 
  sels <- selectors
  optional $ char ' '
  return $ DeepChildren sels
  
selectors :: Parsec String u [Selector]
selectors = many1 $ parseId
  <|> parseClass
  <|> parseTag
  <|> parseAttr

parseId :: Parsec String u Selector
parseId = do
  _ <- char '#'
  x <- many $ noneOf ",#.[ >"
  return $ ById x

parseClass :: Parsec String u Selector
parseClass = do
  _ <- char '.'
  x <- many $ noneOf ",#.[ >"
  return $ ByClass x

parseTag :: Parsec String u Selector
parseTag = do
  x <- many1 $ noneOf ",#.[ >"
  return $ ByTagName x

parseAttr :: Parsec String u Selector
parseAttr = do
  _ <- char '['
  name <- many $ noneOf ",#.=$^*]"
  (parseAttrExists name)
    <|> (parseAttrWith "=" ByAttrEquals name)
    <|> (parseAttrWith "*=" ByAttrContains name)
    <|> (parseAttrWith "^=" ByAttrStarts name)
    <|> (parseAttrWith "$=" ByAttrEnds name)

parseAttrExists :: String -> Parsec String u Selector
parseAttrExists attrname = do
  _ <- char ']'
  return $ ByAttrExists attrname

parseAttrWith :: String -> (String -> String -> Selector) -> String -> Parsec String u Selector
parseAttrWith sign constructor name = do
  _ <- string sign
  value <- many $ noneOf ",#.]"
  _ <- char ']'
  return $ constructor name value