{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Select (
    SelectContext (..)
,   TagSpec
,   TagInfo (..)

,   select
,   tagsToSpec
) where

import Text.HTML.Scalpel.Internal.Select.Types

import Control.Applicative ((<$>), (<|>))
import Data.Maybe (catMaybes, isJust, fromJust, fromMaybe)

import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Tree as Tree
import qualified Data.Vector as Vector
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup


type Index = Int
type Name = Maybe T.Text
type CloseOffset = Maybe Index

-- | The span of a tag in terms of the index of the opening tag and the index of
-- the closing tag. If there is no closing tag the closing tag is equal to the
-- opening tag.
data Span = Span !Int !Int

-- | A representation of the hierarchal structure of a document. Nodes of the
-- tree are spans which mark the start and end of a tag. The tree is organized
-- such that tags that appear earlier in the parsed string appear earlier in the
-- list of nodes, and that a given node is completely within the span of its
-- parent.
type TagForest = Tree.Forest Span

-- | A tag and associated precomputed meta data that is accessed in tight inner
-- loops during scraping.
data TagInfo str = TagInfo {
                   infoTag    :: !(TagSoup.Tag str)
                 , infoName   :: !Name
                 , infoIndex  :: !Index
                 , infoOffset :: !CloseOffset
                 }

-- | A vector of tags and precomputed meta data. A vector is used because it
-- allows for constant time slicing and sharing memory between the slices.
type TagVector str = Vector.Vector (TagInfo str)

-- | Ephemeral meta-data that each TagSpec is tagged with. This type contains
-- information that is not intrinsic in the sub-tree that corresponds to a given
-- TagSpec.
data SelectContext = SelectContext {
                     -- | `select` generates a list of `TagSpec`s that match a
                     -- selector. This indicates the index in the result list
                     -- that this TagSpec corresponds to.
                     ctxPosition :: !Index
                   }

-- | A structured representation of the parsed tags that provides fast element
-- look up via a vector of tags, and fast traversal via a rose tree of tags.
type TagSpec str = (TagVector str, TagForest, SelectContext)

-- | The 'select' function takes a 'Selectable' value and a list of
-- 'TagSoup.Tag's and returns a list of every subsequence of the given list of
-- Tags that matches the given selector.
select :: (Ord str, TagSoup.StringLike str)
       => Selector -> TagSpec str -> [TagSpec str]
select s tagSpec = newSpecs
    where
        (MkSelector nodes) = s
        newSpecs = zipWith applyPosition [0..] (selectNodes nodes tagSpec [])
        applyPosition p (tags, f, ctx) = (tags, f, SelectContext p)

-- | Creates a TagSpec from a list of tags parsed by TagSoup.
tagsToSpec :: forall str. (Ord str, TagSoup.StringLike str)
           => [TagSoup.Tag str] -> TagSpec str
tagsToSpec tags = (vector, tree, ctx)
    where
        vector = tagsToVector tags
        tree   = vectorToTree vector
        ctx    = SelectContext 0

-- | Annotate each tag with the offset to the corresponding closing tag. This
-- annotating is done in O(n * log(n)).
--
-- The algorithm works on a list of tags annotated with their index. It
-- maintains a map of unclosed open tags keyed by tag name.
--
--      (1) When an open tag is encountered it is pushed onto the list keyed by
--          its name.
--
--      (2) When a closing tag is encountered the corresponding opening tag is
--          popped, the offset between the two are computed, the opening tag is
--          annotated with the offset between the two, and both are added to the
--          result set.
--
--      (3) When any other tag is encountered it is added to the result set
--          immediately.
--
--      (4) After all tags are either in the result set or the state, all
--          unclosed tags from the state are added to the result set without a
--          closing offset.
--
--      (5) The result set is then sorted by their indices.
tagsToVector :: forall str. (Ord str, TagSoup.StringLike str)
             => [TagSoup.Tag str] -> TagVector str
tagsToVector tags = let indexed  = zip tags [0..]
                        total    = length indexed
                        unsorted = go indexed Map.empty
                        emptyVec = Vector.replicate total undefined
                     in emptyVec Vector.// unsorted
    where
        go :: [(TagSoup.Tag str, Index)]
           -> Map.Map T.Text [(TagSoup.Tag str, Index)]
           -> [(Index, TagInfo str)]
        go [] state = map (\(t, i) -> (i, TagInfo t (maybeName t) i Nothing))
                                    $ concat
                                    $ Map.elems state
            where
                maybeName t | TagSoup.isTagOpen t  = Just $ getTagName t
                            | TagSoup.isTagClose t = Just $ getTagName t
                            | otherwise            = Nothing
        go (x@(tag, index) : xs) state
            | TagSoup.isTagClose tag =
                let maybeOpen = head <$> Map.lookup tagName state
                    state'    = Map.alter popTag tagName state
                    info      = TagInfo tag (Just tagName) index Nothing
                    res       = catMaybes [
                                  Just (index, info)
                              ,   calcOffset <$> maybeOpen
                              ]
                 in res ++ go xs state'
            | TagSoup.isTagOpen tag  = go xs (Map.alter appendTag tagName state)
            | otherwise              =
                let info = TagInfo tag Nothing index Nothing
                in (index, info) : go xs state
            where
                tagName = getTagName tag

                appendTag :: Maybe [(TagSoup.Tag str, Index)]
                          -> Maybe [(TagSoup.Tag str, Index)]
                appendTag m = (x :) <$> (m <|> Just [])

                calcOffset :: (TagSoup.Tag str, Int) -> (Index, TagInfo str)
                calcOffset (t, i) =
                    let offset = index - i
                        info   = TagInfo t (Just tagName) i (Just offset)
                     in offset `seq` (i, info)

                popTag :: Maybe [a] -> Maybe [a]
                popTag (Just (_ : y : xs)) = let s = y : xs in s `seq` Just s
                popTag _                   = Nothing

getTagName :: TagSoup.StringLike str => TagSoup.Tag str -> T.Text
getTagName (TagSoup.TagOpen name _) = TagSoup.castString name
getTagName (TagSoup.TagClose name)  = TagSoup.castString name
getTagName _                        = undefined

-- | Builds a forest describing the structure of the tags within a given vector.
-- The nodes of the forest are tag spans which mark the indices within the
-- vector of an open and close pair. The tree is organized such for any node n
-- the parent of node n is the smallest span that completely encapsulates the
-- span of node n.
vectorToTree :: TagSoup.StringLike str => TagVector str -> TagForest
vectorToTree tags = fixup $ forestWithin 0 (Vector.length tags)
    where
        forestWithin :: Int -> Int -> TagForest
        forestWithin !lo !hi
            | hi <= lo   = []
            | not isOpen = forestWithin (lo + 1) hi
            | otherwise  = Tree.Node (Span lo closeIndex) subForest
                         : forestWithin (closeIndex + 1) hi
            where
                info       = tags Vector.! lo
                isOpen     = TagSoup.isTagOpen $ infoTag info
                closeIndex = lo + fromMaybe 0 (infoOffset info)
                subForest  = forestWithin (lo + 1) closeIndex

        -- Lifts nodes whose closing tags lay outside their parent tags up to
        -- within a parent node that encompasses the node's entire span.
        fixup :: TagForest -> TagForest
        fixup [] = []
        fixup (Tree.Node (Span lo hi) subForest : siblings)
            = Tree.Node (Span lo hi) ok : bad
            where
                (ok, bad) = malformed (fixup siblings) $ fixup subForest

                malformed :: TagForest -- Forest to prepend bad trees on.
                          -> TagForest  -- Remaining trees to examine.
                          -> (TagForest, TagForest)
                malformed preBad [] = ([], preBad)
                malformed preBad (n@(Tree.Node (Span _ nHi) _) : ns)
                    | hi < nHi  = (ok, n : bad)
                    | otherwise = (n : ok, bad)
                    where (ok, bad) = malformed preBad ns

-- | Generates a list of 'TagSpec's that match the given list of 'SelectNode's.
-- This is is done in linear time with respect to the number of tags.
--
-- The algorithm is a simple DFS traversal of the tag forest. While traversing
-- the forest if the current SelectNode is satisfied by the current node in the
-- tree the SelectNode is popped and the current node's sub-forest is traversed
-- with the remaining SelectNodes. If there is only a single SelectNode then any
-- node encountered that satisfies the SelectNode is returned as an answer.
selectNodes :: TagSoup.StringLike str
            => [SelectNode] -> TagSpec str -> [TagSpec str] -> [TagSpec str]
selectNodes []  _          acc = acc
selectNodes [_] (_, [], _) acc = acc
-- Now that there is only a single SelectNode to satisfy, search the remaining
-- forests and generates a TagSpec for each node that satisfies the condition.
selectNodes [n] (tags, f : fs, ctx) acc
    | nodeMatches n info = (shrunkSpec :)
                         $ selectNodes [n] (tags, fs, ctx)
                         $ selectNodes [n] (tags, Tree.subForest f, ctx) acc
    | otherwise          = selectNodes [n] (tags, fs, ctx)
                         $ selectNodes [n] (tags, Tree.subForest f, ctx) acc
    where
        Span lo hi = Tree.rootLabel f
        shrunkSpec = (
                       Vector.slice lo (hi - lo + 1) tags
                     , [fmap recenter f]
                     , ctx
                     )
        recenter (Span nLo nHi) = Span (nLo - lo) (nHi - lo)
        info = tags Vector.! lo
-- There are multiple SelectNodes that need to be satisfied. If the current node
-- satisfies the condition, then the current nodes sub-forest is searched for
-- matches of the remaining SelectNodes.
selectNodes (_ : _) (_, [], _) acc = acc
selectNodes (n : ns) (tags, f : fs, ctx) acc
    | nodeMatches n info = selectNodes ns       (tags, Tree.subForest f, ctx)
                         $ selectNodes (n : ns) (tags, fs, ctx) acc
    | otherwise          = selectNodes (n : ns) (tags, Tree.subForest f, ctx)
                         $ selectNodes (n : ns) (tags, fs, ctx) acc
    where
        Span lo _ = Tree.rootLabel f
        info = tags Vector.! lo

-- | Returns True if a tag satisfies a given SelectNode's condition.
nodeMatches :: TagSoup.StringLike str => SelectNode -> TagInfo str -> Bool
nodeMatches (SelectNode node preds) info = checkTag node preds info
nodeMatches (SelectAny preds)       info = checkPreds preds (infoTag info)

-- | Given a tag name and a list of attribute predicates return a function that
-- returns true if a given tag matches the supplied name and predicates.
checkTag :: TagSoup.StringLike str
         => T.Text -> [AttributePredicate] -> TagInfo str -> Bool
checkTag name preds (TagInfo tag tagName _ _)
      =  TagSoup.isTagOpen tag
      && isJust tagName
      && name == fromJust tagName
      && checkPreds preds tag

-- | Returns True if a tag satisfies a list of attribute predicates.
checkPreds :: TagSoup.StringLike str
           => [AttributePredicate] -> TagSoup.Tag str -> Bool
checkPreds preds tag
    =  TagSoup.isTagOpen tag
    && all (flip checkPred attrs) preds
    where (TagSoup.TagOpen _ attrs) = tag