-- | The CorePattern module deconstructs the Pattern tree created by
-- ReadRegex.parseRegex and returns a simpler Q/P tree with
-- annotations at each Q node.  This will be converted by the TNFA
-- module into a QNFA finite automata.
--
-- Of particular note, this Pattern to Q/P conversion creates and
-- assigns all the internal Tags that will be used during the matching
-- process, and associates the captures groups with the tags that
-- represent their starting and ending locations and with their
-- immediate parent group.
--
-- Each Maximize and Minimize tag is held as either a preTag or a
-- postTag by one and only one location in the Q/P tree.  The Orbit
-- tags are each held by one and only one Star node.  Tags that stop a
-- Group are also held in perhaps numerous preReset lists.
--
-- The additional nullQ::nullView field of Q records the potentially
-- complex information about what tests and tags must be used if the
-- pattern unQ::P matches 0 zero characters.  There can be redundancy
-- in nullView, which is eliminated by cleanNullView.
--
-- Uses recursive do notation.
module Text.Regex.TDFA.CorePattern(Q(..),P(..),WhichTest(..),Wanted(..)
                                  ,TestInfo,OP(..),SetTestInfo(..),NullView
                                  ,patternToQ,cleanNullView,cannotAccept,mustAccept) where

import Control.Monad.RWS {- all -}
import Data.Array.IArray(Array,(!),accumArray,listArray)
import Data.List(sort)
import Data.IntMap.EnumMap(EnumMap)
import qualified Data.IntMap.EnumMap as Map(singleton,null,assocs,keysSet)
import Data.Maybe(isNothing)
import Data.IntSet.EnumSet(EnumSet)
import qualified Data.IntSet.EnumSet as Set(singleton,toList,isSubsetOf)
import Text.Regex.TDFA.Common {- all -}
import Text.Regex.TDFA.Pattern(Pattern(..),starTrans)
-- import Debug.Trace

{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}

err :: String -> a
err = common_error "Text.Regex.TDFA.CorePattern"

debug :: (Show a) => a -> b -> b
debug _ = id

-- Core Pattern Language
data P = Empty
       | Or [Q]
       | Seq Q Q
       | Star { getOrbit :: Maybe Tag  -- tag to prioritize the need to keep track of length of each pass though q
              , resetOrbits :: [Tag]   -- child star's orbits to reset (ResetOrbitTask)
              , firstNull :: Bool      -- Usually True meaning the first pass may match 0 characters
              , unStar :: Q}
       | Test TestInfo                 -- Require the test to be true
       | OneChar Pattern               -- Bring the Pattern element that accepts a character
       | NonEmpty Q                    -- Don't let the Q pattern match nothing
         deriving (Show,Eq)

-- The diagnostics about the pattern
data Q = Q {nullQ :: NullView         -- Ordered list of nullable views
           ,takes :: (Position,Maybe Position)  -- Range of number of accepted characters
           ,preReset :: [Tag]         -- Tags to "reset" (ResetGroupStopTask) (Only immediate children)
           ,preTag,postTag :: Maybe Tag -- Tags assigned around this pattern (TagTask)
           ,tagged :: Bool            -- Whether this node should be tagged -- patternToQ use only
           ,childGroups :: Bool       -- Whether unQ has any PGroups -- patternToQ use only
           ,wants :: Wanted           -- What kind of continuation is used by this pattern
           ,unQ :: P} deriving (Eq)

type TestInfo = (WhichTest,DoPa)

-- This is newtype'd to allow control over class instances
-- This is a set of WhichTest where each test has associated pattern location information
newtype SetTestInfo = SetTestInfo {getTests :: EnumMap WhichTest (EnumSet DoPa)} deriving (Eq)

instance Monoid SetTestInfo where
  mempty = SetTestInfo mempty
  SetTestInfo x `mappend` SetTestInfo y = SetTestInfo (x `mappend` y)

instance Show SetTestInfo where
  show (SetTestInfo sti) = "SetTestInfo "++show (mapSnd (Set.toList) $ Map.assocs sti)

-- There may be several distinct ways for a subtree to conditionally
-- (i.e. with a Test) or unconditionally accept 0 characters.  These
-- are in the list in order of preference, with most preferred listed
-- first.
type NullView = [(SetTestInfo,WinTags)]  -- Ordered list of null views, each is a set of tests and tags

-- During the depth first traversal, children are told about tags by the parent
data HandleTag = NoTag             -- No tag at this boundary
               | Advice Tag        -- tag at this boundary, applied at higher level in tree
               | Apply Tag         -- tag at this boundary, may be applied at this node or passed to one child
                 deriving (Show)

-- Nodes in the tree are labeled by the type kind of continuation they
-- prefer to be passed when processing.  This makes it possible to
-- create a smaller number of QNFA states and avoid creating wasteful
-- QNFA states that won't be reachable in the final automata.
data Wanted = WantsQNFA | WantsQT | WantsBoth | WantsEither deriving (Eq,Show)

instance Show Q where
  show = showQ

showQ :: Q -> String
showQ q = "Q { nullQ = "++show (nullQ q)++
        "\n  , takes = "++show (takes q)++
        "\n  , preReset = "++show (preReset q)++
        "\n  , preTag = "++show (preTag q)++
        "\n  , postTag = "++show (postTag q)++
        "\n  , tagged = "++show (tagged q)++
        "\n  , wants = "++show (wants q)++
        "\n  , unQ = "++ indent (unQ q)++" }"
   where indent = unlines . (\(h:t) -> h : (map (spaces ++) t)) . lines . show
         spaces = replicate 10 ' '

-- Smart constructors for NullView
notNull :: NullView
notNull = []

emptyNull :: WinTags -> NullView
emptyNull tags = (mempty, tags) : []

testNull :: TestInfo -> WinTags -> NullView
testNull (w,d) tags = (SetTestInfo (Map.singleton w (Set.singleton d)), tags) : []

-- The NullViews are ordered, and later test sets that contain the
-- tests from any earlier entry will never be chosen.  This function
-- returns a list with these redundant elements removed.  Note that
-- the first unconditional entry in the list will be the last entry of
-- the returned list since the empty set is a subset of any other set.
cleanNullView :: NullView -> NullView
cleanNullView [] = []
cleanNullView (first@(SetTestInfo sti,_):rest) | Map.null sti = first : []  -- optimization
                                               | otherwise =
  first : cleanNullView (filter (not . (setTI `Set.isSubsetOf`) . Map.keysSet . getTests . fst) rest)
  where setTI = Map.keysSet sti

-- Ordered Sequence of two NullViews: all ordered combinations of tests and tags.
-- Order of <- s1 and <- s2 is deliberately chosen to maintain preference priority
mergeNullViews :: NullView -> NullView -> NullView
mergeNullViews s1 s2 = cleanNullView $ do
  (test1,tag1) <- s1
  (test2,tag2) <- s2
  return (mappend test1 test2,mappend tag1 tag2)
-- mergeNullViews = cleanNullView $ liftM2 (mappend *** mappend)

-- Prepend tags to nullView
addTagsToNullView :: WinTags -> NullView -> NullView
addTagsToNullView [] nv = nv
addTagsToNullView tags nv= do
  (test,tags') <- nv
  return (test,tags `mappend` tags')

-- For PGroup, need to prepend reset tasks before others in nullView
addResetsToNullView :: [Tag]-> NullView -> NullView
addResetsToNullView resetTags nv = [ (test, prepend tags) | (test,tags) <- nv ]
  where prepend = foldr (\h t -> (h:).t) id . map (\tag->(tag,PreUpdate ResetGroupStopTask)) $ resetTags

-- Concatenated two ranges of number of accepted characters
seqTake :: (Int, Maybe Int) -> (Int, Maybe Int) -> (Int, Maybe Int)
seqTake (x1,y1) (x2,y2) = (x1+x2,liftM2 (+) y1 y2)

-- Parallel combination of list of ranges of number of accepted characters
orTakes :: [(Int, Maybe Int)] -> (Int,Maybe Int)
orTakes [] = (0,Just 0)
orTakes ts = let (xs,ys) = unzip ts
             in (minimum xs, foldl1 (liftM2 max) ys)

-- Invariant: apply (toAdvice _ ) == mempty
apply :: HandleTag -> Maybe Tag
apply (Apply tag) = Just tag
apply _ = Nothing
toAdvice :: HandleTag -> HandleTag
toAdvice (Apply tag) = Advice tag
toAdvice s = s
noTag :: HandleTag -> Bool
noTag NoTag = True
noTag _ = False
fromHandleTag :: HandleTag -> Tag
fromHandleTag (Apply tag) = tag
fromHandleTag (Advice tag) = tag
fromHandleTag _ = error "fromHandleTag"

-- Shorthand for combining a preTag and a postTag
winTags :: Maybe Tag -> Maybe Tag -> WinTags
winTags (Just a) (Just b) = [(a,PreUpdate TagTask),(b,PreUpdate TagTask)]
winTags (Just a) Nothing  = [(a,PreUpdate TagTask)]
winTags Nothing  (Just b) = [(b,PreUpdate TagTask)]
winTags Nothing  Nothing  = mempty

-- Predicates on the range of number of accepted  characters
varies :: Q -> Bool
varies Q {takes = (_,Nothing)} = True
varies Q {takes = (x,Just y)} = x/=y

mustAccept :: Q -> Bool
mustAccept q = (0/=) . fst . takes $ q

canAccept :: Q -> Bool
canAccept q = maybe True (0/=) $ snd . takes $ q

cannotAccept :: Q -> Bool
cannotAccept q = maybe False (0==) $ snd . takes $ q

-- This converts then input Pattern to an analyzed Q structure with
-- the tags assigned.
--
-- The analysis is filled in by a depth first search and the tags are
-- created top down and passed to children.  Thus information flows up
-- from the dfs of the children and simultaneously down in the form of
-- pre and post HandleTag data.  This bidirectional flow is handled
-- declaratively by using the MonadFix (i.e. mdo) instance of State.
-- 
-- Invariant: A tag should exist in Q in exactly one place.  This is
-- because PGroup needs to know the tags are around precisely the
-- expression that it wants to record.  If the same tag were in other
-- branches then this would no longer be true.
--
-- This invariant is enforced by each node either taking
-- responsibility (apply) for a passed in / created tag or sending it
-- to exactly one child node.  Other child nodes need to receive it
-- via toAdvice.
--
-- There is a final "qwin of Q {postTag=ISet.singleton 1}" and an
-- implied initial index tag of 0.
-- 
-- favoring pushing Apply into the child postTag makes PGroup happier

type PM = RWS (Maybe GroupIndex) [Either Tag GroupInfo] ([OP]->[OP],Tag) 
type HHQ = HandleTag  -- m1 : info about left boundaary / preTag
        -> HandleTag  -- m2 : info about right boundary / postTag
        -> PM Q

-- There is no group 0 here, since it is always the whole match and has no parent of its own
makeGroupArray :: GroupIndex -> [GroupInfo] -> Array GroupIndex [GroupInfo]
makeGroupArray maxGroupIndex groups = accumArray (\earlier later -> later:earlier) [] (1,maxGroupIndex) filler
    where filler = map (\gi -> (thisIndex gi,gi)) groups

fromRight :: [Either Tag GroupInfo] -> [GroupInfo]
fromRight [] = []
fromRight ((Right x):xs) = x:fromRight xs
fromRight ((Left _):xs) = fromRight xs

partitionEither :: [Either Tag GroupInfo] -> ([Tag],[GroupInfo])
partitionEither = helper id id where
  helper :: ([Tag]->[Tag]) -> ([GroupInfo]->[GroupInfo]) -> [Either Tag GroupInfo] -> ([Tag],[GroupInfo])
  helper ls rs [] = (ls [],rs [])
  helper ls rs ((Right x):xs) = helper  ls      (rs.(x:)) xs
  helper ls rs ((Left  x):xs) = helper (ls.(x:)) rs       xs

-- Partial function: assumes starTrans has been run on the Pattern
patternToQ :: CompOption -> (Pattern,(GroupIndex,DoPa)) -> (Q,Array Tag OP,Array GroupIndex [GroupInfo])
patternToQ compOpt (pOrig,(maxGroupIndex,_)) = (tnfa,aTags,aGroups) where
  (tnfa,(tag_dlist,nextTag),groups) = runRWS monad startReader startState
  aTags = listArray (0,pred nextTag) (tag_dlist [])
  aGroups = makeGroupArray maxGroupIndex (fromRight groups)

  -- implicitly inside a PGroup 0 converted into a GroupInfo 0 undefined 0 1
  monad = go (starTrans pOrig) (Advice 0) (Advice 1)
  startReader :: Maybe GroupIndex
  startReader = Just 0                           -- start inside group 0, capturing enabled
  startState :: ([OP]->[OP],Tag)
  startState = ( (Minimize:) . (Maximize:) , 2)  -- Tag 0 is Minimized and Tag 1 is maximized.

  -- Specialize the monad operations and give more meaningful names
  makeOrbit :: PM (Maybe Tag)
  makeOrbit = do Apply x <- uniq Orbit
                 tell [Left x]
                 return (Just x)

  withOrbit :: PM a -> PM (a,[Tag])
  withOrbit = listens childStars
    where childStars x = let (ts,_) = partitionEither x in ts

  getParentIndex :: PM (Maybe GroupIndex)
  getParentIndex = ask

  makeGroup :: GroupInfo -> PM ()
  makeGroup = tell . (:[]) . Right

  nonCapture :: PM  a -> PM a
  nonCapture = local (const Nothing)

  withParent :: GroupIndex -> PM a -> PM (a,[Tag])
  withParent this = local (const (Just this)) . listens childGroupInfo
    where childGroupInfo x =
            let (_,gs) = partitionEither x
                children :: [GroupIndex]
                children = norep . sort . map thisIndex
                           -- filter to get only immediate children (efficiency)
                           . filter ((this==).parentIndex) $ gs
            in concatMap (map stopTag . (aGroups!)) (this:children)

  uniq :: OP -> PM HandleTag
  uniq newOp = do (op,s) <- get                -- generate the next tag with bias newOp
                  let op' = op . (newOp:)
                      s' = succ s
                  put $! debug ("\n"++show (s,newOp)++"\n") (op',s')
                  return (Apply s) -- someone will need to apply it

  -- Partial function: Must not pass in an empty list
  -- Policy choices:
  --  * pass tags to apply to children and have no preTag or postTag here (so none addded to nullQ)
  --  * middle 'mid' tag: give to left/front child as postTag so a Group there might claims as stopTag
  --  * if parent is Group then preReset will become non-empty
  combineConcat :: [Pattern] -> HHQ
  combineConcat | rightAssoc compOpt = (\ps -> foldr combineSeq (go (last ps)) (map go $ init ps))
                | otherwise          = (\ps -> foldl combineSeq (go (head ps)) (map go $ tail ps)) -- libtre default
    where combineSeq :: HHQ -> HHQ -> HHQ
          combineSeq pFront pEnd = (\ m1 m2 -> mdo
            let bothVary = varies qFront && varies qEnd
            a <- if noTag m1 && bothVary then uniq Minimize else return m1
            b <- if noTag m2 && bothVary then uniq Maximize else return m2
            mid <- case (noTag a,canAccept qFront,noTag b,canAccept qEnd) of
                     (False,False,_,_) -> return (toAdvice a)
                     (_,_,False,False) -> return (toAdvice b)
                     _ -> if tagged qFront || tagged qEnd then uniq Maximize else return NoTag
            qFront <- pFront a mid
            qEnd <- pEnd (toAdvice mid) b
            let wanted = if WantsEither == wants qEnd then wants qFront else wants qEnd
            return $ Q (mergeNullViews (nullQ qFront) (nullQ qEnd))
                       (seqTake (takes qFront) (takes qEnd))
                       [] Nothing Nothing
                       bothVary (childGroups qFront || childGroups qEnd) wanted
                       (Seq qFront qEnd)
                                   )
  go :: Pattern -> HHQ
  go pIn m1 m2 =
    let die = error $ "patternToQ cannot handle "++show pIn
        nil = return $ Q {nullQ=emptyNull (winTags (apply m1) (apply m2))
                         ,takes=(0,Just 0)
                         ,preReset=[],preTag=apply m1,postTag=apply m2
                         ,tagged=False,childGroups=False,wants=WantsEither
                         ,unQ=Empty}
        one = return $ Q {nullQ=notNull
                         ,takes=(1,Just 1)
                         ,preReset=[],preTag=apply m1,postTag=apply m2
                         ,tagged=False,childGroups=False,wants=WantsQNFA
                         ,unQ = OneChar pIn}
        test myTest = return $ Q {nullQ=testNull myTest (winTags (apply m1) (apply m2))
                                 ,takes=(0,Just 0)
                                 ,preReset=[],preTag=apply m1,postTag=apply m2
                                 ,tagged=False,childGroups=False,wants=WantsQT
                                 ,unQ=Test myTest }
    in case pIn of
         PEmpty -> nil
         POr [] -> nil
         POr [p] -> go p m1 m2
         POr ps -> mdo
           -- Exasperation: This POr recursive mdo is very easy to make loop and lockup the program
           let canVary = varies ans || childGroups ans -- childGroups detects that "abc|a(b)c" needs tags
           a <- if noTag m1 && canVary then uniq Minimize else return m1
           b <- if noTag m2 && canVary then uniq Maximize else return m2
           let aAdvice = toAdvice a
               bAdvice = toAdvice b
               -- Due to the recursive-do, it seems that I have to put the if canVary into the op'
               op' = if canVary then uniq Maximize else return bAdvice
           -- Preference for last branch is implicit: do not need op' to create uniq tag:
           cs <- fmap (++[bAdvice]) $ replicateM (pred $ length ps) op'
           qs <- mapM (\(p,c) -> go p aAdvice c) (zip ps cs)
           let wqs = map wants qs
               wanted = if any (WantsBoth==) wqs then WantsBoth
                          else case (any (WantsQNFA==) wqs,any (WantsQT==) wqs) of
                                 (True,True) -> WantsBoth
                                 (True,False) -> WantsQNFA
                                 (False,True) -> WantsQT
                                 (False,False) -> WantsEither
               nullView = addTagsToNullView (winTags (apply a) (apply b)) . cleanNullView . concatMap nullQ $ qs
               -- The nullView computed above takes the nullQ of the
               -- branches and combines them.  This assumes that the
               -- pre/post tags of the children are also part of the
               -- nullQ values.  So for consistency, POr must then add
               -- its own pre/post tags to its nullQ value.
           let ans = Q nullView
                       (orTakes . map takes $ qs)
                       [] (apply a) (apply b)
                       canVary (any childGroups qs) wanted
                       (Or qs)
           return ans
         PConcat [] -> nil -- fatal to pass [] to combineConcat
         PConcat ps -> combineConcat ps m1 m2
         PStar mayFirstBeNull p -> mdo
           let accepts    = canAccept q
               needsOrbit = varies q && childGroups q  -- otherwise it cannot matter or be observed which path is taken
               needsTags  = needsOrbit || accepts      -- important that needsOrbit implies needsTags
           a <- if noTag m1 && needsTags then uniq Minimize else return m1
           b <- if noTag m2 && needsTags then uniq Maximize else return m2
           c <- if needsOrbit then makeOrbit else return Nothing -- any Orbit tag is created after the pre and post tags
           (q,resetTags) <- withOrbit (go p NoTag NoTag)
           let nullView = emptyNull (winTags (apply a) (apply b)) -- chosen to represent skipping sub-pattern
           return $ Q nullView
                      (0,if accepts then Nothing else (Just 0))
                      [] (apply a) (apply b)
                      needsTags (childGroups q) WantsQT
                      (Star c resetTags mayFirstBeNull q)
         PCarat dopa -> test (Test_BOL,dopa)
         PDollar dopa -> test (Test_EOL,dopa)
         PChar {} -> one
         PDot {} -> one
         PAny {} -> one
         PAnyNot {} -> one
         PEscape {} -> one

         -- A PGroup node in the Pattern tree does not become a node
         -- in the Q/P tree. A PGroup can share and pass along a
         -- preTag (with Advice) with other branches, but will pass
         -- down an Apply postTag.
         --
         -- If the parent index is Nothing then this is part of a
         -- non-capturing subtree and ignored.
         PGroup Nothing p -> go p m1 m2
         PGroup (Just this) p -> do
           mParent <- getParentIndex
           case mParent of
             Nothing -> go p m1 m2
             Just parent -> do
               a <- if noTag m1 then uniq Minimize else return m1
               b <- if isNothing (apply m2) then uniq Maximize else return m2
               (q,resetTags) <- withParent this (go p a b)
               makeGroup (GroupInfo this parent (fromHandleTag a) (fromHandleTag b))
               return $ q { nullQ = addResetsToNullView resetTags (nullQ q)
                          , tagged = True
                          , childGroups = True
                          , preReset = resetTags `mappend` (preReset q) }

         -- A PNonCapture node in the Pattern tree does not become a
         -- node in the Q/P tree.  It sets the parent to Nothing while
         -- processing the sub-tree.
         PNonCapture p -> nonCapture (go p m1 m2)

         -- PNonEmpty means the child pattern p can be skipped by
         -- bypassing the pattern.  This is only used in the case p
         -- can accept 0 and can accept more than zero characters
         -- (thus the assertions, enforcted by CorePattern.starTrans).  The important thing about this case
         -- is intercept the "accept 0" possibility and replace with
         -- "skip".
         PNonEmpty p -> mdo
           let needsTags = canAccept q
           a <- if noTag m1 && needsTags then uniq Minimize else return m1
           b <- if noTag m2 && needsTags then uniq Maximize else return m2
           q <- go p (toAdvice a) (toAdvice b)
           when (not needsTags) (err $ "PNonEmpty could not accept characters: "++show (p,pOrig))
           when (mustAccept q) (err $ "patternToQ : PNonEmpty provided with a *mustAccept* pattern: "++show (p,pOrig))
           return $ Q (emptyNull (winTags (apply a) (apply b)))   -- The magic of NonEmpty
                      (0,snd (takes q))                           -- like Or
                      [] (apply a) (apply b)                      -- own the closing tag so it will not end a PGroup
                      needsTags (childGroups q) (wants q)         -- the test case is "x" =~ "(.|$){1,3}"
                      (NonEmpty q)

         -- these are here for completeness of the case branches, currently starTrans replaces them all
         PPlus {} -> die
         PQuest {} -> die
         PBound {} -> die