> -- | This module defines the data type of internal regular expression pattern, 
> -- | as well as the partial derivative operations for regular expression patterns.
> module Text.Regex.PDeriv.IntPattern where
> import Data.List
> import Text.Regex.PDeriv.Common (Range, Letter, IsEmpty(..), GFlag(..), IsGreedy(..) )
> import Text.Regex.PDeriv.RE
> import Text.Regex.PDeriv.Dictionary (Key(..), primeL, primeR)
> import Text.Regex.PDeriv.Pretty
> -- | regular expression patterns
> data Pat = PVar Int [Range] Pat       -- ^ variable pattern 
>   | PE RE                             -- ^ pattern without binder
>   | PPair Pat Pat                     -- ^ pair pattern
>   | PChoice Pat Pat GFlag             -- ^ choice pattern 
>   | PStar Pat GFlag                   -- ^ star pattern 
>   | PPlus Pat Pat                     -- ^ plus pattern, it is used internally to indicate that it is unrolled from a PStar
>   | PEmpty Pat                        -- ^ empty pattern, it is used intermally to indicate that mkEmpty function has been applied.
> {-| The Eq instance for Pat data type
>     NOTE: We ignore the 'consumed word' when comparing patterns
>     (ie we only compare the pattern structure).
>     Essential for later comparisons among patterns. -}
> instance Eq Pat where
>   (==) (PVar x1 _ p1) (PVar x2 _ p2) = (x1 == x2) && (p1 == p2) 
>   (==) (PPair p1 p2) (PPair p1' p2') = (p1 == p1') && (p2 == p2')
>   (==) (PChoice p1 p2 g1) (PChoice p1' p2' g2) = (g1 == g2) && (p2 == p2') && (p1 == p1') -- more efficient, because choices are constructed in left-nested
>   (==) (PPlus p1 p2) (PPlus p1' p2') = (p1 == p1') && (p2 == p2')
>   (==) (PStar p1 g1) (PStar p2 g2) =  (g1 == g2) && (p1 == p2)
>   (==) (PE r1) (PE r2) = r1 == r2
>   (==) _ _ = False
> 
> instance Pretty Pat where
>     pretty (PVar x1 _ p1) = "(" ++ show x1 ++ ":" ++ pretty p1 ++ ")"
>     pretty (PPair p1 p2) = "<" ++ pretty p1 ++ "," ++ pretty p2 ++ ">"
>     pretty (PChoice p1 p2 g) = "(" ++ pretty p1 ++ "|" ++ pretty p2 ++ ")" ++ (show g)
>     pretty (PE r) = show r
>     pretty (PPlus p1 p2 ) = "(" ++ pretty p1 ++ "," ++ pretty p2 ++ ")"
>     pretty (PStar p g) = (pretty p) ++ "*" ++ (show g)
>     pretty (PEmpty p) = "[" ++ pretty p ++ "]"
> instance Show Pat where
>     show pat = pretty pat
> instance Key Pat where
>     hash (PVar x1 _ p1) = let y1 = head (hash x1) 
>                               y2 = head (hash p1)
>                           in y1 `seq` y2 `seq` [ 1 + y1 * primeL + y2 * primeR ] 
>     hash (PPair p1 p2) = let x1 = head (hash p1)
>                              x2 = head (hash p2)
>                          in x1 `seq` x2 `seq` [ 2 + x1 * primeL + x2 * primeR ] 
>     hash (PChoice p1 p2 Greedy) = let x1 = head (hash p1)
>                                       x2 = head (hash p2)
>                                   in x1 `seq` x2 `seq`  [ 4 + x1 * primeL + x2 * primeR ] 
>     hash (PChoice p1 p2 NotGreedy) = let x1 = head (hash p1)
>                                          x2 = head (hash p2)
>                                      in x1 `seq` x2 `seq` [ 5 + x1 * primeL + x2 * primeR ]
>     hash (PPlus p1 p2) = let x1 = head (hash p1)
>                              x2 = head (hash p2)
>                          in x1 `seq` x2 `seq` [ 6 + x1 * primeL + x2 * primeR ]
>     hash (PStar p Greedy) = let x = head (hash p)
>                             in x `seq` [ 7 + x * primeL ]
>     hash (PStar p NotGreedy) = let x = head (hash p)
>                             in x `seq` [ 8 + x * primeL ]
>     hash (PE r) = let x = head (hash r)
>                   in x `seq` [ 9 + x * primeL ]
>     hash (PEmpty p) = let x = head (hash p)
>                       in x `seq` [ 3 + x * primeL ]
>
> -- | function 'strip' strips away the bindings from a pattern
> strip :: Pat -> RE 
> strip (PVar _ w p) = strip p
> strip (PE r) = r
> strip (PStar p g) = Star (strip p) g
> strip (PPair p1 p2) = Seq (strip p1) (strip p2)
> strip (PPlus p1 p2) = Seq (strip p1) (strip p2)
> strip (PChoice p1 p2 g) = Choice (strip p1) (strip p2) g
> strip (PEmpty p) = strip p
> -- | function 'mkEmpPat' makes an empty pattern
> mkEmpPat :: Pat -> Pat
> mkEmpPat (PVar x w p) = PVar x w (mkEmpPat p)
> mkEmpPat (PE r) 
>   | isEmpty r = PE Empty
>   | otherwise = PE Phi
> mkEmpPat (PStar p g) = PE Empty -- problematic?! we are losing binding (x,()) from  ( x : a*) ~> PE <>
> mkEmpPat (PPlus p1 p2) = mkEmpPat p1 -- since p2 must be pstar we drop it. If we mkEmpPat p2, we need to deal with pdPat (PPlus (x :<>) (PE <>)) l
> mkEmpPat (PPair p1 p2) = PPair (mkEmpPat p1) (mkEmpPat p2)
> mkEmpPat (PChoice p1 p2 g) = PChoice (mkEmpPat p1) (mkEmpPat p2) g
> {-| function 'pdPat' computes the partial derivatives of a pattern w.r.t. a letter.
>    Integrating non-greedy operator with PStar
>    For p*, we need to unroll it into a special construct
>    say PPlus p' p* where p' \in p/l.
>    When we push another label, say l' to PPlus p' p*, and
>    p' is emptiable, naively, we would do 
>    [ PPlus p'' p* | p'' <- p' / l ] ++ [ PPlus (mkE p') (PPlus p''' p*) | (PPlus p''' p*) <- p*/l ]
>    Now the problem here is the shape of the pdpat are infinite, which 
>    breaks the requirement of getting a compilation scheme.
>    The fix here is to simplify the second component, by combining the binding, of (mkE p') and p'''
>    since they share the same set of variables.
>    [ PPlus p'' p* | p'' <- p' / l ] ++ [ PPlus p4 p* | (PPlus p''' p*) <- p*/l ] 
>    where p4 = combineBinding (mkE p') p'''
>    For pdPat0 approach, we do not need to do this explicitly, we simply drop 
>    (mkE p') even in the PPair case. see the definitely of pdPat0 below
> -}
> pdPat :: Pat -> Letter -> [Pat]
> pdPat (PVar x w p) (l,idx) = 
>          let pds = pdPat p (l,idx)
>          in if null pds then []
>             else case w of
>		  [] -> [ PVar x [ (idx,idx) ] pd | pd <- pds ]
>		  ((b,e):rs)      --  ranges are stored in the reversed manner, the first pair the right most segment
>                     | idx == (e + 1) -> -- it is consecutive
>                         [ PVar x ((b,idx):rs) pd | pd <- pds ]
>                     | otherwise ->      -- it is NOT consecutive
>                         [ PVar x ((idx,idx):(b,e):rs) pd | pd <- pds ]
> pdPat (PE r) (l,idx) = let pds = partDeriv r l 
>                  in if null pds then []
>                     else [ PE $ resToRE pds ]
> {-| The non-greedy operator has impact to a sequence pattern if the 
>     first sub pattern is non-greedy. We simply swap the order of the 
>     'choices' in the resulting pds. -} 
> pdPat (PPair p1 p2) l = 
>   if (isEmpty (strip p1))
>   then  if isGreedy p1
>         then nub ([ PPair p1' p2 | p1' <- pdPat p1 l] ++ 
>                   [ PPair (mkEmpPat p1) p2' | p2' <- pdPat p2 l])
>         else nub ( [ PPair (mkEmpPat p1) p2' | p2' <- pdPat p2 l] ++ [ PPair p1' p2 | p1' <- pdPat p1 l] )
>   else [ PPair p1' p2 | p1' <- pdPat p1 l ]
> {-| Integrating non-greedy operator with pstar requires more cares.
>     We have two questions to consider.
>      1) When we unfold p*, do we need to take the non-greediness in p into consideration?
>         The answer is no. 
>         What happens is that when we unfold p* into p and p*, if we were considering p is non-greedy and apply pdpat to p*
>         again, i.e. mkE(p),p*/l we will run into non-terminating problem. This seems to be bug with python re library. -} 
> pdPat (PStar p g) l = let pds = pdPat p l
>                       in [ PPlus pd (PStar p g) | pd <- pds ]
> {-| 2) After we unfold p* and 'advance' to (PPlus p' p*) say p' \in p / l for some l
>        we are in the position of 'pushing' another label l' into  (PPlus p' p*). 
>     Shall we swap the order of the alternatives when p' is non-greedy?
>     Why not? This seems harmless since we have already made some progress by pushing l into p*. -}
> pdPat (PPlus p1 p2@(PStar _ _)) l -- p2 must be pStar
>     | isEmpty (strip p1) = 
>         if isGreedy p1 
>         then [ PPlus p3 p2 | p3  <- pdPat p1 l ] ++ [ PPlus p3 p2' | (PPlus p1' p2') <- pdPat p2 l, let p3 =  p1' `getBindingsFrom` p1 ]
>         else [ PPlus p3 p2' | (PPlus p1' p2') <- pdPat p2 l, let p3 =  p1' `getBindingsFrom` p1 ] ++ [ PPlus p3 p2 | p3  <- pdPat p1 l ]
>     | otherwise          = [ PPlus p3 p2 | p3  <- pdPat p1 l ]
> pdPat (PChoice p1 p2 g) l = 
>    nub ((pdPat p1 l)  ++ (pdPat p2 l)) -- nub doesn't seem to be essential
> pdPat p l = error ((show p) ++ (show l))
> -- | function 'getBindingsFrom' transfer bindings from p2 to p1
> getBindingsFrom :: Pat  -- ^ the source of the  
>                    -> Pat -> Pat
> getBindingsFrom p1 p2 = let b = toBinder p2
>                         in assign p1 b
>     where assign :: Pat -> Binder -> Pat
>           assign (PVar x w p) b = case lookup x b of
>                                     Nothing -> let p' = assign p b in PVar x w p'
>                                     Just rs -> let p' = assign p b in PVar x (w ++ rs) p'
>           assign (PE r) _ = PE r
>           assign (PPlus p1 p2) b = PPlus (assign p1 b) p2 -- we don't need to care about p2 since it is a p*
>           assign (PPair p1 p2) b = PPair (assign p1 b) (assign p2 b)
>           assign (PChoice p1 p2 g) b = PChoice (assign p1 b) (assign p2 b) g
> -- | Function 'isGreedy' checks whether a pattern is greedy
> instance IsGreedy Pat where
>     isGreedy (PVar _ _ p) = isGreedy p
>     isGreedy (PE r) = isGreedy r
>     isGreedy (PPair p1 p2) = isGreedy p1 || isGreedy p2
>     isGreedy (PChoice p1 p2 Greedy) = True
>     isGreedy (PChoice p1 p2 NotGreedy) = False -- isGreedy p1 || isGreedy p2
>     isGreedy (PEmpty p) = False
>     isGreedy (PStar p Greedy) = True
>     isGreedy (PStar p NotGreedy) = False
>     isGreedy (PPlus p p') = isGreedy p || isGreedy p'
> -- | The 'Binder' type denotes a set of (pattern var * range) pairs
> type Binder = [(Int, [Range])]
> -- | Function 'toBinder' turns a pattern into a binder
> toBinder :: Pat -> Binder
> toBinder  (PVar i rs p) = [(i,rs)] ++ (toBinder p)
> toBinder  (PPair p1 p2) = (toBinder p1) ++ (toBinder p2)
> toBinder  (PPlus p1 p2) = (toBinder p1) 
> toBinder  (PStar p1 g)    = (toBinder p1) 
> toBinder  (PE r)        = []
> toBinder  (PChoice p1 p2 g) = (toBinder p1) ++ (toBinder p2)
> toBinder  (PEmpty p) = toBinder p
> {-| Function 'updateBinderByIndex' updates a binder given an index to a pattern var
>     ASSUMPTION: the var index in the pattern is linear. e.g. no ( 0 :: R1, (1 :: R2, 2 :a: R3))
> -}
> updateBinderByIndex :: Int    -- ^ the indext of the pattern variable
>                        -> Int -- ^ the letter position
>                        -> Binder -> Binder
> updateBinderByIndex i lpos binder =
>     updateBinderByIndexSub lpos i binder 
> 
> updateBinderByIndexSub :: Int -> Int -> Binder -> Binder
> updateBinderByIndexSub pos idx [] = []
> updateBinderByIndexSub pos idx  (x@(idx',(b,e):rs):xs)
>     | pos `seq` idx `seq` idx' `seq` xs `seq` False = undefined
>     | idx == idx' && pos == (e + 1) = (idx', (b, e+ 1):rs):xs
>     | idx == idx' && pos > (e + 1)  = (idx', (pos,pos):(b, e):rs):xs
>     | idx == idx' && pos < (e + 1)  = error "impossible, the current letter position is smaller than the last recorded letter"
>     | otherwise =  x:(updateBinderByIndexSub pos idx xs)
> updateBinderByIndexSub pos idx (x@(idx',[]):xs)
>     | pos `seq` idx `seq` idx' `seq` xs `seq` False = undefined
>     | idx == idx' = ((idx', [(pos, pos)]):xs)
>     | otherwise = x:(updateBinderByIndexSub pos idx xs)
> {-| Function 'pdPat0' is the 'abstracted' form of the 'pdPat' function
>     It computes a set of pairs. Each pair consists a 'shape' of the partial derivative, and
>     an update function which defines the change of the pattern bindings from the 'source' pattern to 
>     the resulting partial derivative. This is used in the compilation of the regular expression pattern -}
> pdPat0 :: Pat  -- ^ the source pattern
>           -> Letter -- ^ the letter to be "consumed"
>           -> [(Pat, Int -> Binder -> Binder)]
> pdPat0 (PVar x w p) (l,idx) 
>     | null (toBinder p) = -- p is not nested
>         let pds = partDeriv (strip p) l
>         in pds `seq` if null pds then []
>                      else [ (PVar x [] (PE (resToRE pds)), (\i -> (updateBinderByIndex x i))) ]
>     | otherwise = 
>         let pfs = pdPat0 p (l,idx)
>         in pfs `seq` [ (PVar x [] pd, (\i -> (updateBinderByIndex x i) . (f i) ) ) | (pd,f) <- pfs ]
> pdPat0 (PE r) (l,idx) = 
>     let pds = partDeriv r l
>     in  pds `seq` if null pds then []
>                   else [ (PE (resToRE pds), ( \_ -> id ) ) ]
> pdPat0 (PStar p g) l = let pfs = pdPat0 p l
>                        in pfs `seq` [ (PPair p' (PStar p g), f) | (p', f) <- pfs ]
> pdPat0 (PPair p1 p2) l = 
>     if (isEmpty (strip p1))
>     then if isGreedy p1
>          then nub2 ([ (PPair p1' p2, f) | (p1' , f) <- pdPat0 p1 l ] ++ (pdPat0 p2 l))
>          else nub2 ((pdPat0 p2 l) ++ [ (PPair p1' p2, f) | (p1' , f) <- pdPat0 p1 l ])
>     else [ (PPair p1' p2, f) | (p1',f) <- pdPat0 p1 l ]
> pdPat0 (PChoice p1 p2 g) l = 
>      nub2 ((pdPat0 p1 l) ++ (pdPat0 p2 l)) -- nub doesn't seem to be essential
> nub2 :: Eq a => [(a,b)] -> [(a,b)]
> nub2 = nubBy (\(p1,f1) (p2, f2) -> p1 == p2)