> -- | A translation schema from the external syntax (ERE) to our interal syntax (xhaskell style pattern)
> module Text.Regex.PDeriv.Translate 
>     ( translate, translatePosix ) where

> import Control.Monad.State -- needed for the translation scheme
> import Data.Char (chr)
> import qualified Data.IntMap as IM

> import Text.Regex.PDeriv.ExtPattern
> import Text.Regex.PDeriv.IntPattern
> import Text.Regex.PDeriv.RE
> import Text.Regex.PDeriv.Common


> -- | A state monad in which we can assign number to groups and non-groups.
> data TState = TState { ngi :: NGI   -- ^ negative group index
>                      , gi :: GI     -- ^ (positive) group index
>                      , anchorStart :: Bool
>                      , anchorEnd :: Bool
>                      , posix :: Bool -- ^ if posix, add binders to non-groups 
>                      , posix_binder :: IM.IntMap () -- ^ keep tracks of posix binder
>                      } -- the state for trasslation
>             deriving Show


> -- variables 0,-1,-2 are reserved for pre, main and post!
> initTState = TState { ngi = -3, gi = 1, anchorStart = False, anchorEnd = False, posix = False, posix_binder = IM.empty } 


> initTStatePosix = TState { ngi = -3, gi = 1, anchorStart = False, anchorEnd = False, posix = True, posix_binder = IM.empty } 


> type NGI = Int -- the non group index

> type GI = Int -- the group index

getters and putters

> getNGI :: State TState NGI
> getNGI = do { st <- get
>             ; return $ ngi st
>             }

> getIncNGI :: State TState NGI -- get then increase
> getIncNGI = do { st <- get
>                ; let i = ngi st
>                ; put st{ngi=(i-1)} 
>                ; return i
>                }

> getGI :: State TState GI
> getGI = do { st <- get
>            ; return $ gi st
>            }

> getIncGI :: State TState GI -- get then increase 
> getIncGI = do { st <- get
>               ; let i = gi st
>               ; put st{gi=(i+1)}
>               ; return i
>               }

> getAnchorStart :: State TState Bool
> getAnchorStart = do { st <- get
>                     ; return (anchorStart st)
>                     }

> setAnchorStart :: State TState ()
> setAnchorStart = do { st <- get
>                     ; put st{anchorStart=True}
>                     }

> getAnchorEnd :: State TState Bool
> getAnchorEnd  = do { st <- get
>                    ; return (anchorEnd st)
>                    }

> setAnchorEnd :: State TState ()
> setAnchorEnd = do { st <- get
>                   ; put st{anchorEnd=True}
>                   }

> isPosix :: State TState Bool
> isPosix = do { st <- get
>              ; return (posix st)
>              }

> addPosixBinder :: Int -> State TState ()
> addPosixBinder i = do { st <- get
>                       ; let bs = posix_binder st
>                             bs' = IM.insert i () bs
>                       ; put st{posix_binder=bs'}
>                       }

> -- | Translating external pattern to internal pattern
> translate :: EPat -> Pat
> translate epat = case runState (trans epat) initTState of
>                  (pat, state) ->
>                    let hasAnchorS = anchorStart state
>                        hasAnchorE = anchorEnd state
>                    in case (hasAnchorS, hasAnchorE) of
>                       (True, True)   -> PVar mainBinder [] pat 
>                       (True, False)  -> PPair (PVar mainBinder [] pat) (PVar subBinder [] (PE (Star Any NotGreedy)))
>                       (False, True)  -> PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat)
>                       -- (False, False) -> PPair (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat)) (PVar subBinder [] (PE (Star Any NotGreedy)))
>                       -- (False, False) -> PPair (PVar preBinder_ [] (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat))) (PVar subBinder [] (PE (Star Any NotGreedy)))
>                       (False, False) -> (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar preBinder_ [] (PPair (PVar mainBinder [] pat) (PVar subBinder [] (PE (Star Any NotGreedy))))))


> -- |  for posix 
> translatePosix :: EPat -> (Pat,IM.IntMap ())
> translatePosix epat = case runState (trans epat) initTStatePosix of
>                  (pat, state) ->
>                    let hasAnchorS = anchorStart state
>                        hasAnchorE = anchorEnd state
>                        posixBnd   = posix_binder state
>                    in case (hasAnchorS, hasAnchorE) of
>                       (True, True)   -> (PVar mainBinder [] pat, posixBnd)
>                       (True, False)  -> (PPair (PVar mainBinder [] pat) (PVar subBinder [] (PE (Star Any NotGreedy))), posixBnd)
>                       (False, True)  -> (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat), posixBnd)
>                       -- (False, False) -> PPair (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat)) (PVar subBinder [] (PE (Star Any NotGreedy)))
>                       -- (False, False) -> PPair (PVar preBinder_ [] (PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar mainBinder [] pat))) (PVar subBinder [] (PE (Star Any NotGreedy)))
>                       (False, False) -> ((PPair (PVar preBinder [] (PE (Star Any NotGreedy))) (PVar preBinder_ [] (PPair (PVar mainBinder [] pat) (PVar subBinder [] (PE (Star Any NotGreedy)))))), posixBnd)


> {-| 'trans' The top level translation scheme e ~> p
>     There are two sub rules.
>     e ~>_p p
>     and
>     e ~>_r r
>     which are fired depending on whether e has Group pattern (...) (i.e. pattern variable)
> -}
> trans :: EPat -> State TState Pat
> trans epat = 
>     do { is_posix <- isPosix -- if it is posix, we need to aggresively "tag" every sub expression with a binder
>        ; if is_posix && isStructural epat
>          then do 
>            { gi <- getIncGI
>            ; ipat <- trans' epat
>            ; addPosixBinder gi
>            ; return (PVar gi [] ipat)
>            }
>          else trans' epat
>        }
>     where isStructural :: EPat -> Bool -- ^ indicate whether it is a complex structure which we need to add extra binding for POSIX tracking
>           isStructural (EOr _)     = True
>           isStructural (EConcat _) = True                                                                      
>           isStructural (EOpt _ _)  = True
>           isStructural (EPlus _ _) = True
>           isStructural (EStar _ _) = True
>           isStructural _           = False                                                                      

> trans' :: EPat -> State TState Pat
> trans' epat 
>      | hasGroup epat = p_trans epat
>      | otherwise     = do 
>                 { r <- r_trans epat
>                 ; return (PE r)
>                 }

> {-
> trans :: EPat -> State TState Pat
> trans epat | hasGroup epat = p_trans epat
>            | otherwise     = 
>                do 
>                { is_posix <- isPosix 
>                ; if is_posix 
>                  then do 
>                       { gi <- getIncGI
>                       ; r <- r_trans epat
>                       ; addPosixBinder gi
>                       ; return (PVar gi [] (PE r))
>                       }
>                  else do 
>                    { r <- r_trans epat
>                    ; return (PE r)
>                    }
>                }
> -}


> {-| 'p_trans' implementes the rule 'e ~>_p p'
> convention:
> a,b are non group indices.
> x,y,z are group indices
> -}
> p_trans :: EPat -> State TState Pat
> p_trans epat = 
>     case epat of
>       -- () ~>_p ()
>     { EEmpty ->
>       do { return ( PE Empty )
>          }
>       {-
>         e ~> p
>         -----------------
>         ( e ) ~>_p x :: p
>       -}
>     ; EGroup e ->
>       do { i <- getIncGI
>          ; -- p <- trans e
>          ; p <- trans' e -- no need to go through trans which possible tag p with a posix var
>          ; return ( PVar i [] p)
>          }
>       {-
>         e ~> p
>         -----------------
>         (? e ) ~>_p p 
>        -}
>     ; EGroupNonMarking e -> 
>         trans' e
>     ; EOr es -> 
>         {-
>           e1 ~> p1  e2 ~> p2
>           -------------------
>             e1|e2 ~>_p p1|p2
>          -}
>       do { ps <- mapM trans es
>          ; case ps of
>            { (p':ps') -> 
>               return (foldl (\xs x -> PChoice xs x Greedy) p' ps')
>            ; [] -> error "an empty choice enountered." -- todo : capture the error in the monad state
>            }
>          }
>     ; EConcat es ->
>         {- 
>            e1 ~> p1  e2 ~> p2
>            ---------------------
>              (e1,e2) ~>_p (p1,p2)
>          -} 
>         do { ps <- mapM trans es
>            ; case reverse ps of  -- to make sure it is right assoc
>              { (p':ps') -> 
>                    return (foldl (\xs x -> PPair x xs) p' ps')
>              ; [] -> error "an empty sequence enountered." -- todo : capture the error in the moand state
>              }
>            }
>     ; EOpt e b ->
>       {- 
>          todo : not sure whether this makes sense
>            e ~> p
>          -------------------
>            e? ~>_p p|()
>        -}
>       do { p <- trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (PChoice p (PE Empty) g)
>          }
>     ; EPlus e b ->
>       {- 
>            e ~> p
>          -------------------
>            p+ ~>_p (p,p*)
>        -}
>       do { p <- trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (PPair p (PStar p g))
>          }
>     ; EStar e b -> 
>       {- 
>            e ~> p
>          -------------------
>            e*~>_p p*
>        -}
>       do { p <- trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (PStar p g)
>          }
>     ; EBound e low (Just high) b ->
>         {- we could have relax this rule to e ~> p
>             e ~>_r r  
>             r1 = take l (repeat r)
>             r2 = take (h-l) (repeat r?)
>             r' = (r1,r2)
>           -------------------------------------
>             e{l,h} ~> a :: r' 
>          -}
>       do { r <- r_trans e
>          ; i <- getIncNGI
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>                r1s = take low $ repeat r
>                r1 = case r1s of 
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r2s = take (high - low) $ repeat (Choice r Empty g)
>                r2 = case r2s of
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r3 = case (r1,r2) of 
>                       (Empty, Empty) -> Empty
>                       (Empty, _    ) -> r2
>                       (_    , Empty) -> r1
>                       (_    , _    ) -> Seq r1 r2
>                p = PVar i [] (PE r3)
>          ; return p
>          }
>     ; EBound e low Nothing b ->
>         {-
>             e ~>_r r  
>             r1 = take l (repeat r)
>             r' = (r1,r*)
>           -------------------------------------
>             e{l,} ~> a :: r' 
>          -}
>       do { r <- r_trans e
>          ; i <- getIncNGI
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>                r1s = take low $ repeat r
>                r1 = case r1s of 
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r2 = Seq r1 (Star r g)
>                p = PVar i [] (PE r2)
>          ; return p
>          }
>     ; ECarat -> 
>       -- currently we anchor the entire expression 
>       -- regardless of where ^ appears, we turn the subsequent
>       -- ECarat into literal,
>       do { f <- getAnchorStart
>          ; if f 
>            then do { i <- getIncNGI -- not the first carat
>                    ; let r = L '^'
>                          p = PVar i [] (PE r)
>                    ; return p
>                    }
>            else do { setAnchorStart -- the first carat
>                    ; i <- getIncNGI
>                    ; let r = Empty
>                          p = PVar i [] (PE r)
>                    ; return p
>                    }
>          }
>     ; EDollar -> 
>           -- similar to carat, except that we will not treat
>           -- the subsequent EDollar as literal.
>       do { f <- getAnchorEnd
>          ; if f
>            then return ()
>            else setAnchorEnd
>          ; i <- getIncNGI
>          ; let r = Empty
>                p = PVar i [] (PE r)
>          ; return p
>          }
>     ; EDot -> 
>         --  . ~> a :: \Sigma 
>         -- we might not need this rule
>       do { i <- getIncNGI
>          ; let r = Any
>                p = PVar i [] (PE r)
>          ; return p
>         }
>     ; EAny cs ->
>         -- [ abc ] ~> a :: 'a'|'b'|'c' 
>         -- we might not need this rule
>       do { i <- getIncNGI
>          ; let r = char_list_to_re cs
>                -- r = Any
>                p = PVar i [] (PE r)
>          ; return p
>          }
>     ; ENoneOf cs ->
>         -- [^ abc] ~> a :: \Sigma - 'a'|'b'|'c' 
>         -- we might not need this rule
>       do { i <- getIncNGI
>          ; let -- r = char_list_to_re (filter (\c -> not (c `elem` cs )) sigma)
>                r = Not cs
>                p = PVar i [] (PE r)
>          ; return p
>          }
>     ; EEscape c ->
>         -- \\c ~> a :: L c 
>         -- we might not need this rule
>       do { i <- getIncNGI 
>          ; let p = PVar i [] (PE (L c))
>          ; return p 
>          }
>     ; EChar c ->
>         -- c ~> a :: L c
>         -- we might not need this rule
>       do { i <- getIncNGI
>          ; let p = PVar i [] (PE (L c))
>          ; return p
>          }
>     }


> char_list_to_re (c:cs) = foldl (\ r c' -> Choice r (L c') Greedy) (L c) cs
> char_list_to_re [] = error "char_list_to_re expects non-empty list"

> alphas = char_list_to_re (['a'..'z'] ++ ['A'..'Z'])

> digits = char_list_to_re ['0'..'9']

> sigma = map chr [0 .. 255]

> anychar = char_list_to_re sigma


e ~>_r r


> r_trans :: EPat -> State TState RE
> r_trans e = 
>     case e of 
>     { EEmpty -> 
>       {-
>         () ~>_r ()
>        -}
>       return Empty
>     ; EGroup e ->
>       {- we might not need this rule
>          e ~> r
>         ----------
>         (e) ~> r
>        -}
>       r_trans e
>     ; EGroupNonMarking e ->
>       {- we might not need this rule
>          e ~> r
>         ----------
>         (?e) ~> r
>        -}
>       r_trans e
>     ; EOr es ->
>       {-
>         e1 ~>_r r1 e2 ~>_r r2
>       -------------------
>         e1|e2 ~>_r r1|r2
>        -}
>       do { rs <- mapM r_trans es
>          ; case rs of
>            { [] -> return Phi
>            ; (r:rs) -> return (foldl (\ xs x -> Choice xs x Greedy) r rs)
>            }
>          }
>     ; EConcat es ->
>       {-
>         e1 ~>_r r1  e2 ~>_r r2
>        ----------------------
>         (e1,e2) ~>_r (r1,r2)
>        -}
>       do { rs <- mapM r_trans es
>          ; case rs of
>            { [] -> return Empty
>            ; (r:rs) -> return (foldl (\ xs x -> Seq xs x) r rs)
>            }
>          }
>     ; EOpt e b -> 
>       {-
>           e ~>_r r
>         -----------
>           e? ~>_r r?
>        -}
>       do { r <- r_trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (Choice r Empty g)
>          }
>     ; EPlus e b -> 
>       {-
>         e ~>_r r
>         ---------------
>           e+ ~>_r (r,r*)
>        -}
>       do { r <- r_trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (Seq r (Star r g))
>          }
>     ; EStar e b -> 
>       {-
>         e ~>_r r
>         ----------------
>           e* ~>_r r*
>        -}
>       do { r <- r_trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>          ; return (Star r g)
>          }
>     ; EBound e low (Just high) b ->
>       {-
>         e ~>_r r
>         r1 = take l (repeat r)
>         r2 = take (h-l) (repeat r?)
>         r' = (r1,r2)         
>         -----------------
>           e{l:h} => r'
>        -}
>       do { r <- r_trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>                r1s = take low $ repeat r
>                r1 = case r1s of
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r2s = take (high - low) $ repeat (Choice r Empty g)
>                r2 = case r2s of
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r3 = case (r1,r2) of
>                       (Empty, Empty) -> Empty
>                       (Empty, _    ) -> r2
>                       (_    , Empty) -> r1
>                       (_    , _    ) -> Seq r1 r2
>          ; return r3
>          }
>     ; EBound e low Nothing b -> 
>         {-
>             e ~>_r r  
>             r1 = take l (repeat r)
>             r' = (r1,r*)
>           -------------------------------------
>             e{l,} => r' 
>          -}
>       do { r <- r_trans e
>          ; let g | b = Greedy
>                  | otherwise = NotGreedy
>                r1s = take low $ repeat r
>                r1 = case r1s of 
>                     { [] -> Empty
>                     ; (r':rs') -> foldl (\ rs r -> Seq rs r) r' rs'
>                     }
>                r2 = Seq r1 (Star r g)
>          ; return r2
>          }
>     ; ECarat -> 
>       -- currently we anchor the entire expression 
>       -- regardless of where ^ appears, we turn the subsequent
>       -- ECarat into literal,
>       do { f <- getAnchorStart
>          ; if f 
>            then return (L '^') -- not the first carat
>            else do { setAnchorStart -- the first carat
>                    ; return Empty
>                    }
>          }
>     ; EDollar -> 
>           -- similar to carat, except that we will not treat
>           -- the subsequent EDollar as literal.
>       do { f <- getAnchorEnd
>          ; if f
>            then return ()
>            else setAnchorEnd
>          ; return Empty
>          }
>     ; EDot -> 
>         --  . ~>_r \Sigma
>         -- return anychar
>         return Any
>     ; EAny cs ->
>         --  [ abc ] ~>_r 'a'|'b'|'c'
>         return (char_list_to_re cs)
>     ; ENoneOf cs ->
>         --  [^ abc] ~>_r \Sigma - 'a'|'b'|'c'
>         -- return $ char_list_to_re (filter (\c -> not (c `elem` cs )) sigma)
>         return (Not cs)
>     ; EEscape c ->
>         --  \\c ~>_r c
>         return $ L c
>     ; EChar c ->
>         --  c ~>_r c
>         return $ L c
>     }
>