-----------------------------------------------------------------------------
-- |
-- Module      :  Language.HaLex.RegExp
-- Copyright   :  (c) João Saraiva 2001,2002,2003,2004,2005, 2016
-- License     :  LGPL
-- 
-- Maintainer  :  jas@di.uminho.pt
-- Stability   :  provisional
-- Portability :  portable
--
-- Regular Expressions in Haskell. 
-- 
-- Code Included in the Lecture Notes on 
--          Language Processing (with a functional flavour).   
--
-----------------------------------------------------------------------------

module Language.HaLex.RegExp ( 
              -- * Data type with recursion pattern
                RegExp (..)
              , cataRegExp
              -- * Matching
              , matchesRE
              , matches'
              -- * Size
              , sizeRegExp
              -- * Printing
              , showRE
              -- * Simplification
              , simplifyRegExp
              -- * Normalization
              , extREtoRE
              ) where

-----------------------------------------------------------------------------
-- * Data type with recursion pattern

-- | Type of regular expressions.
data RegExp sy  = Empty                              -- ^ Empty Language
                | Epsilon                            -- ^ Empty String
                | Literal   sy                       -- ^ Literals
                | Or        (RegExp sy) (RegExp sy)  -- ^ Disjuncion
                | Then      (RegExp sy) (RegExp sy)  -- ^ Sequence
                | Star      (RegExp sy)              -- ^ Repetition, possibly zero time
                | OneOrMore (RegExp sy)              -- ^ One or more times (extended RegExp)
                | Optional  (RegExp sy)              -- ^ Optional (extended RegExp)
                | RESet     [sy]                     -- ^ Set (extended RegExp)
   deriving (Read, Eq)
   
-- | Catamorphism induced by the 'RegExp' inductive data type

cataRegExp :: ( re 
              , re
              , re   -> re -> re
              , re   -> re
              , sy   -> re
              , re   -> re -> re
              , re   -> re
              , re   -> re
              , [sy] -> re
              ) -> RegExp sy -> re

cataRegExp (empty,epsilon,or,star,lit,th,one,opt,set) = cata
 where cata Empty          = empty 
       cata Epsilon        = epsilon
       cata (Or er1 er2)   = or (cata er1) (cata er2)
       cata (Star er)      = star (cata er)
       cata (Literal a)    = lit a
       cata (Then er1 er2) = th (cata er1) (cata er2)
       cata (OneOrMore er) = one (cata er)
       cata (Optional er)  = opt (cata er)
       cata (RESet st)     = set st

-----------------------------------------------------------------------------
-- * Matching

-- | Test whether a match can be found for the given regular expression
--   in the given sequence of characters. The regular expression is
--   assumed not to contain 'OneOrMore' or 'Optional'. See also @matches'@.

matchesRE :: Eq sy 
          => RegExp sy         -- ^ (canonical) Regular Expression
          -> [sy]              -- ^ Input Symbols
          -> Bool
matchesRE Empty inp          = False
matchesRE Epsilon inp        = inp == []
matchesRE (Literal l) inp    = ([l] == inp)
matchesRE (Or re1 re2) inp   = matchesRE re1 inp || matchesRE re2 inp 
matchesRE (Then re1 re2) inp = or [ matchesRE re1 s1 && matchesRE re2 s2 
                                  | (s1,s2) <- splits inp]
matchesRE (Star re) inp      = matchesRE Epsilon inp ||
                               or [ matchesRE re s1 && matchesRE (Star re) s2 
                                  | (s1,s2) <- frontSplits inp ]

-- | Test whether a match can be found for the given regular expression
--   in the given sequence of characters. The regular expression is
--   allowed to contain 'OneOrMore' or 'Optional'.

matches' :: Eq sy 
         => RegExp sy          -- ^ Regular Expression 
         -> [sy]               -- ^ Input Symbols
         -> Bool
matches' = matchesRE . extREtoRE

-- | Produce a list of all possible ways of splitting the input list
--   into two parts. For instance, 
-- @ 
--   splits "foo" 
--     = [(\"\","foo"),("f","oo"),("fo","o"),("foo",\"\")] 
-- @

splits :: [a]                -- ^ Input List
       -> [ ([a],[a]) ]      -- ^ Splited List
splits s = [ splitAt n s | n <- [ 0 .. length s ] ]


-- | Produce a list of all possible ways of splitting the input list
--   into two parts where the first part is non-empy. For instance, 
-- @ 
--   splits "foo" 
--     = [("f","oo"),("fo","o"),("foo",\"\")]
-- @

frontSplits :: [a] -> [ ([a],[a]) ]
frontSplits s = [ splitAt n s | n <- [ 1 .. length s ] ]


-----------------------------------------------------------------------------
-- * Size

-- | Compute the size of a regular expression.
--   We define the size of a regular expression as the number of occurrences 
--   of symbols of the alfabeth

sizeRegExp :: RegExp sy      -- ^ Regular Expression
           -> Int            -- ^ Size
sizeRegExp = cataRegExp (0,0,(+),id,\x -> 1,(+),id,id,length)


-----------------------------------------------------------------------------
-- * Printing

-- | Print regular expression to String as a catamorphism.
--   A straightforward (catamorphic) show function.
--
--   (it produces too many brackets, making it difficult to read or 
--    understand the expression)

showRE :: Show sy 
       => RegExp sy        -- ^ Regular Expression
       -> [Char]           -- ^ String-based Regular Expression
showRE = cataRegExp  ("{}"
                     , "@"
                     , \ l r -> "(" ++ l ++ "|" ++ r ++ ")"
                     , \ er  -> "(" ++ er ++ ")*"
                     , show 
                     , \ l r -> "(" ++ l ++ r ++ ")"
                     , \ er  -> "(" ++ er ++ ")+"
                     , \ er  -> "(" ++ er ++ ")?"
                     , \ set -> show set
                     )

-- | Pretty print of regular expressions.

instance Show sy => Show (RegExp sy) where
          showsPrec _ Empty             = showString "{}"
          showsPrec _ Epsilon           = showChar '@'
          showsPrec _ (Literal c)       = showsPrec 0 c
{-                | isSymbol c            = showChar '\''
                                        . showChar c
                                        . showChar '\''
                | otherwise             = showChar c
-}
          showsPrec n (Star e)          = showsPrec 10 e . showChar '*'
          showsPrec n (OneOrMore e)     = showParen (n == 4)
                                        $ showsPrec 10 e
                                        . showChar '+'
          showsPrec _ (Optional e)      = showsPrec 10 e
                                        . showChar '?'
          showsPrec n (e1 `Or` e2)      = showParen (n /= 0 && n /= 4)
                                        $ showsPrec 4 e1
                                        . showChar '|'
                                        . showsPrec 4 e2
          showsPrec n (e1 `Then` e2)    = showParen (n /= 0 && n /= 6)
                                        $ showsPrec 6 e1
                                        . showChar ' '
                                        . showsPrec 6 e2
          showsPrec _ (RESet set)       = showList set
 
 
isSymbol x = x `elem` "|? "

-----------------------------------------------------------------------------
-- * Simplification

-- | Simplify regular expressions according to the algebra of regular expressions.

simplifyRegExp :: Eq sy => RegExp sy -> RegExp sy
simplifyRegExp Empty        = Empty
simplifyRegExp Epsilon      = Epsilon
simplifyRegExp (Literal x)  = Literal x

simplifyRegExp (Star x)     = case x' of                                -- Algebraic Rules:
                               Epsilon      -> Epsilon                  -- @*       = @
                               Empty        -> Epsilon                  -- {}*      = @
                               Or Epsilon a -> Star (simplifyRegExp a)  -- (a | @)* = a*
                               Or a Epsilon -> Star (simplifyRegExp a)  -- (@ | a)* = a*
                               _            -> Star x'
  where x' = simplifyRegExp x 


simplifyRegExp (Then x y)  | x' == Empty    = Empty                     -- {} p = {}
                           | y' == Empty    = Empty                     -- p {} = {}
                           | x' == Epsilon  = y'                        -- @ p  = p
                           | y' == Epsilon  = x'                        -- p @  = p
                           | y' == Star x'  = OneOrMore x'              -- p p* = p+
                           | x' == Star y'  = OneOrMore y'              -- p* p = p+
                           | otherwise      = Then x' y'
  where x' =  simplifyRegExp x
        y' =  simplifyRegExp y


simplifyRegExp a@(Or x y)  
      | x' == y'                       = x'                             -- p  | p  = p
      | x' == Empty                    = y'                             -- {} | p  = p 
      | y' == Empty                    = x'                             -- p  | {} = p
--    | x == (Star a) && y == Epsilon  = simplifyRegExp x       
      | otherwise                      = f x' y'                        -- Or x' y'
  where x' = simplifyRegExp x 
        y' = simplifyRegExp y

        f Epsilon (OneOrMore p) = Star p                                -- (@ | p+) = p*
        f Epsilon re            = Optional re                           -- p | @    = p? 
        f (OneOrMore p) Epsilon = Star p                                -- (p+ | @) = p*
        f re Epsilon            = Optional re                           -- @ | p    = p? 
        f re1 re2               = Or re1 re2


simplifyRegExp (OneOrMore x) = case x' of
                                Empty        -> Empty                   -- {}+      = {}
                                Epsilon      -> Epsilon                 -- @+       = @
                                Or p Epsilon -> Star p                  -- (p | @)+ = p*
                                Or Epsilon p -> Star p                  -- (@ | p)+ = p*
                                _            -> OneOrMore x'
  where  x' = simplifyRegExp x

simplifyRegExp (Optional x) = Optional (simplifyRegExp x)

simplifyRegExp (RESet set) = RESet set



-----------------------------------------------------------------------------
-- * Normalization

-- | Rewrite extended regular expressions to
--   plain regular expression. This means that the 'OneOrMore' 
--   'Optional' and 'RESet' constructors are normalized away.

extREtoRE :: RegExp sy -> RegExp sy
extREtoRE  = cataRegExp ( Empty
                        , Epsilon 
                        , \ l r -> Or l r
                        , \ er  -> Star er
                        , \ a   -> Literal a 
                        , \ l r -> Then l r 
                        , \ er  -> Then er (Star er)
                        , \ er  -> Or Epsilon er
                        , \ set -> foldr1 Or (map Literal set)
                        ) 

-----------------------------------------------------------------------------