-- ------------------------------------------------------------

{- |
   Module     : Text.XML.HXT.DTDValidation.RE
   Copyright  : Copyright (C) 2008 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : experimental
   Portability: portable

   A module for regular expression matching based on derivatives of regular expressions.

   The code was taken from Joe English (<http://www.flightlab.com/~joe/sgml/validate.html>).
   Tested and extended by Martin Schmidt.

   Further references for the algorithm:

   Janusz A. Brzozowski.

        Derivatives of Regular Expressions. Journal of the ACM, Volume 11, Issue 4, 1964.

   Mark Hopkins.

        Regular Expression Package. Posted to comp.compilers, 1994.
        Available per FTP at <ftp://iecc.com/pub/file/regex.tar.gz>.
-}

-- ------------------------------------------------------------

module Text.XML.HXT.DTDValidation.RE
    ( RE(..)

    , re_unit
    , re_zero
    , re_sym
    , re_rep
    , re_plus
    , re_opt
    , re_seq
    , re_alt
    , re_dot

    , checkRE
    , matches
    , nullable
    , printRE
  )
where

import           Data.List (foldl')

-- |
-- Data type for regular expressions.

data RE a =
        RE_ZERO String          --' L(0)   = {} (empty set)
        | RE_UNIT               --' L(1)   = { [] } (empty sequence)
        | RE_SYM a              --' L(x)   = { [x] }
        | RE_DOT                --' accept any single symbol
        | RE_REP (RE a)         --' L(e*)  = { [] } `union` L(e+)
        | RE_PLUS (RE a)        --' L(e+)  = { x ++ y | x <- L(e), y <- L(e*) }
        | RE_OPT (RE a)         --' L(e?)  = L(e) `union` { [] }
        | RE_SEQ (RE a) (RE a)  --' L(e,f) = { x ++ y | x <- L(e), y <- L(f) }
        | RE_ALT (RE a) (RE a)  --' L(e|f) = L(e) `union` L(f)
        deriving (Show, Eq, Ord)


-- ------------------------------------------------------------
-- Constructor functions to simplify regular expressions when constructing them.

-- |
-- Constructs a regular expression for an empty set.
--
--    * 1.parameter errMsg :  error message
--
--    - returns : regular expression for an empty set

re_zero                 :: String -> RE a
re_zero m               = RE_ZERO m


-- |
-- Constructs a regular expression for an empty sequence.
--
--    - returns : regular expression for an empty sequence

re_unit                 :: RE a
re_unit                 = RE_UNIT


-- |
-- Constructs a regular expression for accepting a symbol
--
--    * 1.parameter sym :  the symbol to be accepted
--
--    - returns : regular expression for accepting a symbol

re_sym                  :: a -> RE a
re_sym x                = RE_SYM x


-- |
-- Constructs a regular expression for accepting any singel symbol
--
--    - returns : regular expression for accepting any singel symbol

re_dot                  :: RE a
re_dot                  = RE_DOT


-- |
-- Constructs an optional repetition (*) of a regular expression
--
--    * 1.parameter re_a :  regular expression to be repeted
--
--    - returns : new regular expression

re_rep                  :: RE a -> RE a
re_rep RE_UNIT          = RE_UNIT
re_rep (RE_ZERO _)      = RE_UNIT
re_rep e@(RE_REP _)     = RE_REP (rem_rep e)            -- remove nested reps
re_rep e@(RE_ALT _ _)   = RE_REP (rem_rep e)            -- remove nested reps in alternatives
re_rep e                = RE_REP e

-- |
-- remove redundant nested *'s in RE
-- theoretically this is unneccessary,
-- but without this simplification the runtime can increase exponentally
-- when computing deltas, e.g. for a** or (a|b*)* which is the same as (a|b)*

rem_rep                     :: RE a -> RE a
rem_rep (RE_ALT RE_UNIT e2) = e2
rem_rep (RE_ALT e1 e2)      = RE_ALT (rem_rep e1) (rem_rep e2)
rem_rep (RE_REP e1)         = rem_rep e1
rem_rep e1                  = e1


-- |
-- Constructs a repetition (+) of a regular expression
--
--    * 1.parameter re_a :  regular expression to be repeted
--
--    - returns : new regular expression

re_plus                 :: RE a -> RE a
re_plus RE_UNIT         = RE_UNIT
re_plus (RE_ZERO m)     = RE_ZERO m
re_plus e
    | nullable e        = re_rep e            -- nullable e => e+ == e*
    | otherwise         = re_seq e (re_rep e)


-- |
-- Constructs an option (?) of a regular expression
--
--    * 1.parameter re_a :  regular expression to be optional
--
--    - returns : new regular expression

re_opt                  :: (Ord a) => RE a -> RE a
re_opt RE_UNIT          = RE_UNIT
re_opt (RE_ZERO _)      = RE_UNIT
re_opt e                = re_alt RE_UNIT e

-- |
-- Constructs a sequence (,) of two regular expressions
--
--    * 1.parameter re_a :  first regular expression in sequence
--
--    - 2.parameter re_b :  second regular expression in sequence
--
--    - returns : new regular expression

re_seq                                 :: RE a -> RE a -> RE a
re_seq e1@(RE_ZERO _)   _              = e1                         -- simplification
re_seq RE_UNIT          e2             = e2                         -- simplification
re_seq _                e2@(RE_ZERO _) = e2                         -- simplification
re_seq e1               RE_UNIT        = e1                         -- simplification
re_seq (RE_SEQ e11 e12) e2             = re_seq e11 (re_seq e12 e2) -- right assoc.
re_seq e1               e2             = RE_SEQ e1 e2


-- |
-- Constructs an alternative (|) of two regular expressions
--
--    * 1.parameter re_a :  first regular expression of alternative
--
--    - 2.parameter re_b :  second regular expression of alternative
--
--    - returns : new regular expression

re_alt                                      :: (Ord a) => RE a -> RE a -> RE a
re_alt (RE_ZERO _)      e2                  = e2
re_alt e1               (RE_ZERO _)         = e1
re_alt (RE_ALT e11 e12) e2                  = re_alt e11 (re_alt e12 e2)  -- is right assoc
re_alt e1               e2@(RE_ALT e21 e22)
    | e1 == e21                             = e2             -- duplicates removed, the effective rule
    | e1 >  e21                             = re_alt e21 (re_alt e1 e22)  -- sort alternatives
    | otherwise                             = RE_ALT e1 e2
re_alt e1               e2
    | e1 == e2                              = e2             -- simplification, the effective rule
    | e1 >  e2                              = re_alt e2 e1   -- sort alts for unique repr.
    | otherwise                             = RE_ALT e1 e2


-- ------------------------------------------------------------


-- |
-- Checks if a regular expression matches the empty sequence.
--
-- nullable e == [] `in` L(e)
--
-- This check indicates if a regular expression fits to a sentence or not.
--
--    * 1.parameter re :  regular expression to be checked
--
--    - returns : true if regular expression matches the empty sequence,
--                otherwise false

nullable                ::  RE a -> Bool
nullable (RE_ZERO _)    = False
nullable RE_UNIT        = True
nullable (RE_SYM _)     = False
nullable (RE_REP _)     = True
nullable (RE_PLUS e)    = nullable e
nullable (RE_OPT _)     = True
nullable (RE_SEQ e f)   = nullable e && nullable f
nullable (RE_ALT e f)   = nullable e || nullable f
nullable RE_DOT         = False


-- |
-- Derives a regular expression with respect to one symbol.
--
-- L(delta e x) = x \ L(e)
--
--    * 1.parameter re :  regular expression to be derived
--
--    - 2.parameter sym :  the symbol on which the regular expression is applied
--
--    - returns : the derived regular expression

delta :: (Ord a, Show a) => RE a -> a -> RE a
delta re x = case re of
        RE_ZERO _               -> re                                   -- re_zero m
        RE_UNIT                 -> re_zero ("Symbol " ++ show x ++ " unexpected.")
        RE_SYM sym
                | x == sym      -> re_unit
                | otherwise     -> re_zero ("Symbol " ++ show sym ++ " expected, but symbol " ++ show x ++ " found.")
        RE_REP  e               -> re_seq (delta e x) re                -- (re_rep e)
        RE_PLUS e               -> re_seq (delta e x) (re_rep e)
        RE_OPT  e               -> delta e x
        RE_SEQ  e f
                | nullable e    -> re_alt (re_seq (delta e x) f) (delta f x)
                | otherwise     -> re_seq (delta e x) f
        RE_ALT  e f             -> re_alt (delta e x) (delta f x)
        RE_DOT                  -> re_unit


-- |
-- Derives a regular expression with respect to a sentence.
--
--    * 1.parameter re :  regular expression
--
--    - 2.parameter s :  sentence to which the regular expression is applied
--
--    - returns : the derived regular expression

matches :: (Ord a, Show a) => RE a -> [a] -> RE a
matches e = foldl' delta e


-- |
-- Checks if an input matched a regular expression. The function should be
-- called after matches.
--
-- Was the sentence used in @matches@ in the language of the regular expression?
-- -> matches e s == s `in` L(e)?
--
--    * 1.parameter re :  the derived regular expression
--
--    - returns : empty String if input matched the regular expression, otherwise
--               an error message is returned

checkRE :: (Eq a, Show a) => RE a -> String
checkRE (RE_UNIT)       = ""
checkRE (RE_ZERO m)     = m
checkRE re
        | nullable re   = ""
        | otherwise     = "Input must match " ++ printRE re


-- ------------------------------------------------------------


-- |
-- Constructs a string representation of a regular expression.
--
--    * 1.parameter re :  a regular expression
--
--    - returns : the string representation of the regular expression

printRE :: (Eq a, Show a) => RE a -> String
printRE re'
    = "( " ++ printRE1 re' ++ " )"
      where
      -- printRE1 :: (Eq a, Show a) => RE a -> String
      printRE1 re = case re of
          RE_ZERO m                             -> "ERROR: " ++ m
          RE_UNIT                               -> ""
          RE_SYM sym                            -> show sym
          RE_DOT                                -> "."
          RE_REP e
              | isSingle e                      -> printRE1 e ++ "*"
              | otherwise                       -> "(" ++ printRE1 e ++ ")*"
          RE_PLUS e
              | isSingle e                      -> printRE1 e ++ "+"
              | otherwise                       -> "(" ++ printRE1 e ++ ")+"
          RE_OPT e
              | isSingle e                      -> printRE1 e ++ "?"
              | otherwise                       -> "(" ++ printRE1 e ++ ")?"
          RE_SEQ e1 (RE_REP e2)
              | e1 == e2                        -> printRE1 (RE_PLUS e1)
          RE_SEQ e1 (RE_SEQ (RE_REP e2) e3)
              | e1 == e2                        -> printRE1 (RE_SEQ (RE_PLUS e1) e3)
          RE_SEQ e f
              | isAlt e  && not (isAlt f)       -> "(" ++ printRE1 e ++ ") , " ++ printRE1 f
              | not (isAlt e) && isAlt f        -> printRE1 e ++ " , (" ++ printRE1 f ++ ")"
              | isAlt e  && isAlt f             -> "(" ++ printRE1 e ++ ") , (" ++ printRE1 f ++ ")"
              | otherwise                       -> printRE1 e ++ " , " ++ printRE1 f
          RE_ALT RE_UNIT f                      -> printRE1 (RE_OPT f)
          RE_ALT e f
              | isSeq e  && not (isSeq f)       -> "(" ++ printRE1 e ++ ") | " ++ printRE1 f
              | not (isSeq e) && isSeq f        -> printRE1 e ++ " | (" ++ printRE1 f ++ ")"
              | isSeq e  && isSeq f             -> "(" ++ printRE1 e ++ ") | (" ++ printRE1 f ++ ")"
              | otherwise                       -> printRE1 e ++ " | " ++ printRE1 f


      isSingle :: RE a -> Bool
      isSingle (RE_ZERO _)    = True
      isSingle RE_UNIT        = True
      isSingle (RE_SYM _)     = True
      isSingle _              = False


      isSeq :: (Eq a) => RE a -> Bool
      isSeq (RE_SEQ e1 (RE_REP e2))
          | e1 == e2          = False  -- is transformed back into RE_PLUS
      isSeq (RE_SEQ _ _)      = True
      isSeq _                 = False


      isAlt :: RE a -> Bool
      isAlt (RE_ALT RE_UNIT _)= False  -- is transformed back into a RE_OPT
      isAlt (RE_ALT _ _)      = True
      isAlt _                 = False