{-# LANGUAGE FlexibleContexts #-}
-- |
-- Module: RegExp
-- Description: Regular expression library
-- Copyright: (c) 2011 National Institute of Aerospace / Galois, Inc.
--
-- A regular expression library.
--
-- For an example, see
-- <https://github.com/Copilot-Language/examplesForACSL/blob/master/example15/main.hs>

module Copilot.Library.RegExp ( copilotRegexp, copilotRegexpB ) where

import Text.ParserCombinators.Parsec
  ( optional, (<|>), string, char, between, GenParser, many, choice, CharParser
  , optionMaybe, chainr1, chainr, many1, digit, letter, eof, parse
  , SourceName )
import Data.Int
import Data.Word
import Data.List
import Data.Char
import Data.Maybe

import Control.Monad.State ( evalState, get, modify )

import qualified Copilot.Language as C

-- | The symbols in a regular expression.
--
-- 'Any' is any value of type @t@ (matches any symbol, the "point" character in
-- a regular expression).
data Sym t = Any | Sym t
             deriving ( Eq, Ord, Show )

-- | A symbol's value can occur multiple times in a regular expression, e.g.
-- "t(tfft)*". A running number "symbolNum" is used to make all symbols in a
-- regular expression unique.
data NumSym t = NumSym { symbolNum :: Maybe NumT
                       , symbol    :: Sym t
                       } deriving ( Eq, Show )
type NumT     = Int

-- | The regular expression data type.
--
-- For our use, regular expressions describing a language with no words are not
-- supported since empty languages would not match anything and just yield a
-- Copilot stream of constant false values.
data RegExp t = REpsilon
              | RSymbol  ( NumSym t )
              | ROr      ( RegExp t ) ( RegExp t )
              | RConcat  ( RegExp t ) ( RegExp t )
              | RStar    ( RegExp t )
                deriving Show

-- | Parsers for single characters.
lquote, rquote, lparen, rparen,
  star, plus, qmark, point, minus,
  nondigit :: CharParser () Char
lquote = char '<'
rquote = char '>'
lparen = char '('
rparen = char ')'
star   = char '*'
plus   = char '+'
qmark  = char '?'
point  = char '.'
minus  = char '-'

nondigit = char '_' <|> letter

-- | A "followedBy" combinator for parsing, parses p, then p' and returns the
-- result of p.
followedBy :: GenParser tok () a
           -> GenParser tok () b
           -> GenParser tok () a
followedBy p p' = p >>= \ r -> p' >> return r

-- | Parsing a string p' with prefix p, returning both in order.
cPrefix, optCPrefix :: GenParser tok () Char
                    -> GenParser tok () String
                    -> GenParser tok () String
cPrefix p p' = p  >>= \ c -> fmap ( c : ) p'

-- | Parsing a string @p'@ with the character @p@ as an optional prefix, return
-- the result with the optional prefix.
optCPrefix p p' = optionMaybe p
                  >>= \ r -> case r of
                               Nothing -> p'
                               Just c  -> fmap ( c : ) p'

-- | "case insensitive". Take one argument of type string, parses for the
-- string in a case insensitive manner and yields the parsed string (preserving
-- its case).
ci :: String -> GenParser Char () String
ci = mapM ( \ c -> ( char . toLower ) c <|> ( char . toUpper ) c )

-- | Parser for regular expressions
regexp  :: ( SymbolParser t ) => GenParser Char () ( RegExp t )
regexp  = chainr1 term opOr

term    :: ( SymbolParser t ) => GenParser Char () ( RegExp t )
term    = chainr factor opConcat REpsilon

factor  :: ( SymbolParser t ) => GenParser Char () ( RegExp t )
factor  = opSuffix factor'

factor' :: ( SymbolParser t ) => GenParser Char () ( RegExp t )
factor' = between lparen rparen regexp
          <|> anySym
          <|> parseSym

-- | Parses the "." - point character used to match any symbol.
anySym  :: ( SymbolParser t ) => GenParser Char () ( RegExp t )
anySym  = point >> ( return . RSymbol ) ( NumSym Nothing Any )

class SymbolParser t where
    parseSym :: GenParser Char () ( RegExp t )

instance SymbolParser Bool where
    parseSym = do { truth <- ( ci "t" >> optional ( ci "rue" )
                               >> return True )
                              <|> ( ci "f" >> optional ( ci "alse" )
                                    >> return False )
                              <|> ( string "1" >> return True )
                              <|> ( string "0" >> return False )
                  ; return $ RSymbol ( NumSym Nothing $ Sym truth )
                  }

parseWordSym :: ( Integral t )
                => GenParser Char () ( RegExp t )
parseWordSym = do { num <- between lquote rquote $ many1 digit
                  ; return . RSymbol . NumSym Nothing . Sym
                    $ fromIntegral ( read num :: Integer )
                  }

parseIntSym :: ( Integral t )
                => GenParser Char () ( RegExp t )
parseIntSym = do { num <- between lquote rquote $
                          optCPrefix minus ( many1 digit )
                 ; return . RSymbol . NumSym Nothing . Sym
                   $ fromIntegral ( read num :: Integer )
                 }

type StreamName = String
newtype P = P { getName :: StreamName }
    deriving Eq

parsePSym :: GenParser Char () ( RegExp P )
parsePSym = do { pStream <- between lquote rquote $
                            cPrefix nondigit ( many $ nondigit <|> digit )
               ; return . RSymbol . NumSym Nothing . Sym
                 $ P pStream
               }

instance SymbolParser Word8 where
    parseSym = parseWordSym

instance SymbolParser Word16 where
    parseSym = parseWordSym

instance SymbolParser Word32 where
    parseSym = parseWordSym

instance SymbolParser Word64 where
    parseSym = parseWordSym

instance SymbolParser Int8 where
    parseSym = parseIntSym

instance SymbolParser Int16 where
    parseSym = parseIntSym

instance SymbolParser Int32 where
    parseSym = parseIntSym

instance SymbolParser Int64 where
    parseSym = parseIntSym

instance SymbolParser P where
    parseSym = parsePSym

opOr       :: GenParser Char () ( RegExp t -> RegExp t -> RegExp t )
opOr       = char '|' >> return ROr

opConcat   :: GenParser Char () ( RegExp t -> RegExp t -> RegExp t )
opConcat   = return RConcat

opSuffix   :: GenParser Char () ( RegExp t )
           -> GenParser Char () ( RegExp t )
opSuffix r = do
  subexp   <- r
  suffixes <- many $ choice [ star, plus, qmark ]
  let transform rexp suffix =
          case suffix of
            '*'   -> RStar   rexp
            '+'   -> RConcat rexp ( RStar rexp )
            '?'   -> ROr     rexp   REpsilon
            other -> C.badUsage ("in Regular Expression library: " ++
                               "unhandled operator: " ++ show other)
  return $ foldl transform subexp suffixes

parser :: ( SymbolParser t )
         => GenParser Char () ( RegExp t )
parser = regexp `followedBy` eof

hasEpsilon                    :: RegExp t -> Bool
hasEpsilon   REpsilon         = True
hasEpsilon ( RSymbol  _     ) = False
hasEpsilon ( ROr      r1 r2 ) = hasEpsilon r1 || hasEpsilon r2
hasEpsilon ( RConcat  r1 r2 ) = hasEpsilon r1 && hasEpsilon r2
hasEpsilon ( RStar    _     ) = True

first                    :: RegExp t -> [ NumSym t ]
first   REpsilon         = []
first ( RSymbol  s     ) = [ s ]
first ( ROr      r1 r2 ) = first r1 ++ first r2
first ( RConcat  r1 r2 ) = first r1 ++ if hasEpsilon r1 then
                                           first r2 else []
first ( RStar    r     ) = first r

reverse'                   :: RegExp t -> RegExp t
reverse' ( ROr     r1 r2 ) = ROr     ( reverse' r1 ) ( reverse' r2 )
reverse' ( RConcat r1 r2 ) = RConcat ( reverse' r2 ) ( reverse' r1 )
reverse' ( RStar   r     ) = RStar   ( reverse' r  )
reverse'   e               = e

last' :: RegExp t -> [ NumSym t ]
last' = first . reverse'

follow                        :: ( Eq t ) =>
                                 RegExp t -> NumSym t -> [ NumSym t ]
follow   REpsilon         _   = []
follow ( RSymbol  _     ) _   = []
follow ( ROr      r1 r2 ) sNr = follow r1 sNr ++ follow r2 sNr
follow ( RConcat  r1 r2 ) sNr = follow r1 sNr ++ follow r2 sNr
                                ++ if sNr `elem` last' r1 then
                                       first r2 else []
follow ( RStar    r     ) sNr = follow r sNr
                                `union` if sNr `elem` last' r then
                                            first r else []

preceding :: ( Eq t ) => RegExp t -> NumSym t -> [ NumSym t ]
preceding = follow . reverse'

hasFinitePath                   :: RegExp t -> Bool
hasFinitePath ( ROr     r1 r2 ) = hasFinitePath r1 || hasFinitePath r2
hasFinitePath ( RConcat _  r2 ) = hasFinitePath r2
hasFinitePath ( RStar   _     ) = False
hasFinitePath   _               = True

getSymbols                   :: RegExp t -> [ NumSym t ]
getSymbols ( RSymbol s     ) = [ s ]
getSymbols ( ROr     r1 r2 ) = getSymbols r1 ++ getSymbols r2
getSymbols ( RConcat r1 r2 ) = getSymbols r1 ++ getSymbols r2
getSymbols ( RStar   r     ) = getSymbols r
getSymbols   _               = []

-- assign each symbol in the regular expression a
-- unique number, counting up from 0
enumSyms   :: RegExp t -> RegExp t
enumSyms rexp = evalState ( enumSyms' rexp ) 0
    where
      enumSyms' ( RSymbol s     ) = do
        num <- get
        modify ( + 1 )
        return $ RSymbol s { symbolNum = Just num }
      enumSyms' ( ROr     r1 r2 ) = do
        r1' <- enumSyms' r1
        r2' <- enumSyms' r2
        return $ ROr r1' r2'
      enumSyms' ( RConcat r1 r2 ) = do
        r1' <- enumSyms' r1
        r2' <- enumSyms' r2
        return $ RConcat r1' r2'
      enumSyms' ( RStar   r     ) = do
        r'  <- enumSyms' r
        return $ RStar   r'
      enumSyms'   other           =
        return other

regexp2CopilotNFA :: ( C.Typed t, Eq t )
                     => C.Stream t -> RegExp t -> C.Stream Bool -> C.Stream Bool
regexp2CopilotNFA inStream rexp reset =
    let symbols                    = getSymbols rexp
        first'                     = first rexp
        start                      = [ True ] C.++ C.false

        preceding'   numSym        = let ps    = preceding rexp numSym
                                         s     = if numSym `elem` first' then
                                                   [ start ] else []
                                     in s ++ [ streams !! i
                                             | i <- map ( fromJust . symbolNum ) ps ]

        matchesInput numSym        = case symbol numSym of
                                       Any   -> C.true
                                       Sym t -> inStream C.== C.constant t

        transitions  numSym ps     = matchesInput numSym
                                     C.&& ( foldl ( C.|| ) C.false ps )

        stream       numSym        = let ps    = preceding' numSym
                                         init_ = C.constant $ numSym `elem` first'
                                     in C.mux reset
                                        ( [ False ] C.++ matchesInput numSym C.&& init_ )
                                        ( [ False ] C.++ transitions  numSym ps )

        streams                    = map stream symbols

        outStream                  = foldl ( C.|| ) start streams

    in outStream

-- | Regular expression matching over an arbitrary stream
copilotRegexp :: ( C.Typed t, SymbolParser t, Eq t )
              => C.Stream t     -- ^ The stream to monitor.
              -> SourceName     -- ^ The regular expression.
              -> C.Stream Bool  -- ^ A stream indicating when to reset the
                                --   monitor.
              -> C.Stream Bool
copilotRegexp inStream rexp reset =
  case parse parser rexp rexp of
    Left  err -> C.badUsage ("parsing regular exp: " ++ show err)
    Right rexp' -> let nrexp = enumSyms rexp' in
        if hasFinitePath nrexp then
            C.badUsage $
            concat [ "The regular expression contains a finite path "
                   , "which is something that will fail to match "
                   , "since we do not have a distinct end-of-input "
                   , "symbol on infinite streams." ]
        else if hasEpsilon nrexp then
                 C.badUsage $
                 concat [ "The regular expression matches a language "
                        , "that contains epsilon. This cannot be handled "
                        , "on infinite streams, since we do not have "
                        , "a distinct end-of-input symbol." ]
             else regexp2CopilotNFA inStream nrexp reset

regexp2CopilotNFAB :: RegExp P -> [ ( StreamName, C.Stream Bool ) ]
                      -> C.Stream Bool -> C.Stream Bool
regexp2CopilotNFAB rexp propositions reset =
    let symbols                    = getSymbols rexp
        first'                     = first rexp
        start                      = [ True ] C.++ C.false

        preceding'   numSym        = let ps    = preceding rexp numSym
                                         s     = if numSym `elem` first' then
                                                   [ start ] else []
                                     in s ++ [ streams !! i
                                             | i <- map ( fromJust . symbolNum ) ps ]

        lookup' a l = case lookup a l of
                        Nothing -> C.badUsage ("boolean stream "
                                             ++ a
                                             ++ " is not defined")
                        Just s  -> s

        matchesInput numSym        = case symbol numSym of
                                       Any   -> C.true
                                       Sym t -> lookup' ( getName t ) propositions

        transitions  numSym ps     = matchesInput numSym
                                     C.&& ( foldl ( C.|| ) C.false ps )

        stream       numSym        = let ps    = preceding' numSym
                                         init_ = C.constant $ numSym `elem` first'
                                     in C.mux reset
                                        ( [ False ] C.++ matchesInput numSym C.&& init_ )
                                        ( [ False ] C.++ transitions  numSym ps )

        streams                    = map stream symbols

        outStream                  = foldl ( C.|| ) start streams

    in outStream

-- | Regular expression matching over a collection of boolean streams.
--
-- Regular expressions can contain symbols, which are expanded to match
-- specific streams.
--
-- For example, the regular expression:
--
-- @
-- "\<s0\>(\<s1\>)+"
-- @
--
-- would match if you provide a map (association list) that assigns, to the
-- symbol @"s0"@, a stream that is true at the first sample, and to @"s1"@, a
-- stream that is true at every sample after the first sample.
copilotRegexpB :: SourceName                         -- ^ Regular expression
               -> [ ( StreamName, C.Stream Bool ) ]  -- ^ A table with the
                                                     -- stream associated to
                                                     -- each symbol.
               -> C.Stream Bool                      -- ^ A stream indicating
                                                     -- when to reset the
                                                     -- monitor.
               -> C.Stream Bool
copilotRegexpB rexp propositions reset =
  case parse parser rexp rexp of
    Left  err -> C.badUsage ("parsing regular exp: " ++ show err)
    Right rexp' -> let nrexp = enumSyms rexp' in
        if hasFinitePath nrexp then
            C.badUsage $
            concat [ "The regular expression contains a finite path "
                   , "which is something that will fail to match "
                   , "since we do not have a distinct end-of-input "
                   , "symbol on infinite streams." ]
        else if hasEpsilon nrexp then
                 C.badUsage $
                 concat [ "The regular expression matches a language "
                        , "that contains epsilon. This cannot be handled "
                        , "on infinite streams, since we do not have "
                        , "a distinct end-of-input symbol." ]
             else regexp2CopilotNFAB nrexp propositions reset