-----------------------------------------------------------------------------
-- |
-- Module      :  Language.HaLex.Dfa
-- Copyright   :  (c) João Saraiva 2001,2002,2003,2004,2005, 2016
-- License     :  LGPL
--
-- Maintainer  :  jas@di.uminho.pt
-- Stability   :  provisional
-- Portability :  portable
--
-- Parsing (through parsing combinators) concrete regular Expressions
--                (in the Unix like notation)
--
-- Code Included in the Lecture Notes on
--      Language Processing (with a functional flavour).
--
-----------------------------------------------------------------------------


module Language.HaLex.RegExpParser ( parseRegExp
                                   ) where

import Prelude hiding ((<$>), (<*>))
import Data.Char
import Language.HaLex.Parser
import Language.HaLex.RegExp


-- | Parser for regular expressions

parseRegExp :: [Char]                -- ^ Input symbols
            -> Maybe (RegExp Char)   -- ^ Regular expression
parseRegExp re = res
  where parsed_re = expr re
        res | parsed_re == [] = Nothing
            | otherwise       = Just (fst (head parsed_re))

expr :: Parser Char (RegExp Char)
expr =  f  <$> termThen <*> symbol '|' <*> expr
    <|> id <$> termThen
    <|> succeed Epsilon
  where f l _ r = Or l r

termThen :: Parser Char (RegExp Char)
termThen =  f  <$> term <*> termThen
        <|> id <$> term
  where f l r   = Then l r

term :: Parser Char (RegExp Char)
term =  f  <$> factor <*> symbol '?'
    <|> g  <$> factor <*> symbol '*'
    <|> h  <$> factor <*> symbol '+'
    <|> id <$> factor
  where
     f e _ = Or   e Epsilon
     g e _ = Star e
     h e _ = Then e (Star e)

factor :: Parser Char (RegExp  Char)
factor =  f <$> letterOrDigit
      <|> g <$> symbol '\'' <*> satisfy (\ x -> True) <*> symbol '\''
      <|> h <$> symbol '(' <*> expr <*> symbol ')'
      <|> k <$> symbol '[' <*> (oneOrMore range) <*> symbol ']'
      <|> l <$> symbol '[' <*> symbol '^' <*> range <*> symbol ']'
  where
     f a         = Literal a
     g _ e _     = Literal e
     h _ e _     = e
     k _ l _     = RESet (concat l)
     l _ _ l _   = RESet [ x | x <- ascii
                         , not (x `elem` l)
			 ] 

range :: Parser Char [Char]
range  =  f   <$> letterOrDigit <*> symbol '-' <*> letterOrDigit
      <|> id  <$> oneOrMore (satisfy (\ x -> x `elem` ascii
                                             && x /= '-' && x /= '^'))
  where f a _ c = [a..c]

      

letterOrDigit :: Parser Char Char
letterOrDigit = satisfy (\x -> isDigit x || isAlpha x)

setRegExp :: Char                -- ^ first elem of the set
          -> Char                -- ^ last elem of the set
          -> RegExp Char         -- ^ Regular Expression for the set
setRegExp a b = foldr1 Or (map Literal [a..b])



-- Ascii characteres (C Language)
ascii =  ['a'..'z']                             -- lower letter
      ++ ['A'..'Z']                             -- capital letters
      ++ [' ','\n','\t']                        -- Tab Or New line Or Space
      ++ "~|#$%^&*)(_+|\\`-={}[]:\";<>?,./"     -- Special Characters



-- Not used yet
spaces :: Parser Char ()
spaces = (\ _ _ -> ()) <$> symbol ' ' <*> spaces
      <|> succeed ()