{-# OPTIONS_GHC -fno-warn-missing-signatures #-}
-- | 'ReadRegex' parses string regular expressions with all the
-- extensions : lazy and possessive modifiers and back-references.
--
-- "man re_format" on OS X 10.4.4 was the guide for most of the format
-- (the new extended format, not the old basic format).  Currently,
-- only word-boundaries are not recognized and handled.
-- 
module Text.Regex.Parsec.ReadRegex
  (PatternIndex
  ,parseRegex
  ,legalCharacterClasses
  ,decodePatternSet
  ,p_regex) where

import Text.Regex.Parsec.Common(StringRegex)
import Text.Regex.Parsec.Pattern(
  Pattern(PStar, PQuest, PPossessive, PPlus, POr, PLazy, PGroup,
    PEscape, PEmpty, PDot, PDollar, PConcat, PChar, PCarat, PBound,
    PBack, PAnyNot, PAny),
  PatternIndex, PatternSetEquivalenceClass(..),
  PatternSetCollatingElement(PatternSetCollatingElement),
  PatternSetCharacterClass(PatternSetCharacterClass), PatternSet(..))
import Text.ParserCombinators.Parsec((<|>), (<?>), updateState,
  unexpected, try, runParser, many, getState, GenParser, ParseError,
  sepBy1, option, notFollowedBy, many1, lookAhead, eof, between,
  string, noneOf, digit, char, anyChar)
import Control.Monad(liftM, when, guard)
import Data.Set(Set)
import qualified Data.Set as Set(fromList, toList, insert, empty, Set)

-- BracketElement is internal only
data BracketElement = BEChar Char | BEChars String | BEColl String | BEEquiv String | BEClass String

-- | This runs the p_regex parser on the input string (with initial
-- state 0), and returns (Left ParseError) on failure or Right
-- (Pattern,PatternIndex) where the PatternIndex is the number of
-- parenthesized subgroups in the Pattern (i.e. the final state of the
-- parser).  If the entire input is not consumed then 'parseRegex'
-- returns a parse error.  If you need to parse a regular expression
-- that is not the entire input string then you will need to utilize
-- p_regex.
parseRegex :: StringRegex -> Either ParseError (Pattern,PatternIndex)
parseRegex x = runParser (do pat <- p_regex
                             eof
                             subs <- getState
                             return (pat,subs)) 0 x x

-- | This is the Parsec parser that performs the interpretation. The
-- state is a PatternIndex which is incremented and read (in that
-- order) to assign a unique 'PatternIndex' to each parenthesized
-- group.  It does not check that the regex is followed by the end of
-- the input.
p_regex :: GenParser Char PatternIndex Pattern
p_regex = liftM POr $ sepBy1 p_branch (char '|')

-- man re_format helps alot, it says one-or-more pieces so this uses
-- many1 instead of many.  Use "()" to indicate an empty piece.
p_branch = liftM PConcat $ many1 p_piece

p_piece = (p_anchor <|> p_atom) >>= p_post_atom

p_anchor = (char '^' >> return PCarat)
       <|> (char '$' >> return PDollar)
       <?> "empty () or anchor ^ or $"

p_atom =  p_empty <|> p_group <|> p_bracket <|> p_char <?> "an atom"

p_empty = try (do string "()" 
                  index <- updateState (+1) >> getState
                  return $ PGroup index PEmpty)

p_group = lookAhead (char '(') >> do
  index <- updateState (+1) >> getState
  liftM (PGroup index) $ between (char '(') (char ')') p_regex

-- p_post_atom takes the previous atom as a parameter
p_post_atom atom = ( ( (char '?' >> return (PQuest atom))
                   <|> (char '+' >> return (PPlus  atom))
                   <|> (char '*' >> return (PStar  atom))
                   <|> p_bound atom) >>= p_post_atom_ng) <|> (return atom)

p_bound atom = try $ between (char '{') (char '}') (p_bound_spec atom)

p_bound_spec atom = do lowS <- many1 digit
                       let lowI = read lowS
                       highMI <- option (Just lowI) $ try $ do 
                                   char ','
                                   highS <- many digit
                                   if null highS then return Nothing -- no upper bound
                                     else do let highI = read highS
                                             guard (lowI <= highI)
                                             return (Just (read highS))
                       return (PBound lowI highMI atom)

-- Look for another '?' to make into a Lazy matcher (non-greedy)
p_post_atom_ng atom = ((char '?' <?> "lazy ? modifier") >> return (PLazy atom))
                  <|> ((char '+' <?> "possessive + modifier") >> return (PPossessive atom))
                  <|> return atom

p_char = p_dot <|> p_left_brace <|> p_escaped <|> p_other_char
    where p_dot = char '.' >> return PDot
          p_left_brace = try $ (char '{' >> notFollowedBy digit >> return (PChar '{'))
          p_escaped = char '\\' >> (liftM (PBack . read) (many1 digit) <|> liftM PEscape anyChar)
          p_other_char = liftM PChar (noneOf specials)
          specials  = "^.[$()|*+?{\\"

-- parse [bar] and [^bar] sets of characters
p_bracket = (char '[') >> ( (char '^' >> p_set True) <|> (p_set False) )

-- p_set does not support [.ch.] or [=y=] or [:foo:]
p_set :: Bool -> GenParser Char st Pattern
p_set invert = do initial <- (option "" ((char ']' >> return "]") <|> (char '-' >> return "-")))
                  values <- many1 p_set_elem
                  char ']'
                  let chars = maybe'set $ initial ++ [c | BEChar c <- values ] ++ concat [s | BEChars s <- values ] 
                      colls = maybe'set [PatternSetCollatingElement coll | BEColl coll <- values ]
                      equivs = maybe'set [PatternSetEquivalenceClass equiv | BEEquiv equiv <- values]
                      class's = maybe'set [PatternSetCharacterClass a'class | BEClass a'class <- values]
                      maybe'set x = if null x then Nothing else Just (Set.fromList x)
                      sets = PatternSet chars class's colls equivs
                  sets `seq` return $ if invert then PAnyNot sets else PAny sets

p_set_elem = p_set_elem_class <|> p_set_elem_equiv <|> p_set_elem_coll
         <|> p_set_elem_range <|> p_set_elem_char <?> "Failed to parse bracketed string"

p_set_elem_class = liftM BEClass $
  try (between (string "[:") (string ":]") (many1 $ noneOf ":]"))

p_set_elem_equiv = liftM BEEquiv $
  try (between (string "[=") (string "=]") (many1 $ noneOf "=]"))

p_set_elem_coll =  liftM BEColl $
  try (between (string "[.") (string ".]") (many1 $ noneOf ".]"))

p_set_elem_range = try $ do 
  start <- noneOf "]-"
  char '-'
  end <- noneOf "]"
  return (BEChars [start..end])

p_set_elem_char = do 
  c <- noneOf "]"
  when (c == '-') $ do
    atEnd <- (lookAhead (char ']') >> return True) <|> (return False)
    when (not atEnd) (unexpected "A dash is in the wrong place in a bracket")
  return (BEChar c)

-- | This takes a 'PatternSet' and returns 'Set' 'Char'.  The
-- character classes in legalCharacterClasses are expanded to their
-- members.
--
-- 'decodePatternSet' ignores collating element and treats equivalence
-- classes as just their defining character(s).
decodePatternSet :: PatternSet -> Set Char
decodePatternSet (PatternSet msc mscc _ msec) =
  let baseMSC = maybe Set.empty id msc
      withMSCC = foldl (flip Set.insert) baseMSC  (maybe [] (concatMap decodeCharacterClass . Set.toList) mscc)
      withMSEC = foldl (flip Set.insert) withMSCC (maybe [] (concatMap unSEC . Set.toList) msec)
  in withMSEC

-- | This is the list of [:foo:] character classes that are recognized
-- by decodePatternSet.  This is the whole list including the
-- "word" extension which is "alnum" and the underscore '_' character.
--
-- The NUL character is included as part of the "cntrl" character class.
legalCharacterClasses :: [String]
legalCharacterClasses = ["alnum","digit","punct","alpha","graph"
  ,"space","blank","lower","upper","cntrl","print","xdigit","word"]

decodeCharacterClass :: PatternSetCharacterClass -> String
decodeCharacterClass (PatternSetCharacterClass s) =
  case s of
    "alnum" -> ['0'..'9']++['a'..'z']++['A'..'Z']
    "digit" -> ['0'..'9']
    "punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\95']++"\96"++['\123'..'\126']
    "alpha" -> ['a'..'z']++['A'..'Z']
    "graph" -> ['\41'..'\126']
    "space" -> "\t\n\v\f\r "
    "blank" -> "\t "
    "lower" -> ['a'..'z']
    "upper" -> ['A'..'Z']
    "cntrl" -> ['\0'..'\31']++"\127" -- with NUL
    "print" -> ['\32'..'\126']
    "xdigit" -> ['0'..'9']++['a'..'f']++['A'..'F']
    "word" -> ['0'..'9']++['a'..'z']++['A'..'Z']++"_"
    _ -> []