module Parsing.Lexer (Parsing.Lexer.lexer, mylexer) where

import GLL.Combinators (Token(..),SubsumesToken(..))

import CCO.Component

import Data.Char (isDigit, isAlpha, isUpper, isLower, isAlphaNum, isSpace)
import Text.Regex.Applicative
import Text.Read (readEither)

lexer :: Component String [Token]
lexer = component (return . mylexer)
--lexer = component (((\x -> trace (show x) (return x))) . mylexer)

mylexer = sSpecs

-- Do not forget that states can be created that do not perform
-- longest match by default. Could help to disambiguate atoms for example.
lState :: String -> Bool -> RE Char t -> (t -> String -> [Token]) -> 
                  (t -> Maybe Token) -> String -> [Token]
lState _ _ _ _ _ [] = []
lState stateName discardLayout myTokens mySelector adder s =
    let re | discardLayout = Just <$> myTokens <|> ws
           | otherwise     = Just <$> myTokens
        ws =      (Nothing <$ some (psym isSpace))
              <|> (Nothing <$ string "//" <* many (psym ((/=) '\n')))
              <|> (Nothing <$ string "#" <* many (psym ((/=) '\n')))
    in case findLongestPrefix re s of
        Just (Just tok, rest)   -> (maybe id (:) (adder tok)) (mySelector tok rest)
        Just (Nothing,rest)     -> lState stateName discardLayout myTokens mySelector adder rest
        Nothing                 -> error ("lexical error at " ++ stateName ++ ": " ++ show (take 10 s))


sSpecs = lState "SPECS" True (lCommentStart <|> lTokens) sel Just
  where sel tok = case tok of  Keyword "/*"           -> sComment
                               Keyword "/*HS-IMPORTS" -> sComment
                               Keyword "Section"      -> sSectionNum
                               Keyword "Subsection"   -> sSectionNum
                               _                      -> sSpecs

sSectionNum = lState "SECTION-NUM" True (optional lSectNum) sel id
  where sel tok = sSection 

sSection = lState "SECTION" False lTitleWords (const sSpecs) Just

sComment = lState "COMMENT" False 
              (lHS_import <|> lTick <|> lCommentEnd <|> lCommentPart) sel Just
  where sel tok = case tok of Token "TICK" _  -> sTickIn 
                              Keyword "*/"    -> sSpecs
                              _               -> sComment
        lHS_import = Keyword <$> string "HS-IMPORTS"

sTickIn = lState "TICK-IN" True (lTick <|> lTokens) sel Just
  where sel tok = case tok of Token "TICK" _  -> sTickIn
                              _               -> sSpecInTick

sTickOut = lState "TICK-OUT" False (lTick <|> lCommentPart) sel Just
  where sel tok = case tok of Token "TICK" _  -> sTickOut
                              _               -> sComment

sSpecInTick = lState "SPEC-IN-TICK" True (lTick <|> lTokens) sel Just
  where sel tok = case tok of Token "TICK" _  -> sTickOut
                              _               -> sSpecInTick


lTokens :: SubsumesToken t => RE Char t
lTokens =
        lCharacters
    <|> lKeywords
    <|> charsToInt  <$> optional (sym '-') <*> some (psym isDigit)
    <|> upcast . IDLit . Just <$> lName 
    <|> upcast . AltIDLit . Just <$> lVar 
--    <|> upcast . CharLit . Just <$> lCharLit
    <|> upcast . StringLit . Just <$> lStringLit
    <|> lMore
    where
            charsToInt Nothing n = upcast (IntLit (Just (read n)))
            charsToInt (Just _) n = upcast (IntLit (Just (-(read n))))

            lChar c = upcast (Char c) <$ sym c
            lCharacters = foldr ((<|>) . lChar) empty cbs_characters

            lKeyword k  = upcast (Keyword k) <$ string k
            lKeywords = foldr ((<|>) . lKeyword) empty cbs_keywords

            lMore = foldr ((<|>) . uncurry lToken) empty myTokens 
            lToken t re = upcast . Token t . Just <$> re

            lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"'
             where strChar =  sym '\\' *> sym '\"'
                              <|> psym ((/=) '\"')
                   toString inner = case readEither ("\"" ++ inner ++ "\"") of
                      Left _  -> inner
                      Right v -> v 

            lCharLit = id <$ sym '\'' <*> charChar <* sym '\''
              where charChar = sym '\\' *> sym '\''
                                <|> psym ((/=) '\'')

cbs_characters = "~():,*+?!|[]{}.@=<>-_&^"

cbs_keywords =  [ "Alias"
                , "Assert"
                , "Auxiliary"
                , "Built-in"
                , "Datatype"    
                , "Entity"                
                , "Funcon"
                , "Hidden"
                , "Language"
                , "Lexis"
                , "Meta-variables"
                , "Otherwise" 
                , "Rule"
                , "SDF"
                , "Semantics"
                , "Syntax"
                , "Type"
                , "Variables"
                , "...", ">:", "<:", "=>", "|->", "==", "~>" 
                , "=/=", "|-", "--", "->", "::=", "[[", "]]"]

lCommentStart = Keyword "/*" <$ sym '/' <* sym '*'
            <|> Keyword "/*HS-IMPORTS" <$ string "/*HS-IMPORTS"
lCommentEnd   = Keyword "*/" <$ sym '*' <* sym '/'

myTokens =  [("ATOM", lAtom), ("FILE", lFile), ("BAR", lBar)
            ,("ORDINAL", lOrdinal), ("DQUOTE", lDQuote) ]

lDQuote = "\\\"" <$ sym '\\' <* sym '\"'

lBar = "----" <$ sym '-' <* sym '-' <* sym '-' <* sym '-' <* many (sym '-')

lName = (:) <$> psym isLower <*> many (psym (\c -> isAlphaNum c || c == '-'))

lCommentPart :: SubsumesToken t => RE Char t
lCommentPart = 
      upcast (Char '*') <$ sym '*'
  <|> upcast (Char '@') <$ sym '@'
  <|> upcast <$ sym '@' <*> lSectNum
  <|> upcast . Token "ORDINARY" . Just <$> lOrdinary
                

lOrdinary = some $ psym (\c -> c /= '*' && c /= '`' && c /= '@')

--TODO atoms are not being lexed correctly
-- for example '\' | '`' should be two atoms separated by a char '|'
-- this is an interesting example of ambiguity
--
-- ambiguity: '\' '\' , one atom '\' ' followed by \', or two atoms '\'
--
-- Lexis
--    capitalized-characters ::= '\' | '`' 
lAtom = sym '\'' *> ((concat <$> few atom_char) <|> lBackslash <|> escaped) <* sym '\''
  where atom_char = (:[]) <$> psym (not . invC)
-- TODO how to allow escaped single quotes in atoms (see ambiguities above)
--                      <|> escaped 
         where invC c = c == '\'' || c == '\t' || c == '\n' || c == '\r'  
 --                           || c == '|' {- temporary fix -}

        escaped = (\a b -> [a,b]) <$> sym '\\' <*> sym '\''
lVar = (\c xs ys zs -> c:xs++ys++zs) <$> psym isUpper <*> many (psym isAlpha) 
            <*> stem_rest <*> rest
  where stem_rest = (++) <$> (concat <$> many ((:) <$> sym '-' <*> some (psym isAlpha))) 
                      <*> stem_suffix
        stem_suffix = maybe [] id <$> optional ((\a b -> [a,b]) <$> sym '-' <*> psym isDigit)
        rest = (++) <$> suffix <*> (maybe [] id <$> (optional postfix))
         where suffix = (++) <$> many (psym isDigit) <*> many (sym '\'')
               postfix = (:[]) <$> (sym '*' <|> sym '+' <|> sym '?')

lFile = ((++)) . concat <$> many part_sep <*> part
 where part = some (psym isAlphaNum)
       part_sep = (\a b -> a ++ [b]) <$> part <*> (sym '-' <|> sym '_')

lSectNum :: RE Char Token
lSectNum =    Token "SECT-NUM" . Just <$> lSect1Num 
          <|> Token "SECT-NUM" . Just <$> lSect2Num 
          <|> Token "SECT-NUM" . Just <$> lSect3Num 
          <|> Token "SECT-NUM" . Just <$> lSect4Num 

-- ambiguity MISC lexes both as lSect1Num, but should be interpreted as lTitleWords
lSect1Num = lOrdinal
lSect2Num = (\x y z -> x++[y]++z) <$> lOrdinal <*> sym '.' <*> lOrdinal
lSect3Num = (\x1 x2 x3 x4 x5 -> x1++[x2]++x3++[x4]++x5) <$> 
              lOrdinal <*> sym '.' <*> lOrdinal <*> sym '.' <*> lOrdinal 
lSect4Num = (\x1 x2 x3 x4 x5 x6 x7 -> x1++[x2]++x3++[x4]++x5++[x6]++x7) <$> 
              lOrdinal <*> sym '.' <*> lOrdinal <*> sym '.' <*> lOrdinal 
                       <*> sym '.' <*> lOrdinal
lOrdinal = some (psym isDigit) {- CAUSES AMBIGUITY, IS IT USED? <|> (:[]) <$> psym isAlpha-}

lTitleWords :: RE Char Token
lTitleWords = Token "TITLE" . Just <$> many (psym ((/=) '\n')) <* sym '\n' 

lTick :: SubsumesToken t => RE Char t
lTick = upcast (Token "TICK" (Just "`")) <$ sym '`'

-- | Escaped backslash
lBackslash = "\\" <$ sym '\\' <* sym '\\'

concatMany :: RE Char String -> RE Char String
concatMany p = concat <$> many p 

concatSome p = concat <$> some p

{-
notFollowedBy :: RE c s1 -> RE c s2 -> RE c s1
notFollowedBy p q = do
  -- apply the regex p
  (res, matched) <- withMatched p
  case match q of --then see if the regex q matches
    Just _  -> -- if it does we need to insert `matched` back into the input string
               empty
    Nothing -> return res -- otherwise just return the result
-}