-- | Tokenizing of Tempus source code.
module Tempus.Lexer (
    Token (..),
    showToken,
    lex,
    tokenize,

    Parser (..),
    ParseResult,
    initParser
) where


import Prelude hiding (lex)

import Control.Applicative
import Control.Monad.State

import Data.Char
import Data.List (unfoldr)
import Data.Maybe (fromMaybe)

import Tempus.Loc


-- * Tokens

-- | The input tokens for a Tempus parser.
data Token = EOF
           | Variable String
           | NatLit Integer
           | Equals
           | ArrowRight
           | Plus
           | Times
           | ParenOpen
           | ParenClose
           | Zero
           | One
           | Mu
           | Nu
           | Dot
           | Comma
           | Lambda
           | SquareOpen
           | SquareClose
           | AngleOpen
           | AngleClose
           | CircledAsterisk
           | CircledDot
           | QuestionMark

           | KWBehavior
           | KWCase
           | KWConst
           | KWEvent
           | KWExpand
           | KWFirst
           | KWFold
           | KWLeft
           | KWNever
           | KWPack
           | KWPositive
           | KWRace
           | KWReflect
           | KWRight
           | KWSecond
           | KWUltraswitch
           | KWType
           | KWUltrajump
           | KWUnfold
           | KWUnpack
           | KWValue
           deriving (Show, Eq)


-- TODO: support unicode output
-- | Return the ASCII symbol or keyword, or a descriptive text for a token.
showToken :: Token -> String
showToken tok = case tok of
    EOF             -> "end of file"
    Variable v      -> v
    NatLit i        -> show i
    Equals          -> "="
    ArrowRight      -> "->"
    Plus            -> "+"
    Times           -> "*"
    ParenOpen       -> "("
    ParenClose      -> ")"
    Zero            -> "0"
    One             -> "1"
    Mu              -> "mu"
    Nu              -> "nu"
    Dot             -> "."
    Comma           -> ","
    Lambda          -> "\\"
    SquareOpen      -> "["
    SquareClose     -> "]"
    AngleOpen       -> "opening angle bracket"
    AngleClose      -> "closing angle bracket"
    CircledAsterisk -> "<*>"
    CircledDot      -> "<.>"
    QuestionMark    -> "?"
    KWBehavior      -> "behavior"
    KWCase          -> "case"
    KWConst         -> "const"
    KWEvent         -> "event"
    KWExpand        -> "expand"
    KWFirst         -> "first"
    KWFold          -> "fold"
    KWLeft          -> "left"
    KWNever         -> "never"
    KWPack          -> "pack"
    KWPositive      -> "positive"
    KWRace          -> "race"
    KWReflect       -> "reflect"
    KWRight         -> "right"
    KWSecond        -> "second"
    KWUltraswitch   -> "ultraswitch"
    KWType          -> "type"
    KWUltrajump     -> "ultrajump"
    KWUnfold        -> "unfold"
    KWUnpack        -> "unpack"
    KWValue         -> "value"


-- | The list of keyword and symbol strings with their corresponding token.
reserved :: [(String, Token)]
reserved = [
        -- keywords
        ("type", KWType),
        ("value", KWValue),
        ("behavior", KWBehavior),
        ("event", KWEvent),
        ("expand", KWExpand),
        ("positive", KWPositive),
        ("\x2115\x208A", KWPositive),
        ("fold", KWFold),
        ("unfold", KWUnfold),
        ("const", KWConst),
        ("left", KWLeft),
        ("right", KWRight),
        ("case", KWCase),
        ("first", KWFirst),
        ("second", KWSecond),
        ("never", KWNever),
        ("race", KWRace),
        ("reflect", KWReflect),
        ("pack", KWPack),
        ("unpack", KWUnpack),
        ("ultraswitch", KWUltraswitch),
        ("ultrajump", KWUltrajump),

        -- symbols
        ("\x03BC", Mu),
        ("mu", Mu),
        ("\x03BD", Nu),
        ("nu", Nu),
        ("\x2192", ArrowRight),
        ("->", ArrowRight),
        ("+", Plus),
        ("\xD7", Times),
        ("*", Times),
        ("\x03BB", Lambda),
        ("\\", Lambda),
        ("\x229B", CircledAsterisk),
        ("<*>", CircledAsterisk),
        ("\x2299", CircledDot),
        ("<.>", CircledDot),
        ("=", Equals),
        (".", Dot),
        ("?", QuestionMark)
    ]

-- | The list of brackets with a special meaning and their corresponding token.
brackets :: [(Char, Token)]
brackets = [
        ('(', ParenOpen),
        (')', ParenClose),
        ('[', SquareOpen),
        (']', SquareClose),
        ('\x27E8', AngleOpen),
        ('\x27E9', AngleClose)
    ]


-- * Lexer

-- | The lexer monad.
type Lexer a = State LexerState a

-- | The lexer state.
data LexerState = LexerState {
                      input :: String,
                      -- ^ The residual input string.
                      loc :: SrcLoc
                      -- ^ The current location in the original input string.
                  }
                  deriving (Show, Eq)


-- ** Primitive lexers

-- | Modify the input string of the current lexer state by the function.
modifyInput :: (String -> String) -> Lexer ()
modifyInput f = modify (\st -> st { input = f $ input st })

-- | Modify the location of the current lexer state.
modifyLoc :: (Int -> Int, Int -> Int) -> Lexer ()
modifyLoc (fl, fc) = modify (\st -> st { loc = let (l, c) = loc st in (fl l, fc c) })

-- | Discard the first @n@ characters of the current input string.
discard :: Int -> Lexer ()
discard n = modifyInput (drop n) >> modifyLoc (id, (+ n))


-- | Read all subsequent characters satisfying a certain predicate from the input string and
-- return the read characters.
lexWhile :: (Char -> Bool) -> Lexer String
lexWhile p = do
    s <- gets input
    let (s', rest) = span p s
    modifyInput (const rest)
    modifyLoc (id, (+ length s'))
    return s'


-- ** Tempus token lexers

-- | Lexes a single token discarding all whitespace before that token.
lexInput :: Lexer (Maybe Token)
lexInput = do
    s <- gets input
    case s of
        [] -> return $ Just EOF
        _  -> Just <$> lexToken >>= \t -> lexWhitespace >> return t

-- | Lexes all whitespace, including single line comments.
lexWhitespace :: Lexer ()
lexWhitespace = do
    s <- gets input
    case s of
        '-':'-':c:_ | isSpace c -> lexWhile (/= '\n') >> lexWhitespace
        '\n':_ -> discard 1 >> modifyLoc (succ, const 1) >> lexWhitespace
        c:_ | isSpace c -> discard 1 >> lexWhitespace
        _ -> return ()

-- | Returns true iff the symbol is a bracket, i.e. has the general Unicode category
-- @OpenPunctuation@ or @ClosePunctuation@.
isBracket :: Char -> Bool
isBracket c = generalCategory c `elem` [OpenPunctuation, ClosePunctuation]

-- | Converts the string representation of a decimal number to an @Integer@.
stringToNat :: String -> Integer
stringToNat = foldl (\a -> (10*a +) . toInteger . digitToInt) 0

-- | Lexes a single token.
lexToken :: Lexer Token
lexToken = do
    s <- gets input
    case s of
        -- make whitespace around commas optional
        ',':_ -> discard 1 >> return Comma

            -- brackets
        c:_ | isBracket c -> do
                discard 1
                return $ fromMaybe (Variable [c]) $ lookup c brackets

            -- all other identifiers
            | otherwise -> do
                ids <- lexWhile (\c -> not $ isSpace c || isBracket c || c == ',')
                case ids of
                    "0" -> return Zero
                    "1" -> return One
                    _ | all isDigit ids -> let n = stringToNat ids
                                           in return $ if n == 0 then Variable ids
                                                                 else NatLit n
                    _ -> return $ fromMaybe (Variable ids) $ lookup ids reserved

        _ -> fail $ "lexToken: interal error"


{- |
    Runs a lexer inside the parser monad. A new parser is constructed that runs the lexer,
    applies the read token to the passed continuation and runs the resulting parser of the
    continuation with the new input string and location.
-}
lex :: (Loc Token -> Parser a) -> Parser a
lex cont = Parser $ \s loc ->
    case runState lexInput $ LexerState s loc of
        (Nothing, _) -> Left $ Loc loc $ "lexer error at " ++ showSrcLoc loc
        (Just t, LexerState s' loc') -> runParser (cont (Loc loc t)) s' loc'

-- | Tokenizes a string to a list of @Token@ values with their corresponding locations. This
-- function can be used for testing purposes.
tokenize :: String -> [Loc Token]
tokenize s = run . snd . runState lexWhitespace $ LexerState s (1,1)
    where
        run st = case runState lexInput st of
            (Nothing, _)    -> []
            (Just EOF, _)   -> []
            (Just tok, st') -> Loc (loc st) tok : run st'


-- * Parser monad

-- | Parser result which is either an error message with the location the error occured or a value.
type ParseResult a = Either (Loc String) a

-- | Type for the parser monad.
newtype Parser a = Parser { runParser :: String -> SrcLoc -> ParseResult a }

instance Monad Parser where
    return a = Parser $ \_ _   -> Right a
    p >>= f  = Parser $ \s loc -> case runParser p s loc of
                                      Left e  -> Left e
                                      Right a -> runParser (f a) s loc
    fail s   = Parser $ \_ loc -> Left $ Loc loc s


-- | Initializes a parser by lexing all whitespace before the first token.
initParser :: Parser a -> String -> ParseResult a
initParser p s = let (_, LexerState s' loc') = runState lexWhitespace $ LexerState s (1,1)
                 in runParser p s' loc'