--------------------------------------------------------------------------------
-- Copyright 2001-2012, Daan Leijen, Bastiaan Heeren, Jurriaan Hage. This file 
-- is distributed under the terms of the BSD3 License. For more information, 
-- see the file "LICENSE.txt", which is included in the distribution.
--------------------------------------------------------------------------------
--  $Id: Lexer.hs 291 2012-11-08 11:27:33Z heere112 $

module Lvm.Core.Parsing.Lexer (lexer) where

import Control.Monad
import Data.Char hiding (isSymbol, isLetter)
import Data.List
import Data.Maybe
import Lvm.Core.Parsing.Token

type Lexer  = Pos -> String -> [Token]
type Lexer5 = Pos -> String -> ([Token] -> [Token], Double, Pos, String)

lexer :: Lexer
lexer (ln,_) []                 = [((ln+1,0),LexEOF)]

lexer pos ('-':'-':cs)          = nextinc lexeol pos 2 cs
lexer pos ('{':'-':cs)          = nextinc (lexComment 0) pos 2 cs

lexer pos ('l':'e':'t':'!':cs)      | nonId cs    = (pos,LexLETSTRICT)  : nextinc lexer pos 4 cs
lexer pos ('l':'e':'t':cs)          | nonId cs    = (pos,LexLET)  : nextinc lexer pos 3 cs
lexer pos ('i':'n':cs)              | nonId cs    = (pos,LexIN)   : nextinc lexer pos 2 cs
lexer pos ('d':'o':cs)              | nonId cs    = (pos,LexDO)   : nextinc lexer pos 2 cs
lexer pos ('w':'h':'e':'r':'e':cs)  | nonId cs    = (pos,LexWHERE): nextinc lexer pos 5 cs
lexer pos ('c':'a':'s':'e':cs)      | nonId cs    = (pos,LexCASE) : nextinc lexer pos 4 cs
lexer pos ('o':'f':cs)              | nonId cs    = (pos,LexOF)   : nextinc lexer pos 2 cs
lexer pos ('i':'f':cs)              | nonId cs    = (pos,LexIF)   : nextinc lexer pos 2 cs
lexer pos ('t':'h':'e':'n':cs)      | nonId cs    = (pos,LexTHEN) : nextinc lexer pos 4 cs
lexer pos ('e':'l':'s':'e':cs)      | nonId cs    = (pos,LexELSE) : nextinc lexer pos 4 cs
lexer pos ('d':'a':'t':'a':cs)      | nonId cs    = (pos,LexDATA) : nextinc lexer pos 4 cs
lexer pos ('t':'y':'p':'e':cs)      | nonId cs    = (pos,LexTYPE) : nextinc lexer pos 4 cs
lexer pos ('m':'o':'d':'u':'l':'e':cs)      | nonId cs = (pos,LexMODULE) : nextinc lexer pos 6 cs
lexer pos ('i':'m':'p':'o':'r':'t':cs)      | nonId cs = (pos,LexIMPORT) : nextinc lexer pos 6 cs
-- not standard
lexer pos ('c':'o':'n':cs)                  | nonId cs = (pos,LexCON)    : nextinc lexer pos 3 cs
lexer pos ('w':'i':'t':'h':cs)              | nonId cs = (pos,LexWITH)   : nextinc lexer pos 4 cs
lexer pos ('m':'a':'t':'c':'h':cs)          | nonId cs = (pos,LexMATCH)   : nextinc lexer pos 5 cs
lexer pos ('c':'c':'a':'l':'l':cs)          | nonId cs = (pos,LexCCALL)   : nextinc lexer pos 5 cs
lexer pos ('p':'u':'b':'l':'i':'c':cs)      | nonId cs = (pos,LexPUBLIC)   : nextinc lexer pos 6 cs
lexer pos ('e':'x':'t':'e':'r':'n':cs)      | nonId cs = (pos,LexEXTERN)  : nextinc lexer pos 6 cs
lexer pos ('s':'t':'a':'t':'i':'c':cs)      | nonId cs = (pos,LexSTATIC)  : nextinc lexer pos 6 cs
lexer pos ('c':'u':'s':'t':'o':'m':cs)      | nonId cs = (pos,LexCUSTOM)  : nextinc lexer pos 6 cs
lexer pos ('n':'o':'t':'h':'i':'n':'g':cs)  | nonId cs = (pos,LexNOTHING) : nextinc lexer pos 7 cs
lexer pos ('p':'r':'i':'v':'a':'t':'e':cs)  | nonId cs = (pos,LexPRIVATE) : nextinc lexer pos 7 cs
lexer pos ('d':'e':'f':'a':'u':'l':'t':cs)  | nonId cs = (pos,LexDEFAULT) : nextinc lexer pos 7 cs
lexer pos ('d':'y':'n':'a':'m':'i':'c':cs)  | nonId cs = (pos,LexDYNAMIC) : nextinc lexer pos 7 cs
lexer pos ('r':'u':'n':'t':'i':'m':'e':cs)  | nonId cs = (pos,LexRUNTIME) : nextinc lexer pos 7 cs
lexer pos ('s':'t':'d':'c':'a':'l':'l':cs)  | nonId cs = (pos,LexSTDCALL) : nextinc lexer pos 7 cs
lexer pos ('o':'r':'d':'i':'n':'a':'l':cs)  | nonId cs = (pos,LexORDINAL) : nextinc lexer pos 7 cs
lexer pos ('d':'e':'c':'o':'r':'a':'t':'e':cs) | nonId cs = (pos,LexDECORATE) : nextinc lexer pos 8 cs
lexer pos ('a':'b':'s':'t':'r':'a':'c':'t':cs) | nonId cs = (pos,LexABSTRACT) : nextinc lexer pos 8 cs
lexer pos ('i':'n':'s':'t':'r':'c':'a':'l':'l':cs)  | nonId cs = (pos,LexINSTRCALL) : nextinc lexer pos 9 cs
lexer pos ('i':'n':'s':'t':'r':'u':'c':'t':'i':'o':'n':cs) | nonId cs = (pos,LexINSTR) : nextinc lexer pos 11 cs


lexer pos (':':':':cs)          | nonSym cs = (pos,LexCOLCOL) : nextinc lexer pos 2 cs
lexer pos ('=':'>':cs)          | nonSym cs = (pos,LexARROW)  : nextinc lexer pos 2 cs
lexer pos ('-':'>':cs)          | nonSym cs = (pos,LexRARROW) : nextinc lexer pos 2 cs
lexer pos ('<':'-':cs)          | nonSym cs = (pos,LexLARROW) : nextinc lexer pos 2 cs
lexer pos ('.':'.':cs)          | nonSym cs = (pos,LexDOTDOT) : nextinc lexer pos 2 cs
lexer pos ('\'':'\'':cs)        = nextinc (lexSpecialId pos) pos 2 cs

lexer pos ('.':cs)              | nonSym cs = (pos,LexDOT)    : nextinc lexer pos 1 cs
lexer pos (',':cs)              | nonSym cs = (pos,LexCOMMA)  : nextinc lexer pos 1 cs
lexer pos ('`':cs)              | nonSym cs = (pos,LexQUOTE)  : nextinc lexer pos 1 cs
lexer pos (';':cs)              | nonSym cs = (pos,LexSEMI)   : nextinc lexer pos 1 cs
lexer pos ('|':cs)              | nonSym cs = (pos,LexBAR)    : nextinc lexer pos 1 cs
lexer pos ('~':cs)              | nonSym cs = (pos,LexTILDE)  : nextinc lexer pos 1 cs
lexer pos ('@':cs)              | nonSym cs = (pos,LexAT)     : nextinc lexer pos 1 cs
lexer pos ('=':cs)              | nonSym cs = (pos,LexASG)    : nextinc lexer pos 1 cs
lexer pos ('\\':cs)             | nonSym cs = (pos,LexBSLASH) : nextinc lexer pos 1 cs
lexer pos ('!':cs)              | nonSym cs = (pos,LexEXCL)   : nextinc lexer pos 1 cs
lexer pos (':':cs)              | nonSym cs = (pos,LexCOLON)  : nextinc lexer pos 1 cs
-- lexer pos ('-':cs)              | nonSym cs = (pos,LexDASH)   : nextinc lexer pos 1 cs

lexer pos ('(':cs)              = (pos,LexLPAREN) : nextinc lexer pos 1 cs
lexer pos (')':cs)              = (pos,LexRPAREN) : nextinc lexer pos 1 cs
lexer pos ('[':cs)              = (pos,LexLBRACKET):nextinc lexer pos 1 cs
lexer pos (']':cs)              = (pos,LexRBRACKET):nextinc lexer pos 1 cs
lexer pos ('{':cs)              = (pos,LexLBRACE) : nextinc lexer pos 1 cs
lexer pos ('}':cs)              = (pos,LexRBRACE) : nextinc lexer pos 1 cs

lexer pos ('\'':cs)             = nextinc lexChar pos 1 cs
lexer pos ('"':cs)              = lexString (incpos pos 1) (pos,"") cs

lexer pos ('0':cs)              = lexZero pos cs

lexer pos xs@(':':_)            = lexWhile isSymbol LexConOp pos pos xs
lexer pos ('$':xs@(c:_))        | isLower c || c == '_'  = let np = incpos pos 1 in lexWhile isLetter LexId np np xs
lexer pos xs@(c:cs)             | isLower c || c == '_'  = lexWhile isLetter LexId pos pos xs
                                | isUpper c              = lexConOrQual pos xs
                                | isSpace c              = next lexer pos c cs
                                | isSymbol c             = lexWhile isSymbol LexOp pos pos xs
                                | isDigit c              = lexIntFloat pos xs
                                | otherwise              = (pos,LexUnknown c) : next lexer pos c cs

next :: (Pos -> String -> a) -> Pos -> Char -> String -> a
next f pos c cs = let pos' = newpos pos c  in seq pos' (f pos' cs)

nextinc :: (Pos -> String -> a) -> Pos -> Int -> String -> a
nextinc f pos i cs = let pos' = incpos pos i  in seq pos' (f pos' cs)

lexConOrQual :: Lexer
lexConOrQual pos cs
  = let (ident,rest) = span isLetter cs
        pos'         = foldl' newpos pos ident
    in case rest of
        '.':ds@(d:_)    | isLower d || d == '_'  
                                     -> lexWhile isLetter (LexQualId ident) pos (incpos pos' 1) ds
                        | isUpper d  -> lexWhile isLetter (LexQualCon ident) pos (incpos pos' 1) ds
                        | isSymbol d -> lexWhile isSymbol (LexQualId ident) pos (incpos pos' 1) ds
        '.':'\'':'\'':ds -> case lexSpecialId pos (incpos pos 3) ds of
                               (pos1, LexCon s):xs -> (pos1, LexQualCon ident s):xs
                               (pos1, LexId s):xs  -> (pos1, LexQualId ident s):xs
                               xs                  -> xs
        _ -> (pos,LexCon ident) : seq pos' (lexer pos' rest)

lexWhile :: (Char -> Bool) -> (String -> Lexeme) -> Pos -> Lexer
lexWhile ctype con pos0 pos cs       = let (ident,rest)  = span ctype cs
                                           pos'          = foldl' newpos pos ident
                                       in  (pos0,con ident) : seq pos' (lexer pos' rest)

lexSpecialId :: Pos -> Lexer
lexSpecialId originalPos pos cs -- originalPos points to where '' started. it should
                                -- be used as the position of the identifier because of the layout rule
                                -- y = 4
                                -- ''x'' = 3   -- x and y should be in the same context
  = let (ident,rest) = span (\c -> not (isSpace c) && c /= '\'') cs in
    case rest of
      ('\'':'\'':cs')-> let pos' = foldl' newpos pos (ident ++ "''") in
                        seq pos' $
                        case ident of
                          []        -> (originalPos,LexError "empty special identifier") : lexer pos' cs'
                          -- ":"       -> (originalPos,LexError "empty special con identifier") : lexer pos' cs'
                          ":"       -> (originalPos,LexId ident)  : lexer pos' cs'
                          ':':conid -> (originalPos,LexCon conid) : lexer pos' cs'
                          _         -> (originalPos,LexId ident)  : lexer pos' cs'
      _              -> let pos' = foldl' newpos pos ident in
                        (pos',LexError ("expecting '' after special identifier " ++ show ident)):lexer pos' rest

-----------------------------------------------------------
-- Numbers
-----------------------------------------------------------

lexZero :: Lexer
lexZero pos (c:cs)  | c == 'o' || c == 'O'  = case octal pos' cs of
                                                Just (i,pos'',cs')   -> (pos, LexInt i) : lexer pos'' cs'
                                                Nothing              -> (pos, LexError "illegal octal number")
                                                                                : lexer pos' cs
                    | c == 'x' || c == 'X'  = case hexal pos' cs of
                                                Just (i,pos'',cs')   -> (pos, LexInt i) : lexer pos'' cs'
                                                Nothing              -> (pos, LexError "illegal hexadecimal number")
                                                                                : lexer pos' cs
                    | c == '.'              = lexFloat 0 pos' cs
                    | isDigit c             = lexIntFloat pos (c:cs)
                    | otherwise             = (pos,LexInt 0) : lexer pos' (c:cs)
                    where
                      pos'  = newpos (newpos pos '0') c
lexZero pos cs      = (pos,LexInt 0) : lexer (newpos pos '0') cs

lexIntFloat :: Lexer
lexIntFloat pos cs  = case decimal pos cs of
                        Just (i,pos',cs')   ->
                            case cs' of ('.':cs'')   -> lexFloat i (newpos pos' '.') cs''
                                        _            -> (pos,LexInt i) : lexer pos' cs'
                        _ -> error "lexIntFloat"

lexFloat :: Integer -> Lexer
lexFloat i pos cs   = let (fracterr,fract,pos',cs')   = lexFract pos cs
                          (experr,expon,pos'',cs'')   = lexExponent pos' cs'
                      in  fracterr (experr ( (pos,LexFloat ((fromInteger i + fract) * expon)) : lexer pos'' cs''))

lexFract :: Lexer5
lexFract pos cs     = let (xs,rest) = span isDigit cs
                      in  if null xs
                           then ( ((pos,LexError "invalid fraction") :), 0.0, pos, cs )
                           else ( id, foldr op 0.0 xs, foldl' newpos pos xs, rest )
                    where
                      c `op` f  = (f + fromIntegral (fromEnum c - fromEnum '0'))/10.0

lexExponent :: Lexer5
lexExponent pos (c:cs)  | c == 'e' || c == 'E'   = case cs of ('-':cs') -> lexExp negate (incpos pos 2) cs'
                                                              ('+':cs') -> lexExp id (incpos pos 2) cs'
                                                              _         -> lexExp id (incpos pos 1) cs
lexExponent pos cs      = (id, 1.0, pos, cs)

lexExp :: (Integer -> Integer) -> Lexer5
lexExp f pos cs         = case decimal pos cs of
                            Just (i,pos',cs')   -> (id,power (f i),pos',cs')
                            Nothing             -> (((pos,LexError "invalid exponent"):), 1.0, pos, cs )
                        where
                          power e   | e < 0      = 1.0/power(-e)
                                    | otherwise  = fromInteger (10^e)

hexal, octal, decimal :: Pos -> String -> Maybe (Integer, Pos, String)
hexal   = number 16 isHexal
octal   = number 8 isOctal
decimal = number 10 isDigit

number ::Integer-> (Char -> Bool) -> Pos -> String -> Maybe (Integer,Pos,String)
number base test pos cs = let (xs,rest) = span test cs
                          in  if null xs
                               then Nothing
                               else Just (foldl' op 0 xs, foldl' newpos pos xs, rest)
                        where
                          x `op` y      = base*x + fromIntegral (fromChar y)
                          fromChar c    | isDigit c     = fromEnum c - fromEnum '0'
                                        | otherwise     = fromEnum (toUpper c) - fromEnum 'A'

isOctal, isHexal :: Char -> Bool
isOctal = isOctDigit 
isHexal = isHexDigit 

-----------------------------------------------------------
-- Characters
-----------------------------------------------------------

lexChar :: Lexer
lexChar pos ('\\':cs)           = let (pos',lexeme,xs) = escapeChar pos cs
                                  in  lexEndChar lexeme pos' xs
lexChar pos ('\'':cs)           = (pos,LexError "empty character") : nextinc lexer pos 1 cs

lexChar pos (c:cs)              | isGraphic c || c == '"' || c == ' ' = lexEndChar (pos,LexChar c) (incpos pos 1) cs
                                | otherwise                           = (pos,LexError "invalid character") : next lexer pos c cs

lexChar pos []                  = (pos,LexError "unexpected end of input in character") : lexer pos []

lexEndChar :: Token -> Lexer
lexEndChar lexeme pos ('\'':cs) = lexeme : nextinc lexer pos 1 cs
lexEndChar _ pos cs             = (pos,LexError "expecting termiInting symbol \"'\"") : lexer pos cs


lexString :: Pos -> (Pos, String) -> String -> [Token]
lexString pos (p,s) ('"':cs)    = (p,LexString (reverse s)) : nextinc lexer pos 1 cs

lexString pos (p,s) ('\n':cs)   = (p,LexString (reverse s)) : (pos,LexError "newline in string") : next lexer pos '\n' cs

lexString pos (p,s) ('\\':c:cs) | isSpace c     = gap (incpos pos 1) (p,s) cs
                                | c == '&'      = lexString (incpos pos 2) (p,s) cs
                                | otherwise     = let (pos',(_,lexeme),cs') = escapeChar pos (c:cs)
                                                  in  case lexeme of
                                                        LexChar d   -> lexString pos' (p,d:s) cs'
                                                        _           -> (pos,LexError "illegal escape sequence") :
                                                                         lexString pos' (p,s) cs'

lexString pos (p,s) []          = (p,LexString (reverse s))
                                        : (pos,LexError "unexpected end of input in string")
                                              : lexer pos []
lexString pos (p,s) ['\\']      = lexString (incpos pos 1) (p,s) []

lexString pos (p,s) (c:cs)      | isGraphic c || c == '\'' || c == ' '
                                             = lexString (incpos pos 1) (p,c:s) cs
                                | otherwise  = (pos,LexError ("illegal character (" ++ [c] ++ ") in string"))
                                                    : lexString (newpos pos c) (p,s) cs

gap :: Pos -> (Pos, String) -> String -> [Token]
gap pos (p,s) cs                = let (ws,rest) = span isSpace cs
                                      pos'      = foldl' newpos pos ws
                                  in  case rest of
                                        ('\\':cs')  -> lexString pos' (p,s) cs'
                                        _           -> (pos',LexError "(\\) expected at end of gap")
                                                            : lexString pos' (p,s) rest

-----------------------------------------------------------
-- Escape sequences
-----------------------------------------------------------


escapeChar :: Pos -> String -> (Pos,Token,String)
escapeChar pos [] = (pos,(pos,LexError "Unexpected end of input"),[])
escapeChar pos cs = fromMaybe def (msum [ f pos cs | f <- fs ])
 where
   def = (pos,(pos,LexError "invalid escape sequence"),cs)
   fs  = [ascii3, ascii2, escape, control, charnum]

charnum :: Pos -> String -> Maybe (Pos,Token,String)
charnum pos ('x':cs)    = numToChar pos (hexal (incpos pos 1) cs)
charnum pos ('o':cs)    = numToChar pos (octal (incpos pos 1) cs)
charnum pos ('d':cs)    = numToChar pos (decimal (incpos pos 1) cs)
charnum pos (c:cs)      | isDigit c  = numToChar pos (decimal (incpos pos 1) cs)
charnum _ _             = Nothing

numToChar :: Pos -> Maybe (Integer, Pos, String) -> Maybe (Pos, Token, String)
numToChar pos (Just (x,pos',cs')) = Just (pos',(pos,LexChar (toEnum (fromInteger x))), cs')
numToChar _ _ = Nothing



control :: Pos -> String -> Maybe (Pos,Token,String)
control pos ('^':c:cs)  | isUpper c    = let x = toEnum (fromEnum c - fromEnum 'A')
                                         in  Just (incpos pos 2, (pos,LexChar x), cs)
control _ _ = Nothing



escape :: Pos -> String -> Maybe (Pos,Token,String)
escape pos (c:cs)    = case lookup c escapemap of
                         Just k     -> Just (incpos pos 1, (pos,LexChar k), cs)
                         Nothing    -> Nothing
escape _ _ = Nothing

ascii2 :: Pos -> String -> Maybe (Pos,Token,String)
ascii2 pos (x:y:cs)  = case lookup [x,y] ascii2map of
                         Just k     -> Just (incpos pos 2, (pos,LexChar k), cs)
                         Nothing    -> Nothing
ascii2 _ _ = Nothing

ascii3 :: Pos -> String -> Maybe (Pos,Token,String)
ascii3 pos (x:y:z:cs)= case lookup [x,y,z] ascii3map of
                         Just k     -> Just (incpos pos 3, (pos,LexChar k), cs)
                         Nothing    -> Nothing
ascii3 _ _ = Nothing



escapemap :: [(Char, Char)]
escapemap        = zip "abfnrtv\\\"\'"
                       "\a\b\f\n\r\t\v\\\"\'"

ascii2map :: [(String, Char)]
ascii2map        = zip ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
                        "FS","GS","RS","US","SP"]
                       "\BS\HT\LF\VT\FF\CR\SO\SI\EM\FS\GS\RS\US\SP"

ascii3map :: [(String, Char)]
ascii3map        = zip ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
                        "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
                        "CAN","SUB","ESC","DEL"]
                       "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\BEL\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"


-----------------------------------------------------------
-- Symbols
-----------------------------------------------------------

isSpecial, isSmall, isLarge, isLetter, isSymbol :: Char -> Bool
isSpecial   = (`elem` "(),;[]`{}")
isSmall c   = isLower c || c == '_'
isLarge     = isUpper
isLetter c  = isSmall c || isLarge c || isDigit c || c == '\''
isSymbol    = (`elem` "!#$%&*+./<=>?@\\^|-~:")

isGraphic :: Char -> Bool
isGraphic c = isLetter c || isSymbol c || isSpecial c || (c == ':') || (c == '"')

nonId :: String -> Bool
nonId (c:_) = not (isLetter c)
nonId []    = True

nonSym :: String -> Bool
nonSym (c:_) = not (isSymbol c)
nonSym []    = True

-----------------------------------------------------------
-- Comment
-----------------------------------------------------------

lexeol :: Lexer
lexeol pos ('\n':cs)    = lexer  (newpos pos '\n') cs
lexeol pos (c:cs)       = lexeol (newpos pos c) cs
lexeol pos []           = lexer pos []

lexComment :: Int -> Lexer
lexComment level pos s =
   case s of
      '-':'}':cs | level == 0    -> lexer (incpos pos 2) cs
                 | otherwise     -> lexComment (level - 1) (incpos pos 2) cs
      '{':'-':cs -> lexComment (level+1) (incpos pos 2) cs
      c:cs       -> lexComment level (newpos pos c) cs
      []         -> lexer pos []