DDC/Core/Lexer.hs


-- | Reference lexer for core langauge parser. Slow but Simple.
--
--   The lexers here all use 'String' in place of a real name type.
--   After applying these functions to the program text, we need
--   to use `renameTok` tok convert the strings in `TokNamed` tokens
--   into the name type specific to the langauge fragment to be parsed.
--
module DDC.Core.Lexer
        ( module DDC.Core.Lexer.Tokens
        , module DDC.Core.Lexer.Names

          -- * Lexer
        , lexModuleWithOffside
        , lexExp)
where
import DDC.Core.Lexer.Offside
import DDC.Core.Lexer.Comments
import DDC.Core.Lexer.Names
import DDC.Core.Lexer.Tokens
import DDC.Data.SourcePos
import DDC.Data.Token
import Data.Char
import Data.List


-- Module ---------------------------------------------------------------------
-- | Lex a module and apply the offside rule.
--
--   Automatically drop comments from the token stream along the way.
--
lexModuleWithOffside 
        :: FilePath     -- ^ Path to source file, for error messages.
        -> Int          -- ^ Starting line number.
        -> String       -- ^ String containing program text.
        -> [Token (Tok String)]

lexModuleWithOffside sourceName lineStart str
 = {-# SCC lexWithOffside #-}
        applyOffside [] 
        $ addStarts
        $ dropComments 
        $ lexString sourceName lineStart str


-- Exp ------------------------------------------------------------------------
-- | Lex a string into tokens.
--
--   Automatically drop comments from the token stream along the way.
--
lexExp  :: FilePath     -- ^ Path to source file, for error messages.
        -> Int          -- ^ Starting line number.
        -> String       -- ^ String containing program text.
        -> [Token (Tok String)]

lexExp sourceName lineStart str
 = {-# SCC lexExp #-}
        dropNewLines
        $ dropComments
        $ lexString sourceName lineStart str


-- Generic --------------------------------------------------------------------
lexString :: String -> Int -> String -> [Token (Tok String)]
lexString sourceName lineStart str
        = lexWord lineStart 1 str
 where 
  lexWord :: Int -> Int -> String -> [Token (Tok String)]
  lexWord line column w
   = let  tok t = Token t (SourcePos sourceName line column)
          tokM  = tok . KM
          tokA  = tok . KA
          tokN  = tok . KN

          lexMore n rest
           = lexWord line (column + n) rest

     in case w of
        []               -> []        

        -- Whitespace
        ' '  : w'        -> lexMore 1 w'
        '\t' : w'        -> lexMore 8 w'

        -- Literal values
        -- This needs to come before the rule for '-'
        c : cs
         | isDigit c
         , (body, rest)         <- span isLitBody cs
         -> tokN (KLit (c:body))        : lexMore (length (c:body)) rest

        '-' : c : cs
         | isDigit c
         , (body, rest)         <- span isLitBody cs
         -> tokN (KLit ('-':c:body))    : lexMore (length (c:body)) rest

        -- Meta tokens
        -- ISSUE #302: Don't try to lex body of a block comment.
        '{'  : '-' : w'  
         -> tokM KCommentBlockStart : lexMore 2 w'

        '-'  : '}' : w'  
         -> tokM KCommentBlockEnd   : lexMore 2 w'

        '-'  : '-' : w'  
         -> let  (_junk, w'') = span (/= '\n') w'
            in   tokM KCommentLineStart  : lexMore 2 w''

        '\n' : w'        -> tokM KNewLine           : lexWord (line + 1) 1 w'

        -- The unit data constructor
        '(' : ')' : w'   -> tokA KDaConUnit      : lexMore 2 w'

        -- Compound Parens
        '['  : ':' : w'  -> tokA KSquareColonBra : lexMore 2 w'
        ':'  : ']' : w'  -> tokA KSquareColonKet : lexMore 2 w'
        '<'  : ':' : w'  -> tokA KAngleColonBra  : lexMore 2 w'
        ':'  : '>' : w'  -> tokA KAngleColonKet  : lexMore 2 w'

        -- Function Constructors
        '~'  : '>'  : w' -> tokA KArrowTilde     : lexMore 2 w'
        '-'  : '>'  : w' -> tokA KArrowDash      : lexMore 2 w'
        '<'  : '-'  : w' -> tokA KArrowDashLeft  : lexMore 2 w'
        '='  : '>'  : w' -> tokA KArrowEquals    : lexMore 2 w'

        -- Compound symbols
        ':'  : ':'  : w' -> tokA KColonColon     : lexMore 2 w'
        '/'  : '\\' : w' -> tokA KBigLambda      : lexMore 2 w'

        -- Debruijn indices
        '^'  : cs
         |  (ds, rest)   <- span isDigit cs
         ,  length ds >= 1
         -> tokA (KIndex (read ds))              : lexMore (1 + length ds) rest         

        -- Parens
        '('  : w'       -> tokA KRoundBra        : lexMore 1 w'
        ')'  : w'       -> tokA KRoundKet        : lexMore 1 w'
        '['  : w'       -> tokA KSquareBra       : lexMore 1 w'
        ']'  : w'       -> tokA KSquareKet       : lexMore 1 w'
        '{'  : w'       -> tokA KBraceBra        : lexMore 1 w'
        '}'  : w'       -> tokA KBraceKet        : lexMore 1 w'
        '<'  : w'       -> tokA KAngleBra        : lexMore 1 w'
        '>'  : w'       -> tokA KAngleKet        : lexMore 1 w'            

        -- Punctuation
        '.'  : w'       -> tokA KDot             : lexMore 1 w'
        '|'  : w'       -> tokA KBar             : lexMore 1 w'
        '^'  : w'       -> tokA KHat             : lexMore 1 w'
        '+'  : w'       -> tokA KPlus            : lexMore 1 w'
        ':'  : w'       -> tokA KColon           : lexMore 1 w'
        ','  : w'       -> tokA KComma           : lexMore 1 w'
        '\\' : w'       -> tokA KBackSlash       : lexMore 1 w'
        ';'  : w'       -> tokA KSemiColon       : lexMore 1 w'
        '_'  : w'       -> tokA KUnderscore      : lexMore 1 w'
        '='  : w'       -> tokA KEquals          : lexMore 1 w'
        '&'  : w'       -> tokA KAmpersand       : lexMore 1 w'
        '-'  : w'       -> tokA KDash            : lexMore 1 w'
        
        -- Bottoms
        name
         |  Just w'     <- stripPrefix "Pure"  name 
         -> tokA KBotEffect   : lexMore 2 w'
         
         |  Just w'     <- stripPrefix "Empty" name 
         -> tokA KBotClosure  : lexMore 2 w'

        -- Named Constructors
        c : cs
         | isConStart c
         , (body,  rest)        <- span isConBody cs
         , (body', rest')       <- case rest of
                                        '\'' : rest'    -> (body ++ "'", rest')
                                        '#'  : rest'    -> (body ++ "#", rest')
                                        _               -> (body, rest)
         -> let readNamedCon s
                 | Just socon   <- readSoConBuiltin s
                 = tokA (KSoConBuiltin socon)    : lexMore (length s) rest'

                 | Just kicon   <- readKiConBuiltin s
                 = tokA (KKiConBuiltin kicon)    : lexMore (length s) rest'

                 | Just twcon   <- readTwConBuiltin s
                 = tokA (KTwConBuiltin twcon)    : lexMore (length s) rest'
                 
                 | Just tccon   <- readTcConBuiltin s
                 = tokA (KTcConBuiltin tccon)    : lexMore (length s) rest'
                 
                 | Just con     <- readCon s
                 = tokN (KCon con)               : lexMore (length s) rest'
               
                 | otherwise    
                 = [tok (KJunk [c])]
                 
            in  readNamedCon (c : body')

        -- Keywords, Named Variables and Witness constructors
        c : cs
         | isVarStart c
         , (body,  rest)        <- span isVarBody cs
         , (body', rest')       <- case rest of
                                        '#' : rest'     -> (body ++ "#", rest')
                                        _               -> (body, rest)
         -> let readNamedVar s
                 | Just t  <- lookup s keywords
                 = tok t                   : lexMore (length s) rest'

                 | Just wc <- readWbConBuiltin s
                 = tokA (KWbConBuiltin wc) : lexMore (length s) rest'
         
                 | Just v  <- readVar s
                 = tokN (KVar v)           : lexMore (length s) rest'

                 | otherwise
                 = [tok (KJunk [c])]

            in  readNamedVar (c : body')

        -- Some unrecognised character.
        -- We still need to keep lexing as this may be in a comment.
        c : cs   -> (tok $ KJunk [c]) : lexMore 1 cs