-- | Tokenizing of Tempus source code. module Tempus.Lexer ( Token (..), showToken, lex, tokenize, Parser (..), ParseResult, initParser ) where import Prelude hiding (lex) import Control.Applicative import Control.Monad.State import Data.Char import Data.List (unfoldr) import Data.Maybe (fromMaybe) import Tempus.Loc -- * Tokens -- | The input tokens for a Tempus parser. data Token = EOF | Variable String | NatLit Integer | Equals | ArrowRight | Plus | Times | ParenOpen | ParenClose | Zero | One | Mu | Nu | Dot | Comma | Lambda | SquareOpen | SquareClose | AngleOpen | AngleClose | CircledAsterisk | CircledDot | QuestionMark | KWBehavior | KWCase | KWConst | KWEvent | KWExpand | KWFirst | KWFold | KWLeft | KWNever | KWPack | KWPositive | KWRace | KWReflect | KWRight | KWSecond | KWUltraswitch | KWType | KWUltrajump | KWUnfold | KWUnpack | KWValue deriving (Show, Eq) -- TODO: support unicode output -- | Return the ASCII symbol or keyword, or a descriptive text for a token. showToken :: Token -> String showToken tok = case tok of EOF -> "end of file" Variable v -> v NatLit i -> show i Equals -> "=" ArrowRight -> "->" Plus -> "+" Times -> "*" ParenOpen -> "(" ParenClose -> ")" Zero -> "0" One -> "1" Mu -> "mu" Nu -> "nu" Dot -> "." Comma -> "," Lambda -> "\\" SquareOpen -> "[" SquareClose -> "]" AngleOpen -> "opening angle bracket" AngleClose -> "closing angle bracket" CircledAsterisk -> "<*>" CircledDot -> "<.>" QuestionMark -> "?" KWBehavior -> "behavior" KWCase -> "case" KWConst -> "const" KWEvent -> "event" KWExpand -> "expand" KWFirst -> "first" KWFold -> "fold" KWLeft -> "left" KWNever -> "never" KWPack -> "pack" KWPositive -> "positive" KWRace -> "race" KWReflect -> "reflect" KWRight -> "right" KWSecond -> "second" KWUltraswitch -> "ultraswitch" KWType -> "type" KWUltrajump -> "ultrajump" KWUnfold -> "unfold" KWUnpack -> "unpack" KWValue -> "value" -- | The list of keyword and symbol strings with their corresponding token. reserved :: [(String, Token)] reserved = [ -- keywords ("type", KWType), ("value", KWValue), ("behavior", KWBehavior), ("event", KWEvent), ("expand", KWExpand), ("positive", KWPositive), ("\x2115\x208A", KWPositive), ("fold", KWFold), ("unfold", KWUnfold), ("const", KWConst), ("left", KWLeft), ("right", KWRight), ("case", KWCase), ("first", KWFirst), ("second", KWSecond), ("never", KWNever), ("race", KWRace), ("reflect", KWReflect), ("pack", KWPack), ("unpack", KWUnpack), ("ultraswitch", KWUltraswitch), ("ultrajump", KWUltrajump), -- symbols ("\x03BC", Mu), ("mu", Mu), ("\x03BD", Nu), ("nu", Nu), ("\x2192", ArrowRight), ("->", ArrowRight), ("+", Plus), ("\xD7", Times), ("*", Times), ("\x03BB", Lambda), ("\\", Lambda), ("\x229B", CircledAsterisk), ("<*>", CircledAsterisk), ("\x2299", CircledDot), ("<.>", CircledDot), ("=", Equals), (".", Dot), ("?", QuestionMark) ] -- | The list of brackets with a special meaning and their corresponding token. brackets :: [(Char, Token)] brackets = [ ('(', ParenOpen), (')', ParenClose), ('[', SquareOpen), (']', SquareClose), ('\x27E8', AngleOpen), ('\x27E9', AngleClose) ] -- * Lexer -- | The lexer monad. type Lexer a = State LexerState a -- | The lexer state. data LexerState = LexerState { input :: String, -- ^ The residual input string. loc :: SrcLoc -- ^ The current location in the original input string. } deriving (Show, Eq) -- ** Primitive lexers -- | Modify the input string of the current lexer state by the function. modifyInput :: (String -> String) -> Lexer () modifyInput f = modify (\st -> st { input = f $ input st }) -- | Modify the location of the current lexer state. modifyLoc :: (Int -> Int, Int -> Int) -> Lexer () modifyLoc (fl, fc) = modify (\st -> st { loc = let (l, c) = loc st in (fl l, fc c) }) -- | Discard the first @n@ characters of the current input string. discard :: Int -> Lexer () discard n = modifyInput (drop n) >> modifyLoc (id, (+ n)) -- | Read all subsequent characters satisfying a certain predicate from the input string and -- return the read characters. lexWhile :: (Char -> Bool) -> Lexer String lexWhile p = do s <- gets input let (s', rest) = span p s modifyInput (const rest) modifyLoc (id, (+ length s')) return s' -- ** Tempus token lexers -- | Lexes a single token discarding all whitespace before that token. lexInput :: Lexer (Maybe Token) lexInput = do s <- gets input case s of [] -> return $ Just EOF _ -> Just <$> lexToken >>= \t -> lexWhitespace >> return t -- | Lexes all whitespace, including single line comments. lexWhitespace :: Lexer () lexWhitespace = do s <- gets input case s of '-':'-':c:_ | isSpace c -> lexWhile (/= '\n') >> lexWhitespace '\n':_ -> discard 1 >> modifyLoc (succ, const 1) >> lexWhitespace c:_ | isSpace c -> discard 1 >> lexWhitespace _ -> return () -- | Returns true iff the symbol is a bracket, i.e. has the general Unicode category -- @OpenPunctuation@ or @ClosePunctuation@. isBracket :: Char -> Bool isBracket c = generalCategory c `elem` [OpenPunctuation, ClosePunctuation] -- | Converts the string representation of a decimal number to an @Integer@. stringToNat :: String -> Integer stringToNat = foldl (\a -> (10*a +) . toInteger . digitToInt) 0 -- | Lexes a single token. lexToken :: Lexer Token lexToken = do s <- gets input case s of -- make whitespace around commas optional ',':_ -> discard 1 >> return Comma -- brackets c:_ | isBracket c -> do discard 1 return $ fromMaybe (Variable [c]) $ lookup c brackets -- all other identifiers | otherwise -> do ids <- lexWhile (\c -> not $ isSpace c || isBracket c || c == ',') case ids of "0" -> return Zero "1" -> return One _ | all isDigit ids -> let n = stringToNat ids in return $ if n == 0 then Variable ids else NatLit n _ -> return $ fromMaybe (Variable ids) $ lookup ids reserved _ -> fail $ "lexToken: interal error" {- | Runs a lexer inside the parser monad. A new parser is constructed that runs the lexer, applies the read token to the passed continuation and runs the resulting parser of the continuation with the new input string and location. -} lex :: (Loc Token -> Parser a) -> Parser a lex cont = Parser $ \s loc -> case runState lexInput $ LexerState s loc of (Nothing, _) -> Left $ Loc loc $ "lexer error at " ++ showSrcLoc loc (Just t, LexerState s' loc') -> runParser (cont (Loc loc t)) s' loc' -- | Tokenizes a string to a list of @Token@ values with their corresponding locations. This -- function can be used for testing purposes. tokenize :: String -> [Loc Token] tokenize s = run . snd . runState lexWhitespace $ LexerState s (1,1) where run st = case runState lexInput st of (Nothing, _) -> [] (Just EOF, _) -> [] (Just tok, st') -> Loc (loc st) tok : run st' -- * Parser monad -- | Parser result which is either an error message with the location the error occured or a value. type ParseResult a = Either (Loc String) a -- | Type for the parser monad. newtype Parser a = Parser { runParser :: String -> SrcLoc -> ParseResult a } instance Monad Parser where return a = Parser $ \_ _ -> Right a p >>= f = Parser $ \s loc -> case runParser p s loc of Left e -> Left e Right a -> runParser (f a) s loc fail s = Parser $ \_ loc -> Left $ Loc loc s -- | Initializes a parser by lexing all whitespace before the first token. initParser :: Parser a -> String -> ParseResult a initParser p s = let (_, LexerState s' loc') = runState lexWhitespace $ LexerState s (1,1) in runParser p s' loc'