-- | Finds and tokenizes BASIC keywords, in preparation for parsing. This allows -- keywords to be read even if there are no spaces around them. Even though -- the standard disallows it, many BASIC implementations allowed this to save -- memory or screen real estate. The down side is that longer variable names -- are not practical, since they might contain keywords. module Language.VintageBasic.Tokenizer (Token(..),TokenizedLine,isDataTok,isRemTok,charTokTest,taggedCharToksToString,isStringTok, isBuiltinTok,taggedTokensP,tokenP,printToken) where import Data.Char(toLower,toUpper) import Language.VintageBasic.Builtins(Builtin,builtinToStrAssoc) import Language.VintageBasic.LexCommon import Text.ParserCombinators.Parsec type TokenizedLine = Tagged [Tagged Token] -- | Parses one or more whitespace characters, producing a space token. spaceTokP :: Parser Token spaceTokP = whiteSpaceChar >> whiteSpace >> return SpaceTok data Token = StringTok { getStringTokString :: String } | RemTok { getRemTokString :: String } | DataTok { getDataTokString :: String } | CommaTok | ColonTok | SemiTok | LParenTok | RParenTok | DollarTok | PercentTok | EqTok | NETok | LETok | LTTok | GETok | GTTok | PlusTok | MinusTok | MulTok | DivTok | PowTok | AndTok | OrTok | NotTok | BuiltinTok Builtin | LetTok | DimTok | OnTok | GoTok | SubTok | ReturnTok | IfTok | ThenTok | ForTok | ToTok | StepTok | NextTok | PrintTok | InputTok | RandomizeTok | ReadTok | RestoreTok | DefTok | FnTok | EndTok | StopTok | SpaceTok | DotTok | CharTok { getCharTokChar :: Char } deriving (Eq,Show) caseInsensitiveChar :: Char -> Parser Char caseInsensitiveChar c = char (toLower c) <|> char (toUpper c) keyword :: String -> Parser String keyword s = try (mapM caseInsensitiveChar s) ("keyword " ++ s) stringTokP :: Parser Token -- no special escape chars allowed stringTokP = do _ <- char '"' s <- manyTill anyChar (char '"') whiteSpace return (StringTok s) isStringTok :: Token -> Bool isStringTok (StringTok _) = True isStringTok _ = False remTokP :: Parser Token remTokP = do _ <- keyword "REM" s <- many anyChar return (RemTok s) isRemTok :: Token -> Bool isRemTok (RemTok _) = True isRemTok _ = False dataTokP :: Parser Token dataTokP = do _ <- keyword "DATA" s <- many anyChar return (DataTok s) isDataTok :: Token -> Bool isDataTok (DataTok _) = True isDataTok _ = False charTokP :: Parser Token charTokP = do c <- legalChar; return (CharTok (toUpper c)) charTokTest :: (Char -> Bool) -> Token -> Bool charTokTest f (CharTok c) = f c charTokTest _ _ = False taggedCharToksToString :: [Tagged Token] -> String taggedCharToksToString = map (getCharTokChar . getTaggedVal) strToTokAssoc :: [(String, Token)] strToTokAssoc = [ (",", CommaTok), (":", ColonTok), (";", SemiTok), ("(", LParenTok), (")", RParenTok), ("$", DollarTok), ("%", PercentTok), ("=", EqTok), ("<>", NETok), ("<=", LETok), ("<", LTTok), (">=", GETok), (">", GTTok), ("+", PlusTok), ("-", MinusTok), ("*", MulTok), ("/", DivTok), ("^", PowTok), (".", DotTok), ("AND", AndTok), ("OR", OrTok), ("NOT", NotTok), ("LET", LetTok), ("DIM", DimTok), ("ON", OnTok), ("GO", GoTok), ("SUB", SubTok), ("RETURN", ReturnTok), ("IF", IfTok), ("THEN", ThenTok), ("FOR", ForTok), ("TO", ToTok), ("STEP", StepTok), ("NEXT", NextTok), ("PRINT", PrintTok), ("?", PrintTok), ("INPUT", InputTok), ("RANDOMIZE", RandomizeTok), ("READ", ReadTok), ("RESTORE", RestoreTok), ("DEF", DefTok), ("FN", FnTok), ("END", EndTok), ("STOP", StopTok) ] ++ [(s, BuiltinTok b) | (b,s) <- builtinToStrAssoc] tokToStrAssoc :: [(Token, String)] tokToStrAssoc = [(t,s) | (s,t) <- strToTokAssoc] anyTokP :: Parser Token anyTokP = choice ([spaceTokP, stringTokP, remTokP, dataTokP] ++ [do _ <- keyword s; whiteSpace; return t | (s,t) <- strToTokAssoc] ++ [charTokP]) "LEGAL BASIC CHARACTER" isBuiltinTok :: Token -> Bool isBuiltinTok (BuiltinTok _) = True isBuiltinTok _ = False taggedTokenP :: Parser (Tagged Token) taggedTokenP = do pos <- getPosition tok <- anyTokP return (Tagged pos tok) -- | The main parser used to read a series of tokens from a string. taggedTokensP :: Parser [Tagged Token] taggedTokensP = do toks <- many taggedTokenP eof "" return toks -- | The single-token parser used at the parser level tokenP :: (Token -> Bool) -> GenParser (Tagged Token) () (Tagged Token) tokenP test = token (printToken . getTaggedVal) getPosTag testTaggedToken where testTaggedToken (Tagged pos tok) = if test tok then Just (Tagged pos tok) else Nothing -- | Prettyprint a token for error reporting or debugging. printToken :: Token -> String printToken tok = case (lookup tok tokToStrAssoc) of (Just s) -> s Nothing -> case tok of (CharTok c) -> [c] (DataTok s) -> "DATA" ++ s (RemTok s) -> "REM" ++ s SpaceTok -> " " (StringTok s) -> "\"" ++ s ++ "\"" _ -> error "printToken: unrecognized token."