{------------------------------------------------------------------------------- Copyright: Bernie Pope 2004 Module: Lexer Description: Lexical analysis for baskell's parser. Primary Authors: Bernie Pope -------------------------------------------------------------------------------} {- This file is part of baskell. baskell is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. baskell is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with baskell; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} module Lexer ( Token (..) , Symbol (..) , lexer ) where import Data.Char ( isDigit , isAlpha , isPrint , isLower ) import Text.ParserCombinators.Parsec.Pos ( newPos , setSourceColumn , incSourceColumn , incSourceLine ) import Text.ParserCombinators.Parsec ( SourcePos ) -------------------------------------------------------------------------------- newtype Token = Token (SourcePos, Symbol) instance Show Token where show (Token (pos, symbol)) = show symbol data Symbol = Word String | SingleQuoted String | Num Int | Equals | LeftRoundBracket | RightRoundBracket | LeftSquareBracket | RightSquareBracket | Comma | Colon | DoubleColon | SemiColon | BackSlash | Exclamation | Dash | GreaterThan | Bad String deriving (Eq) -- slighlty pretty printing of Symbols via Show. The -- prettiness is due to the fact that the Parsec parser -- uses show to print Symbols as part of error messages instance Show Symbol where show (Word s) = "word: " ++ s show (SingleQuoted s) = "single quoted string: " ++ s show (Num n) = "number: " ++ show n show Equals = "equals sign: '='" show LeftRoundBracket = "bracket: '('" show RightRoundBracket = "bracket: ')'" show LeftSquareBracket = "bracket: '['" show RightSquareBracket = "bracket: ']'" show Comma = "comma: ','" show Colon = "colon: ':'" show DoubleColon = "double colon: '::'" show SemiColon = "semi-colon: ';'" show BackSlash = "backslash: '\'" show Exclamation = "exclamation sign: '!'" show Dash = "dash '-'" show GreaterThan = "greater than sign: '>'" show (Bad str) = str -- turn a stream of characters into a stream of tokens lexer :: String -> String -> [Token] lexer filename input = lexWork (newPos filename 1 1) input lexWork :: SourcePos -> String -> [Token] lexWork pos [] = [] lexWork pos (x:xs) | x == '=' = simpleToken Equals nextCol | x == '(' = simpleToken LeftRoundBracket nextCol | x == ')' = simpleToken RightRoundBracket nextCol | x == '[' = simpleToken LeftSquareBracket nextCol | x == ']' = simpleToken RightSquareBracket nextCol | x == ',' = simpleToken Comma nextCol | x == '\n' = lexWork (nextLine pos) xs | x == ':' = lexColon pos xs | x == ';' = simpleToken SemiColon nextCol | x == '\\' = simpleToken BackSlash nextCol | x == '!' = simpleToken Exclamation nextCol | x == '-' = simpleToken Dash nextCol | x == '>' = simpleToken GreaterThan nextCol -- source location does not need to be accurate within a comment | x == '#' = lexWork pos (dropWhile (/= '\n') xs) | isWhiteSpace x = let nextPos = incSourceColumn pos 1 in lexWork nextPos xs | isDigit x = let (num, rest) = span isDigit (x:xs) nextPos = incSourceColumn pos (length num) in Token (pos, Num $ read num) : lexWork nextPos rest | isAlpha x = let (restWord, rest) = span isWordChar xs word = x:restWord nextPos = incSourceColumn pos (length word) in Token (pos, Word word) : lexWork nextPos rest -- quoted strings need special care - escaped quotes can appear -- within the string, and the string might not be terminated -- by a quote in the case of a lexical error | x == '\'' = let (thisString, rest) = lexTailQuotedString xs nextPos = incSourceColumn pos (length thisString + 2) in if null rest then [Token (pos, Bad $ "ill-quoted input: " ++ x:thisString)] else Token (pos, SingleQuoted thisString) : lexWork nextPos (tail rest) | otherwise = simpleToken (Bad $ "symbol: " ++ show x) nextCol where simpleToken :: Symbol -> (SourcePos -> SourcePos) -> [Token] simpleToken tok srcPosUpdate = Token (pos, tok) : lexWork (srcPosUpdate pos) xs isWhiteSpace :: Char -> Bool isWhiteSpace c = c `elem` " \t\r\f\v\xA0" isWordChar :: Char -> Bool isWordChar c = isAlpha c || isDigit c nextLine :: SourcePos -> SourcePos nextLine pos = incSourceLine (setSourceColumn pos 1) 1 nextCol :: SourcePos -> SourcePos nextCol pos = incSourceColumn pos 1 lexColon :: SourcePos -> String -> [Token] lexColon pos [] = [Token (pos, Colon)] lexColon pos (':':rest) = Token (pos, DoubleColon) : lexWork (incSourceColumn pos 2) rest lexColon pos (other:rest) = Token (pos, Colon) : lexWork (nextCol pos) (other:rest) -- lex the rest of a string following the first quote mark -- must skip escaped quotes, and escaped backslashes -- include the final quote mark in the result lexTailQuotedString :: String -> (String, String) lexTailQuotedString [] = ([], []) lexTailQuotedString str@('\'':xs) = ([], str) -- a backslash then a quote is escaped lexTailQuotedString ('\\':'\'':xs) = let (string, rest) = lexTailQuotedString xs in ('\'':string, rest) -- a backslash then a backslash is an escaped backslash lexTailQuotedString ('\\':'\\':xs) = let (string, rest) = lexTailQuotedString xs in ('\\':'\\':string, rest) lexTailQuotedString (x:xs) = let (string, rest) = lexTailQuotedString xs in (x:string, rest)