module Text.Jasmine.Token ( identifier , reserved , whiteSpace , dec , hex , autoSemi , autoSemi' , rOp , lChar , lexeme -- Testing , isAlpha ) where -- --------------------------------------------------------------------- import Control.Applicative ( (<|>) ) import Data.Attoparsec.Char8 (hexadecimal, decimal, char, Parser, satisfy, try, many, (), skipMany, skipMany1, isDigit, string, skipWhile, anyChar) import Data.Char ( isAlpha, isSpace ) import qualified Data.Set as Set import qualified Data.ByteString.Char8 as S8 -- --------------------------------------------------------------------- -- Do not use the lexer, it is greedy and consumes subsequent symbols, -- e.g. "!" in a==!b rOp :: String -> Parser () rOp x = lexeme $ do { _ <- string $ S8.pack x; return () } lChar :: Char -> Parser () lChar x = lexeme $ do { _ <- char x; return () } -- --------------------------------------------------------------------- -- Need to deal with the following cases -- 1. Missing semi, because following } => empty -- 2. Additional semi, with following } => empty -- 3. semi with no following } => semi -- 4. no semi, but prior NL => semi autoSemi :: Parser [Char] autoSemi = do{ _ <- rOp ";"; return (";");} <|> return ("") autoSemi' :: Parser [Char] autoSemi' = do{ _ <- rOp ";"; return (";");} -- --------------------------------------------------------------------- --identifier = lexeme $ many1 (letter <|> oneOf "_") identifier :: Parser String identifier = lexeme $ try $ do{ name <- ident ; if (isReservedName name) then fail ("reserved word " ++ show name) else return name } ident :: Parser [Char] ident = do{ c <- identStart ; cs <- many identLetter ; return (c:cs) } "identifier" isReservedName :: String -> Bool isReservedName = flip Set.member reservedNames -- TODO: fix trailing characters test reserved :: String -> Parser () reserved name = lexeme $ do { _ <- string $ S8.pack name; return () } {- reserved name = lexeme $ try $ do{ _ <- string name ; notFollowedBy identLetter ("end of " ++ show name) } -} --whiteSpace = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment "") --whiteSpace = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <|> do { _ <- char '\n'; setNLFlag} "") whiteSpace :: Parser () whiteSpace = skipMany (do { _ <- string $ S8.pack "\n"; return ()} <|> simpleSpace <|> comment "") -- whiteSpace = try $ many $ (do { equal TokenWhite } <|> do { (equal TokenNL); setNLFlag}) simpleSpace :: Parser () -- Note : Data.Char.isSpace includes a variety of non-breaking space chars, and unicode variants simpleSpace = skipMany1 $ (satisfy isSpace) comment :: Parser () comment = do isMulti <- try $ do _ <- char fslash (char asterisk >> return True) <|> (char fslash >> return False) if isMulti then inComment else skipWhile (/= linefeed) inComment :: Parser () inComment = theEnd <|> (anyChar >> inComment) "end of comment" where theEnd = char asterisk >> (char fslash >> return ()) <|> inComment fslash, asterisk, linefeed :: Char fslash = '/' asterisk = '*' linefeed = '\n' identLetter :: Parser Char identLetter = alphaNum <|> oneOf "_" identStart :: Parser Char identStart = letter <|> oneOf "_$" reservedNames :: Set.Set String reservedNames = Set.fromAscList [ "break", "case", "catch", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "false", "finally", "for", "function", "if", "in", "instanceof", "new", "null", "return", "switch", "this", "throw", "true", "try", "typeof", "var", "void", "while", "with" ] hex :: Parser Integer hex = lexeme $ do{ _ <- oneOf "xX"; hexadecimal } dec :: Parser Integer dec = lexeme $ decimal -- | @lexeme p@ first applies parser @p@ and than the 'whiteSpace' -- parser, returning the value of @p@. Every lexical -- token (lexeme) is defined using @lexeme@, this way every parse -- starts at a point without white space. Parsers that use @lexeme@ are -- called /lexeme/ parsers in this document. -- -- The only point where the 'whiteSpace' parser should be -- called explicitly is the start of the main parser in order to skip -- any leading white space. -- -- > mainParser = do{ whiteSpace -- > ; ds <- many (lexeme digit) -- > ; eof -- > ; return (sum ds) -- > } lexeme :: Parser b -> Parser b lexeme p = do{ x <- p; whiteSpace; return x } -- --------------------------------------------------------------------- -- Stuff from Parsec needed to make Attoparsec work -- | @oneOf cs@ succeeds if the current character is in the supplied -- list of characters @cs@. Returns the parsed character. See also -- 'satisfy'. -- -- > vowel = oneOf "aeiou" oneOf :: String -> Parser Char oneOf = satisfy . flip elem -- | Parses a letter or digit (a character between \'0\' and \'9\'). -- Returns the parsed character. alphaNum :: Parser Char alphaNum = satisfy isAlphaNum "letter or digit" -- | Parses a letter (an upper case or lower case character). Returns the -- parsed character. letter :: Parser Char letter = satisfy isAlpha "letter" -- | Parses a digit. Returns the parsed character. -- | Parses a hexadecimal digit (a digit or a letter between \'a\' and -- \'f\' or \'A\' and \'F\'). Returns the parsed character. isAlphaNum :: Char -> Bool isAlphaNum c = isAlpha c || isDigit c -- --------------------------------------------------------------------- {- javascriptDef :: LanguageDef st javascriptDef = javaStyle { reservedNames = [ "break", "case", "catch", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "false", "finally", "for", "function", "if", "in", "instanceof", "new", "null", "return", "switch", "this", "throw", "true", "try", "typeof", "var", "void", "while", "with" ] -- TODO: make the following constants, so the parser defn is simpler. e,g op_COMMA = "," , reservedOpNames= [ ";" , -- "SEMICOLON", "," , -- "COMMA", "?" , -- "HOOK", ":" , -- "COLON", "||" , -- "OR", "&&" , -- "AND", "|" , -- "BITWISE_OR", "^" , -- "BITWISE_XOR", "&" , -- "BITWISE_AND", "===" , -- "STRICT_EQ", "==" , -- "EQ", "=" , -- "ASSIGN", "!==" , -- "STRICT_NE", "!=" , -- "NE", "<<" , -- "LSH", "<=" , -- "LE", "<" , -- "LT", ">>>" , -- "URSH", ">>" , -- "RSH", ">=" , -- "GE", ">" , -- "GT", "++" , -- "INCREMENT", "--" , -- "DECREMENT", "+" , -- "PLUS", "-" , -- "MINUS", "*" , -- "MUL", "/" , -- "DIV", "%" , -- "MOD", "!" , -- "NOT", "~" , -- "BITWISE_NOT", "." , -- "DOT", "[" , -- "LEFT_BRACKET", "]" , -- "RIGHT_BRACKET", "{" , -- "LEFT_CURLY", "}" , -- "RIGHT_CURLY", "(" , -- "LEFT_PAREN", ")" , -- "RIGHT_PAREN", "@*/" -- "CONDCOMMENT_END" ] , opLetter = oneOf "!%&()*+,-./:;<=>?[]^{|}~" , opStart = opLetter javascriptDef , identStart = letter <|> oneOf "_" , identLetter = alphaNum <|> oneOf "_" , caseSensitive = True } -} -- EOF