-- | -- The first step in the parsing process - turns source code into a list of lexemes -- module Language.PureScript.Parser.Lexer ( PositionedToken(..) , Token() , TokenParser() , lex , lexLenient , anyToken , token , match , lparen , rparen , parens , lbrace , rbrace , braces , lsquare , rsquare , squares , indent , indentAt , larrow , rarrow , lfatArrow , rfatArrow , colon , doubleColon , equals , pipe , tick , dot , comma , semi , at , underscore , holeLit , semiSep , semiSep1 , commaSep , commaSep1 , lname , lname' , qualifier , tyname , kiname , dconsname , uname , uname' , mname , reserved , symbol , symbol' , identifier , charLiteral , stringLiteral , number , natural , reservedPsNames , reservedTypeNames , isSymbolChar , isUnquotedKey ) where import Prelude.Compat hiding (lex) import Control.Applicative ((<|>)) import Control.Monad (void, guard) import Control.Monad.Identity (Identity) import Data.Char (isSpace, isAscii, isSymbol, isAlphaNum) import Data.Monoid ((<>)) import Data.String (fromString) import Data.Text (Text) import qualified Data.Text as T import Language.PureScript.Comments import Language.PureScript.Parser.State import Language.PureScript.PSString (PSString) import qualified Text.Parsec as P import qualified Text.Parsec.Token as PT data Token = LParen | RParen | LBrace | RBrace | LSquare | RSquare | Indent Int | LArrow | RArrow | LFatArrow | RFatArrow | Colon | DoubleColon | Equals | Pipe | Tick | Dot | Comma | Semi | At | Underscore | LName Text | UName Text | Qualifier Text | Symbol Text | CharLiteral Char | StringLiteral PSString | Number (Either Integer Double) | HoleLit Text deriving (Show, Eq, Ord) prettyPrintToken :: Token -> Text prettyPrintToken LParen = "(" prettyPrintToken RParen = ")" prettyPrintToken LBrace = "{" prettyPrintToken RBrace = "}" prettyPrintToken LSquare = "[" prettyPrintToken RSquare = "]" prettyPrintToken LArrow = "<-" prettyPrintToken RArrow = "->" prettyPrintToken LFatArrow = "<=" prettyPrintToken RFatArrow = "=>" prettyPrintToken Colon = ":" prettyPrintToken DoubleColon = "::" prettyPrintToken Equals = "=" prettyPrintToken Pipe = "|" prettyPrintToken Tick = "`" prettyPrintToken Dot = "." prettyPrintToken Comma = "," prettyPrintToken Semi = ";" prettyPrintToken At = "@" prettyPrintToken Underscore = "_" prettyPrintToken (Indent n) = "indentation at level " <> T.pack (show n) prettyPrintToken (LName s) = T.pack (show s) prettyPrintToken (UName s) = T.pack (show s) prettyPrintToken (Qualifier _) = "qualifier" prettyPrintToken (Symbol s) = s prettyPrintToken (CharLiteral c) = T.pack (show c) prettyPrintToken (StringLiteral s) = T.pack (show s) prettyPrintToken (Number n) = T.pack (either show show n) prettyPrintToken (HoleLit name) = "?" <> name data PositionedToken = PositionedToken { -- | Start position of this token ptSourcePos :: P.SourcePos -- | End position of this token (not including whitespace) , ptEndPos :: P.SourcePos -- | End position of the previous token , ptPrevEndPos :: Maybe P.SourcePos , ptToken :: Token , ptComments :: [Comment] } deriving (Eq) -- Parsec requires this instance for various token-level combinators instance Show PositionedToken where show = T.unpack . prettyPrintToken . ptToken type Lexer u a = P.Parsec Text u a lex :: FilePath -> Text -> Either P.ParseError [PositionedToken] lex f s = updatePositions <$> P.parse parseTokens f s updatePositions :: [PositionedToken] -> [PositionedToken] updatePositions [] = [] updatePositions (x:xs) = x : zipWith update (x:xs) xs where update PositionedToken { ptEndPos = pos } pt = pt { ptPrevEndPos = Just pos } parseTokens :: Lexer u [PositionedToken] parseTokens = whitespace *> P.many parsePositionedToken <* P.skipMany parseComment <* P.eof -- | Lexes the given file, and on encountering a parse error, returns the -- progress made up to that point, instead of returning an error lexLenient :: FilePath -> Text -> Either P.ParseError [PositionedToken] lexLenient f s = updatePositions <$> P.parse parseTokensLenient f s parseTokensLenient :: Lexer u [PositionedToken] parseTokensLenient = whitespace *> P.many parsePositionedToken <* P.skipMany parseComment whitespace :: Lexer u () whitespace = P.skipMany (P.satisfy isSpace) parseComment :: Lexer u Comment parseComment = (BlockComment <$> blockComment <|> LineComment <$> lineComment) <* whitespace where blockComment :: Lexer u Text blockComment = P.try $ P.string "{-" *> (T.pack <$> P.manyTill P.anyChar (P.try (P.string "-}"))) lineComment :: Lexer u Text lineComment = P.try $ P.string "--" *> (T.pack <$> P.manyTill P.anyChar (P.try (void (P.char '\n') <|> P.eof))) parsePositionedToken :: Lexer u PositionedToken parsePositionedToken = P.try $ do comments <- P.many parseComment pos <- P.getPosition tok <- parseToken pos' <- P.getPosition whitespace return $ PositionedToken pos pos' Nothing tok comments parseToken :: Lexer u Token parseToken = P.choice [ P.try $ P.string "<-" *> P.notFollowedBy symbolChar *> pure LArrow , P.try $ P.string "←" *> P.notFollowedBy symbolChar *> pure LArrow , P.try $ P.string "<=" *> P.notFollowedBy symbolChar *> pure LFatArrow , P.try $ P.string "⇐" *> P.notFollowedBy symbolChar *> pure LFatArrow , P.try $ P.string "->" *> P.notFollowedBy symbolChar *> pure RArrow , P.try $ P.string "→" *> P.notFollowedBy symbolChar *> pure RArrow , P.try $ P.string "=>" *> P.notFollowedBy symbolChar *> pure RFatArrow , P.try $ P.string "⇒" *> P.notFollowedBy symbolChar *> pure RFatArrow , P.try $ P.string "::" *> P.notFollowedBy symbolChar *> pure DoubleColon , P.try $ P.string "∷" *> P.notFollowedBy symbolChar *> pure DoubleColon , P.try $ P.char '(' *> pure LParen , P.try $ P.char ')' *> pure RParen , P.try $ P.char '{' *> pure LBrace , P.try $ P.char '}' *> pure RBrace , P.try $ P.char '[' *> pure LSquare , P.try $ P.char ']' *> pure RSquare , P.try $ P.char '`' *> pure Tick , P.try $ P.char ',' *> pure Comma , P.try $ P.char '=' *> P.notFollowedBy symbolChar *> pure Equals , P.try $ P.char ':' *> P.notFollowedBy symbolChar *> pure Colon , P.try $ P.char '|' *> P.notFollowedBy symbolChar *> pure Pipe , P.try $ P.char '.' *> P.notFollowedBy symbolChar *> pure Dot , P.try $ P.char ';' *> P.notFollowedBy symbolChar *> pure Semi , P.try $ P.char '@' *> P.notFollowedBy symbolChar *> pure At , P.try $ P.char '_' *> P.notFollowedBy identLetter *> pure Underscore , HoleLit <$> P.try (P.char '?' *> (T.pack <$> P.many1 identLetter)) , LName <$> parseLName , parseUName >>= \uName -> guard (validModuleName uName) *> (Qualifier uName <$ P.char '.') <|> pure (UName uName) , Symbol <$> parseSymbol , CharLiteral <$> parseCharLiteral , StringLiteral <$> parseStringLiteral , Number <$> parseNumber ] where parseLName :: Lexer u Text parseLName = T.cons <$> identStart <*> (T.pack <$> P.many identLetter) parseUName :: Lexer u Text parseUName = T.cons <$> P.upper <*> (T.pack <$> P.many identLetter) parseSymbol :: Lexer u Text parseSymbol = T.pack <$> P.many1 symbolChar identStart :: Lexer u Char identStart = P.lower <|> P.oneOf "_" identLetter :: Lexer u Char identLetter = P.alphaNum <|> P.oneOf "_'" symbolChar :: Lexer u Char symbolChar = P.satisfy isSymbolChar parseCharLiteral :: Lexer u Char parseCharLiteral = P.try $ do { c <- PT.charLiteral tokenParser; if fromEnum c > 0xFFFF then P.unexpected "astral code point in character literal; characters must be valid UTF-16 code units" else return c } parseStringLiteral :: Lexer u PSString parseStringLiteral = fromString <$> (blockString <|> PT.stringLiteral tokenParser) where delimiter = P.try (P.string "\"\"\"") blockString = delimiter *> P.manyTill P.anyChar delimiter parseNumber :: Lexer u (Either Integer Double) parseNumber = (consumeLeadingZero *> P.parserZero) <|> (Right <$> P.try (PT.float tokenParser) <|> Left <$> P.try (PT.natural tokenParser)) P. "number" where -- lookAhead doesn't consume any input if its parser succeeds -- if notFollowedBy fails though, the consumed '0' will break the choice chain consumeLeadingZero = P.lookAhead (P.char '0' *> (P.notFollowedBy P.digit P. "no leading zero in number literal")) -- | -- We use Text.Parsec.Token to implement the string and number lexemes -- langDef :: PT.GenLanguageDef Text u Identity langDef = PT.LanguageDef { PT.reservedNames = [] , PT.reservedOpNames = [] , PT.commentStart = "" , PT.commentEnd = "" , PT.commentLine = "" , PT.nestedComments = True , PT.identStart = P.parserFail "Identifiers not supported" , PT.identLetter = P.parserFail "Identifiers not supported" , PT.opStart = P.parserFail "Operators not supported" , PT.opLetter = P.parserFail "Operators not supported" , PT.caseSensitive = True } -- | -- A token parser based on the language definition -- tokenParser :: PT.GenTokenParser Text u Identity tokenParser = PT.makeTokenParser langDef type TokenParser a = P.Parsec [PositionedToken] ParseState a anyToken :: TokenParser PositionedToken anyToken = P.token (T.unpack . prettyPrintToken . ptToken) ptSourcePos Just token :: (Token -> Maybe a) -> TokenParser a token f = P.token (T.unpack . prettyPrintToken . ptToken) ptSourcePos (f . ptToken) match :: Token -> TokenParser () match tok = token (\tok' -> if tok == tok' then Just () else Nothing) P. T.unpack (prettyPrintToken tok) lparen :: TokenParser () lparen = match LParen rparen :: TokenParser () rparen = match RParen parens :: TokenParser a -> TokenParser a parens = P.between lparen rparen lbrace :: TokenParser () lbrace = match LBrace rbrace :: TokenParser () rbrace = match RBrace braces :: TokenParser a -> TokenParser a braces = P.between lbrace rbrace lsquare :: TokenParser () lsquare = match LSquare rsquare :: TokenParser () rsquare = match RSquare squares :: TokenParser a -> TokenParser a squares = P.between lsquare rsquare indent :: TokenParser Int indent = token go P. "indentation" where go (Indent n) = Just n go _ = Nothing indentAt :: P.Column -> TokenParser () indentAt n = token go P. "indentation at level " ++ show n where go (Indent n') | n == n' = Just () go _ = Nothing larrow :: TokenParser () larrow = match LArrow rarrow :: TokenParser () rarrow = match RArrow lfatArrow :: TokenParser () lfatArrow = match LFatArrow rfatArrow :: TokenParser () rfatArrow = match RFatArrow colon :: TokenParser () colon = match Colon doubleColon :: TokenParser () doubleColon = match DoubleColon equals :: TokenParser () equals = match Equals pipe :: TokenParser () pipe = match Pipe tick :: TokenParser () tick = match Tick dot :: TokenParser () dot = match Dot comma :: TokenParser () comma = match Comma semi :: TokenParser () semi = match Semi at :: TokenParser () at = match At underscore :: TokenParser () underscore = match Underscore holeLit :: TokenParser Text holeLit = token go P. "hole literal" where go (HoleLit n) = Just n go _ = Nothing -- | -- Parse zero or more values separated by semicolons -- semiSep :: TokenParser a -> TokenParser [a] semiSep = flip P.sepBy semi -- | -- Parse one or more values separated by semicolons -- semiSep1 :: TokenParser a -> TokenParser [a] semiSep1 = flip P.sepBy1 semi -- | -- Parse zero or more values separated by commas -- commaSep :: TokenParser a -> TokenParser [a] commaSep = flip P.sepBy comma -- | -- Parse one or more values separated by commas -- commaSep1 :: TokenParser a -> TokenParser [a] commaSep1 = flip P.sepBy1 comma lname :: TokenParser Text lname = token go P. "identifier" where go (LName s) = Just s go _ = Nothing lname' :: Text -> TokenParser () lname' s = token go P. show s where go (LName s') | s == s' = Just () go _ = Nothing qualifier :: TokenParser Text qualifier = token go P. "qualifier" where go (Qualifier s) = Just s go _ = Nothing reserved :: Text -> TokenParser () reserved s = token go P. show s where go (LName s') | s == s' = Just () go (Symbol s') | s == s' = Just () go _ = Nothing uname :: TokenParser Text uname = token go P. "proper name" where go (UName s) | validUName s = Just s go _ = Nothing uname' :: Text -> TokenParser () uname' s = token go P. "proper name" where go (UName s') | s == s' = Just () go _ = Nothing tyname :: TokenParser Text tyname = token go P. "type name" where go (UName s) = Just s go _ = Nothing kiname :: TokenParser Text kiname = token go P. "kind name" where go (UName s) = Just s go _ = Nothing dconsname :: TokenParser Text dconsname = token go P. "data constructor name" where go (UName s) = Just s go _ = Nothing mname :: TokenParser Text mname = token go P. "module name" where go (UName s) | validModuleName s = Just s go _ = Nothing symbol :: TokenParser Text symbol = token go P. "symbol" where go (Symbol s) = Just s go Colon = Just ":" go LFatArrow = Just "<=" go At = Just "@" go _ = Nothing symbol' :: Text -> TokenParser () symbol' s = token go P. show s where go (Symbol s') | s == s' = Just () go Colon | s == ":" = Just () go LFatArrow | s == "<=" = Just () go _ = Nothing charLiteral :: TokenParser Char charLiteral = token go P. "char literal" where go (CharLiteral c) = Just c go _ = Nothing stringLiteral :: TokenParser PSString stringLiteral = token go P. "string literal" where go (StringLiteral s) = Just s go _ = Nothing number :: TokenParser (Either Integer Double) number = token go P. "number" where go (Number n) = Just n go _ = Nothing natural :: TokenParser Integer natural = token go P. "natural" where go (Number (Left n)) = Just n go _ = Nothing identifier :: TokenParser Text identifier = token go P. "identifier" where go (LName s) | s `notElem` reservedPsNames = Just s go _ = Nothing validModuleName :: Text -> Bool validModuleName s = '_' `notElemT` s validUName :: Text -> Bool validUName s = '\'' `notElemT` s notElemT :: Char -> Text -> Bool notElemT c = not . T.any (== c) -- | -- A list of purescript reserved identifiers -- reservedPsNames :: [Text] reservedPsNames = [ "data" , "newtype" , "type" , "foreign" , "import" , "infixl" , "infixr" , "infix" , "class" , "instance" , "derive" , "module" , "case" , "of" , "if" , "then" , "else" , "do" , "let" , "true" , "false" , "in" , "where" ] reservedTypeNames :: [Text] reservedTypeNames = [ "forall", "where" ] -- | -- The characters allowed for use in operators -- isSymbolChar :: Char -> Bool isSymbolChar c = (c `elem` (":!#$%&*+./<=>?@\\^|-~" :: [Char])) || (not (isAscii c) && isSymbol c) -- | -- The characters allowed in the head of an unquoted record key -- isUnquotedKeyHeadChar :: Char -> Bool isUnquotedKeyHeadChar c = (c == '_') || isAlphaNum c -- | -- The characters allowed in the tail of an unquoted record key -- isUnquotedKeyTailChar :: Char -> Bool isUnquotedKeyTailChar c = (c `elem` ("_'" :: [Char])) || isAlphaNum c -- | -- Strings allowed to be left unquoted in a record key -- isUnquotedKey :: Text -> Bool isUnquotedKey t = case T.uncons t of Nothing -> False Just (hd, tl) -> isUnquotedKeyHeadChar hd && T.all isUnquotedKeyTailChar tl