{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeFamilies #-} -- | -- Module : Swarm.Language.Parse -- Copyright : Brent Yorgey -- Maintainer : byorgey@gmail.com -- -- SPDX-License-Identifier: BSD-3-Clause -- -- Parser for the Swarm language. Note, you probably don't want to -- use this directly, unless there is a good reason to parse a term -- without also type checking it; use -- 'Swarm.Language.Pipeline.processTerm' instead, which parses, -- typechecks, elaborates, and capability checks a term all at once. module Swarm.Language.Parse ( -- * Reserved words reservedWords, -- * Parsers Parser, parsePolytype, parseType, parseTerm, binOps, unOps, -- * Utility functions runParser, runParserTH, readTerm, readTerm', showShortError, showErrorPos, getLocRange, ) where import Control.Monad.Combinators.Expr import Control.Monad.Reader import Data.Bifunctor import Data.Foldable (asum) import Data.List (nub) import Data.List.NonEmpty qualified (head) import Data.Map.Strict qualified as Map import Data.Maybe (fromMaybe, mapMaybe) import Data.Set qualified as S import Data.Set.Lens (setOf) import Data.Text (Text, index, toLower) import Data.Text qualified as T import Data.Void import Swarm.Language.Syntax import Swarm.Language.Types import Text.Megaparsec hiding (runParser) import Text.Megaparsec.Char import Text.Megaparsec.Char.Lexer qualified as L import Text.Megaparsec.Pos qualified as Pos import Witch -- Imports for doctests (cabal-docspec needs this) -- $setup -- >>> import qualified Data.Map.Strict as Map -- | When parsing a term using a quasiquoter (i.e. something in the -- Swarm source code that will be parsed at compile time), we want -- to allow antiquoting, i.e. writing something like $x to refer to -- an existing Haskell variable. But when parsing a term entered by -- the user at the REPL, we do not want to allow this syntax. data Antiquoting = AllowAntiquoting | DisallowAntiquoting deriving (Eq, Ord, Show) type Parser = ReaderT Antiquoting (Parsec Void Text) type ParserError = ParseErrorBundle Text Void -------------------------------------------------- -- Lexer -- | List of reserved words that cannot be used as variable names. reservedWords :: [Text] reservedWords = map (syntax . constInfo) (filter isUserFunc allConst) ++ map (dirSyntax . dirInfo) allDirs ++ [ "void" , "unit" , "int" , "text" , "dir" , "bool" , "robot" , "cmd" , "delay" , "let" , "def" , "end" , "in" , "true" , "false" , "forall" , "require" ] -- | Skip spaces and comments. sc :: Parser () sc = L.space space1 (L.skipLineComment "//") (L.skipBlockComment "/*" "*/") -- | In general, we follow the convention that every token parser -- assumes no leading whitespace and consumes all trailing -- whitespace. Concretely, we achieve this by wrapping every token -- parser using 'lexeme'. lexeme :: Parser a -> Parser a lexeme = L.lexeme sc -- | A lexeme consisting of a literal string. symbol :: Text -> Parser Text symbol = L.symbol sc -- | Parse a case-insensitive reserved word, making sure it is not a -- prefix of a longer variable name, and allowing the parser to -- backtrack if it fails. reserved :: Text -> Parser () reserved w = (lexeme . try) $ string' w *> notFollowedBy (alphaNumChar <|> char '_') -- | Parse an identifier, i.e. any non-reserved string containing -- alphanumeric characters and underscores and not starting with a -- number. identifier :: Parser Text identifier = (lexeme . try) (p >>= check) "variable name" where p = (:) <$> (letterChar <|> char '_') <*> many (alphaNumChar <|> char '_' <|> char '\'') check s | toLower t `elem` reservedWords = fail $ "reserved word '" ++ s ++ "' cannot be used as variable name" | otherwise = return t where t = into @Text s -- | Parse a text literal (including escape sequences) in double quotes. textLiteral :: Parser Text textLiteral = into <$> lexeme (char '"' >> manyTill L.charLiteral (char '"')) -- | Parse a positive integer literal token, in decimal, binary, -- octal, or hexadecimal notation. Note that negation is handled as -- a separate operator. integer :: Parser Integer integer = label "integer literal" $ lexeme $ do n <- string "0b" *> L.binary <|> string "0o" *> L.octal <|> string "0x" *> L.hexadecimal <|> L.decimal notFollowedBy alphaNumChar return n braces :: Parser a -> Parser a braces = between (symbol "{") (symbol "}") parens :: Parser a -> Parser a parens = between (symbol "(") (symbol ")") -------------------------------------------------- -- Parser -- | Parse a Swarm language polytype, which starts with an optional -- quanitifation (@forall@ followed by one or more variables and a -- period) followed by a type. Note that anything accepted by -- 'parseType' is also accepted by 'parsePolytype'. parsePolytype :: Parser Polytype parsePolytype = join $ quantify <$> (fromMaybe [] <$> optional (reserved "forall" *> some identifier <* symbol ".")) <*> parseType where quantify :: [Var] -> Type -> Parser Polytype quantify xs ty -- Iplicitly quantify over free type variables if the user didn't write a forall | null xs = return $ Forall (S.toList free) ty -- Otherwise, require all variables to be explicitly quantified | S.null free = return $ Forall xs ty | otherwise = fail $ unlines [ " Type contains free variable(s): " ++ unwords (map from (S.toList free)) , " Try adding them to the 'forall'." ] where free = tyVars ty `S.difference` S.fromList xs -- | Parse a Swarm language (mono)type. parseType :: Parser Type parseType = makeExprParser parseTypeAtom table where table = [ [InfixR ((:*:) <$ symbol "*")] , [InfixR ((:+:) <$ symbol "+")] , [InfixR ((:->:) <$ symbol "->")] ] parseTypeAtom :: Parser Type parseTypeAtom = TyVoid <$ reserved "void" <|> TyUnit <$ reserved "unit" <|> TyVar <$> identifier <|> TyInt <$ reserved "int" <|> TyText <$ reserved "text" <|> TyDir <$ reserved "dir" <|> TyBool <$ reserved "bool" <|> TyRobot <$ reserved "robot" <|> TyCmd <$> (reserved "cmd" *> parseTypeAtom) <|> TyDelay <$> braces parseType <|> parens parseType parseDirection :: Parser Direction parseDirection = asum (map alternative allDirs) "direction constant" where alternative d = d <$ (reserved . dirSyntax . dirInfo) d -- | Parse Const as reserved words (e.g. @Fail <$ reserved "fail"@) parseConst :: Parser Const parseConst = asum (map alternative consts) "built-in user function" where consts = filter isUserFunc allConst alternative c = c <$ reserved (syntax $ constInfo c) -- | Add 'Location' to a parser parseLocG :: Parser a -> Parser (Location, a) parseLocG pa = do start <- getOffset a <- pa end <- getOffset pure (Location start end, a) -- | Add 'Location' to a 'Term' parser parseLoc :: Parser Term -> Parser Syntax parseLoc pterm = uncurry Syntax <$> parseLocG pterm parseTermAtom :: Parser Syntax parseTermAtom = parseLoc ( TUnit <$ symbol "()" <|> TConst <$> parseConst <|> TVar <$> identifier <|> TDir <$> parseDirection <|> TInt <$> integer <|> TText <$> textLiteral <|> TBool <$> ((True <$ reserved "true") <|> (False <$ reserved "false")) <|> reserved "require" *> ( ( TRequireDevice <$> (textLiteral "device name in double quotes") ) <|> ( TRequire <$> (fromIntegral <$> integer) <*> (textLiteral "entity name in double quotes") ) ) <|> SLam <$> (symbol "\\" *> identifier) <*> optional (symbol ":" *> parseType) <*> (symbol "." *> parseTerm) <|> sLet <$> (reserved "let" *> identifier) <*> optional (symbol ":" *> parsePolytype) <*> (symbol "=" *> parseTerm) <*> (reserved "in" *> parseTerm) <|> sDef <$> (reserved "def" *> identifier) <*> optional (symbol ":" *> parsePolytype) <*> (symbol "=" *> parseTerm <* reserved "end") <|> parens (mkTuple <$> (parseTerm `sepBy` symbol ",")) ) -- Potential syntax for explicitly requesting memoized delay. -- Perhaps we will not need this in the end; see the discussion at -- https://github.com/swarm-game/swarm/issues/150 . -- <|> parseLoc (TDelay SimpleDelay (TConst Noop) <$ try (symbol "{{" *> symbol "}}")) -- <|> parseLoc (SDelay MemoizedDelay <$> dbraces parseTerm) <|> parseLoc (TDelay SimpleDelay (TConst Noop) <$ try (symbol "{" *> symbol "}")) <|> parseLoc (SDelay SimpleDelay <$> braces parseTerm) <|> parseLoc (ask >>= (guard . (== AllowAntiquoting)) >> parseAntiquotation) mkTuple :: [Syntax] -> Term mkTuple [] = TUnit mkTuple [STerm x] = x mkTuple (x : xs) = SPair x (STerm (mkTuple xs)) -- | Construct an 'SLet', automatically filling in the Boolean field -- indicating whether it is recursive. sLet :: Var -> Maybe Polytype -> Syntax -> Syntax -> Term sLet x ty t1 = SLet (x `S.member` setOf fv (sTerm t1)) x ty t1 -- | Construct an 'SDef', automatically filling in the Boolean field -- indicating whether it is recursive. sDef :: Var -> Maybe Polytype -> Syntax -> Term sDef x ty t = SDef (x `S.member` setOf fv (sTerm t)) x ty t parseAntiquotation :: Parser Term parseAntiquotation = TAntiText <$> (lexeme . try) (symbol "$str:" *> identifier) <|> TAntiInt <$> (lexeme . try) (symbol "$int:" *> identifier) -- | Parse a Swarm language term. parseTerm :: Parser Syntax parseTerm = sepEndBy1 parseStmt (symbol ";") >>= mkBindChain mkBindChain :: [Stmt] -> Parser Syntax mkBindChain stmts = case last stmts of Binder x _ -> return $ foldr mkBind (STerm (TApp (TConst Return) (TVar x))) stmts BareTerm t -> return $ foldr mkBind t (init stmts) where mkBind (BareTerm t1) t2 = loc t1 t2 $ SBind Nothing t1 t2 mkBind (Binder x t1) t2 = loc t1 t2 $ SBind (Just x) t1 t2 loc a b = Syntax $ sLoc a <> sLoc b data Stmt = BareTerm Syntax | Binder Text Syntax deriving (Show) parseStmt :: Parser Stmt parseStmt = mkStmt <$> optional (try (identifier <* symbol "<-")) <*> parseExpr mkStmt :: Maybe Text -> Syntax -> Stmt mkStmt Nothing = BareTerm mkStmt (Just x) = Binder x -- | When semicolons are missing between definitions, for example: -- def a = 1 end def b = 2 end def c = 3 end -- The makeExprParser produces: -- App (App (TDef a) (TDef b)) (TDef x) -- This function fix that by converting the Apps into Binds, so that it results in: -- Bind a (Bind b (Bind c)) fixDefMissingSemis :: Syntax -> Syntax fixDefMissingSemis term = case nestedDefs term [] of [] -> term defs -> foldr1 mkBind defs where mkBind t1 t2 = Syntax (sLoc t1 <> sLoc t2) $ SBind Nothing t1 t2 nestedDefs term' acc = case term' of def@(Syntax _ SDef {}) -> def : acc (Syntax _ (SApp nestedTerm def@(Syntax _ SDef {}))) -> nestedDefs nestedTerm (def : acc) -- Otherwise returns an empty list to keep the term unchanged _ -> [] parseExpr :: Parser Syntax parseExpr = fixDefMissingSemis <$> makeExprParser parseTermAtom table where table = snd <$> Map.toDescList tableMap tableMap = Map.unionsWith (++) [ Map.singleton 9 [InfixL (exprLoc2 $ SApp <$ string "")] , binOps , unOps ] -- add location for ExprParser by combining all exprLoc2 :: Parser (Syntax -> Syntax -> Term) -> Parser (Syntax -> Syntax -> Syntax) exprLoc2 p = do (l, f) <- parseLocG p pure $ \s1 s2 -> Syntax (l <> sLoc s1 <> sLoc s2) $ f s1 s2 -- | Precedences and parsers of binary operators. -- -- >>> Map.map length binOps -- fromList [(0,1),(2,1),(3,1),(4,6),(6,3),(7,2),(8,1)] binOps :: Map.Map Int [Operator Parser Syntax] binOps = Map.unionsWith (++) $ mapMaybe binOpToTuple allConst where binOpToTuple c = do let ci = constInfo c ConstMBinOp assoc <- pure (constMeta ci) let assI = case assoc of L -> InfixL N -> InfixN R -> InfixR pure $ Map.singleton (fixity ci) [assI (mkOp c <$ operatorString (syntax ci))] -- | Precedences and parsers of unary operators (currently only 'Neg'). -- -- >>> Map.map length unOps -- fromList [(7,1)] unOps :: Map.Map Int [Operator Parser Syntax] unOps = Map.unionsWith (++) $ mapMaybe unOpToTuple allConst where unOpToTuple c = do let ci = constInfo c ConstMUnOp assoc <- pure (constMeta ci) let assI = case assoc of P -> Prefix S -> Postfix pure $ Map.singleton (fixity ci) [assI (exprLoc1 $ SApp (noLoc $ TConst c) <$ operatorString (syntax ci))] -- combine location for ExprParser exprLoc1 :: Parser (Syntax -> Term) -> Parser (Syntax -> Syntax) exprLoc1 p = do (l, f) <- parseLocG p pure $ \s -> Syntax (l <> sLoc s) $ f s operatorString :: Text -> Parser Text operatorString n = (lexeme . try) (string n <* notFollowedBy operatorSymbol) operatorSymbol :: Parser Text operatorSymbol = T.singleton <$> oneOf opChars where isOp = \case { ConstMFunc {} -> False; _ -> True } . constMeta opChars = nub . concatMap (from . syntax) . filter isOp $ map constInfo allConst -------------------------------------------------- -- Utilities -- | Run a parser on some input text, returning either the result or a -- pretty-printed parse error message. runParser :: Parser a -> Text -> Either Text a runParser p t = first (from . errorBundlePretty) (parse (runReaderT p DisallowAntiquoting) "" t) -- | A utility for running a parser in an arbitrary 'MonadFail' (which -- is going to be the TemplateHaskell 'Q' monad --- see -- "Swarm.Language.Parse.QQ"), with a specified source position. runParserTH :: (Monad m, MonadFail m) => (String, Int, Int) -> Parser a -> String -> m a runParserTH (file, line, col) p s = case snd (runParser' (runReaderT (fully p) AllowAntiquoting) initState) of Left err -> fail $ errorBundlePretty err Right e -> return e where -- This is annoying --- megaparsec does not export its function to -- construct an initial parser state, so we can't just use that -- and then change the one field we need to be different (the -- pstateSourcePos). We have to copy-paste the whole thing. initState :: State Text Void initState = State { stateInput = from s , stateOffset = 0 , statePosState = PosState { pstateInput = from s , pstateOffset = 0 , pstateSourcePos = SourcePos file (mkPos line) (mkPos col) , pstateTabWidth = defaultTabWidth , pstateLinePrefix = "" } , stateParseErrors = [] } -- | Run a parser "fully", consuming leading whitespace and ensuring -- that the parser extends all the way to eof. fully :: Parser a -> Parser a fully p = sc *> p <* eof -- | Run a parser "fully", consuming leading whitespace (including the -- possibility that the input is nothing but whitespace) and -- ensuring that the parser extends all the way to eof. fullyMaybe :: Parser a -> Parser (Maybe a) fullyMaybe = fully . optional -- | Parse some input 'Text' completely as a 'Term', consuming leading -- whitespace and ensuring the parsing extends all the way to the -- end of the input 'Text'. Returns either the resulting 'Term' (or -- @Nothing@ if the input was only whitespace) or a pretty-printed -- parse error message. readTerm :: Text -> Either Text (Maybe Syntax) readTerm = runParser (fullyMaybe parseTerm) -- | A lower-level `readTerm` which returns the megaparsec bundle error -- for precise error reporting. readTerm' :: Text -> Either ParserError (Maybe Syntax) readTerm' = parse (runReaderT (fullyMaybe parseTerm) DisallowAntiquoting) "" -- | A utility for converting a ParserError into a one line message: -- : showShortError :: ParserError -> String showShortError pe = show (line + 1) <> ": " <> from msg where ((line, _), _, msg) = showErrorPos pe -- | A utility for converting a ParseError into a range and error message. showErrorPos :: ParserError -> ((Int, Int), (Int, Int), Text) showErrorPos (ParseErrorBundle errs sourcePS) = (minusOne start, minusOne end, from msg) where -- convert megaparsec source pos to starts at 0 minusOne (x, y) = (x - 1, y - 1) -- get the first error position (ps) and line content (str) err = Data.List.NonEmpty.head errs offset = case err of TrivialError x _ _ -> x FancyError x _ -> x (str, ps) = reachOffset offset sourcePS msg = parseErrorTextPretty err -- extract the error starting position start@(line, col) = getLineCol ps -- compute the ending position based on the word at starting position wordlength = case break (== ' ') . drop col <$> str of Just (word, _) -> length word + 1 _ -> 0 end = (line, col + wordlength) getLineCol :: PosState a -> (Int, Int) getLineCol ps = (line, col) where line = unPos $ sourceLine $ pstateSourcePos ps col = unPos $ sourceColumn $ pstateSourcePos ps -- | A utility for converting a Location into a range getLocRange :: Text -> (Int, Int) -> ((Int, Int), (Int, Int)) getLocRange code (locStart, locEnd) = (start, end) where start = getLocPos locStart end = getLocPos (dropWhiteSpace locEnd) -- remove trailing whitespace that got included by the lexer dropWhiteSpace offset | isWhiteSpace offset = dropWhiteSpace (offset - 1) | otherwise = offset isWhiteSpace offset = -- Megaparsec offset needs to be (-1) to start at 0 Data.Text.index code (offset - 1) `elem` [' ', '\n', '\r', '\t'] -- using megaparsec offset facility, compute the line/col getLocPos offset = let sourcePS = PosState { pstateInput = code , pstateOffset = 0 , pstateSourcePos = Pos.initialPos "" , pstateTabWidth = Pos.defaultTabWidth , pstateLinePrefix = "" } (_, ps) = reachOffset offset sourcePS in getLineCol ps