{-# LANGUAGE TemplateHaskell #-} ----------------------------------------------------------------------------- -- | -- Module : Disco.Parser -- Copyright : disco team and contributors -- Maintainer : byorgey@gmail.com -- -- SPDX-License-Identifier: BSD-3-Clause -- -- Parser to convert concrete Disco syntax into an (untyped, surface -- language) AST. -- ----------------------------------------------------------------------------- module Disco.Parser ( -- * Parser type and utilities DiscoParseError(..), Parser, runParser, withExts, indented, thenIndented -- * Lexer -- ** Basic lexemes , sc, lexeme, symbol, reservedOp , natural, reserved, reservedWords, ident -- ** Punctuation , parens, braces, angles, brackets , semi, comma, colon, dot, pipe , lambda -- * Disco parser -- ** Modules , wholeModule, parseModule, parseExtName, parseTopLevel, parseDecl , parseImport, parseModuleName -- ** Terms , term, parseTerm, parseTerm', parseExpr, parseAtom , parseContainer, parseEllipsis, parseContainerComp, parseQual , parseLet, parseTypeOp -- ** Case and patterns , parseCase, parseBranch, parseGuards, parseGuard , parsePattern, parseAtomicPattern -- ** Types , parseType, parseAtomicType , parsePolyTy ) where import Unbound.Generics.LocallyNameless (Name, bind, embed, fvAny, string2Name) import Unbound.Generics.LocallyNameless.Unsafe (unsafeUnbind) import Control.Monad.Combinators.Expr import Text.Megaparsec hiding (runParser) import qualified Text.Megaparsec as MP import Text.Megaparsec.Char import qualified Text.Megaparsec.Char.Lexer as L import Control.Lens (makeLenses, toListOf, use, (%=), (%~), (&), (.=)) import Control.Monad.State import Data.Char (isAlpha, isDigit) import Data.Foldable (asum) import Data.List (find, intercalate) import qualified Data.Map as M import Data.Maybe (fromMaybe) import Data.Ratio import Data.Set (Set) import qualified Data.Set as S import Disco.AST.Surface import Disco.Extensions import Disco.Module import Disco.Pretty (prettyStr) import Disco.Syntax.Operators import Disco.Syntax.Prims import Disco.Types import Polysemy (run) ------------------------------------------------------------ -- Lexer -- Some of the basic setup code for the parser taken from -- https://markkarpov.com/megaparsec/parsing-simple-imperative-language.html -- | Currently required indent level. data IndentMode where NoIndent :: IndentMode -- ^ Don't require indent. ThenIndent :: IndentMode -- ^ Parse one token without -- indent, then switch to @Indent@. Indent :: IndentMode -- ^ Require everything to be indented at -- least one space. -- | Extra custom state for the parser. data ParserState = ParserState { _indentMode :: IndentMode -- ^ Currently required level of indentation. , _enabledExts :: Set Ext -- ^ Set of enabled language extensions -- (some of which may affect parsing). } makeLenses ''ParserState initParserState :: ParserState initParserState = ParserState NoIndent S.empty -- OpaqueTerm is a wrapper around Term just to make ShowErrorComponent -- happy, which requires Eq and Ord instances; but we can't make Term -- an instance of either. newtype OpaqueTerm = OT Term instance Show OpaqueTerm where show (OT t) = show t instance Eq OpaqueTerm where _ == _ = True instance Ord OpaqueTerm where compare _ _ = EQ data DiscoParseError = ReservedVarName String | InvalidPattern OpaqueTerm deriving (Show, Eq, Ord) instance ShowErrorComponent DiscoParseError where showErrorComponent (ReservedVarName x) = "keyword \"" ++ x ++ "\" cannot be used as a variable name" showErrorComponent (InvalidPattern (OT t)) = "Invalid pattern: " ++ run (prettyStr t) errorComponentLen (ReservedVarName x) = length x errorComponentLen (InvalidPattern _) = 1 -- | A parser is a megaparsec parser of strings, with an extra layer -- of state to keep track of the current indentation level and -- language extensions, and some custom error messages. type Parser = StateT ParserState (MP.Parsec DiscoParseError String) -- | Run a parser from the initial state. runParser :: Parser a -> FilePath -> String -> Either (ParseErrorBundle String DiscoParseError) a runParser = MP.runParser . flip evalStateT initParserState -- | Run a parser under a specified 'IndentMode'. withIndentMode :: IndentMode -> Parser a -> Parser a withIndentMode m p = do indentMode .= m res <- p indentMode .= NoIndent return res -- | @indented p@ is just like @p@, except that every token must not -- start in the first column. indented :: Parser a -> Parser a indented = withIndentMode Indent -- | @indented p@ is just like @p@, except that every token after the -- first must not start in the first column. thenIndented :: Parser a -> Parser a thenIndented = withIndentMode ThenIndent -- | @requireIndent p@ possibly requires @p@ to be indented, depending -- on the current '_indentMode'. Used in the definition of -- 'lexeme' and 'symbol'. requireIndent :: Parser a -> Parser a requireIndent p = do l <- use indentMode case l of ThenIndent -> do a <- p indentMode .= Indent return a Indent -> L.indentGuard sc GT pos1 >> p NoIndent -> p -- | Locally set the enabled extensions within a subparser. withExts :: Set Ext -> Parser a -> Parser a withExts exts p = do oldExts <- use enabledExts enabledExts .= exts a <- p enabledExts .= oldExts return a -- | Locally enable some additional extensions within a subparser. withAdditionalExts :: Set Ext -> Parser a -> Parser a withAdditionalExts exts p = do oldExts <- use enabledExts enabledExts %= S.union exts a <- p enabledExts .= oldExts return a -- | Ensure that a specific extension is enabled, fail if not. ensureEnabled :: Ext -> Parser () ensureEnabled e = do exts <- use enabledExts guard $ e `S.member` exts -- | Generically consume whitespace, including comments. sc :: Parser () sc = L.space space1 lineComment empty {- no block comments in disco -} where lineComment = L.skipLineComment "--" -- | Parse a lexeme, that is, a parser followed by consuming -- whitespace. lexeme :: Parser a -> Parser a lexeme p = requireIndent $ L.lexeme sc p -- | Parse a given string as a lexeme. symbol :: String -> Parser String symbol s = requireIndent $ L.symbol sc s -- | Parse a reserved operator. reservedOp :: String -> Parser () reservedOp s = (lexeme . try) (string s *> notFollowedBy (oneOf opChar)) -- | Characters that can occur in an operator symbol. opChar :: [Char] opChar = "~!@#$%^&*-+=|<>?/\\." parens, braces, angles, brackets, bagdelims, fbrack, cbrack :: Parser a -> Parser a parens = between (symbol "(") (symbol ")") braces = between (symbol "{") (symbol "}") angles = between (symbol "<") (symbol ">") brackets = between (symbol "[") (symbol "]") bagdelims = between (symbol "⟅") (symbol "⟆") fbrack = between (symbol "⌊") (symbol "⌋") cbrack = between (symbol "⌈") (symbol "⌉") semi, comma, colon, dot, pipe, hash :: Parser String semi = symbol ";" comma = symbol "," colon = symbol ":" dot = symbol "." pipe = symbol "|" hash = symbol "#" -- | A literal ellipsis of two or more dots, @..@ ellipsis :: Parser String ellipsis = label "ellipsis (..)" $ concat <$> ((:) <$> dot <*> some dot) -- | The symbol that starts an anonymous function (either a backslash -- or a Greek λ). lambda :: Parser String lambda = symbol "\\" <|> symbol "λ" forall :: Parser () forall = () <$ symbol "∀" <|> reserved "forall" exists :: Parser () exists = () <$ symbol "∃" <|> reserved "exists" -- | Parse a natural number. natural :: Parser Integer natural = lexeme L.decimal "natural number" -- | Parse a nonnegative decimal of the form @xxx.yyyy[zzz]@, where -- the @y@s and bracketed @z@s are both optional as long as the -- other is present. (In other words, there must be something after -- the period.) For example, this parser accepts all of the -- following: -- -- > 2.0 -- > 2.333 -- > 2.33[45] -- > 2.[45] -- -- The idea is that brackets surround an infinitely repeating -- sequence of digits. -- -- We used to accept @2.@ with no trailing digits, but no longer do. -- See https://github.com/disco-lang/disco/issues/245 and Note -- [Trailing period]. decimal :: Parser Rational decimal = lexeme (readDecimal <$> some digit <* char '.' <*> fractionalPart ) where digit = satisfy isDigit fractionalPart = -- either some digits optionally followed by bracketed digits... (,) <$> some digit <*> optional (brackets (some digit)) -- ...or just bracketed digits. <|> ([],) <$> (Just <$> brackets (some digit)) readDecimal a (b, mrep) = read a % 1 -- integer part -- next part is just b/10^n + (if null b then 0 else read b) % (10^length b) -- repeating part + readRep (length b) mrep readRep _ Nothing = 0 readRep offset (Just rep) = read rep % (10^offset * (10^length rep - 1)) -- If s = 0.[rep] then 10^(length rep) * s = rep.[rep], so -- 10^(length rep) * s - s = rep, so -- -- s = rep/(10^(length rep) - 1). -- -- We also have to divide by 10^(length b) to shift it over -- past any non-repeating prefix. -- ~~~~ Note [Trailing period] -- -- We used to accept numbers with nothing after the trailing period, -- such as @2.@. However, this caused some problems with parsing: -- -- - First, https://github.com/disco-lang/disco/issues/99 which we -- solved by making sure there was not another period after the -- trailing period. -- - Next, https://github.com/disco-lang/disco/issues/245. -- -- I first tried solving #245 by disallowing *any* operator character -- after the trailing period, but then some tests in the test suite -- started failing, where we had written things like @1./(10^5)@. The -- problem is that when a period is followed by another operator -- symbol, sometimes we might want them to be parsed as an operator -- (as in @2.-4@, #245), and sometimes we might not (as in -- @1./(10^5)@). So in the end it seems simpler and cleaner to -- require at least a 0 digit after the period --- just like pretty -- much every other programming language and just like standard -- mathematical practice. -- | Parse a reserved word. reserved :: String -> Parser () reserved w = (lexeme . try) $ string w *> notFollowedBy alphaNumChar -- | The list of all reserved words. reservedWords :: [String] reservedWords = [ "unit", "true", "false", "True", "False", "let", "in", "is" , "if", "when" , "otherwise", "and", "or", "mod", "choose", "implies", "iff" , "min", "max" , "union", "∪", "intersect", "∩", "subset", "⊆", "elem", "∈" , "enumerate", "count", "divides" , "Void", "Unit", "Bool", "Boolean", "Proposition", "Prop", "Char" , "Nat", "Natural", "Int", "Integer", "Frac", "Fractional", "Rational", "Fin" , "List", "Bag", "Set", "Graph", "Map" , "N", "Z", "F", "Q", "ℕ", "ℤ", "𝔽", "ℚ" , "∀", "forall", "∃", "exists", "type" , "import", "using" ] -- | Parse an identifier, i.e. any non-reserved string beginning with -- a given type of character and continuing with alphanumerics, -- underscores, and apostrophes. identifier :: Parser Char -> Parser String identifier begin = (lexeme . try) (p >>= check) "variable name" where p = (:) <$> begin <*> many identChar identChar = alphaNumChar <|> oneOf "_'" check x | x `elem` reservedWords = do -- back up to beginning of bad token to report correct position updateParserState (\s -> s { stateOffset = stateOffset s - length x }) customFailure $ ReservedVarName x | otherwise = return x -- | Parse an 'identifier' and turn it into a 'Name'. ident :: Parser (Name Term) ident = string2Name <$> identifier letterChar ------------------------------------------------------------ -- Parser -- | Results from parsing a block of top-level things. data TLResults = TLResults { _tlDecls :: [Decl] , _tlDocs :: [(Name Term, [DocThing])] , _tlTerms :: [Term] } emptyTLResults :: TLResults emptyTLResults = TLResults [] [] [] makeLenses ''TLResults -- | Parse the entire input as a module (with leading whitespace and -- no leftovers). wholeModule :: LoadingMode -> Parser Module wholeModule = between sc eof . parseModule -- | Parse an entire module (a list of declarations ended by -- semicolons). The 'LoadingMode' parameter tells us whether to -- include or replace any language extensions enabled at the top -- level. We include them when parsing a module entered at the -- REPL, and replace them when parsing a standalone module. parseModule :: LoadingMode -> Parser Module parseModule mode = do exts <- S.fromList <$> many parseExtension let extFun = case mode of Standalone -> withExts REPL -> withAdditionalExts extFun exts $ do imports <- many parseImport topLevel <- many parseTopLevel let theMod = mkModule exts imports topLevel return theMod where groupTLs :: [DocThing] -> [TopLevel] -> TLResults groupTLs _ [] = emptyTLResults groupTLs revDocs (TLDoc doc : rest) = groupTLs (doc : revDocs) rest groupTLs revDocs (TLDecl decl@(DType (TypeDecl x _)) : rest) = groupTLs [] rest & tlDecls %~ (decl :) & tlDocs %~ ((x, reverse revDocs) :) groupTLs revDocs (TLDecl decl@(DTyDef (TypeDefn x _ _)) : rest) = groupTLs [] rest & tlDecls %~ (decl :) & tlDocs %~ ((string2Name x, reverse revDocs) :) groupTLs _ (TLDecl defn : rest) = groupTLs [] rest & tlDecls %~ (defn :) groupTLs _ (TLExpr t : rest) = groupTLs [] rest & tlTerms %~ (t:) defnGroups :: [Decl] -> [Decl] defnGroups [] = [] defnGroups (d@DType{} : ds) = d : defnGroups ds defnGroups (d@DTyDef{} : ds) = d : defnGroups ds defnGroups (DDefn (TermDefn x bs) : ds) = DDefn (TermDefn x (bs ++ concatMap (\(TermDefn _ cs) -> cs) grp)) : defnGroups rest where (grp, rest) = matchDefn ds matchDefn :: [Decl] -> ([TermDefn], [Decl]) matchDefn (DDefn t@(TermDefn x' _) : ds2) | x == x' = (t:ts, ds2') where (ts, ds2') = matchDefn ds2 matchDefn ds2 = ([], ds2) mkModule exts imps tls = Module exts imps (defnGroups decls) docs terms where TLResults decls docs terms = groupTLs [] tls -- | Parse an extension. parseExtension :: Parser Ext parseExtension = L.nonIndented sc $ reserved "using" *> parseExtName -- | Parse the name of a language extension (case-insensitive). parseExtName :: Parser Ext parseExtName = choice (map parseOneExt allExtsList) "language extension name" where parseOneExt ext = ext <$ lexeme (string' (show ext) :: Parser String) -- | Parse an import, of the form @import @. parseImport :: Parser String parseImport = L.nonIndented sc $ reserved "import" *> parseModuleName -- | Parse the name of a module. parseModuleName :: Parser String parseModuleName = lexeme $ intercalate "/" <$> (some (alphaNumChar <|> oneOf "_-") `sepBy` char '/') <* optional (string ".disco") -- | Parse a top level item (either documentation or a declaration), -- which must start at the left margin. parseTopLevel :: Parser TopLevel parseTopLevel = L.nonIndented sc $ TLDoc <$> parseDocThing <|> TLDecl <$> parseDecl -- See Note [Parsing definitions and top-level expressions] <|> TLExpr <$> thenIndented parseTerm -- ~~~~ Note [Parsing definitions and top-level expressions] -- -- The beginning of a definition might look the same as an -- expression. e.g. is f(x,y) the start of a definition of f, or an -- expression with a function call? We used to therefore wrap -- 'parseDecl' in 'try'. The problem is that if a definition has a -- syntax error on the RHS, it would fail, backtrack, then try -- parsing a top-level expression and fail when it got to the = -- sign, giving an uninformative parse error message. -- See https://github.com/disco-lang/disco/issues/346. -- -- The solution is that we now do more careful backtracking within -- parseDecl itself: when parsing a definition, we only backtrack if -- we don't get a complete LHS + '=' sign; once we start parsing the -- RHS of a definition we no longer backtrack, since it can't -- possibly be a valid top-level expression. -- | Parse a documentation item: either a group of lines beginning -- with @|||@ (text documentation), or a group beginning with @!!!@ -- (checked examples/properties). parseDocThing :: Parser DocThing parseDocThing = DocString <$> some parseDocString <|> DocProperty <$> parseProperty -- | Parse one line of documentation beginning with @|||@. parseDocString :: Parser String parseDocString = label "documentation" $ L.nonIndented sc $ string "|||" *> takeWhileP Nothing (`elem` " \t") *> takeWhileP Nothing (`notElem` "\r\n") <* sc -- Note we use string "|||" rather than symbol "|||" because we -- don't want it to consume whitespace afterwards (in particular a -- line with ||| by itself would cause symbol "|||" to consume the -- newline). -- | Parse a top-level property/unit test, of the form -- -- @!!! forall x1 : ty1, ..., xn : tyn. term@. -- -- The forall is optional. parseProperty :: Parser Term parseProperty = label "property" $ L.nonIndented sc $ do _ <- symbol "!!!" indented parseTerm -- | Parse a single top-level declaration (either a type declaration -- or single definition clause). parseDecl :: Parser Decl parseDecl = try (DType <$> parseTyDecl) <|> DDefn <$> parseDefn <|> DTyDef <$> parseTyDefn -- | Parse a top-level type declaration of the form @x : ty@. parseTyDecl :: Parser TypeDecl parseTyDecl = label "type declaration" $ TypeDecl <$> ident <*> indented (colon *> parsePolyTy) -- | Parse a definition of the form @x pat1 .. patn = t@. parseDefn :: Parser TermDefn parseDefn = label "definition" $ (\(x, ps) body -> TermDefn x [bind ps body]) -- Only backtrack if we don't get a complete 'LHS ='. Once we see -- an = sign, commit to parsing a definition, because it can't be a -- valid standalone expression anymore. If the RHS fails, we don't -- want to backtrack, we just want to display the parse error. <$> try ((,) <$> ident <*> indented (many parseAtomicPattern) <* reservedOp "=") <*> indented parseTerm -- | Parse the definition of a user-defined algebraic data type. parseTyDefn :: Parser TypeDefn parseTyDefn = label "type defintion" $ do reserved "type" indented $ do name <- parseTyDef args <- fromMaybe [] <$> optional (parens $ parseTyVarName `sepBy1` comma) _ <- reservedOp "=" TypeDefn name args <$> parseType -- | Parse the entire input as a term (with leading whitespace and -- no leftovers). term :: Parser Term term = between sc eof parseTerm -- | Parse a term, consisting of a @parseTerm'@ optionally -- followed by an ascription. parseTerm :: Parser Term parseTerm = -- trace "parseTerm" $ ascribe <$> parseTerm' <*> optional (label "type annotation" $ colon *> parsePolyTy) where ascribe t Nothing = t ascribe t (Just ty) = TAscr t ty -- | Parse a non-atomic, non-ascribed term. parseTerm' :: Parser Term parseTerm' = label "expression" $ parseQuantified <|> parseLet <|> parseExpr <|> parseAtom -- | Parse an atomic term. parseAtom :: Parser Term parseAtom = label "expression" $ parseUnit <|> TBool True <$ (reserved "true" <|> reserved "True") <|> TBool False <$ (reserved "false" <|> reserved "False") <|> TChar <$> lexeme (between (char '\'') (char '\'') L.charLiteral) <|> TString <$> lexeme (char '"' >> manyTill L.charLiteral (char '"')) <|> TWild <$ try parseWild <|> TPrim <$> try parseStandaloneOp -- Note primitives are NOT reserved words, so they are just parsed -- as identifiers. This means that it is possible to shadow a -- primitive in a local context, as it should be. Vars are turned -- into prims at scope-checking time: if a var is not in scope but -- there is a prim of that name then it becomes a TPrim. See the -- 'typecheck Infer (TVar x)' case in Disco.Typecheck. <|> TVar <$> ident <|> TPrim <$> (ensureEnabled Primitives *> parsePrim) <|> TRat <$> try decimal <|> TNat <$> natural <|> parseTypeOp <|> TApp (TPrim PrimFloor) . TParens <$> fbrack parseTerm <|> TApp (TPrim PrimCeil) . TParens <$> cbrack parseTerm <|> parseCase <|> try parseAbs <|> bagdelims (parseContainer BagContainer) <|> braces (parseContainer SetContainer) <|> brackets (parseContainer ListContainer) <|> tuple <$> parens (parseTerm `sepBy1` comma) parseAbs :: Parser Term parseAbs = TApp (TPrim PrimAbs) <$> (pipe *> parseTerm <* pipe) parseUnit :: Parser Term parseUnit = TUnit <$ (reserved "unit" <|> void (symbol "■")) -- | Parse a wildcard, which is an underscore that isn't the start of -- an identifier. parseWild :: Parser () parseWild = (lexeme . try . void) $ string "_" <* notFollowedBy (alphaNumChar <|> oneOf "_'") -- | Parse a standalone operator name with tildes indicating argument -- slots, e.g. ~+~ for the addition operator. parseStandaloneOp :: Parser Prim parseStandaloneOp = asum $ concatMap mkStandaloneOpParsers (concat opTable) where mkStandaloneOpParsers :: OpInfo -> [Parser Prim] mkStandaloneOpParsers (OpInfo (UOpF Pre uop) syns _) = map (\syn -> PrimUOp uop <$ try (lexeme (string syn >> char '~'))) syns mkStandaloneOpParsers (OpInfo (UOpF Post uop) syns _) = map (\syn -> PrimUOp uop <$ try (lexeme (char '~' >> string syn))) syns mkStandaloneOpParsers (OpInfo (BOpF _ bop) syns _) = map (\syn -> PrimBOp bop <$ try (lexeme (char '~' >> string syn >> char '~'))) syns -- XXX TODO: improve the above so it first tries to parse a ~, -- then parses any postfix or infix thing; or else it looks for -- a prefix thing followed by a ~. This will get rid of the -- need for 'try' and also potentially improve error messages. -- The below may come in useful. -- flatOpTable = concat opTable -- prefixOps = [ (uop, syns) | (OpInfo (UOpF Pre uop) syns _) <- flatOpTable ] -- postfixOps = [ (uop, syns) | (OpInfo (UOpF Post uop) syns _) <- flatOpTable ] -- infixOps = [ (bop, syns) | (OpInfo (BOpF _ bop) syns _) <- flatOpTable ] -- | Parse a primitive name starting with a $. parsePrim :: Parser Prim parsePrim = do void (char '$') x <- identifier letterChar case find ((==x) . primSyntax) primTable of Just (PrimInfo p _ _) -> return p Nothing -> fail ("Unrecognized primitive $" ++ x) -- | Parse a container, like a literal list, set, bag, or a -- comprehension (not including the square or curly brackets). -- -- @ -- -- ::= '[' ']' -- | '{' '}' -- -- -- ::= empty | -- -- -- ::= [ ] -- | -- -- -- ::= '|' -- | ',' [ (',' )* ] [ ] -- -- ::= [ ',' ]* -- -- -- ::= 'in' -- | -- -- ::= '..' [ ] -- @ parseContainer :: Container -> Parser Term parseContainer c = nonEmptyContainer <|> return (TContainer c [] Nothing) -- Careful to do this without backtracking, since backtracking can -- lead to bad performance in certain pathological cases (for -- example, a very deeply nested list). where -- Any non-empty container starts with a term, followed by some -- remainder (which could either be the rest of a literal -- container, or a container comprehension). If there is no -- remainder just return a singleton container, optionally with an -- ellipsis. nonEmptyContainer = do t <- parseRepTerm containerRemainder t <|> singletonContainer t parseRepTerm = do t <- parseTerm n <- optional $ do guard (c == BagContainer) void hash parseTerm return (t, n) singletonContainer t = TContainer c [t] <$> optional parseEllipsis -- The remainder of a container after the first term starts with -- either a pipe (for a comprehension) or a comma (for a literal -- container). containerRemainder :: (Term, Maybe Term) -> Parser Term containerRemainder (t,n) = do s <- pipe <|> comma case (s, n) of ("|", Nothing) -> parseContainerComp c t ("|", Just _) -> fail "no comprehension with bag repetition syntax" (",", _) -> do -- Parse the rest of the terms in a literal container after -- the first, then an optional ellipsis, and return -- everything together. ts <- parseRepTerm `sepBy` comma e <- optional parseEllipsis return $ TContainer c ((t,n):ts) e _ -> error "Impossible, got a symbol other than '|' or ',' in containerRemainder" -- | Parse an ellipsis at the end of a literal list, of the form -- @.. t@. Any number > 1 of dots may be used, just for fun. parseEllipsis :: Parser (Ellipsis Term) parseEllipsis = do _ <- ellipsis Until <$> parseTerm -- | Parse the part of a list comprehension after the | (without -- square brackets), i.e. a list of qualifiers. -- -- @q [,q]*@ parseContainerComp :: Container -> Term -> Parser Term parseContainerComp c t = do qs <- toTelescope <$> (parseQual `sepBy` comma) return (TContainerComp c $ bind qs t) -- | Parse a qualifier in a comprehension: either a binder @x in t@ or -- a guard @t@. parseQual :: Parser Qual parseQual = try parseSelection <|> parseQualGuard where parseSelection = label "membership expression (x in ...)" $ QBind <$> ident <*> (selector *> (embed <$> parseTerm)) selector = reservedOp "<-" <|> reserved "in" parseQualGuard = label "boolean expression" $ QGuard . embed <$> parseTerm -- | Turn a parenthesized list of zero or more terms into the -- appropriate syntax node: one term @(t)@ is just the term itself -- (but we record the fact that it was parenthesized, in order to -- correctly turn juxtaposition into multiplication); two or more -- terms @(t1,t2,...)@ are a tuple. tuple :: [Term] -> Term tuple [x] = TParens x tuple t = TTup t -- | Parse a quantified abstraction (λ, ∀, ∃). parseQuantified :: Parser Term parseQuantified = TAbs <$> parseQuantifier <*> (bind <$> parsePattern `sepBy` comma <*> (dot *> parseTerm)) -- | Parse a quantifier symbol (lambda, forall, or exists). parseQuantifier :: Parser Quantifier parseQuantifier = Lam <$ lambda <|> All <$ forall <|> Ex <$ exists -- | Parse a let expression (@let x1 = t1, x2 = t2, ... in t@). parseLet :: Parser Term parseLet = TLet <$> (reserved "let" *> (bind <$> (toTelescope <$> (parseBinding `sepBy` comma)) <*> (reserved "in" *> parseTerm))) -- | Parse a single binding (@x [ : ty ] = t@). parseBinding :: Parser Binding parseBinding = do x <- ident mty <- optional (colon *> parsePolyTy) t <- symbol "=" *> (embed <$> parseTerm) return $ Binding (embed <$> mty) x t -- | Parse a case expression. parseCase :: Parser Term parseCase = between (symbol "{?") (symbol "?}") $ TCase <$> parseBranch `sepBy` comma -- | Parse one branch of a case expression. parseBranch :: Parser Branch parseBranch = flip bind <$> parseTerm <*> parseGuards -- | Parse the list of guards in a branch. @otherwise@ can be used -- interchangeably with an empty list of guards. parseGuards :: Parser (Telescope Guard) parseGuards = (TelEmpty <$ reserved "otherwise") <|> (toTelescope <$> many parseGuard) -- | Parse a single guard (@if@, @if ... is@, or @let@) parseGuard :: Parser Guard parseGuard = try parseGPat <|> parseGBool <|> parseGLet where guardWord = reserved "if" <|> reserved "when" parseGBool = GBool <$> (embed <$> (guardWord *> parseTerm)) parseGPat = GPat <$> (embed <$> (guardWord *> parseTerm)) <*> (reserved "is" *> parsePattern) parseGLet = GLet <$> (reserved "let" *> parseBinding) -- | Parse an atomic pattern, by parsing a term and then attempting to -- convert it to a pattern. parseAtomicPattern :: Parser Pattern parseAtomicPattern = label "pattern" $ do t <- parseAtom case termToPattern t of Nothing -> customFailure $ InvalidPattern (OT t) Just p -> return p -- | Parse a pattern, by parsing a term and then attempting to convert -- it to a pattern. parsePattern :: Parser Pattern parsePattern = label "pattern" $ do t <- parseTerm case termToPattern t of Nothing -> customFailure $ InvalidPattern (OT t) Just p -> return p -- | Attempt converting a term to a pattern. termToPattern :: Term -> Maybe Pattern termToPattern TWild = Just PWild termToPattern (TVar x) = Just $ PVar x termToPattern (TParens t) = termToPattern t termToPattern TUnit = Just PUnit termToPattern (TBool b) = Just $ PBool b termToPattern (TNat n) = Just $ PNat n termToPattern (TChar c) = Just $ PChar c termToPattern (TString s) = Just $ PString s termToPattern (TTup ts) = PTup <$> mapM termToPattern ts termToPattern (TApp (TVar i) t) | i == string2Name "left" = PInj L <$> termToPattern t | i == string2Name "right" = PInj R <$> termToPattern t -- termToPattern (TInj s t) = PInj s <$> termToPattern t termToPattern (TAscr t s) = case s of Forall (unsafeUnbind -> ([], s')) -> PAscr <$> termToPattern t <*> pure s' _ -> Nothing termToPattern (TBin Cons t1 t2) = PCons <$> termToPattern t1 <*> termToPattern t2 termToPattern (TBin Add t1 t2) = case (termToPattern t1, termToPattern t2) of (Just p, _) | length (toListOf fvAny p) == 1 && null (toListOf fvAny t2) -> Just $ PAdd L p t2 (_, Just p) | length (toListOf fvAny p) == 1 && null (toListOf fvAny t1) -> Just $ PAdd R p t1 _ -> Nothing -- If t1 is a pattern binding one variable, and t2 has no fvs, -- this can be a PAdd L. Also vice versa for PAdd R. termToPattern (TBin Mul t1 t2) = case (termToPattern t1, termToPattern t2) of (Just p, _) | length (toListOf fvAny p) == 1 && null (toListOf fvAny t2) -> Just $ PMul L p t2 (_, Just p) | length (toListOf fvAny p) == 1 && null (toListOf fvAny t1) -> Just $ PMul R p t1 _ -> Nothing -- If t1 is a pattern binding one variable, and t2 has no fvs, -- this can be a PMul L. Also vice versa for PMul R. termToPattern (TBin Sub t1 t2) = case termToPattern t1 of Just p | length (toListOf fvAny p) == 1 && null (toListOf fvAny t2) -> Just $ PSub p t2 _ -> Nothing -- If t1 is a pattern binding one variable, and t2 has no fvs, -- this can be a PSub. -- For now we don't handle the case of t - p, since it seems -- less useful (and desugaring it would require extra code since -- subtraction is not commutative). termToPattern (TBin Div t1 t2) = PFrac <$> termToPattern t1 <*> termToPattern t2 termToPattern (TUn Neg t) = PNeg <$> termToPattern t termToPattern (TContainer ListContainer ts Nothing) = PList <$> mapM (termToPattern . fst) ts termToPattern _ = Nothing -- | Parse an expression built out of unary and binary operators. parseExpr :: Parser Term parseExpr = fixJuxtMul . fixChains <$> (makeExprParser parseAtom table "expression") where table -- Special case for function application, with highest -- precedence. Note that we parse all juxtaposition as -- function application first; we later go through and turn -- some into multiplication (fixing up the precedence -- appropriately) based on a syntactic analysis. = [ InfixL (TApp <$ string "") ] -- get all other operators from the opTable : (map . concatMap) mkOpParser opTable mkOpParser :: OpInfo -> [Operator Parser Term] mkOpParser (OpInfo op syns _) = concatMap (withOpFixity op) syns -- Only parse unary operators consisting of operator symbols. -- Alphabetic unary operators (i.e. 'not') will be parsed as -- applications of variable names, since if they are parsed here -- they will incorrectly parse even when they are a prefix of a -- variable name. withOpFixity (UOpF fx op) syn | any isAlpha syn = [] | otherwise = [ufxParser fx ((reservedOp syn "operator") >> return (TUn op))] withOpFixity (BOpF fx op) syn = [bfxParser fx ((reservedOp syn "operator") >> return (TBin op))] ufxParser Pre = Prefix ufxParser Post = Postfix bfxParser InL = InfixL bfxParser InR = InfixR bfxParser In = InfixN isChainable op = op `elem` [Eq, Neq, Lt, Gt, Leq, Geq, Divides] -- Comparison chains like 3 < x < 5 first get parsed as 3 < (x < -- 5), which does not make sense. This function looks for such -- nested comparison operators and turns them into a TChain. fixChains (TUn op t) = TUn op (fixChains t) fixChains (TBin op t1 (TBin op' t21 t22)) | isChainable op && isChainable op' = TChain t1 (TLink op t21 : getLinks op' t22) fixChains (TBin op t1 t2) = TBin op (fixChains t1) (fixChains t2) fixChains (TApp t1 t2) = TApp (fixChains t1) (fixChains t2) -- Only recurse as long as we see TUn, TBin, or TApp which could -- have been generated by the expression parser. If we see -- anything else we can stop. fixChains e = e getLinks op (TBin op' t1 t2) | isChainable op' = TLink op t1 : getLinks op' t2 getLinks op e = [TLink op (fixChains e)] -- Find juxtapositions (parsed as function application) which -- syntactically have either a literal Nat or a parenthesized -- expression containing an operator as the LHS, and turn them -- into multiplications. Then fix up the parse tree by rotating -- newly created multiplications up until their precedence is -- higher than the thing above them. fixJuxtMul :: Term -> Term -- Just recurse through TUn or TBin and fix precedence on the way back up. fixJuxtMul (TUn op t) = fixPrec $ TUn op (fixJuxtMul t) fixJuxtMul (TBin op t1 t2) = fixPrec $ TBin op (fixJuxtMul t1) (fixJuxtMul t2) -- Possibly turn a TApp into a multiplication, if the LHS looks -- like a multiplicative term. However, we must be sure to -- *first* recursively fix the subterms (particularly the -- left-hand one) *before* doing this analysis. See -- . fixJuxtMul (TApp t1 t2) | isMultiplicativeTerm t1' = fixPrec $ TBin Mul t1' t2' | otherwise = fixPrec $ TApp t1' t2' where t1' = fixJuxtMul t1 t2' = fixJuxtMul t2 -- Otherwise we can stop recursing, since anything other than TUn, -- TBin, or TApp could not have been produced by the expression -- parser. fixJuxtMul t = t -- A multiplicative term is one that looks like either a natural -- number literal, or a unary or binary operation (optionally -- parenthesized). For example, 3, (-2), and (x + 5) are all -- multiplicative terms, so 3x, (-2)x, and (x + 5)x all get parsed -- as multiplication. On the other hand, (x y) is always parsed -- as function application, even if x and y both turn out to have -- numeric types; a variable like x does not count as a -- multiplicative term. Likewise, (x y) z is parsed as function -- application, since (x y) is not a multiplicative term: it is -- parenthezised, but contains a TApp rather than a TBin or TUn. isMultiplicativeTerm :: Term -> Bool isMultiplicativeTerm (TNat _) = True isMultiplicativeTerm TUn{} = True isMultiplicativeTerm TBin{} = True isMultiplicativeTerm (TParens t) = isMultiplicativeTerm t isMultiplicativeTerm _ = False -- Fix precedence by bubbling up any new TBin terms whose -- precedence is less than that of the operator above them. We -- don't worry at all about fixing associativity, just precedence. fixPrec :: Term -> Term -- e.g. 2y! --> (2@y)! --> fixup --> 2 * (y!) fixPrec (TUn uop (TBin bop t1 t2)) | bPrec bop < uPrec uop = case uopMap M.! uop of OpInfo (UOpF Pre _) _ _ -> TBin bop (TUn uop t1) t2 OpInfo (UOpF Post _) _ _ -> TBin bop t1 (TUn uop t2) _ -> error "Impossible! In fixPrec, uopMap contained OpInfo (BOpF ...)" fixPrec (TBin bop1 (TBin bop2 t1 t2) t3) | bPrec bop2 < bPrec bop1 = TBin bop2 t1 (fixPrec $ TBin bop1 t2 t3) -- e.g. x^2y --> x^(2@y) --> x^(2*y) --> (x^2) * y fixPrec (TBin bop1 t1 (TBin bop2 t2 t3)) | bPrec bop2 < bPrec bop1 = TBin bop2 (fixPrec $ TBin bop1 t1 t2) t3 fixPrec t = t -- | Parse an atomic type. parseAtomicType :: Parser Type parseAtomicType = label "type" $ TyVoid <$ reserved "Void" <|> TyUnit <$ reserved "Unit" <|> TyBool <$ (reserved "Boolean" <|> reserved "Bool") <|> TyProp <$ (reserved "Proposition" <|> reserved "Prop") <|> TyC <$ reserved "Char" -- <|> try parseTyFin <|> TyN <$ (reserved "Natural" <|> reserved "Nat" <|> reserved "N" <|> reserved "ℕ") <|> TyZ <$ (reserved "Integer" <|> reserved "Int" <|> reserved "Z" <|> reserved "ℤ") <|> TyF <$ (reserved "Fractional" <|> reserved "Frac" <|> reserved "F" <|> reserved "𝔽") <|> TyQ <$ (reserved "Rational" <|> reserved "Q" <|> reserved "ℚ") <|> TyCon <$> parseCon <*> (fromMaybe [] <$> optional (parens (parseType `sepBy1` comma))) <|> TyVar <$> parseTyVar <|> parens parseType -- parseTyFin :: Parser Type -- parseTyFin = TyFin <$> (reserved "Fin" *> natural) -- <|> TyFin <$> (lexeme (string "Z" <|> string "ℤ") *> natural) parseCon :: Parser Con parseCon = CList <$ reserved "List" <|> CBag <$ reserved "Bag" <|> CSet <$ reserved "Set" <|> CGraph <$ reserved "Graph" <|> CMap <$ reserved "Map" <|> CUser <$> parseTyDef parseTyDef :: Parser String parseTyDef = identifier upperChar parseTyVarName :: Parser String parseTyVarName = identifier lowerChar parseTyVar :: Parser (Name Type) parseTyVar = string2Name <$> parseTyVarName parsePolyTy :: Parser PolyType parsePolyTy = closeType <$> parseType -- | Parse a type expression built out of binary operators. parseType :: Parser Type parseType = makeExprParser parseAtomicType table where table = [ [ infixR "*" (:*:) , infixR "×" (:*:) ] , [ infixR "+" (:+:) , infixR "⊎" (:+:) ] , [ infixR "->" (:->:) , infixR "→" (:->:) ] ] infixR name fun = InfixR (reservedOp name >> return fun) parseTyOp :: Parser TyOp parseTyOp = Enumerate <$ reserved "enumerate" <|> Count <$ reserved "count" parseTypeOp :: Parser Term parseTypeOp = TTyOp <$> parseTyOp <*> parseAtomicType