Copyright 2009 Jake Wheat
The main file for parsing sql, uses parsec. Not sure if parsec is the
right choice, but it seems to do the job pretty well at the moment.
For syntax reference see
http://savage.net.au/SQL/sql20032.bnf.html
and
http://savage.net.au/SQL/sql92.bnf.html
for some online sql grammar guides
and
http://www.postgresql.org/docs/8.4/interactive/sqlsyntax.html
for some notes on postgresql syntax (the rest of that manual is also helpful)
Some further reference/reading:
parsec tutorial:
http://legacy.cs.uu.nl/daan/download/parsec/parsec.html
parsec reference:
http://hackage.haskell.org/package/parsec3.0.0
pdf about parsing, uses haskell and parser combinators for examples
and exercises:
http://www.cs.uu.nl/docs/vakken/gont/diktaat.pdf
The parsers are written top down as you go through the file, so the
top level sql text parsers appear first, then the statements, then the
fragments, then the utility parsers and other utilities at the bottom.
>
> module Database.HsSqlPpp.Parsing.Parser (
>
> parseSql
> ,parseSqlFile
>
> ,parseExpression
> ,parsePlpgsql
> )
> where
> import Text.Parsec hiding(many, optional, (<|>), string)
> import Text.Parsec.Expr
> import Text.Parsec.String
> import Control.Applicative
> import Control.Monad.Identity
> import Data.Maybe
> import Data.Char
> import Database.HsSqlPpp.Parsing.Lexer
> import Database.HsSqlPpp.Parsing.ParseErrors
> import Database.HsSqlPpp.TypeChecking.Ast
> import Database.HsSqlPpp.TypeChecking.TypeChecker as A
The parse state is used to keep track of source positions inside
function bodies, these bodies are parsed separately to the rest of the
code which is why we need to do this.
> type MySourcePos = (String,Int,Int)
> type ParseState = [MySourcePos]
> startState :: ParseState
> startState = []
> toMySp :: SourcePos -> MySourcePos
> toMySp sp = (sourceName sp, sourceLine sp, sourceColumn sp)
================================================================================
= Top level parsing functions
> parseSql :: String
> -> Either ExtendedError StatementList
> parseSql s = statementsOnly $ parseIt (lexSqlText s) sqlStatements "" s startState
> parseSqlFile :: FilePath
> -> IO (Either ExtendedError StatementList)
> parseSqlFile fn = do
> sc <- readFile fn
> x <- lexSqlFile fn
> return $ statementsOnly $ parseIt x sqlStatements fn sc startState
>
> parseExpression :: String
>
> -> Either ExtendedError Expression
> parseExpression s = parseIt (lexSqlText s) (expr <* eof) "" s startState
>
>
>
>
> parsePlpgsql :: String
> -> Either ExtendedError StatementList
> parsePlpgsql s = parseIt (lexSqlText s) (many plPgsqlStatement <* eof) "" s startState
utility function to do error handling in one place
> parseIt lexed parser fn src ss =
> case lexed of
> Left er -> Left er
> Right toks -> convertToExtendedError
> (runParser parser ss fn toks) fn src
> statementsOnly :: Either ExtendedError StatementList
> -> Either ExtendedError StatementList
> statementsOnly s = case s of
> Left er -> Left er
> Right st -> Right st
================================================================================
= Parsing top level statements
> sqlStatements :: ParsecT [Token] ParseState Identity [Statement]
> sqlStatements = many (sqlStatement True) <* eof
parse a statement
> sqlStatement :: Bool -> ParsecT [Token] ParseState Identity Statement
> sqlStatement reqSemi =
> (choice [
> selectStatement
> ,insert
> ,update
> ,delete
> ,truncateSt
> ,copy
> ,keyword "create" *>
> choice [
> createTable
> ,createType
> ,createFunction
> ,createView
> ,createDomain]
> ,keyword "drop" *>
> choice [
> dropSomething
> ,dropFunction]
> ]
> <* (if reqSemi
> then symbol ";" >> return ()
> else optional (symbol ";") >> return ()))
> <|> copyData
> getAdjustedPosition :: ParsecT [Token] ParseState Identity MySourcePos
> getAdjustedPosition = do
> p <- toMySp <$> getPosition
> s <- getState
> case s of
> [] -> return p
> x:_ -> return $ adjustPosition x p
> pos :: ParsecT [Token] ParseState Identity Annotation
> pos = do
> p <- toSp <$> getPosition
> s <- getState
> case s of
> [] -> return [p]
> x:_ -> return [adjustPos x p]
> where
> toSp sp = A.SourcePos (sourceName sp) (sourceLine sp) (sourceColumn sp)
> adjustPos (fn,pl,_) (A.SourcePos _ l c) = A.SourcePos fn (pl+l1) c
> adjustPos _ x = error $ "internal error - tried to adjust as sourcepos: " ++ show x
> adjustPosition :: MySourcePos -> MySourcePos -> MySourcePos
> adjustPosition (fn,pl,_) (_,l,c) = (fn,pl+l1,c)
================================================================================
statement flavour parsers
top level/sql statements first
= select
select parser, parses things starting with the keyword 'select'
supports plpgsql 'select into' only for the variants which look like
'select into ([targets]) [columnNames] from ...
or
'select [columnNames] into ([targets]) from ...
This should be changed so it can only parse an into clause when
expecting a plpgsql statement.
recurses to support parsing excepts, unions, etc
> selectStatement :: ParsecT [Token] ParseState Identity Statement
> selectStatement = SelectStatement <$> pos <*> selectExpression
> selectExpression :: ParsecT [Token] ParseState Identity SelectExpression
> selectExpression =
> choice [selectE, values]
> where
> selectE = do
> keyword "select"
> s1 <- selQuerySpec
> choice [
>
>
> CombineSelect [] Except s1 <$> (keyword "except" *> selectExpression)
> ,CombineSelect [] Intersect s1 <$> (keyword "intersect" *> selectExpression)
> ,CombineSelect [] UnionAll s1 <$> (try (keyword "union"
> *> keyword "all") *> selectExpression)
> ,CombineSelect [] Union s1 <$> (keyword "union" *> selectExpression)
> ,return s1]
> where
> selQuerySpec = Select []
> <$> option Dupes (Distinct <$ keyword "distinct")
> <*> selectList
> <*> tryOptionMaybe from
> <*> tryOptionMaybe whereClause
> <*> option [] groupBy
> <*> tryOptionMaybe having
> <*> option [] orderBy
> <*> option Asc (choice [
> Asc <$ keyword "asc"
> ,Desc <$ keyword "desc"])
> <*> tryOptionMaybe limit
> <*> tryOptionMaybe offset
> from = keyword "from" *> tref
> groupBy = keyword "group" *> keyword "by"
> *> commaSep1 expr
> having = keyword "having" *> expr
> orderBy = keyword "order" *> keyword "by"
> *> commaSep1 expr
> limit = keyword "limit" *> expr
> offset = keyword "offset" *> expr
>
>
>
>
>
>
>
> tref = threadOptionalSuffix getFirstTref joinPart
> getFirstTref = choice [
> SubTref []
> <$> parens selectExpression
> <*> (optional (keyword "as") *> nkwid)
> ,optionalSuffix
> (TrefFun []) (try $ identifier >>= functionCallSuffix)
> (TrefFunAlias []) () (optional (keyword "as") *> nkwid)
> ,optionalSuffix
> (Tref []) nkwid
> (TrefAlias []) () (optional (keyword "as") *> nkwid)]
>
>
>
> --joins
> joinPart tr1 = threadOptionalSuffix (readOneJoinPart tr1) joinPart
> readOneJoinPart tr1 = JoinedTref [] tr1
>
> <$> option Unnatural (Natural <$ keyword "natural")
> <*> choice [
> Inner <$ keyword "inner"
> ,LeftOuter <$ try (keyword "left" *> keyword "outer")
> ,RightOuter <$ try (keyword "right" *> keyword "outer")
> ,FullOuter <$ try (keyword "full" >> keyword "outer")
> ,Cross <$ keyword "cross"]
>
> <*> (keyword "join" *> tref)
>
> <*> choice [
> Just <$> (JoinOn <$> (keyword "on" *> expr))
> ,Just <$> (JoinUsing <$> (keyword "using" *> columnNameList))
> ,return Nothing]
> nkwid = try $ do
> x <- idString
>
>
>
>
>
>
> --[]
> if map toLower x `elem` ["as"
> ,"where"
> ,"except"
> ,"union"
> ,"intersect"
> ,"loop"
> ,"inner"
> ,"on"
> ,"left"
> ,"right"
> ,"full"
> ,"cross"
> ,"natural"
> ,"order"
> ,"group"
> ,"limit"
> ,"using"
> ,"from"]
> then fail "not keyword"
> else return x
> values = keyword "values" >>
> Values [] <$> commaSep1 (parens $ commaSep1 expr)
= insert, update and delete
insert statement: supports option column name list,
multiple rows to insert and insert from select statements
> insert :: ParsecT [Token] ParseState Identity Statement
> insert = keyword "insert" >> keyword "into" >>
> Insert
> <$> pos
> <*> idString
> <*> option [] (try columnNameList)
> <*> selectExpression
> <*> tryOptionMaybe returning
> update :: ParsecT [Token] ParseState Identity Statement
> update = keyword "update" >>
> Update
> <$> pos
> <*> idString
> <*> (keyword "set" *> commaSep1 setClause)
> <*> tryOptionMaybe whereClause
> <*> tryOptionMaybe returning
> where
> setClause = choice
> [RowSetClause <$> parens (commaSep1 idString)
> <*> (symbol "=" *> parens (commaSep1 expr))
> ,SetClause <$> idString
> <*> (symbol "=" *> expr)]
> delete :: ParsecT [Token] ParseState Identity Statement
> delete = keyword "delete" >> keyword "from" >>
> Delete
> <$> pos
> <*> idString
> <*> tryOptionMaybe whereClause
> <*> tryOptionMaybe returning
> truncateSt :: ParsecT [Token] ParseState Identity Statement
> truncateSt = keyword "truncate" >> optional (keyword "table") >>
> Truncate
> <$> pos
> <*> commaSep1 idString
> <*> option ContinueIdentity (choice [
> ContinueIdentity <$ (keyword "continue"
> <* keyword "identity")
> ,RestartIdentity <$ (keyword "restart"
> <* keyword "identity")])
> <*> cascade
> copy :: ParsecT [Token] ParseState Identity Statement
> copy = do
> p <- pos
> keyword "copy"
> tableName <- idString
> cols <- option [] (parens $ commaSep1 idString)
> keyword "from"
> src <- choice [
> CopyFilename <$> extrStr <$> stringLit
> ,Stdin <$ keyword "stdin"]
> return $ Copy p tableName cols src
> copyData :: ParsecT [Token] ParseState Identity Statement
> copyData = CopyData <$> pos <*> mytoken (\tok ->
> case tok of
> CopyPayloadTok n -> Just n
> _ -> Nothing)
= ddl
> createTable :: ParsecT [Token] ParseState Identity Statement
> createTable = do
> p <- pos
> keyword "table"
> tname <- idString
> choice [
> CreateTableAs p tname <$> (keyword "as" *> selectExpression)
> ,uncurry (CreateTable p tname) <$> readAttsAndCons]
> where
>
>
>
>
> readAttsAndCons = parens (swap <$> multiPerm
> (try tableConstr)
> tableAtt
> (symbol ","))
> where swap (a,b) = (b,a)
> tableAtt = AttributeDef
> <$> idString
> <*> typeName
> <*> tryOptionMaybe (keyword "default" *> expr)
> <*> many rowConstraint
> tableConstr = choice [
> UniqueConstraint
> <$> try (keyword "unique" *> columnNameList)
> ,PrimaryKeyConstraint
> <$> try (keyword "primary" *> keyword "key"
> *> choice [
> (:[]) <$> idString
> ,parens (commaSep1 idString)])
> ,CheckConstraint
> <$> try (keyword "check" *> parens expr)
> ,ReferenceConstraint
> <$> try (keyword "foreign" *> keyword "key"
> *> parens (commaSep1 idString))
> <*> (keyword "references" *> idString)
> <*> option [] (parens $ commaSep1 idString)
> <*> onDelete
> <*> onUpdate]
> rowConstraint =
> choice [
> RowUniqueConstraint <$ keyword "unique"
> ,RowPrimaryKeyConstraint <$ keyword "primary" <* keyword "key"
> ,RowCheckConstraint <$> (keyword "check" *> parens expr)
> ,NullConstraint <$ keyword "null"
> ,NotNullConstraint <$ (keyword "not" *> keyword "null")
> ,RowReferenceConstraint
> <$> (keyword "references" *> idString)
> <*> option Nothing (try $ parens $ Just <$> idString)
> <*> onDelete
> <*> onUpdate
> ]
> onDelete = onSomething "delete"
> onUpdate = onSomething "update"
> onSomething k = option Restrict $ try $ keyword "on"
> *> keyword k *> cascade
> createType :: ParsecT [Token] ParseState Identity Statement
> createType = keyword "type" >>
> CreateType
> <$> pos
> <*> idString
> <*> (keyword "as" *> parens (commaSep1 typeAtt))
> where
> typeAtt = TypeAttDef <$> idString <*> typeName
create function, support sql functions and plpgsql functions. Parses
the body in both cases and provides a statement list for the body
rather than just a string.
> createFunction :: ParsecT [Token] ParseState Identity Statement
> createFunction = do
> p <- pos
> keyword "function"
> fnName <- idString
> params <- parens $ commaSep param
> retType <- keyword "returns" *> typeName
> keyword "as"
> bodypos <- getAdjustedPosition
> body <- stringLit
> lang <- readLang
> let (q, b) = parseBody lang body fnName bodypos
> CreateFunction p lang fnName params retType q b <$> pVol
> where
> pVol = matchAKeyword [("volatile", Volatile)
> ,("stable", Stable)
> ,("immutable", Immutable)]
> readLang = keyword "language" *> matchAKeyword [("plpgsql", Plpgsql)
> ,("sql",Sql)]
> parseBody lang body fnName bodypos =
> case (parseIt
> (lexSqlText (extrStr body))
> (functionBody lang)
> ("function " ++ fnName)
> (extrStr body)
> [bodypos]) of
> Left er@(ExtendedError e _) ->
>
>
>
>
>
>
> let ep = toMySp $ errorPos e
> (fn,fp,fc) = bodypos
> fnBit = "in function " ++ fnName ++ "\n"
> ++ fn ++ ":" ++ show fp ++ ":" ++ show fc ++ ":\n"
> (_,lp,lc) = adjustPosition bodypos ep
> lineBit = "on line\n"
> ++ fn ++ ":" ++ show lp ++ ":" ++ show lc ++ ":\n"
> in error $ show er ++ "\n" ++ fnBit ++ lineBit
> Right body' -> (quoteOfString body, body')
sql function is just a list of statements, the last one has the
trailing semicolon optional
> functionBody Sql = do
> a <- many (try $ sqlStatement True)
>
>
> SqlFnBody <$> option a ((\b -> (a++[b])) <$> sqlStatement False)
plpgsql function has an optional declare section, plus the statements
are enclosed in begin ... end; (semi colon after end is optional(
> functionBody Plpgsql =
> PlpgsqlFnBody
> <$> option [] declarePart
> <*> statementPart
> where
> statementPart = keyword "begin"
> *> many plPgsqlStatement
> <* keyword "end" <* optional (symbol ";") <* eof
> declarePart = keyword "declare"
> *> manyTill (try varDef) (lookAhead $ keyword "begin")
params to a function
> param :: ParsecT [Token] ParseState Identity ParamDef
> param = choice [
> try (ParamDef <$> idString <*> typeName)
> ,ParamDefTp <$> typeName]
variable declarations in a plpgsql function
> varDef :: ParsecT [Token] ParseState Identity VarDef
> varDef = VarDef
> <$> idString
> <*> typeName
> <*> tryOptionMaybe ((symbol ":=" <|> symbol "=")*> expr) <* symbol ";"
> createView :: ParsecT [Token] ParseState Identity Statement
> createView = keyword "view" >>
> CreateView
> <$> pos
> <*> idString
> <*> (keyword "as" *> selectExpression)
> createDomain :: ParsecT [Token] ParseState Identity Statement
> createDomain = keyword "domain" >>
> CreateDomain
> <$> pos
> <*> idString
> <*> (tryOptionMaybe (keyword "as") *> typeName)
> <*> tryOptionMaybe (keyword "check" *> parens expr)
> dropSomething :: ParsecT [Token] ParseState Identity Statement
> dropSomething = do
> p <- pos
> x <- try (choice [
> Domain <$ keyword "domain"
> ,Type <$ keyword "type"
> ,Table <$ keyword "table"
> ,View <$ keyword "view"
> ])
> (i,e,r) <- parseDrop idString
> return $ DropSomething p x i e r
> dropFunction :: ParsecT [Token] ParseState Identity Statement
> dropFunction = do
> p <- pos
> keyword "function"
> (i,e,r) <- parseDrop pFun
> return $ DropFunction p i e r
> where
> pFun = (,) <$> idString
> <*> parens (many idString)
> parseDrop :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity (IfExists, [a], Cascade)
> parseDrop p = (,,)
> <$> ifExists
> <*> commaSep1 p
> <*> cascade
> where
> ifExists = option Require
> (try $ IfExists <$ (keyword "if"
> *> keyword "exists"))
================================================================================
= component parsers for sql statements
> whereClause :: ParsecT [Token] ParseState Identity Expression
> whereClause = keyword "where" *> expr
selectlist and selectitem: the bit between select and from
check for into either before the whole list of select columns
or after the whole list
> selectList :: ParsecT [Token] ParseState Identity SelectList
> selectList =
> choice [
> flip SelectList <$> readInto <*> itemList
> ,SelectList <$> itemList <*> option [] readInto]
> where
> readInto = keyword "into" *> commaSep1 idString
> itemList = commaSep1 selectItem
> selectItem = optionalSuffix
> SelExp expr
> SelectItem () (keyword "as" *> idString)
> returning :: ParsecT [Token] ParseState Identity SelectList
> returning = keyword "returning" *> selectList
> columnNameList :: ParsecT [Token] ParseState Identity [String]
> columnNameList = parens $ commaSep1 idString
> typeName :: ParsecT [Token] ParseState Identity TypeName
> typeName = choice [
> SetOfTypeName <$> (keyword "setof" *> typeName)
> ,do
> s <- map toLower <$> idString
> choice [
> PrecTypeName s <$> parens integer
> ,ArrayTypeName (SimpleTypeName s) <$ symbol "[" <* symbol "]"
> ,return $ SimpleTypeName s]]
> cascade :: ParsecT [Token] ParseState Identity Cascade
> cascade = option Restrict (choice [
> Restrict <$ keyword "restrict"
> ,Cascade <$ keyword "cascade"])
================================================================================
= plpgsql statements
> plPgsqlStatement :: ParsecT [Token] ParseState Identity Statement
> plPgsqlStatement =
> sqlStatement True
> <|> (choice [
> continue
> ,execute
> ,caseStatement
> ,assignment
> ,ifStatement
> ,returnSt
> ,raise
> ,forStatement
> ,whileStatement
> ,perform
> ,nullStatement]
> <* symbol ";")
> nullStatement :: ParsecT [Token] ParseState Identity Statement
> nullStatement = NullStatement <$> (pos <* keyword "null")
> continue :: ParsecT [Token] ParseState Identity Statement
> continue = ContinueStatement <$> (pos <* keyword "continue")
> perform :: ParsecT [Token] ParseState Identity Statement
> perform = keyword "perform" >>
> Perform <$> pos <*> expr
> execute :: ParsecT [Token] ParseState Identity Statement
> execute = pos >>= \p -> keyword "execute" >>
> optionalSuffix
> (Execute p) expr
> (ExecuteInto p) () readInto
> where
> readInto = keyword "into" *> commaSep1 idString
> assignment :: ParsecT [Token] ParseState Identity Statement
> assignment = Assignment
> <$> pos
>
>
>
> <*> try (idString <* (symbol ":=" <|> symbol "="))
> <*> expr
> returnSt :: ParsecT [Token] ParseState Identity Statement
> returnSt = pos >>= \p -> keyword "return" >>
> choice [
> ReturnNext p <$> (keyword "next" *> expr)
> ,ReturnQuery p <$> (keyword "query" *> selectExpression)
> ,Return p <$> tryOptionMaybe expr]
> raise :: ParsecT [Token] ParseState Identity Statement
> raise = pos >>= \p -> keyword "raise" >>
> Raise p
> <$> raiseType
> <*> (extrStr <$> stringLit)
> <*> option [] (symbol "," *> commaSep1 expr)
> where
> raiseType = matchAKeyword [("notice", RNotice)
> ,("exception", RException)
> ,("error", RError)]
> forStatement :: ParsecT [Token] ParseState Identity Statement
> forStatement = do
> p <- pos
> keyword "for"
> start <- idString
> keyword "in"
> choice [(ForSelectStatement p start <$> try selectExpression <*> theRest)
> ,(ForIntegerStatement p start
> <$> expr
> <*> (symbol ".." *> expr)
> <*> theRest)]
> where
> theRest = keyword "loop" *> many plPgsqlStatement
> <* keyword "end" <* keyword "loop"
> whileStatement :: ParsecT [Token] ParseState Identity Statement
> whileStatement = keyword "while" >>
> WhileStatement
> <$> pos
> <*> (expr <* keyword "loop")
> <*> many plPgsqlStatement <* keyword "end" <* keyword "loop"
> ifStatement :: ParsecT [Token] ParseState Identity Statement
> ifStatement = keyword "if" >>
> If
> <$> pos
> <*> (ifPart <:> elseifParts)
> <*> (elsePart <* endIf)
> where
> ifPart = expr <.> (thn *> many plPgsqlStatement)
> elseifParts = many ((elseif *> expr) <.> (thn *> many plPgsqlStatement))
> elsePart = option [] (keyword "else" *> many plPgsqlStatement)
> endIf = keyword "end" <* keyword "if"
> thn = keyword "then"
> elseif = keyword "elseif"
>
>
> (<.>) a b = (,) <$> a <*> b
> caseStatement :: ParsecT [Token] ParseState Identity Statement
> caseStatement = keyword "case" >>
> CaseStatement <$> pos
> <*> expr
> <*> many whenSt
> <*> option [] (keyword "else" *> many plPgsqlStatement)
> <* keyword "end" <* keyword "case"
> where
> whenSt = keyword "when" >>
> (,) <$> commaSep1 expr
> <*> (keyword "then" *> many plPgsqlStatement)
================================================================================
= expressions
This is the bit that makes it the most obvious that I don't really
know haskell, parsing theory or parsec ... robbed a parsing example
from haskellcafe and mainly just kept changing it until it seemed to
work
> expr :: ParsecT [Token] ParseState Identity Expression
> expr = buildExpressionParser table factor
> <?> "expression"
> factor :: ParsecT [Token] ParseState Identity Expression
> factor =
First job is to take care of forms which start as a regular
expression, and then add a suffix on
> threadOptionalSuffixes fct [castSuffix
> ,betweenSuffix
> ,arraySubSuffix]
> where
> fct = choice [
order these so the ones which can be valid prefixes of others appear
further down the list, probably want to refactor this to use the
optionalsuffix parsers to improve speed.
One little speed optimisation, to help with pretty printed code which
can contain a lot of parens check for nested ((
This little addition speeds up ./ParseFile.lhs sqltestfiles/system.sql
on my system from ~4 minutes to ~4 seconds (most of the 4s is probably
compilation overhead).
> try (lookAhead (symbol "(" >> symbol "(")) >> parens expr
start with the factors which start with parens eliminate scalar
subqueries since they're easy to distinguish from the others then do in
predicate before row constructor, since an in predicate can start with
a row constructor, then finally vanilla parens
> ,scalarSubQuery
> ,try $ threadOptionalSuffix rowCtor inPredicateSuffix
> ,parens expr
we have two things which can start with a $,
do the position arg first, then we can unconditionally
try the dollar quoted string next
> ,positionalArg
string using quotes don't start like anything else and we've
already tried the other thing which starts with a $, so can
parse without a try
> ,stringLit
> ,floatLit
> ,integerLit
put the factors which start with keywords before the ones which start
with a function
> ,caseParse
> ,exists
> ,booleanLit
> ,nullLit
> ,arrayLit
> ,castKeyword
> ,substring
now do identifiers, functions, and window functions (each is a prefix
to the next one)
> ,threadOptionalSuffixes
> identifier
> [inPredicateSuffix
> ,\l -> threadOptionalSuffix (functionCallSuffix l) windowFnSuffix]]
== operator table
proper hacky, but sort of does the job
the 'missing' notes refer to pg operators which aren't yet supported
pg's operator table is on this page:
http://www.postgresql.org/docs/8.4/interactive/sqlsyntaxlexical.html#SQLSYNTAXOPERATORS
will probably need something more custom to handle full range of sql
syntactical novelty, in particular the precedence rules mix these
operators up with irregular syntax operators, you can create new
operators during parsing, and some operators are prefix/postfix or
binary depending on the types of their operands (how do you parse
something like this?)/
The full list of operators from DefaultScope.hs should be used here.
> table :: [[Operator [Token] ParseState Identity Expression]]
> table = [
>
> [prefix "-" (FunCall [] "u-")]
> ,[binary "^" AssocLeft]
> ,[binary "*" AssocLeft
> ,idHackBinary "*" AssocLeft
> ,binary "/" AssocLeft
> ,binary "%" AssocLeft]
> ,[binary "+" AssocLeft
> ,binary "-" AssocLeft]
>
> ,[postfixks ["is", "not", "null"] (FunCall [] "!isNotNull")
> ,postfixks ["is", "null"] (FunCall [] "!isNull")]
>
> ,[binary "<->" AssocNone
> ,binary "<=" AssocRight
> ,binary ">=" AssocRight
> ,binary "||" AssocLeft
> ,prefix "@" (FunCall [] "@")
> ]
>
> --between
> --overlaps
> ,[binaryk "like" "!like" AssocNone
> ,binarycust "!=" "<>" AssocNone]
>
> ,[lt "<" AssocNone
> ,binary ">" AssocNone]
> ,[binary "=" AssocRight
> ,binary "<>" AssocNone]
> ,[prefixk "not" (FunCall [] "!not")]
> ,[binaryk "and" "!and" AssocLeft
> ,binaryk "or" "!or" AssocLeft]]
> where
>
>
> binary s
> = Infix (try (symbol s >> return (binaryF s)))
> binarycust s t
> = Infix (try (symbol s >> return (binaryF t)))
>
>
> idHackBinary s
> = Infix (try (keyword s >> return (binaryF s)))
> prefix s f
> = Prefix (try (symbol s >> return (unaryF f)))
> binaryk s o
> = Infix (try (keyword s >> return (binaryF o)))
> prefixk s f
> = Prefix (try (keyword s >> return (unaryF f)))
>
>
> postfixks ss f
> = Postfix (try (mapM_ keyword ss >> return (unaryF f)))
> unaryF f l = f [l]
> binaryF s l m = FunCall [] s [l,m]
some custom parsers
fix problem parsing <> don't parse as "<" if it is immediately
followed by ">"
don't know if this is needed anymore.
> lt _ = Infix (dontFollowWith "<" ">" >>
> return (\l -> (\m -> FunCall [] "<" [l,m])))
> dontFollowWith c1 c2 =
> try $ symbol c1 *> ((do
> lookAhead $ symbol c2
> fail "dont follow")
> <|> return ())
== factor parsers
> scalarSubQuery :: ParsecT [Token] ParseState Identity Expression
> scalarSubQuery = try (symbol "(" *> lookAhead (keyword "select")) >>
> ScalarSubQuery []
> <$> selectExpression <* symbol ")"
in predicate an identifier or row constructor followed by 'in'
then a list of expressions or a subselect
> inPredicateSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> inPredicateSuffix e =
> InPredicate [] e
> <$> option True (False <$ keyword "not")
> <*> (keyword "in" *> parens ((InSelect <$> selectExpression)
> <|>
> (InList <$> commaSep1 expr)))
row ctor: one of
row ()
row (expr)
row (expr, expr1, ...)
(expr, expr2,...) [implicit (no row keyword) version, at least two elements
must be present]
(expr) parses to just expr rather than row(expr)
and () is a syntax error.
> rowCtor :: ParsecT [Token] ParseState Identity Expression
> rowCtor = FunCall [] "!rowCtor" <$> choice [
> keyword "row" *> parens (commaSep expr)
> ,parens $ commaSep2 expr]
> floatLit :: ParsecT [Token] ParseState Identity Expression
> floatLit = FloatLit [] <$> float
> integerLit :: ParsecT [Token] ParseState Identity Expression
> integerLit = IntegerLit [] <$> integer
case only supports 'case when condition' flavour and not 'case
expression when value' currently
> caseParse :: ParsecT [Token] ParseState Identity Expression
> caseParse = keyword "case" >>
> choice [
> try $ CaseSimple [] <$> expr
> <*> many whenParse
> <*> tryOptionMaybe (keyword "else" *> expr)
> <* keyword "end"
> ,Case [] <$> many whenParse
> <*> tryOptionMaybe (keyword "else" *> expr)
> <* keyword "end"]
> where
> whenParse = (,) <$> (keyword "when" *> commaSep1 expr)
> <*> (keyword "then" *> expr)
> exists :: ParsecT [Token] ParseState Identity Expression
> exists = keyword "exists" >>
> Exists [] <$> parens selectExpression
> booleanLit :: ParsecT [Token] ParseState Identity Expression
> booleanLit = BooleanLit [] <$> (True <$ keyword "true"
> <|> False <$ keyword "false")
> nullLit :: ParsecT [Token] ParseState Identity Expression
> nullLit = NullLit [] <$ keyword "null"
> arrayLit :: ParsecT [Token] ParseState Identity Expression
> arrayLit = keyword "array" >>
> FunCall [] "!arrayCtor" <$> squares (commaSep expr)
> arraySubSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> arraySubSuffix e = if e == Identifier [] "array"
> then fail "can't use array as identifier name"
> else FunCall [] "!arraySub" <$> ((e:) <$> squares (commaSep1 expr))
supports basic window functions of the form
fn() over ([partition bit]? [order bit]?)
frame clauses todo:
range unbounded preceding
range between unbounded preceding and current row
range between unbounded preceding and unbounded following
rows unbounded preceding
rows between unbounded preceding and current row
rows between unbounded preceding and unbounded following
> windowFnSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> windowFnSuffix e = WindowFn [] e
> <$> (keyword "over" *> (symbol "(" *> option [] partitionBy))
> <*> option [] orderBy1
> <*> option Asc (try $ choice [
> Asc <$ keyword "asc"
> ,Desc <$ keyword "desc"])
> <* symbol ")"
> where
> orderBy1 = keyword "order" *> keyword "by" *> commaSep1 expr
> partitionBy = keyword "partition" *> keyword "by" *> commaSep1 expr
> betweenSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> betweenSuffix a = do
> keyword "between"
> b <- dodgyParseElement
> keyword "and"
> c <- dodgyParseElement
> return $ FunCall [] "!between" [a,b,c]
>
>
>
From postgresql src/backend/parser/gram.y
* We have two expression types: a_expr is the unrestricted kind, and
* b_expr is a subset that must be used in some places to avoid shift/reduce
* conflicts. For example, we can't do BETWEEN as "BETWEEN a_expr AND a_expr"
* because that use of AND conflicts with AND as a boolean operator. So,
* b_expr is used in BETWEEN and we remove boolean keywords from b_expr.
*
* Note that '(' a_expr ')' is a b_expr, so an unrestricted expression can
* always be used by surrounding it with parens.
Thanks to Sam Mason for the heads up on this.
TODO: copy this approach here.
> where
> dodgyParseElement = factor
> functionCallSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> functionCallSuffix (Identifier _ fnName) = FunCall [] fnName <$> parens (commaSep expr)
> functionCallSuffix s = error $ "internal error: cannot make functioncall from " ++ show s
> castKeyword :: ParsecT [Token] ParseState Identity Expression
> castKeyword = keyword "cast" *> symbol "(" >>
> Cast [] <$> expr
> <*> (keyword "as" *> typeName <* symbol ")")
> castSuffix :: Expression -> ParsecT [Token] ParseState Identity Expression
> castSuffix ex = Cast [] ex <$> (symbol "::" *> typeName)
> substring :: ParsecT [Token] ParseState Identity Expression
> substring = do
> keyword "substring"
> symbol "("
> a <- expr
> keyword "from"
> b <- expr
> keyword "for"
> c <- expr
> symbol ")"
> return $ FunCall [] "!substring" [a,b,c]
> identifier :: ParsecT [Token] ParseState Identity Expression
> identifier = Identifier [] <$> idString
================================================================================
= Utility parsers
== tokeny things
keyword has to not be immediately followed by letters or numbers
(symbols and whitespace are ok) so we know that we aren't reading an
identifier which happens to start with a complete keyword
> keyword :: String -> ParsecT [Token] ParseState Identity String
> keyword k = mytoken (\tok ->
> case tok of
> IdStringTok i | lcase k == lcase i -> Just k
> _ -> Nothing)
> where
> lcase = map toLower
> idString :: MyParser String
> idString = mytoken (\tok -> case tok of
> IdStringTok i -> Just i
> _ -> Nothing)
> symbol :: String -> ParsecT [Token] ParseState Identity String
> symbol c = mytoken (\tok -> case tok of
> SymbolTok s | c==s -> Just c
> _ -> Nothing)
> integer :: MyParser Integer
> integer = mytoken (\tok -> case tok of
> IntegerTok n -> Just n
> _ -> Nothing)
> positionalArg :: ParsecT [Token] ParseState Identity Expression
> positionalArg = PositionalArg [] <$> mytoken (\tok -> case tok of
> PositionalArgTok n -> Just n
> _ -> Nothing)
> float :: MyParser Double
> float = mytoken (\tok -> case tok of
> FloatTok n -> Just n
> _ -> Nothing)
> stringLit :: MyParser Expression
> stringLit = mytoken (\tok ->
> case tok of
> StringTok d s -> Just $ StringLit [] d s
> _ -> Nothing)
couple of helper functions which extract the actual string
from a StringLD or StringL, and the delimiters which were used
(either ' or a dollar tag)
> extrStr :: Expression -> String
> extrStr (StringLit _ _ s) = s
> extrStr x = error $ "internal error: extrStr not supported for this type " ++ show x
> quoteOfString :: Expression -> String
> quoteOfString (StringLit _ tag _) = tag
> quoteOfString x = error $ "internal error: quoteType not supported for this type " ++ show x
== combinatory things
> parens :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity a
> parens = between (symbol "(") (symbol ")")
> squares :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity a
> squares = between (symbol "[") (symbol "]")
> tryOptionMaybe :: (Stream s m t) =>
> ParsecT s u m a -> ParsecT s u m (Maybe a)
> tryOptionMaybe p = try (optionMaybe p) <|> return Nothing
> commaSep2 :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity [a]
> commaSep2 p = sepBy2 p (symbol ",")
> sepBy2 :: (Stream s m t) =>
> ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m [a]
> sepBy2 p sep = (p <* sep) <:> sepBy1 p sep
> commaSep :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity [a]
> commaSep p = sepBy p (symbol ",")
> commaSep1 :: ParsecT [Token] ParseState Identity a
> -> ParsecT [Token] ParseState Identity [a]
> commaSep1 p = sepBy1 p (symbol ",")
doesn't seem too gratuitous, comes up a few times
> (<:>) :: (Applicative f) =>
> f a -> f [a] -> f [a]
> (<:>) a b = (:) <$> a <*> b
pass a list of pairs of strings and values
try each pair k,v in turn,
if keyword k matches then return v
doesn't really add a lot of value
> matchAKeyword :: [(String, a)] -> ParsecT [Token] ParseState Identity a
> matchAKeyword [] = fail "no matches"
> matchAKeyword ((k,v):kvs) = v <$ keyword k <|> matchAKeyword kvs
parseOptionalSuffix
parse the start of something -> parseResultA,
then parse an optional suffix -> parseResultB
if this second parser succeeds, return fn2 parseResultA parseResultB
else return fn1 parseResultA
e.g.
parsing an identifier in a select list can be
fieldName
or
fieldName as alias
so we can pass
* IdentifierCtor
* identifier (returns aval)
* AliasedIdentifierCtor
* () looks like a place holder, probably a crap idea
* parser for (as b) (returns bval)
as the args, which I like to ident like:
parseOptionalSuffix
IdentifierCtor identifierParser
AliasedIdentifierCtor () asAliasParser
and we get either
* IdentifierCtor identifierValue
or
* AliasedIdentifierCtor identifierValue aliasValue
as the result depending on whether the asAliasParser
succeeds or not.
probably this concept already exists under a better name in parsing
theory
> optionalSuffix :: (Stream s m t2) =>
> (t1 -> b)
> -> ParsecT s u m t1
> -> (t1 -> a -> b)
> -> ()
> -> ParsecT s u m a
> -> ParsecT s u m b
> optionalSuffix c1 p1 c2 _ p2 = do
> x <- p1
> option (c1 x) (c2 x <$> try p2)
threadOptionalSuffix
parse the start of something -> parseResultA,
then parse an optional suffix, passing parseResultA
to this parser -> parseResultB
return parseResultB is it succeeds, else return parseResultA
sort of like a suffix operator parser where the suffixisable part
is parsed, then if the suffix is there it wraps the suffixisable
part in an enclosing tree node.
parser1 -> tree1
(parser2 tree1) -> maybe tree2
tree2 isnothing ? tree1 : tree2
> threadOptionalSuffix :: ParsecT [tok] st Identity a
> -> (a -> GenParser tok st a)
> -> ParsecT [tok] st Identity a
> threadOptionalSuffix p1 p2 = do
> x <- p1
> option x (try $ p2 x)
I'm pretty sure this is some standard monad operation but I don't know
what. It's a bit like the maybe monad but when you get nothing it
returns the previous result instead of nothing
if you take the parsing specific stuff out you get:
p1 :: (Monad m) =>
m b -> (b -> m (Maybe b)) -> m b
p1 = do
x <- p1
y <- p2 x
case y of
Nothing -> return x
Just z -> return z
=====
like thread optional suffix, but we pass a list of suffixes in, not
much of a shorthand
> threadOptionalSuffixes :: ParsecT [tok] st Identity a
> -> [a -> GenParser tok st a]
> -> ParsecT [tok] st Identity a
> threadOptionalSuffixes p1 p2s = do
> x <- p1
> option x (try $ choice (map (\l -> l x) p2s))
couldn't work how to to perms so just did this hack instead
e.g.
a1,a2,b1,b2,a2,b3,b4 parses to ([a1,a2,a3],[b1,b2,b3,b4])
> multiPerm :: (Stream s m t) =>
> ParsecT s u m a1
> -> ParsecT s u m a
> -> ParsecT s u m sep
> -> ParsecT s u m ([a1], [a])
> multiPerm p1 p2 sep = do
> (r1, r2) <- unzip <$> sepBy1 parseAorB sep
> return (catMaybes r1, catMaybes r2)
> where
> parseAorB = choice [
> (\x -> (Just x,Nothing)) <$> p1
> ,(\y -> (Nothing, Just y)) <$> p2]
== lexer stuff
> type MyParser = GenParser Token ParseState
> mytoken :: (Tok -> Maybe a) -> MyParser a
> mytoken test
> = token showToken posToken testToken
> where
> showToken (_,tok) = show tok
> posToken (posi,_) = posi
> testToken (_,tok) = test tok