Copyright 2009 Jake Wheat This file contains the lexer for sql source text. Lexicon: string identifier or keyword symbols - operators and ;,()[] positional arg int float copy payload (used to lex copy from stdin data) > module Database.HsSqlPpp.Lexer ( > Token > ,Tok(..) > ,lexSqlFile > ,lexSqlText > ) where > import Text.Parsec hiding(many, optional, (<|>)) > import qualified Text.Parsec.Token as P > import Text.Parsec.Language > import Text.Parsec.String > import Control.Applicative > import Control.Monad.Identity > import Data.Char > import Database.HsSqlPpp.ParseErrors ================================================================================ = data types > type Token = (SourcePos, Tok) > data Tok = StringTok String String --delim, value (delim will one of > --', $$, $[stuff]$ > | IdStringTok String --includes . and x.y.* type stuff > | SymbolTok String -- operators, and ()[],;: > -- * is currently always lexed as an id > -- rather than an operator > -- this gets fixed in the parsing stage > | PositionalArgTok Integer -- $1, etc. > | FloatTok Double > | IntegerTok Integer > | CopyPayloadTok String -- support copy from stdin; with inline data > deriving (Eq,Show) > type ParseState = [Tok] > lexSqlFile :: FilePath -> IO (Either ExtendedError [Token]) > lexSqlFile f = do > te <- readFile f > let x = runParser sqlTokens [] f te --parseFromFile sqlTokens f > return $ convertToExtendedError x f te > lexSqlText :: String -> Either ExtendedError [Token] > lexSqlText s = convertToExtendedError (runParser sqlTokens [] "" s) "" s ================================================================================ = lexers lexer for tokens, contains a hack for copy from stdin with inline table data. > sqlTokens :: ParsecT String ParseState Identity [Token] > sqlTokens = > setState [] >> > whiteSpace >> > many sqlToken <* eof Lexer for an individual token. What we could do is lex lazily and when the lexer reads a copy from stdin statement, it switches lexers to lex the inline table data, then switches back. Don't know how to do this in parsec, or even if it is possible, so as a work around, we use the state to trap if we've just seen 'from stdin;', if so, we read the copy payload as one big token, otherwise we read a normal token. > sqlToken :: ParsecT String ParseState Identity Token > sqlToken = do > sp <- getPosition > sta <- getState > t <- if sta == [ft,st,mt] > then copyPayload > else try sqlString > <|> try idString > <|> try positionalArg > <|> try sqlSymbol > <|> try sqlFloat > <|> try sqlInteger > updateState $ \stt -> > case () of > _ | stt == [] && t == ft -> [ft] > | stt == [ft] && t == st -> [ft,st] > | stt == [ft,st] && t == mt -> [ft,st,mt] > | otherwise -> [] > return (sp,t) > where > ft = IdStringTok "from" > st = IdStringTok "stdin" > mt = SymbolTok ";" == specialized token parsers > sqlString :: ParsecT String ParseState Identity Tok > sqlString = stringQuotes <|> stringLD > where > --parse a string delimited by single quotes > stringQuotes = StringTok "\'" <$> stringPar > stringPar = optional (char 'E') *> char '\'' > *> readQuoteEscape <* whiteSpace > --(readquoteescape reads the trailing ') have to read two consecutive single quotes as a quote character instead of the end of the string, probably an easier way to do this other escapes (e.g. \n \t) are left unprocessed > readQuoteEscape = do > x <- anyChar > if x == '\'' > then try ((x:) <$> (char '\'' *> readQuoteEscape)) > <|> return "" > else (x:) <$> readQuoteEscape parse a dollar quoted string > stringLD = do > -- cope with $$ as well as $[identifier]$ > tag <- try (char '$' *> ((char '$' *> return "") > <|> (identifierString <* char '$'))) > s <- lexeme $ manyTill anyChar > (try $ char '$' <* string tag <* char '$') > return $ StringTok ("$" ++ tag ++ "$") s > idString :: ParsecT String ParseState Identity Tok > idString = IdStringTok <$> identifierString > positionalArg :: ParsecT String ParseState Identity Tok > positionalArg = char '$' >> PositionalArgTok <$> integer Lexing symbols: approach 1: try to keep multi symbol operators as single lexical items (e.g. "==", "~==" approach 2: make each character a separate element e.g. == lexes to ['=', '='] then the parser sorts this out Sort of using approach 1 at the moment, see below == notes on symbols in pg operators pg symbols can be made from: =_*/<>=~!@#%^&|`? no --, /* in symbols can't end in + or - unless contains ~!@#%^&|? Most of this isn't relevant for the current lexer. == sql symbols for this lexer: sql symbol is one of ()[],; - single character +-*/<>=~!@#%^&|`? string - one or more of these, parsed until hit char which isn't one of these (including whitespace). This will parse some standard sql expressions wrongly at the moment, work around is to add whitespace e.g. i think 3*-4 is valid sql, should lex as '3' '*' '-' '4', but will currently lex as '3' '*-' '4'. This is planned to be fixed in the parser. .. := :: : - other special cases > sqlSymbol :: ParsecT String ParseState Identity Tok > sqlSymbol = > SymbolTok <$> lexeme (choice [ > replicate 1 <$> oneOf "()[],;" > ,string ".." > ,try $ string "::" > ,try $ string ":=" > ,string ":" > ,many1 (oneOf "+-*/<>=~!@#%^&|`?") > ]) > sqlFloat :: ParsecT String ParseState Identity Tok > sqlFloat = FloatTok <$> float > sqlInteger :: ParsecT String ParseState Identity Tok > sqlInteger = IntegerTok <$> integer ================================================================================ additional parser bits and pieces include dots, * in all identifier strings during lexing. This parser is also used for keywords, so identifiers and keywords aren't distinguished until during proper parsing, and * and qualifiers aren't really examined until type checking > identifierString :: ParsecT String ParseState Identity String > identifierString = lexeme $ choice [ > "*" <$ symbol "*" > ,do > a <- nonStarPart > b <- tryMaybeP ((++) <$> symbol "." <*> identifierString) > case b of Nothing -> return a > Just c -> return $ a ++ c] > where > nonStarPart = (letter <|> char '_') <:> secondOnwards > secondOnwards = many (alphaNum <|> char '_') parse the block of inline data for a copy from stdin, ends with \. on its own on a line > copyPayload :: ParsecT String ParseState Identity Tok > copyPayload = CopyPayloadTok <$> lexeme (getLinesTillMatches "\\.\n") > where > getLinesTillMatches s = do > x <- getALine > if x == s > then return "" > else (x++) <$> getLinesTillMatches s > getALine = (++"\n") <$> manyTill anyChar (try newline) doesn't seem too gratuitous, comes up a few times > (<:>) :: (Applicative f) => > f a -> f [a] -> f [a] > (<:>) a b = (:) <$> a <*> b > tryMaybeP :: GenParser tok st a > -> ParsecT [tok] st Identity (Maybe a) > tryMaybeP p = try (optionMaybe p) <|> return Nothing ================================================================================ = parsec pass throughs > symbol :: String -> ParsecT String ParseState Identity String > symbol = P.symbol lexer > integer :: ParsecT String ParseState Identity Integer > integer = lexeme $ P.integer lexer > float :: ParsecT String ParseState Identity Double > float = lexeme $ P.float lexer > whiteSpace :: ParsecT String ParseState Identity () > whiteSpace= P.whiteSpace lexer > lexeme :: ParsecT String ParseState Identity a > -> ParsecT String ParseState Identity a > lexeme = P.lexeme lexer this lexer isn't really used as much as it could be, probably some of the fields are not used at all (like identifier and operator stuff) > lexer :: P.GenTokenParser String ParseState Identity > lexer = P.makeTokenParser (emptyDef { > P.commentStart = "/*" > ,P.commentEnd = "*/" > ,P.commentLine = "--" > ,P.nestedComments = False > ,P.identStart = letter <|> char '_' > ,P.identLetter = alphaNum <|> oneOf "_" > ,P.opStart = P.opLetter emptyDef > ,P.opLetter = oneOf opLetters > ,P.reservedOpNames= [] > ,P.reservedNames = [] > ,P.caseSensitive = False > }) > opLetters :: String > opLetters = ".:^*/%+-<>=|!"