-- Copyright (c) 2017 Uber Technologies, Inc. -- -- Permission is hereby granted, free of charge, to any person obtaining a copy -- of this software and associated documentation files (the "Software"), to deal -- in the Software without restriction, including without limitation the rights -- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -- copies of the Software, and to permit persons to whom the Software is -- furnished to do so, subject to the following conditions: -- -- The above copyright notice and this permission notice shall be included in -- all copies or substantial portions of the Software. -- -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -- THE SOFTWARE. module Database.Sql.Hive.Parser.Token where import Database.Sql.Hive.Token import Database.Sql.Hive.Parser.Internal import Database.Sql.Position import qualified Text.Parsec as P import qualified Text.Parsec.Pos as P import Data.Char (isDigit) import Data.String import Data.Text.Lazy hiding (foldl1, map, head, last, all, null, init) import qualified Data.Text.Lazy.Encoding as TL import Data.ByteString.Lazy (ByteString) import qualified Data.ByteString.Lazy as BL import Data.Semigroup ((<>)) showTok :: (Token, Position, Position) -> String showTok (t, _, _) = show t posFromTok :: P.SourcePos -> (Token, Position, Position) -> [(Token, Position, Position)] -> P.SourcePos posFromTok _ (_, pos, _) _ = flip P.setSourceLine (fromEnum $ positionLine pos) $ flip P.setSourceColumn (fromEnum $ positionColumn pos) $ P.initialPos "-" tokEqualsP :: Token -> Parser Range tokEqualsP tok = P.tokenPrim showTok posFromTok testTok where testTok (tok', s, e) = if tok == tok' then Just $ Range s e else Nothing tokNotEqualsP :: Token -> Parser Range tokNotEqualsP tok = P.tokenPrim showTok posFromTok testTok where testTok (tok', s, e) = if tok /= tok' then Just $ Range s e else Nothing testNameTok :: (Token, Position, Position) -> Maybe (Text, Range) testNameTok (tok, s, e) = case tok of TokWord _ name -> Just (name, Range s e) _ -> Nothing variableSubstitutionP :: Parser Range variableSubstitutionP = P.tokenPrim showTok posFromTok testVariableTok where testVariableTok (tok, s, e) = case tok of TokVariable _ _ -> Just (Range s e) _ -> Nothing typeNameP :: Parser (Text, Range) typeNameP = P.tokenPrim showTok posFromTok testNameTok nodeNameP :: Parser (Text, Range) nodeNameP = P.tokenPrim showTok posFromTok testNameTok structFieldNameP :: Parser (Text, Range) structFieldNameP = P.tokenPrim showTok posFromTok testNameTok windowNameP :: Parser (Text, Range) windowNameP = P.tokenPrim showTok posFromTok testNameTok datePartP :: Parser (Text, Range) datePartP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord _ name | toLower name `elem` parts -> Just (toLower name, Range s e) _ -> Nothing parts = [ "year", "yy", "yyyy" , "quarter", "qq", "q" , "month", "mm", "m" , "day", "dd", "d", "dy", "dayofyear", "y" , "week", "wk", "ww" , "hour", "hh" , "minute", "mi", "n" , "second", "ss", "s" , "millisecond", "ms" , "microsecond", "mcs", "us" ] schemaNameP :: Parser (Text, Range) schemaNameP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord True name -> Just (name, Range s e) TokWord False name | wordCanBeSchemaName (wordInfo name) -> Just (name, Range s e) _ -> Nothing tableNameP :: Parser (Text, Range) tableNameP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord True name -> Just (name, Range s e) TokWord False name | wordCanBeTableName (wordInfo name) -> Just (name, Range s e) _ -> Nothing projectionNameP :: Parser (Text, Range) projectionNameP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord True name -> Just (name, Range s e) TokWord False name | wordCanBeTableName (wordInfo name) -> Just (name, Range s e) _ -> Nothing columnNameP :: Parser (Text, Range) columnNameP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord True name -> Just (name, Range s e) TokWord False name | wordCanBeColumnName (wordInfo name) -> Just (name, Range s e) _ -> Nothing functionNameP :: Parser (Text, Range) functionNameP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord True name -> Just (name, Range s e) TokWord False name | wordCanBeFunctionName (wordInfo name) -> Just (name, Range s e) _ -> Nothing propertyValuePartP :: Parser (Text, Range) propertyValuePartP = textUntilP [";"] -- Hive supports property names and values contianing -- equal signs and spaces. Here we stop on the first equal sign. propertyNameP :: Parser (Text, Range) propertyNameP = textUntilP ["=", ";"] -- Parses all tokens until a token equal to a given string is found -- returns parsed text. textUntilP :: [Text] -> Parser (Text, Range) textUntilP x = do res <- P.many anyTokenExceptX let name = Data.Text.Lazy.concat $ fst <$> res s = snd $ head res e = snd $ last res pure (name, s <> e) where anyTokenExceptX :: Parser (Text, Range) anyTokenExceptX = P.tokenPrim showTok posFromTok $ \ (tok, s, e) -> case tok of TokSymbol t | not (t `elem` x) -> Just (t, Range s e) TokWord _ t | not (t `elem` x) -> Just (t, Range s e) TokNumber t | not (t `elem` x) -> Just (t, Range s e) _ -> Nothing keywordP :: Text -> Parser Range keywordP keyword = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokWord False name | name == keyword -> Just (Range s e) _ -> Nothing fieldTypeP :: Parser (Text, Range) fieldTypeP = P.tokenPrim showTok posFromTok testTok where isField :: (Eq s, IsString s) => s -> Bool isField = flip elem [ "century", "day", "decade", "doq", "dow", "doy", "epoch" , "hour", "isodow", "isoweek", "isoyear", "microseconds" , "millennium", "milliseconds", "minute", "month", "quarter" , "second", "time zone", "timezone_hour", "timezone_minute" , "week", "year" ] testTok (tok, s, e) = case tok of TokWord _ field | isField field -> Just (field, Range s e) TokString field | isField field -> Just (TL.decodeUtf8 field, Range s e) _ -> Nothing periodP :: Parser (Text, Range) periodP = P.tokenPrim showTok posFromTok testTok where isPeriod :: (Eq s, IsString s) => s -> Bool isPeriod = flip elem [ "day", "hour", "minute", "month", "second", "year" ] testTok (tok, s, e) = case tok of TokWord _ period | isPeriod period -> Just (period, Range s e) TokString period | isPeriod period -> Just (TL.decodeUtf8 period, Range s e) _ -> Nothing byteAmountP :: Parser (Text, Range) byteAmountP = P.tokenPrim showTok posFromTok testTok where isByteAmount :: Text -> Bool isByteAmount text = let str = unpack $ toLower text in and [ not $ null str , last str `elem` ['b', 'k', 'm', 'g'] , all isDigit $ init str ] testTok (tok, s, e) = case tok of TokWord False byteAmount | isByteAmount byteAmount -> Just (byteAmount, Range s e) _ -> Nothing stringP :: Parser (ByteString, Range) stringP = do tokens <- P.many1 singleStringTokenP let s = foldl1 BL.append $ map fst tokens r = foldl1 (<>) $ map snd tokens pure $ (s, r) where singleStringTokenP :: Parser (ByteString, Range) singleStringTokenP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokString string -> Just (string, Range s e) _ -> Nothing numberP :: Parser (Text, Range) numberP = P.tokenPrim showTok posFromTok testTok where testTok (tok, s, e) = case tok of TokNumber number -> Just (number, Range s e) _ -> Nothing dotP :: Parser Range dotP = symbolP "." equalP :: Parser Range equalP = symbolP "=" colonP :: Parser Range colonP = symbolP ":" symbolP :: Text -> Parser Range symbolP op = tokEqualsP $ TokSymbol op starP :: Parser Range starP = symbolP "*" openP :: Parser Range openP = symbolP "(" closeP :: Parser Range closeP = symbolP ")" openBracketP :: Parser Range openBracketP = symbolP "[" closeBracketP :: Parser Range closeBracketP = symbolP "]" openAngleP :: Parser Range openAngleP = symbolP "<" closeAngleP :: Parser Range closeAngleP = symbolP ">" castP :: Parser Range castP = keywordP "cast" castOpP :: Parser Range castOpP = symbolP "::" minusP :: Parser Range minusP = symbolP "-" accessRankP :: Parser Range accessRankP = keywordP "accessrank" addP :: Parser Range addP = keywordP "add" afterP :: Parser Range afterP = keywordP "after" allP :: Parser Range allP = keywordP "all" alterP :: Parser Range alterP = keywordP "alter" analyzeP :: Parser Range analyzeP = keywordP "analyze" andP :: Parser Range andP = keywordP "and" arrayP :: Parser Range arrayP = keywordP "array" asP :: Parser Range asP = keywordP "as" ascP :: Parser Range ascP = keywordP "asc" atP :: Parser Range atP = keywordP "at" autoP :: Parser Range autoP = keywordP "auto" avroP :: Parser Range avroP = keywordP "avro" bestP :: Parser Range bestP = keywordP "best" betweenP :: Parser Range betweenP = keywordP "between" bucketP :: Parser Range bucketP = keywordP "bucket" bucketsP :: Parser Range bucketsP = keywordP "buckets" byP :: Parser Range byP = keywordP "by" cacheP :: Parser Range cacheP = keywordP "cache" cascadeP :: Parser Range cascadeP = keywordP "cascade" caseP :: Parser Range caseP = keywordP "case" changeP :: Parser Range changeP = keywordP "change" clusterP :: Parser Range clusterP = keywordP "cluster" clusteredP :: Parser Range clusteredP = keywordP "clustered" collectionP :: Parser Range collectionP = keywordP "collection" columnP :: Parser Range columnP = keywordP "column" columnsP :: Parser Range columnsP = keywordP "columns" commaP :: Parser Range commaP = symbolP "," commentP :: Parser Range commentP = keywordP "comment" commitP :: Parser Range commitP = keywordP "commit" computeP :: Parser Range computeP = keywordP "compute" createP :: Parser Range createP = keywordP "create" crossP :: Parser Range crossP = keywordP "cross" cubeP :: Parser Range cubeP = keywordP "cube" currentP :: Parser Range currentP = keywordP "current" currentDatabaseP :: Parser (Text, Range) currentDatabaseP = ("current_database",) <$> keywordP "current_database" currentDateP :: Parser (Text, Range) currentDateP = ("current_date",) <$> keywordP "current_date" currentSchemaP :: Parser (Text, Range) currentSchemaP = ("current_schema",) <$> keywordP "current_schema" currentTimeP :: Parser (Text, Range) currentTimeP = ("current_time",) <$> keywordP "current_time" currentTimestampP :: Parser (Text, Range) currentTimestampP = ("current_timestamp",) <$> keywordP "current_timestamp" currentUserP :: Parser (Text, Range) currentUserP = ("current_user",) <$> keywordP "current_user" dataP :: Parser Range dataP = keywordP "data" databaseP :: Parser Range databaseP = keywordP "database" dateDiffP :: Parser Range dateDiffP = keywordP "datediff" dbPropertiesP :: Parser Range dbPropertiesP = keywordP "dbproperties" defaultP :: Parser Range defaultP = keywordP "default" definedP :: Parser Range definedP = keywordP "defined" deleteP :: Parser Range deleteP = keywordP "delete" delimitedP :: Parser Range delimitedP = keywordP "delimited" descP :: Parser Range descP = keywordP "desc" describeP :: Parser Range describeP = keywordP "describe" P.<|> keywordP "desc" directoryP :: Parser Range directoryP = keywordP "directory" distinctP :: Parser Range distinctP = keywordP "distinct" distributeP :: Parser Range distributeP = keywordP "distribute" dropP :: Parser Range dropP = keywordP "drop" elseP :: Parser Range elseP = keywordP "else" encodingP :: Parser Range encodingP = keywordP "encoding" endP :: Parser Range endP = keywordP "end" escapeP :: Parser Range escapeP = keywordP "escape" escapedP :: Parser Range escapedP = keywordP "escaped" excludingP :: Parser Range excludingP = keywordP "excluding" existsP :: Parser Range existsP = keywordP "exists" explainP :: Parser Range explainP = keywordP "explain" externalP :: Parser Range externalP = keywordP "external" extractP :: Parser Range extractP = keywordP "extract" falseP :: Parser Range falseP = keywordP "false" fieldsP :: Parser Range fieldsP = keywordP "fields" firstP :: Parser Range firstP = keywordP "first" followingP :: Parser Range followingP = keywordP "following" forP :: Parser Range forP = keywordP "for" formatP :: Parser Range formatP = keywordP "format" fromP :: Parser Range fromP = keywordP "from" functionP :: Parser Range functionP = keywordP "function" fullP :: Parser Range fullP = keywordP "full" globalP :: Parser Range globalP = keywordP "global" grantP :: Parser Range grantP = keywordP "grant" groupP :: Parser Range groupP = keywordP "group" groupingP :: Parser Range groupingP = keywordP "grouping" havingP :: Parser Range havingP = keywordP "having" ifP :: Parser Range ifP = keywordP "if" ignoreP :: Parser Range ignoreP = keywordP "ignore" inP :: Parser Range inP = keywordP "in" includingP :: Parser Range includingP = keywordP "including" inPathP :: Parser Range inPathP = keywordP "inpath" innerP :: Parser Range innerP = keywordP "inner" inputFormatP :: Parser Range inputFormatP = keywordP "inputformat" insertP :: Parser Range insertP = keywordP "insert" intervalP :: Parser Range intervalP = keywordP "interval" intoP :: Parser Range intoP = keywordP "into" isP :: Parser Range isP = keywordP "is" itemsP :: Parser Range itemsP = keywordP "items" joinP :: Parser Range joinP = keywordP "join" keysP :: Parser Range keysP = keywordP "keys" ksafeP :: Parser Range ksafeP = keywordP "ksafe" lastP :: Parser Range lastP = keywordP "last" lateralP :: Parser Range lateralP = keywordP "lateral" leftP :: Parser Range leftP = keywordP "left" likeP :: Parser Range likeP = keywordP "like" limitP :: Parser Range limitP = keywordP "limit" linesP :: Parser Range linesP = keywordP "lines" loadP :: Parser Range loadP = keywordP "load" localP :: Parser Range localP = keywordP "local" localTimeP :: Parser (Text, Range) localTimeP = ("localtime",) <$> keywordP "localtime" localTimestampP :: Parser (Text, Range) localTimestampP = ("localtimestamp",) <$> keywordP "localtimestamp" locationP :: Parser Range locationP = keywordP "location" mapP :: Parser Range mapP = keywordP "map" metadataP :: Parser Range metadataP = keywordP "metadata" noP :: Parser Range noP = keywordP "no" nodeP :: Parser Range nodeP = keywordP "node" nodesP :: Parser Range nodesP = keywordP "nodes" noScanP :: Parser Range noScanP = keywordP "noscan" notP :: Parser Range notP = keywordP "not" notOperatorP :: Parser Range notOperatorP = keywordP "not" P.<|> symbolP "!" nullP :: Parser Range nullP = keywordP "null" nullsP :: Parser Range nullsP = keywordP "nulls" nullsequalP :: Parser Range nullsequalP = keywordP "nullsequal" ofP :: Parser Range ofP = keywordP "of" offsetP :: Parser Range offsetP = keywordP "offset" onP :: Parser Range onP = keywordP "on" orP :: Parser Range orP = keywordP "or" orcP :: Parser Range orcP = keywordP "orc" orderP :: Parser Range orderP = keywordP "order" overlapsP :: Parser Range overlapsP = keywordP "overlaps" overwriteP :: Parser Range overwriteP = keywordP "overwrite" outP :: Parser Range outP = keywordP "out" outerP :: Parser Range outerP = keywordP "outer" outputFormatP :: Parser Range outputFormatP = keywordP "outputformat" overP :: Parser Range overP = keywordP "over" parametersP :: Parser Range parametersP = keywordP "parameters" parquetP :: Parser Range parquetP = keywordP "parquet" partitionP :: Parser Range partitionP = keywordP "partition" partitionedP :: Parser Range partitionedP = keywordP "partitioned" percentP :: Parser Range percentP = keywordP "percent" precedingP :: Parser Range precedingP = keywordP "preceding" preserveP :: Parser Range preserveP = keywordP "preserve" projectionP :: Parser Range projectionP = keywordP "projection" projectionsP :: Parser Range projectionsP = keywordP "projections" protectionP :: Parser Range protectionP = keywordP "protection" purgeP :: Parser Range purgeP = keywordP "purge" randP :: Parser Range randP = keywordP "rand" rangeP :: Parser Range rangeP = keywordP "range" rcFileP :: Parser Range rcFileP = keywordP "rcfile" regexpP :: Parser Range regexpP = keywordP "regexp" reloadP :: Parser Range reloadP = keywordP "reload" renameP :: Parser Range renameP = keywordP "rename" restrictP :: Parser Range restrictP = keywordP "restrict" revokeP :: Parser Range revokeP = keywordP "revoke" rlikeP :: Parser Range rlikeP = keywordP "rlike" rightP :: Parser Range rightP = keywordP "right" rollbackP :: Parser Range rollbackP = keywordP "rollback" rollupP :: Parser Range rollupP = keywordP "rollup" rowP :: Parser Range rowP = keywordP "row" rowsP :: Parser Range rowsP = keywordP "rows" schemaP :: Parser Range schemaP = keywordP "schema" segmentedP :: Parser Range segmentedP = keywordP "segmented" selectP :: Parser Range selectP = keywordP "select" semicolonP :: Parser Range semicolonP = symbolP ";" notSemicolonP :: Parser Range notSemicolonP = tokNotEqualsP $ TokSymbol ";" semiP :: Parser Range semiP = keywordP "semi" sessionUserP :: Parser (Text, Range) sessionUserP = ("session_user",) <$> keywordP "session_user" sequenceFileP :: Parser Range sequenceFileP = keywordP "sequencefile" serdeP :: Parser Range serdeP = keywordP "serde" serdePropertiesP :: Parser Range serdePropertiesP = keywordP "serdeproperties" setP :: Parser Range setP = keywordP "set" setsP :: Parser Range setsP = keywordP "sets" showP :: Parser Range showP = keywordP "show" sortP :: Parser Range sortP = keywordP "sort" sortedP :: Parser Range sortedP = keywordP "sorted" statisticsP :: Parser Range statisticsP = keywordP "statistics" storedP :: Parser Range storedP = keywordP "stored" structP :: Parser Range structP = keywordP "struct" sysDateP :: Parser (Text, Range) sysDateP = ("sysdate",) <$> keywordP "sysdate" tableP :: Parser Range tableP = keywordP "table" tableSampleP :: Parser Range tableSampleP = keywordP "tablesample" tblPropertiesP :: Parser Range tblPropertiesP = keywordP "tblproperties" temporaryP :: Parser Range temporaryP = keywordP "temporary" P.<|> keywordP "temp" terminatedP :: Parser Range terminatedP = keywordP "terminated" textFileP :: Parser Range textFileP = keywordP "textfile" thenP :: Parser Range thenP = keywordP "then" timeseriesP :: Parser Range timeseriesP = keywordP "timeseries" timestampP :: Parser Range timestampP = keywordP "timestamp" toP :: Parser Range toP = keywordP "to" trueP :: Parser Range trueP = keywordP "true" truncateP :: Parser Range truncateP = keywordP "truncate" unboundedP :: Parser Range unboundedP = keywordP "unbounded" unionP :: Parser Range unionP = keywordP "union" uniontypeP :: Parser Range uniontypeP = keywordP "uniontype" unknownP :: Parser Range unknownP = keywordP "unknown" unsegmentedP :: Parser Range unsegmentedP = keywordP "unsegmented" useP :: Parser Range useP = keywordP "use" userP :: Parser (Text, Range) userP = ("user",) <$> keywordP "user" valuesP :: Parser Range valuesP = keywordP "values" viewP :: Parser Range viewP = keywordP "view" whenP :: Parser Range whenP = keywordP "when" whereP :: Parser Range whereP = keywordP "where" windowP :: Parser Range windowP = keywordP "window" withP :: Parser Range withP = keywordP "with" inequalityOpP :: Parser (Text, Range) inequalityOpP = P.tokenPrim showTok posFromTok testTok where testTok (TokSymbol op, s, e) | op `elem` ["<", ">", "<=", ">="] = Just (op, Range s e) testTok _ = Nothing equalityOpP :: Parser (Text, Range) equalityOpP = P.tokenPrim showTok posFromTok testTok where testTok (TokSymbol op, s, e) | op `elem` ["=", "==", "<=>", "<>", "!="] = Just (op, Range s e) testTok _ = Nothing