module Language.Pig.Parser.Parser ( parseString , parseFile , module Language.Pig.Parser.AST ) where import System.IO import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Expr import Text.ParserCombinators.Parsec.Language import qualified Text.ParserCombinators.Parsec.Token as Token import Data.List (intercalate) import Control.Applicative ((<$>), (<*>), (*>), (<*)) import Language.Pig.Parser.AST -- Lexer specialChar = oneOf "_" pigLanguageDef :: LanguageDef st pigLanguageDef = emptyDef { Token.commentStart = "/*" , Token.commentEnd = "*/" , Token.commentLine = "--" , Token.nestedComments = True , Token.identStart = letter , Token.identLetter = alphaNum <|> specialChar , Token.reservedNames = ["LOAD", "USING", "AS", -- todo case insensitivity of these keywords "FOREACH", "GENERATE", "FLATTEN", "JOIN", "BY", "GROUP", "DESCRIBE", "SHIP", "DEFINE", "STREAM", "THROUGH", "STORE", "INTO", "USING", "int", "long", "float", "double", "chararray", "bytearray", "*"] , Token.reservedOpNames = ["=", "+", "-", "*", "/", "%", "?", ":"] , Token.caseSensitive = False } lexer = Token.makeTokenParser pigLanguageDef identifier = Token.identifier lexer reserved = Token.reserved lexer reservedOp = Token.reservedOp lexer integer = Token.integer lexer naturalOrFloat = Token.naturalOrFloat lexer semi = Token.semi lexer comma = Token.comma lexer whiteSpace = Token.whiteSpace lexer parens = Token.parens lexer lexeme = Token.lexeme lexer -- parser to handle double colon. pigIdentifier = try(detailedIdentifier) <|> identifier detailedIdentifier :: Parser String detailedIdentifier = lexeme $ (intercalate "::") <$> sepBy1 identifierPart (string "::") identifierPart = (:) <$> letter <*> many1 (alphaNum <|> specialChar) -- Parser: top-down parseString :: [Char] -> Root parseString input = case parsePig input of Left msg -> error (show msg) Right p -> p parseFile :: FilePath -> IO PigFile parseFile filename = ((PigFile filename) . parseString) <$> readFile (filename) parsePig :: String -> Either ParseError Root parsePig input = parse pigParser "pigParser error" input pigParser :: Parser Root pigParser = whiteSpace >> statements statements :: Parser Root statements = Seq <$> endBy statement semi statement :: Parser Statement statement = query <|> describe <|> define <|> store query :: Parser Statement query = Assignment <$> (Identifier <$> identifier) <*> (reservedOp "=" *> opClause) describe :: Parser Statement describe = Describe <$> (reserved "DESCRIBE" *> pigVar) define :: Parser Statement define = DefineUDF <$> (reserved "define" *> pigVar) <*> executable <*> shipClause -- could be input, output, ship, cache, stderr in full pig grammar store :: Parser Statement store = Store <$> (reserved "STORE" *> pigVar) <*> (reserved "INTO" *> pigQuotedString Directory) <*> (reserved "USING" *> pigFunc) opClause :: Parser OpClause opClause = loadClause <|> foreachClause <|> innerJoinClause <|> groupClause <|> streamClause loadClause :: Parser OpClause loadClause = LoadClause <$> (reserved "LOAD" *> pigQuotedString Filename) <*> (reserved "USING" *> pigFunc) <*> (reserved "AS" *> pigTupleDef) -- foreach: only the block (outer bag) version foreachClause :: Parser OpClause foreachClause = ForeachClause <$> (reserved "FOREACH" *> pigVar) <*> (reserved "GENERATE" *> (GenBlock <$> sepBy transform comma)) innerJoinClause :: Parser OpClause innerJoinClause = InnerJoinClause <$> (reserved "JOIN" *> sepBy joinTable comma) groupClause :: Parser OpClause groupClause = GroupClause <$> (reserved "GROUP" *> pigVar) <*> (reserved "BY" *> (MultipleColumn <$> tuple <|> SingleColumn <$> name)) streamClause :: Parser OpClause streamClause = StreamClause <$> (reserved "STREAM" *> pigVar) <*> (reserved "THROUGH" *> pigVar) <*> (reserved "AS" *> pigTupleDef) joinTable :: Parser Join joinTable = Join <$> pigIdentifier <*> (reserved "BY" *> pigIdentifier) shipClause :: Parser DefineSpec shipClause = Ship . Filename <$> (reserved "SHIP" *> parens quotedString) pigVar :: Parser Alias pigVar = Identifier <$> pigIdentifier pigQuotedString :: (String -> a) -> Parser a pigQuotedString constructor = constructor <$> quotedString pigFunc :: Parser Function pigFunc = Function <$> identifier <*> parens arguments arguments :: Parser [Argument] arguments = sepBy argument comma argument :: Parser Argument argument = (StringArgument . String <$> quotedString) <|> (AliasArgument <$> pigVar) quotedString :: Parser String quotedString = (char '\'' *> (many $ noneOf "\'")) <* char '\'' <* whiteSpace -- doesn't take into account escaped quotes executable :: Parser Command executable = Exec <$> (char '`' *> (many $ noneOf "`") <* char '`' <* whiteSpace) pigTupleDef :: Parser TupleDef pigTupleDef = TupleDef <$> parens tupleDef tupleDef :: Parser [Field] tupleDef = sepBy field comma field :: Parser Field field = Field <$> pigVar <* char ':' <*> pigType pigType :: Parser SimpleType pigType = pigSimpleType "int" Int <|> pigSimpleType "long" Long <|> pigSimpleType "float" Float <|> pigSimpleType "double" Double <|> pigSimpleType "chararray" CharArray <|> pigSimpleType "bytearray" ByteArray pigSimpleType :: String -> SimpleType -> Parser SimpleType pigSimpleType typeString constructor = reserved typeString >> return constructor transform :: Parser Transform transform = try(aliasTransform) <|> flattenTransform <|> tupleFieldGlob <|> expressionTransform <|> functionTransform <|> envTransform flattenTransform :: Parser Transform flattenTransform = Flatten <$> (reserved "FLATTEN" *> parens pigIdentifier) <*> (reserved "AS" *> tuple) expressionTransform :: Parser Transform expressionTransform = ExpressionTransform <$> generalExpression <*> (reserved "AS" *> (Identifier <$> identifier)) functionTransform :: Parser Transform functionTransform = FunctionTransform <$> pigFunc <*> (reserved "AS" *> (Identifier <$> identifier)) aliasTransform :: Parser Transform aliasTransform = AliasTransform <$> (Identifier <$> pigIdentifier) <*> (reserved "AS" *> (Identifier <$> identifier)) envTransform :: Parser Transform envTransform = EnvTransform <$> pigQuotedString String <*> (reserved "AS" *> (Identifier <$> identifier)) -- general expression: -- fieldExpression or literal or function or binary operation (+-*/%) or bincond (?:) -- bincond: boolean expression (==, !=, >, <, >=, <=) (and, or, not) generalExpression :: Parser Expression generalExpression = parens calculation -- conditional is ternary operator, so lookahead to try and parse it first. calculation :: Parser Expression calculation = try(conditional) <|> buildExpressionParser pigOperators pigTerm pigOperators = [[Prefix (reservedOp "-" >> return (Unary Neg))] ,[Infix (reservedOp "*" >> return (Binary Multiply)) AssocLeft] ,[Infix (reservedOp "/" >> return (Binary Divide)) AssocLeft] ,[Infix (reservedOp "%" >> return (Binary Modulo)) AssocLeft] ,[Infix (reservedOp "+" >> return (Binary Add)) AssocLeft] ,[Infix (reservedOp "-" >> return (Binary Subtract)) AssocLeft]] pigTerm :: Parser Expression pigTerm = (ScalarTerm . String <$> quotedString) <|> (ScalarTerm <$> number) <|> generalExpression <|> (AliasTerm <$> name) number = Number <$> naturalOrFloat -- for now - could be naturalOrFloat for inclusion conditional :: Parser Expression conditional = BinCond <$> booleanExpression <*> (reserved "?" *> calculation) <*> (reserved ":" *> calculation) booleanExpression = buildExpressionParser booleanOperators booleanTerm booleanTerm = parens booleanExpression <|> comparisonExpression booleanOperators = [ [Prefix (reservedOp "not" >> return (BooleanUnary Not))] , [Infix (reservedOp "and" >> return (BooleanBinary And)) AssocLeft] , [Infix (reservedOp "or" >> return (BooleanBinary Or)) AssocLeft]] comparisonExpression :: Parser BooleanExpression comparisonExpression = flippedBooleanExpression <$> pigTerm <*> relation <*> pigTerm where flippedBooleanExpression expr1 op expr2 = BooleanExpression op expr1 expr2 -- bincond: boolean expression (==, !=, >, <, >=, <=) (and, or, not) relation = (reservedOp ">" >> return Greater) <|> (reservedOp "<" >> return Less) <|> (reservedOp "<=" >> return LessEqual) <|> (reservedOp ">=" >> return GreaterEqual) <|> (reservedOp "==" >> return Equal) <|> (reservedOp "!=" >> return NotEqual) tupleFieldGlob :: Parser Transform tupleFieldGlob = reserved "*" >> return TupleFieldGlob tuple :: Parser Tuple tuple = Tuple <$> parens (sepBy name comma) name :: Parser Alias name = Identifier <$> pigIdentifier