{-# OPTIONS -fno-warn-name-shadowing #-} module Atomo.Parser.Base where import Control.Monad (liftM) import Data.Char import Data.List (nub, sort) import Text.Parsec import qualified Text.Parsec.Token as P import Atomo.Types (Expr(..), ParserState, VM) type Parser = ParsecT String ParserState VM isOpLetter :: Char -> Bool isOpLetter c = c `elem` "!@#%&*-./\\?" || isSymbol c isOperator :: String -> Bool isOperator "" = False isOperator cs = head cs `notElem` "@$~" && all isOpLetter cs def :: P.GenLanguageDef String ParserState VM def = P.LanguageDef { P.commentStart = "{-" , P.commentEnd = "-}" , P.commentLine = "--" , P.nestedComments = True , P.identStart = letter <|> P.opStart def <|> oneOf "_" , P.identLetter = alphaNum <|> P.opLetter def , P.opStart = satisfy (\c -> c `notElem` "@$~" && isOpLetter c) , P.opLetter = satisfy isOpLetter , P.reservedOpNames = ["=", ":=", ",", "|"] , P.reservedNames = ["operator", "macro", "for-macro", "this", "True", "False"] , P.caseSensitive = True } tp :: P.GenTokenParser String ParserState VM tp = makeTokenParser def eol :: Parser () eol = newline >> return () lexeme :: Parser a -> Parser a lexeme = P.lexeme tp capIdent :: Parser String capIdent = do c <- satisfy isUpper cs <- many (P.identLetter def) return (c:cs) lowIdent :: Parser String lowIdent = do c <- satisfy isLower cs <- many (P.identLetter def) return (c:cs) capIdentifier :: Parser String capIdentifier = lexeme capIdent lowIdentifier :: Parser String lowIdentifier = lexeme lowIdent anyIdent :: Parser String anyIdent = try $ do c <- P.identStart def cs <- many (P.identLetter def) if isOperator (c:cs) then unexpected "operator" else return (c:cs) anyIdentifier :: Parser String anyIdentifier = lexeme anyIdent parens :: Parser a -> Parser a parens = P.parens tp brackets :: Parser a -> Parser a brackets = P.brackets tp braces :: Parser a -> Parser a braces = P.braces tp comma :: Parser String comma = P.comma tp commaSep :: Parser a -> Parser [a] commaSep = P.commaSep tp commaSep1 :: Parser a -> Parser [a] commaSep1 = P.commaSep1 tp dot :: Parser String dot = P.dot tp identifier :: Parser String identifier = lexeme $ P.identifier tp ident :: Parser String ident = P.identifier tp operator :: Parser String operator = try $ do c <- P.opStart def cs <- many (P.opLetter def) if (c:cs) `elem` P.reservedOpNames def then unexpected ("reserved operator " ++ show (c:cs)) else return (c:cs) reserved :: String -> Parser () reserved = P.reserved tp reservedOp :: String -> Parser () reservedOp = P.reservedOp tp integer :: Parser Integer integer = do f <- sign n <- natural return (f n) where sign = choice [ char '-' >> return negate , char '+' >> return id , return id ] float :: Parser Double float = do f <- sign n <- P.float tp return (f n) where sign = choice [ char '-' >> return negate , char '+' >> return id , return id ] natural :: Parser Integer natural = P.natural tp symbol :: String -> Parser String symbol = P.symbol tp delimit :: String -> Parser String delimit n = whiteSpace >> symbol n stringLiteral :: Parser String stringLiteral = P.stringLiteral tp charLiteral :: Parser Char charLiteral = P.charLiteral tp colon :: Parser () colon = char ':' >> return () wsBlock :: Parser a -> Parser [a] wsBlock = wsDelim ";" wsDelim :: String -> Parser a -> Parser [a] wsDelim d = indentAware (\n o -> sourceColumn n == sourceColumn o) (delimit d >> return True) False wsMany1 :: Parser a -> Parser [a] wsMany1 p = do ps <- indentAware chainContinue (return False) True p if null ps then fail "needed more than one" else return ps wsMany :: Parser a -> Parser [a] wsMany = indentAware chainContinue (return False) True wsManyStart :: Show a => Parser a -> Parser a -> Parser [a] wsManyStart s p = do ps <- indentAwareStart chainContinue (return False) True s p if null ps then fail "needed more than one" else return ps chainContinue :: SourcePos -> SourcePos -> Bool chainContinue n o = sourceLine o == sourceLine n || sourceColumn n > sourceColumn o indentAware :: (SourcePos -> SourcePos -> Bool) -> Parser Bool -> Bool -> Parser a -> Parser [a] indentAware cmp delim allowSeq p = indentAwareStart cmp delim allowSeq p p indentAwareStart :: (SourcePos -> SourcePos -> Bool) -> Parser Bool -> Bool -> Parser a -> Parser a -> Parser [a] indentAwareStart cmp delim allowSeq s p = do start <- getPosition wsmany start [] where wsmany o es = choice [ do x <- if null es then s else try p new <- lookAhead (whiteSpace >> getPosition) sequential <- liftM (== new) $ lookAhead (spacing >> getPosition) delimited <- option False $ try delim if delimited || cmp new o || (allowSeq && sequential) then whiteSpace >> wsmany o (es ++ [x]) else return (es ++ [x]) , return es ] keyword :: Parser a -> Parser (String, a) keyword p = do name <- keywordName target <- p return (name, target) keywordName :: Parser String keywordName = do n <- operator <|> (ident >>= \name -> char ':' >> return name) whiteSpace1 return n keywords :: Show a => ([String] -> [a] -> b) -> a -> Parser a -> Parser b keywords c d p = do r <- choice [try (lookAhead keywordName) >> return d, p] (ns, rs) <- liftM unzip $ wsMany1 (keyword p) return (c ns (r:rs)) tagged :: Parser Expr -> Parser Expr tagged p = do pos <- getPosition r <- p return r { eLocation = Just pos } makeTokenParser :: P.GenLanguageDef String ParserState VM -> P.GenTokenParser String ParserState VM makeTokenParser languageDef = P.TokenParser{ P.identifier = identifier , P.reserved = reserved , P.operator = operator , P.reservedOp = reservedOp , P.charLiteral = charLiteral , P.stringLiteral = stringLiteral , P.natural = natural , P.integer = integer , P.float = float , P.naturalOrFloat = naturalOrFloat , P.decimal = decimal , P.hexadecimal = hexadecimal , P.octal = octal , P.symbol = symbol , P.lexeme = lexeme , P.whiteSpace = whiteSpace , P.parens = parens , P.braces = braces , P.angles = angles , P.brackets = brackets , P.squares = brackets , P.semi = semi , P.comma = comma , P.colon = colon , P.dot = dot , P.semiSep = semiSep , P.semiSep1 = semiSep1 , P.commaSep = commaSep , P.commaSep1 = commaSep1 } where ----------------------------------------------------------- -- Bracketing ----------------------------------------------------------- parens = between (open "(") (close ")") braces = between (open "{") (close "}") angles = between (open "<") (close ">") brackets = between (open "[") (close "]") semi = delimit ";" comma = delimit "," dot = delimit "." colon = delimit ":" commaSep p = sepBy p comma semiSep p = sepBy p semi commaSep1 p = sepBy1 p comma semiSep1 p = sepBy1 p semi ----------------------------------------------------------- -- Chars & Strings ----------------------------------------------------------- charLiteral = lexeme (char '$' >> characterChar) "character" characterChar = charLetter <|> charEscape "literal character" charEscape = do{ char '\\'; escapeCode } charLetter = satisfy (\c -> (c /= '\\') && (c > '\026')) stringLiteral = lexeme ( do{ str <- between (char '"') (char '"' "end of string") (many stringChar) ; return (foldr (maybe id (:)) "" str) } "literal string") stringChar = do{ c <- stringLetter; return (Just c) } <|> stringEscape "string character" stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) stringEscape = do{ char '\\' ; do{ escapeGap ; return Nothing } <|> do{ escapeEmpty; return Nothing } <|> do{ esc <- escapeCode; return (Just esc) } } escapeEmpty = char '&' escapeGap = do{ many1 space ; char '\\' "end of string gap" } -- escape codes escapeCode = charEsc <|> charNum <|> charAscii <|> charControl "escape code" charControl = do{ char '^' ; code <- upper ; return (toEnum (fromEnum code - fromEnum 'A')) } charNum = do{ code <- decimal <|> do{ char 'o'; number 8 octDigit } <|> do{ char 'x'; number 16 hexDigit } ; return (toEnum (fromInteger code)) } charEsc = choice (map parseEsc escMap) where parseEsc (c,code) = do{ char c; return code } charAscii = choice (map parseAscii asciiMap) where parseAscii (asc,code) = try (do{ string asc; return code }) -- escape code tables escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'" asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", "FS","GS","RS","US","SP"] ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", "CAN","SUB","ESC","DEL"] ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US " ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL" ----------------------------------------------------------- -- Numbers ----------------------------------------------------------- naturalOrFloat = lexeme natFloat "number" float = lexeme floating "float" integer = lexeme int "integer" natural = lexeme nat "natural" -- floats floating = do{ n <- decimal ; fractExponent n } natFloat = do{ char '0' ; zeroNumFloat } <|> decimalFloat zeroNumFloat = do{ n <- hexadecimal <|> octal ; return (Left n) } <|> decimalFloat <|> fractFloat 0 <|> return (Left 0) decimalFloat = do{ n <- decimal ; option (Left n) (fractFloat n) } fractFloat n = do{ f <- fractExponent n ; return (Right f) } fractExponent n = do{ fract <- fraction ; expo <- option 1.0 exponent' ; return ((fromInteger n + fract)*expo) } <|> do{ expo <- exponent' ; return (fromInteger n*expo) } fraction = do{ char '.' ; digits <- many1 digit "fraction" ; return (foldr op 0.0 digits) } "fraction" where op d f = (f + fromIntegral (digitToInt d))/10.0 exponent' = do{ oneOf "eE" ; f <- sign ; e <- decimal "exponent" ; return (power (f e)) } "exponent" where power e | e < 0 = 1.0/power(-e) | otherwise = fromInteger (10^e) -- integers and naturals int = do{ f <- sign ; n <- nat ; return (f n) } sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id nat = zeroNumber <|> decimal zeroNumber = do{ char '0' ; hexadecimal <|> octal <|> decimal <|> return 0 } "zeroNumber" decimal = number 10 digit hexadecimal = do{ oneOf "xX"; number 16 hexDigit } octal = do{ oneOf "oO"; number 8 octDigit } number base baseDigit = do{ digits <- many1 baseDigit ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits ; seq n (return n) } ----------------------------------------------------------- -- Operators & reserved ops ----------------------------------------------------------- reservedOp name = lexeme $ try $ do{ string name ; notFollowedBy (P.opLetter languageDef) ("end of " ++ show name) } operator = lexeme $ try $ do{ name <- oper ; if isReservedOp name then unexpected ("reserved operator " ++ show name) else return name } oper = try (do{ c <- (P.opStart languageDef) ; cs <- many (P.opLetter languageDef) ; return (c:cs) }) "operator" isReservedOp = isReserved (sort (P.reservedOpNames languageDef)) ----------------------------------------------------------- -- Identifiers & Reserved words ----------------------------------------------------------- reserved name = lexeme $ try $ do{ caseString name ; notFollowedBy (P.identLetter languageDef) ("end of " ++ show name) } caseString name | P.caseSensitive languageDef = string name | otherwise = do{ walk name; return name } where walk [] = return () walk (c:cs) = do{ caseChar c msg; walk cs } caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c) | otherwise = char c msg = show name identifier = try $ do{ name <- ident ; if isReservedName name then unexpected ("reserved word " ++ show name) else return name } ident = try (do c <- P.identStart def cs <- many (P.identLetter def) if isOperator (c:cs) then unexpected "operator" else return (c:cs)) "identifier" isReservedName name = isReserved theReservedNames caseName where caseName | P.caseSensitive languageDef = name | otherwise = map toLower name isReserved names name = scan names where scan [] = False scan (r:rs) = case compare r name of LT -> scan rs EQ -> True GT -> False theReservedNames | P.caseSensitive languageDef = sortedNames | otherwise = map (map toLower) sortedNames where sortedNames = sort (P.reservedNames languageDef) ----------------------------------------------------------- -- White space & symbols ----------------------------------------------------------- delimit name = try $ do{ whiteSpace; symbol name } open = symbol close name = do{ whiteSpace; s <- string name; spacing; return s } symbol name = do{ s <- string name; whiteSpace; return s } lexeme p = do{ x <- p; spacing; return x } --whiteSpace whiteSpace = do spacing skipMany (try $ spacing >> newline) spacing whiteSpace :: Parser () whiteSpace = P.whiteSpace tp whiteSpace1 :: Parser () whiteSpace1 = (space <|> newline) >> whiteSpace simpleSpace :: Parser () simpleSpace = skipMany1 $ satisfy (`elem` " \t\f\v\xa0") spacing :: Parser () spacing = skipMany spacing1 spacing1 :: Parser () spacing1 | noLine && noMulti = simpleSpace "whitespace" | noLine = simpleSpace <|> multiLineComment "whitespace or multiline comment" | noMulti = simpleSpace <|> oneLineComment "whitespace or line comment" | otherwise = simpleSpace <|> oneLineComment <|> multiLineComment "whitespace or commend" where noLine = null (P.commentLine def) noMulti = null (P.commentStart def) oneLineComment :: Parser () oneLineComment = try (string (P.commentLine def)) >> skipMany (satisfy (/= '\n')) multiLineComment :: Parser () multiLineComment = try (string (P.commentStart def)) >> inComment inComment :: Parser () inComment | P.nestedComments def = inCommentMulti | otherwise = inCommentSingle inCommentMulti :: Parser () inCommentMulti = (try (string (P.commentEnd def)) >> return ()) <|> (multiLineComment >> inCommentMulti) <|> (skipMany1 (noneOf startEnd) >> inCommentMulti) <|> (oneOf startEnd >> inCommentMulti) "end of comment" where startEnd = nub (P.commentEnd def ++ P.commentStart def) inCommentSingle :: Parser () inCommentSingle = (try (string (P.commentEnd def)) >> return ()) <|> (skipMany1 (noneOf startEnd) >> inCommentSingle) <|> (oneOf startEnd >> inCommentSingle) "end of comment" where startEnd = nub (P.commentEnd def ++ P.commentStart def)