{-# OPTIONS_GHC -fno-warn-name-shadowing #-} {- This is a modified version of the Parsec module whose copyright is below, which supports figuring out where a token has ended *before* ensuing whitespace. In particular, it defines a type class for functionally updating the state with a SourcePos, and then lexeme always stashes the position there before discarding whitespace. -} ----------------------------------------------------------------------------- -- | -- Module : Text.ParserCombinators.Parsec.Token -- Copyright : (c) Daan Leijen 1999-2001 -- License : BSD-style (see the file libraries/parsec/LICENSE) -- -- Maintainer : daan@cs.uu.nl -- Stability : provisional -- Portability : non-portable (uses existentially quantified data constructors) -- -- A helper module to parse lexical elements (tokens). -- ----------------------------------------------------------------------------- module Alt.Token ( TokenEnd (..) , LanguageDef (..) , TokenParser (..) , makeTokenParser ) where import Data.Char (isAlpha,toLower,toUpper,isSpace,digitToInt) import Data.List (nub,sort) import Text.ParserCombinators.Parsec class TokenEnd st where saveTokenEnd :: CharParser st () instance TokenEnd () where saveTokenEnd = return () ----------------------------------------------------------- -- Language Definition ----------------------------------------------------------- data LanguageDef st = LanguageDef { commentStart :: String , commentEnd :: String , commentLine :: String , nestedComments :: Bool , identStart :: CharParser st Char , identLetter :: CharParser st Char , opStart :: CharParser st Char , opLetter :: CharParser st Char , reservedNames :: [String] , reservedOpNames:: [String] , caseSensitive :: Bool } ----------------------------------------------------------- -- A first class module: TokenParser ----------------------------------------------------------- data TokenParser st = TokenParser{ identifier :: CharParser st String , reserved :: String -> CharParser st () , operator :: CharParser st String , reservedOp :: String -> CharParser st () , charLiteral :: CharParser st Char , stringLiteral :: CharParser st String , natural :: CharParser st Integer , integer :: CharParser st Integer , float :: CharParser st Double , naturalOrFloat :: CharParser st (Either Integer Double) , decimal :: CharParser st Integer , hexadecimal :: CharParser st Integer , octal :: CharParser st Integer , symbol :: String -> CharParser st String , lexeme :: forall a. CharParser st a -> CharParser st a , whiteSpace :: CharParser st () , parens :: forall a. CharParser st a -> CharParser st a , braces :: forall a. CharParser st a -> CharParser st a , angles :: forall a. CharParser st a -> CharParser st a , brackets :: forall a. CharParser st a -> CharParser st a -- "squares" is deprecated , squares :: forall a. CharParser st a -> CharParser st a , semi :: CharParser st String , comma :: CharParser st String , colon :: CharParser st String , dot :: CharParser st String , semiSep :: forall a . CharParser st a -> CharParser st [a] , semiSep1 :: forall a . CharParser st a -> CharParser st [a] , commaSep :: forall a . CharParser st a -> CharParser st [a] , commaSep1 :: forall a . CharParser st a -> CharParser st [a] } ----------------------------------------------------------- -- Given a LanguageDef, create a token parser. ----------------------------------------------------------- makeTokenParser :: TokenEnd st => LanguageDef st -> TokenParser st makeTokenParser languageDef = TokenParser{ identifier = identifier , reserved = reserved , operator = operator , reservedOp = reservedOp , charLiteral = charLiteral , stringLiteral = stringLiteral , natural = natural , integer = integer , float = float , naturalOrFloat = naturalOrFloat , decimal = decimal , hexadecimal = hexadecimal , octal = octal , symbol = symbol , lexeme = lexeme , whiteSpace = whiteSpace , parens = parens , braces = braces , angles = angles , brackets = brackets , squares = brackets , semi = semi , comma = comma , colon = colon , dot = dot , semiSep = semiSep , semiSep1 = semiSep1 , commaSep = commaSep , commaSep1 = commaSep1 } where ----------------------------------------------------------- -- Bracketing ----------------------------------------------------------- parens p = between (symbol "(") (symbol ")") p braces p = between (symbol "{") (symbol "}") p angles p = between (symbol "<") (symbol ">") p brackets p = between (symbol "[") (symbol "]") p semi = symbol ";" comma = symbol "," dot = symbol "." colon = symbol ":" commaSep p = sepBy p comma semiSep p = sepBy p semi commaSep1 p = sepBy1 p comma semiSep1 p = sepBy1 p semi ----------------------------------------------------------- -- Chars & Strings ----------------------------------------------------------- -- charLiteral :: CharParser st Char charLiteral = lexeme (between (char '\'') (char '\'' "end of character") characterChar ) "character" characterChar = charLetter <|> charEscape "literal character" charEscape = do{ char '\\'; escapeCode } charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026')) -- stringLiteral :: CharParser st String stringLiteral = lexeme ( do{ str <- between (char '"') (char '"' "end of string") (many stringChar) ; return (foldr (maybe id (:)) "" str) } "literal string") -- stringChar :: CharParser st (Maybe Char) stringChar = do{ c <- stringLetter; return (Just c) } <|> stringEscape "string character" stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026')) stringEscape = do{ char '\\' ; do{ escapeGap ; return Nothing } <|> do{ escapeEmpty; return Nothing } <|> do{ esc <- escapeCode; return (Just esc) } } escapeEmpty = char '&' escapeGap = do{ many1 space ; char '\\' "end of string gap" } -- escape codes escapeCode = charEsc <|> charNum <|> charAscii <|> charControl "escape code" -- charControl :: CharParser st Char charControl = do{ char '^' ; code <- upper ; return (toEnum (fromEnum code - fromEnum 'A')) } -- charNum :: CharParser st Char charNum = do{ code <- decimal <|> do{ char 'o'; number 8 octDigit } <|> do{ char 'x'; number 16 hexDigit } ; return (toEnum (fromInteger code)) } charEsc = choice (map parseEsc escMap) where parseEsc (c,code) = do{ char c; return code } charAscii = choice (map parseAscii asciiMap) where parseAscii (asc,code) = try (do{ string asc; return code }) -- escape code tables escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'") asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2) ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM", "FS","GS","RS","US","SP"] ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL", "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB", "CAN","SUB","ESC","DEL"] ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI', '\EM','\FS','\GS','\RS','\US','\SP'] ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK', '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK', '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL'] ----------------------------------------------------------- -- Numbers ----------------------------------------------------------- -- naturalOrFloat :: CharParser st (Either Integer Double) naturalOrFloat = lexeme (natFloat) "number" float = lexeme floating "float" integer = lexeme int "integer" natural = lexeme nat "natural" -- floats floating = do{ n <- decimal ; fractExponent n } natFloat = do{ char '0' ; zeroNumFloat } <|> decimalFloat zeroNumFloat = do{ n <- hexadecimal <|> octal ; return (Left n) } <|> decimalFloat <|> fractFloat 0 <|> return (Left 0) decimalFloat = do{ n <- decimal ; option (Left n) (fractFloat n) } fractFloat n = do{ f <- fractExponent n ; return (Right f) } fractExponent n = do{ fract <- fraction ; expo <- option 1.0 exponent' ; return ((fromInteger n + fract)*expo) } <|> do{ expo <- exponent' ; return ((fromInteger n)*expo) } fraction = do{ char '.' ; digits <- many1 digit "fraction" ; return (foldr op 0.0 digits) } "fraction" where op d f = (f + fromIntegral (digitToInt d))/10.0 exponent' = do{ oneOf "eE" ; f <- sign ; e <- decimal "exponent" ; return (power (f e)) } "exponent" where power e | e < 0 = 1.0/power(-e) | otherwise = fromInteger (10^e) -- integers and naturals int = do{ f <- lexeme sign ; n <- nat ; return (f n) } -- sign :: CharParser st (Integer -> Integer) sign = (char '-' >> return negate) <|> (char '+' >> return id) <|> return id nat = zeroNumber <|> decimal zeroNumber = do{ char '0' ; hexadecimal <|> octal <|> decimal <|> return 0 } "" decimal = number 10 digit hexadecimal = do{ oneOf "xX"; number 16 hexDigit } octal = do{ oneOf "oO"; number 8 octDigit } -- number :: Integer -> CharParser st Char -> CharParser st Integer number base baseDigit = do{ digits <- many1 baseDigit ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits ; seq n (return n) } ----------------------------------------------------------- -- Operators & reserved ops ----------------------------------------------------------- reservedOp name = lexeme $ try $ do{ string name ; notFollowedBy (opLetter languageDef) ("end of " ++ show name) } operator = lexeme $ try $ do{ name <- oper ; if (isReservedOp name) then unexpected ("reserved operator " ++ show name) else return name } oper = do{ c <- (opStart languageDef) ; cs <- many (opLetter languageDef) ; return (c:cs) } "operator" isReservedOp name = isReserved (sort (reservedOpNames languageDef)) name ----------------------------------------------------------- -- Identifiers & Reserved words ----------------------------------------------------------- reserved name = lexeme $ try $ do{ caseString name ; notFollowedBy (identLetter languageDef) ("end of " ++ show name) } caseString name | caseSensitive languageDef = string name | otherwise = do{ walk name; return name } where walk [] = return () walk (c:cs) = do{ caseChar c msg; walk cs } caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c) | otherwise = char c msg = show name identifier = lexeme $ try $ do{ name <- ident ; if (isReservedName name) then unexpected ("reserved word " ++ show name) else return name } ident = do{ c <- identStart languageDef ; cs <- many (identLetter languageDef) ; return (c:cs) } "identifier" isReservedName name = isReserved theReservedNames caseName where caseName | caseSensitive languageDef = name | otherwise = map toLower name isReserved names name = scan names where scan [] = False scan (r:rs) = case (compare r name) of LT -> scan rs EQ -> True GT -> False theReservedNames | caseSensitive languageDef = sortedNames | otherwise = map (map toLower) sortedNames where sortedNames = sort (reservedNames languageDef) ----------------------------------------------------------- -- White space & symbols ----------------------------------------------------------- symbol name = lexeme (string name) lexeme p = do x <- p saveTokenEnd whiteSpace return x --whiteSpace whiteSpace | noLine && noMulti = skipMany (simpleSpace "") | noLine = skipMany (simpleSpace <|> multiLineComment "") | noMulti = skipMany (simpleSpace <|> oneLineComment "") | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment "") where noLine = null (commentLine languageDef) noMulti = null (commentStart languageDef) simpleSpace = skipMany1 (satisfy isSpace) oneLineComment = do{ try (string (commentLine languageDef)) ; skipMany (satisfy (/= '\n')) ; return () } multiLineComment = do { try (string (commentStart languageDef)) ; inComment } inComment | nestedComments languageDef = inCommentMulti | otherwise = inCommentSingle inCommentMulti = do{ try (string (commentEnd languageDef)) ; return () } <|> do{ multiLineComment ; inCommentMulti } <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti } <|> do{ oneOf startEnd ; inCommentMulti } "end of comment" where startEnd = nub (commentEnd languageDef ++ commentStart languageDef) inCommentSingle = do{ try (string (commentEnd languageDef)); return () } <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle } <|> do{ oneOf startEnd ; inCommentSingle } "end of comment" where startEnd = nub (commentEnd languageDef ++ commentStart languageDef)