{-# LANGUAGE FlexibleContexts #-} {-| Utility library that provides parsers for commonly-occuring programming constructs such as identifiers, numbers and characters. -} module Language.Parse ( -- * Combinators -- ** Composable string , lookAhead , manyTill , manyThru , (<|>) , choice -- ** Extra , many2 , between2 , isEof , spaces1 , charICase , stringICase -- * Identifiers , blacklistChar -- * Numbers -- ** Prepackaged Parsers , anyNumber -- ** Number Parts , signLiteral , baseLiteral , naturalLiteral , mantissaLiteral , exponentLiteral , denominatorLiteral , xDigit -- ** Convert Strings to Numbers , stringToInteger , stringToMantissa -- * Characters , literalChar , maybeLiteralChar ) where import Control.Monad import Control.Applicative ((<$>), (<*>), (*>), (<*)) import Data.Maybe import Data.Ratio import Data.Char import Text.Parsec ( ParsecT , satisfy, char, oneOf, eof , try, (), parserZero) import qualified Text.Parsec as P --FIXME put this in Parsec.Combinators.Composable ------ Composable Combinators ------ {-| Parse a string, but don't consume input on failure. -} string :: (Monad m, P.Stream s m Char) => String -> ParsecT s u m String string = try . P.string {-| Lookahead without consuming any input. -} lookAhead :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a lookAhead = try . P.lookAhead {-| Use @manyTill p e@ to apply parser @p@ many times, stopping as soon as @e@ is next to parse. Note that @e@ is not consumed. -} manyTill :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m [a] manyTill p e = P.manyTill p (lookAhead e) {-| Use @manyThru p e@ to apply parser @p@ many times, stopping as soon as @e@ is consumed. Unlike Parsec's @manyTill@, if @e@ fails, it does not consume input. -} manyThru :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m [a] manyThru p e = P.manyTill p (try e) {-| Use @a <|> b@ to parse @a@ or @b@. If @a@ fails, no input is consumed. -} (<|>) :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a -> ParsecT s u m a a <|> b = try a P.<|> b {-| Parse the first of the passed combinators that succeeds. If any parser fails, it does not consume input. -} choice :: (Monad m, P.Stream s m t) => [ParsecT s u m a] -> ParsecT s u m a choice = P.choice . map try --TODO sepBy &co ------ Useful Combinators ------ {-| Use @many2 a b@ to parse an @a@ followed by zero or more @b@s. -} many2 :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a -> ParsecT s u m [a] many2 p ps = do car <- p cdr <- P.many ps return (car:cdr) {-| Use @between2 a p@ to parse an @a@, then a @p@, then an @a@. Return the results of the @p@ parser. -} between2 :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m b between2 e p = P.between e e p {-| Detect end of file as a boolean. -} isEof :: (Show t, Monad m, P.Stream s m t) => ParsecT s u m Bool isEof = (eof >> return True) P.<|> return False {-| One or more spaces. -} spaces1 :: (Monad m, P.Stream s m Char) => ParsecT s u m () spaces1 = void $ P.many1 P.space {-| Parse one character, case-insensitive. -} charICase :: (Monad m, P.Stream s m Char) => Char -> ParsecT s u m Char charICase c = satisfy $ (== toLower c) . toLower {-| Parse a string, case-insensitive. If this parser fails, it consumes no input. -} stringICase :: (Monad m, P.Stream s m Char) => String -> ParsecT s u m String stringICase str = try $ mapM charICase str ------ Parsing Identifiers ------ {-| Parses a wide variety of characters, excepting those which meet the passed predicate. Specifically, we accept all of Unicode except: * Space * LineSeparator * ParagraphSeparator * Control * Format * Surrogate * PrivateUse * NotAssigned -} blacklistChar :: (Monad m, P.Stream s m Char) => (Char -> Bool) -> ParsecT s u m Char blacklistChar p = satisfy $ \c -> not (p c) && case generalCategory c of Space -> False LineSeparator -> False ParagraphSeparator -> False Control -> False Format -> False Surrogate -> False PrivateUse -> False NotAssigned -> False _ -> True --Letter, Mark, Number, Punctuation/Quote, Symbol --TODO maybe normal c-like identifiers, maybe identifiers that could be word-based vs. symbol-based ------ Parsing Numbers ------ --TODO common combinations of the number part parsers {-| Optional sign, then an integer number in scientific notation or ratio, in base 2, 8, 10 or 16. If in scientific notation, the exponent may be in base 10 or 16 -} anyNumber :: (Monad m, P.Stream s m Char) => ParsecT s u m Rational anyNumber = ( "number") $ try $ do sign <- P.option 1 signLiteral base <- baseLiteral whole <- naturalLiteral base n <- choice [ scientificNotation whole base , fractionNotation whole base , return (whole % 1) ] return $ fromIntegral sign * n where scientificNotation whole base = do mantissa <- mantissaLiteral base (expbase, exponent) <- P.option (1,0) (decimalExp <|> hexExp) return $ ((whole % 1) + mantissa) * (fromIntegral expbase ^^ exponent) fractionNotation whole base = (whole %) . denominator <$> denominatorLiteral base decimalExp = (,) 10 <$> exponentLiteral 10 hexExp = (,) 16 <$> exponentLiteral 16 {-| Parse a minus or plus sign and return the appropriate multiplier. -} signLiteral :: (Monad m, P.Stream s m Char) => ParsecT s u m Integer signLiteral = ( "sign") $ (char '-' >> return (-1)) P.<|> (char '+' >> return 1) {-| Parse \"0x\", \"0o\", or \"0b\" case-insensitive and return the appropriate base. If none of these parse, return base 10. -} baseLiteral :: (Monad m, P.Stream s m Char) => ParsecT s u m Int baseLiteral = choice [ (stringICase "0x") >> return 16 , (stringICase "0o") >> return 8 , (stringICase "0b") >> return 2 , return 10 ] {-| Parse many digits in the passed base and return the corresponding integer. -} naturalLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Integer naturalLiteral base = ( "natural number") $ stringToInteger base <$> P.many1 (xDigit base) {-| Parse a dot followed by many digits in the passed base and return the corresponding ratio. -} mantissaLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Rational mantissaLiteral base = ( "mantissa") $ do char '.' stringToMantissa base <$> P.many1 (xDigit base) {-| In base 10, parse an 'e' and a decimal integer. In base 16, parse an 'h' and a hexadecimal integer. -} exponentLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Integer exponentLiteral base = ( "exponent") (go base) where body = (*) <$> P.option 1 signLiteral <*> naturalLiteral base go 10 = charICase 'e' >> body go 16 = charICase 'h' >> body go _ = error "unrecognized base in Language.Parser.exponentLiteral (accepts only 10 or 16)" {-| Parse a '/' and a natural in the passed base. Return the reciprocal of that number. -} denominatorLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Rational denominatorLiteral base = ( "denominator") $ do denom <- char '/' >> naturalLiteral base if denom == 0 then parserZero else return (1%denom) {-| Parse a digit in the passed base: 2, 8, 10 or 16. -} xDigit :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Char xDigit base = case base of 2 -> oneOf "01" 8 -> P.octDigit 10 -> P.digit 16 -> P.hexDigit _ -> error "unrecognized base in Language.Parser.xDigit (accepts only 2, 8, 10, or 16)" {-| Interpret a string as an integer in the passed base. -} stringToInteger :: Int -> String -> Integer stringToInteger base = foldl impl 0 where impl acc x = acc * fromIntegral base + (fromIntegral . digitToInt) x {-| Interpret a string as a mantissa in the passed base. -} stringToMantissa :: Int -> String -> Ratio Integer stringToMantissa base = (/ (fromIntegral base%1)) . foldr impl (0 % 1) where impl x acc = acc / (fromIntegral base%1) + (((%1) . fromIntegral . digitToInt) x) ------ Parsing Character Literals ------ {-| Parse a single character as if in a string literal. This should be applicable to both character and string literals. Here's the list of what characters are accepted: * Any single unicode character that is not an ASCII control character, backslash, or double-quote. * Line continuation: backslash, then advance over whitespace (including newlines and comments) through the next backslash. * Octal or hexadecimal ASCII escapes: a sequence in @\/\\\\(x[0-9a-fA-F]{2}|o[0-7]{3})\/@. * Unicode escapes: a sequence in @\/\\\\(u|U0[0-9a-fA-F]|U10)[0-9a-fA-F]{4}\/@. * Special escape: a sequence in @\/\\\\[0abefnrtv\'\"]\/@. For reference, the meanings of special escapes are: @ \\0: nul (ASCII 0, 0x00) \\a: bell (ASCII 7, 0x07) \\b: backspace (ASCII 8, 0x08) \\e: escape (ASCII 27, 0x1B) \\f: form feed (ASCII 12, 0x0C) \\n: line feed (ASCII 10, 0x0A) \\r: carriage return (ASCII 13, 0x0D) \\t: horizontal tab (ASCII 9, 0x09) \\v: vertical tab (ASCII 11, 0x0B) \\\': single quote (ASCII 39, 0x27) \\\": double quote (ASCII 34, 0x22) @ -} literalChar :: (Monad m, P.Stream s m Char) => ParsecT s u m Char literalChar = (satisfy isNormalChar "printing character") P.<|> (escape "escape sequence") where isNormalChar c = c >= ' ' && c `notElem` "\DEL\'\"\\" --FIXME limit this slightly more escape = char '\\' >> P.choice [specialEscape, numericalEscape] specialEscape = fromJust . flip lookup table <$> oneOf (map fst table) where table = [ ('0' , '\0') , ('a' , '\a') , ('b' , '\b') , ('e' , '\27') , ('f' , '\f') , ('n' , '\n') , ('r' , '\r') , ('t' , '\t') , ('\'', '\'') , ('\"', '\"') , ('\\', '\\') ] numericalEscape = chr . fromInteger <$> P.choice [ascii16, uni4, ascii8, uni6] ascii8 = stringToInteger 8 <$> (oneOf "oO" >> P.count 3 P.octDigit) ascii16 = stringToInteger 16 <$> (oneOf "xX" >> P.count 2 P.hexDigit) uni4 = stringToInteger 16 <$> (char 'u' >> P.count 4 P.hexDigit) uni6 = char 'U' >> (high P.<|> low) where low = stringToInteger 16 <$> (char '0' >> P.count 5 P.hexDigit) high = (+ 0x100000) . stringToInteger 16 <$> (string "10" >> P.count 4 P.hexDigit) {-| Parse any character accepted by 'literalChar', but also accept two empty characters: * @\\&@ The eplicit empty character. * Backslash-whitespace-backslash. -} maybeLiteralChar :: (Monad m, P.Stream s m Char) => ParsecT s u m (Maybe Char) maybeLiteralChar = (Just <$> literalChar) P.<|> (const Nothing <$> (string "\\&" P.<|> lineContinue)) where lineContinue = between2 (char '\\') (P.many $ oneOf " \t\n\r") --FIXME more types of whitespace could be allowed