Safe Haskell | Safe-Inferred |
---|
Parsec parsers for common tokens. This module is written with bottom-up programming in mind
so that is stays flexible. For example, although we export several numerical token parsers,
we also export the sub-token parsers they were built out of: numSign
, numNatural
,
numExponent
, &c.
- char :: Stream s m Char => Char -> ParsecT s u m Char
- string :: Stream s m Char => String -> ParsecT s u m String
- charI :: Stream s m Char => Char -> ParsecT s u m Char
- stringI :: Stream s m Char => String -> ParsecT s u m String
- upAlpha :: Stream s m Char => ParsecT s u m Char
- loAlpha :: Stream s m Char => ParsecT s u m Char
- alpha :: Stream s m Char => ParsecT s u m Char
- digit :: Stream s m Char => ParsecT s u m Char
- hexDigit :: Stream s m Char => ParsecT s u m Char
- octDigit :: Stream s m Char => ParsecT s u m Char
- binDigit :: Stream s m Char => ParsecT s u m Char
- ctl :: Stream s m Char => ParsecT s u m Char
- asciiText :: Stream s m Char => ParsecT s u m Char
- uniText :: Stream s m Char => ParsecT s u m Char
- cr :: Stream s m Char => ParsecT s u m ()
- lf :: Stream s m Char => ParsecT s u m ()
- sp :: Stream s m Char => ParsecT s u m ()
- ht :: Stream s m Char => ParsecT s u m ()
- sq :: Stream s m Char => ParsecT s u m ()
- dq :: Stream s m Char => ParsecT s u m ()
- colon :: Stream s m Char => ParsecT s u m ()
- semicolon :: Stream s m Char => ParsecT s u m ()
- dot :: Stream s m Char => ParsecT s u m ()
- comma :: Stream s m Char => ParsecT s u m ()
- bsEsc :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
- yes :: Stream s m Char => ParsecT s u m Bool
- no :: Stream s m Char => ParsecT s u m Bool
- yesno :: Stream s m Char => ParsecT s u m Bool
- lws :: Stream s m Char => ParsecT s u m String
- newline :: Stream s m Char => ParsecT s u m ()
- lineBreak :: Stream s m Char => ParsecT s u m ()
- crlf :: Stream s m Char => ParsecT s u m ()
- bsnl :: Stream s m Char => ParsecT s u m ()
- bsnlwsbs :: Stream s m Char => ParsecT s u m ()
- data IndentPolicy
- dentation :: Stream s m Char => IndentPolicy -> ParsecT s u m Int
- many1Not :: Stream s m Char => (Char -> Bool) -> (Char -> Bool) -> ParsecT s u m String
- sigilized :: Stream s m Char => [(Char, sigil)] -> ParsecT s u m a -> ParsecT s u m (sigil, a)
- inParens :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
- inBrackets :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
- inBraces :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
- inAngles :: Stream s m Char => ParsecT s u m a -> ParsecT s u m a
- numSign :: Stream s m Char => ParsecT s u m Integer
- numBase :: Stream s m Char => ParsecT s u m Int
- numNatural :: Stream s m Char => Int -> ParsecT s u m Integer
- numAfterPoint :: Stream s m Char => Int -> ParsecT s u m Rational
- numDenominator :: Stream s m Char => Int -> ParsecT s u m Rational
- numOptSign :: Stream s m Char => ParsecT s u m Integer
- numInteger :: Stream s m Char => Int -> ParsecT s u m Integer
- xDigit :: Stream s m Char => Int -> ParsecT s u m Char
- stringToInteger :: Integral n => Int -> String -> n
- stringToMantissa :: Int -> String -> Ratio Integer
- integer :: Stream s m Char => ParsecT s u m Integer
- rational :: Stream s m Char => ParsecT s u m Rational
- scientific :: Stream s m Char => ParsecT s u m Rational
- letterEsc :: Stream s m Char => [(Char, Char)] -> ParsecT s u m Char
- decimalEsc :: Stream s m Char => ParsecT s u m Char
- asciiEsc :: Stream s m Char => ParsecT s u m Char
- loUniEsc :: Stream s m Char => ParsecT s u m Char
- hiUniEsc :: Stream s m Char => ParsecT s u m Char
- uniEsc :: Stream s m Char => ParsecT s u m Char
- cEscapes :: [(Char, Char)]
- sqString :: Stream s m Char => ParsecT s u m String
- dqString :: Stream s m Char => [(Char, Char)] -> ParsecT s u m String
- lineComment :: Stream s m Char => String -> ParsecT s u m String
- blockComment :: Stream s m Char => String -> String -> ParsecT s u m String
- nestingComment :: Stream s m Char => String -> String -> ParsecT s u m String
- charClass :: String -> Char -> Bool
- uniPrint :: Char -> Bool
- uniPrintMinus :: (Char -> Bool) -> Char -> Bool
- uniIdClass :: Char -> Bool
- uniIdClassMinus :: (Char -> Bool) -> Char -> Bool
- aChar :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
- anyChar :: Stream s m Char => ParsecT s u m Char
- oneOf :: Stream s m Char => [Char] -> ParsecT s u m Char
- noneOf :: Stream s m Char => [Char] -> ParsecT s u m Char
Basic Characters and Strings
char :: Stream s m Char => Char -> ParsecT s u m Char
char c
parses a single character c
. Returns the parsed
character (i.e. c
).
semiColon = char ';'
string :: Stream s m Char => String -> ParsecT s u m StringSource
string s
parses a sequence of characters given by s
. Returns
the parsed string (i.e. s
). Unlike the Parsec version, this
combinator never consumes input on failure.
adrenalineWord = string "fight" <|> string "flight"
charI :: Stream s m Char => Char -> ParsecT s u m CharSource
Parse a single character, case-insensitive. Normalized to lowercase.
stringI :: Stream s m Char => String -> ParsecT s u m StringSource
Parse a string, case-insensitive. If this parser fails, it consumes no input. Normalized to lowercase.
Common Classes
hexDigit :: Stream s m Char => ParsecT s u m Char
Parses a hexadecimal digit (a digit or a letter between 'a' and 'f' or 'A' and 'F'). Returns the parsed character.
octDigit :: Stream s m Char => ParsecT s u m Char
Parses an octal digit (a character between '0' and '7'). Returns the parsed character.
asciiText :: Stream s m Char => ParsecT s u m CharSource
A single printable ASCII character, as the TEXT
rule from RFC2616 §2.2
uniText :: Stream s m Char => ParsecT s u m CharSource
A single printable unicode character, a generalization of text
. See uniPrint
.
Common Special Literals
bsEsc :: Stream s m Char => (Char -> Bool) -> ParsecT s u m CharSource
A backslash-escape: backslash followed by a single character satisfying the predicate.
yes :: Stream s m Char => ParsecT s u m BoolSource
Parse "yes"
, "y"
, or "1"
, case-insensitive. Return True.
no :: Stream s m Char => ParsecT s u m BoolSource
Parse "no"
, "n"
, or "0"
, case-insensitive. Return False.
Programming Idioms
Whitespace
lws :: Stream s m Char => ParsecT s u m StringSource
Short for "linear whitespace": one or more spaces or tabs.
Similar to rule LWS
from RFC2616 §2.2, but without line folding.
newline :: Stream s m Char => ParsecT s u m ()Source
Parse a single line feed or carriage return. Does not succeed at end of file.
lineBreak :: Stream s m Char => ParsecT s u m ()Source
Recognize when the parser is at a line break (LF, CR, or end of input) If the break is due to a CR or LF, consume it.
data IndentPolicy Source
Determine how the depth of indentation is calculated.
dentation :: Stream s m Char => IndentPolicy -> ParsecT s u m IntSource
Parse a lineBreak
followed by whitespace characters.
Return the depth of indentation.
The acceptable whitespace characters and the method by whch to
caclulate depth is determined by an IntentPolicy
.
WARNING: Do not use this combinator with the Lex
module, it
will break because Parsec is sissy.
Identifiers
:: Stream s m Char | |
=> (Char -> Bool) | Whether a character is allowed in the identifier |
-> (Char -> Bool) | Whether the character is disallowed as the first character of the identifier |
-> ParsecT s u m String |
Parse one or more characters that satisfy a predicate, but with additional restrictions on the first character parsed.
This is useful for identifiers (such as [a-zA-Z_][a-zA-Z_0-9]*
), certain kinds of
numbers (such as [1-9][0-9]*
), and possibly other things.
identifier =charClass
"a-zA-Z0-9_" `many1Not`charClass
"0-9" naturalLiteral =stringToInteger
10 <$>charClass
"0-9" `many1Not` (=='0')
Punctuation
inParens :: Stream s m Char => ParsecT s u m a -> ParsecT s u m aSource
Parse between open and close parenthesis.
inBrackets :: Stream s m Char => ParsecT s u m a -> ParsecT s u m aSource
Parse between open and close square brackets ([...]
).
inBraces :: Stream s m Char => ParsecT s u m a -> ParsecT s u m aSource
Parse between open and close curly braces ({...}
).
inAngles :: Stream s m Char => ParsecT s u m a -> ParsecT s u m aSource
Parse between open and close angles (<...>
).
Number Parts
numSign :: Stream s m Char => ParsecT s u m IntegerSource
Parse a minus or plus sign and return the appropriate multiplier.
numBase :: Stream s m Char => ParsecT s u m IntSource
Parse "0x", "0o", or "0b" case-insensitive and return the appropriate base. If none of these parse, return base 10.
numNatural :: Stream s m Char => Int -> ParsecT s u m IntegerSource
Parse many digits in the passed base and return the corresponding integer.
numAfterPoint :: Stream s m Char => Int -> ParsecT s u m RationalSource
Parse many digits in the passed base and return the appropriate ratio.
numDenominator :: Stream s m Char => Int -> ParsecT s u m RationalSource
Parse a natural in the passed base and return its reciprocal.
numOptSign :: Stream s m Char => ParsecT s u m IntegerSource
Optional sign as numSign
, defaults to positive.
numInteger :: Stream s m Char => Int -> ParsecT s u m IntegerSource
Parse an optional sign (as numOptSign
), then a natural number
of the specified base (as in numNatural
).
xDigit :: Stream s m Char => Int -> ParsecT s u m CharSource
Parse a digit in the passed base: 2, 8, 10 or 16.
stringToInteger :: Integral n => Int -> String -> nSource
Interpret a string as an integer in the passed base.
stringToMantissa :: Int -> String -> Ratio IntegerSource
Interpret a string as a mantissa in the passed base.
Numbers
integer :: Stream s m Char => ParsecT s u m IntegerSource
Parse an integer: optional sign, then a number of digits. Bases 10, 16, 8 and 2 are supported with appropriate prefixes before the digits.
No prefix is base 10.
0x
case-insensitive is base 16.
0o
case-insensitive is base 8.
0n
case-insensitive is base 2.
rational :: Stream s m Char => ParsecT s u m RationalSource
Parse a rational number: an optional sign, then two sequences
of digits separated by a slash. Return the ratio of the appropriate
sign between the two numbers. Bases 10, 16, 8 and 2 are supported as
in integer
.
scientific :: Stream s m Char => ParsecT s u m RationalSource
Parse a number in scientific notation: an optional sign, then
a radix mark, two sequences of digits separated by a dot
, and
finally an optional exponent, which is an exponent letter, an
optional sign and finally one or more digits in the same base.
Only bases 10 and 16 are supported, and the base of the exponent
is the same as the base of the significand. In base ten, the
exponent letter is e
, and in base 16, it is h
.
Note that digits are required on both sides of the (hexa)decimal
point, so neither 0.
nor .14
are recognized.
Character Escapes
letterEsc :: Stream s m Char => [(Char, Char)] -> ParsecT s u m CharSource
Parse a backslash and another character; use the passed table to determine the returned character.
decimalEsc :: Stream s m Char => ParsecT s u m CharSource
Escape sequences for any unicode code point. Represented by a backslash + one or more decimal digits. Fails when the code point is not representable in unicode.
asciiEsc :: Stream s m Char => ParsecT s u m CharSource
Escape sequences for bytes (including ASCII). Represented by a backslash + two hexdigits.
loUniEsc :: Stream s m Char => ParsecT s u m CharSource
Escape sequences in the Unicode Basic Multilingual Plane (BMP). C.f. hiUniEsc
.
Represented by a backslash+lowercase u
followed by four hexdigits.
hiUniEsc :: Stream s m Char => ParsecT s u m CharSource
Escape sequences outside the Unicode BMP. C.f. loUniEsc
.
Represented by a backslash+uppercase U
followed by five or six
hexdigits totalling at most 0x10FFFF
cEscapes :: [(Char, Char)]Source
Common characetr escapes in programming language string literals:
\0 -> ASCII 00 (nul) \a -> ASCII 07 (alarm/bell) \b -> ASCII 08 (backspace) \e -> ASCII 1B (escape) \f -> ASCII 0C (form feed) \n -> ASCII 0A (line feed) \r -> ASCII 0D (carriage return) \t -> ASCII 09 (horizontal tab) \v -> ASCII 0B (vertical tab) \' -> single-quote \" -> double-quote \\ -> backslash
String Literals
sqString :: Stream s m Char => ParsecT s u m StringSource
Parse a single-quoted string with no escape sequences, except that a single-quote in the string is encoded as two single-quote characters.
This is as single-quoted strings in SQL and Rc (the shell in Plan9). It is an excellent encoding for strings because they are so easy to validate from untrusted input and escape for rendering, whether it is done by human or machine.
dqString :: Stream s m Char => [(Char, Char)] -> ParsecT s u m StringSource
Parse a double-quoted string with common backslash escape sequences.
- We use
letterEsc
with the passed table of contents. - Also,
decimalEsc
,asciiEsc
anduniEsc
are allowed. - Further, the
\&
stands for no character: it literally adds nothing to the string in which it appears, but it can be useful as in\127\&0
. - Finally, lines can be folded with a backslash-newline-backslash, ignoring
any
lws
between the newline and the second backslash. This is preferred over a simple backslash-newline, as it reduces any need to remember how leading whitespace is treated after a line-fold.
The escapes parsed by letterEsc
are preferred to the other escape
sequences. Note that there are no escape sequences for backslash or
double-quote by default (aside from a numerical escape), so you'll
need to include them in the table for letterEsc
.
Comments
lineComment :: Stream s m Char => String -> ParsecT s u m StringSource
Parse a comment beginning with the passed string and ending at
(but not including) a lineBreak
.
:: Stream s m Char | |
=> String | Start the block comment |
-> String | End the block comment |
-> ParsecT s u m String |
Parse a non-nesting comment beginning at the first passed string and ending with (and including) the second passed string.
C.f nestingComment
.
Parse a nesting block comment.
C.f. blockComment
.
Character Classes
charClass :: String -> Char -> BoolSource
Match any character in a set.
vowel = charClass "aeiou"
Range notation is supported.
halfAlphabet = charClass "a-nA-N"
To add a literal '-'
to a set, place it at the beginning or end
of the string.
You may also invert the set by placing a caret at the beginning of the string.
nonVowel = "^aeiou"
To add a literal '^'
to a set, place it somewhere other than at
the beginning of the string.
uniPrint :: Char -> BoolSource
The class of printable unicode characters, including linear whitesapce.
- Accepts: Letter, Number, Symbol, Space, Punctuation/Quote, Mark, Format, PrivateUse
- Does not Accept: LineSeparator, ParagraphSeparator, Control, Surrogate, NotAssigned
uniPrintMinus :: (Char -> Bool) -> Char -> BoolSource
Accepts characters from uniPrint
, except those which
satisfy the passed predicate.
uniIdClass :: Char -> BoolSource
Accepts a wide variety of unicode characters. This is the largest class of characters which might be used in a programming language that allows unicode identifiers, and probably include a little too much.
- Accepts: Letter, Mark, Number, Punctuation/Quote, Symbol
- Does not Accept: Space, LineSeparator, ParagraphSeparator, Control, Format, Surrogate, PrivateUse, NotAssigned
uniIdClassMinus :: (Char -> Bool) -> Char -> BoolSource
Accepts characters from uniIdClass
, except those which
satisfy the passed predicate.
Re-exports
aChar :: Stream s m Char => (Char -> Bool) -> ParsecT s u m CharSource
Parse a single char when it satisfies the predicate. Fails when the next input character does not satisfy the predicate.
anyChar :: Stream s m Char => ParsecT s u m Char
This parser succeeds for any character. Returns the parsed character.