{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE Safe #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}

-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015–present Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  FreeBSD
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Commonly used character parsers.
module Text.Megaparsec.Char
  ( -- * Simple parsers
    newline,
    crlf,
    eol,
    tab,
    space,
    hspace,
    space1,
    hspace1,

    -- * Categories of characters
    controlChar,
    spaceChar,
    upperChar,
    lowerChar,
    letterChar,
    alphaNumChar,
    printChar,
    digitChar,
    binDigitChar,
    octDigitChar,
    hexDigitChar,
    markChar,
    numberChar,
    punctuationChar,
    symbolChar,
    separatorChar,
    asciiChar,
    latin1Char,
    charCategory,
    categoryName,

    -- * Single character
    char,
    char',

    -- * Sequence of characters
    string,
    string',
  )
where

import Control.Applicative
import Data.Char
import Data.Functor (void)
import Data.Proxy
import Text.Megaparsec
import Text.Megaparsec.Common

----------------------------------------------------------------------------
-- Simple parsers

-- | Parse a newline character.
newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
newline :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
newline = forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
'\n'
{-# INLINE newline #-}

-- | Parse a carriage return character followed by a newline character.
-- Return the sequence of characters parsed.
crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
crlf :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Tokens s)
crlf = forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
string (forall s. Stream s => Proxy s -> [Token s] -> Tokens s
tokensToChunk (forall {k} (t :: k). Proxy t
Proxy :: Proxy s) String
"\r\n")
{-# INLINE crlf #-}

-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
-- sequence of characters parsed.
eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
eol :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Tokens s)
eol =
  (forall s. Stream s => Proxy s -> Token s -> Tokens s
tokenToChunk (forall {k} (t :: k). Proxy t
Proxy :: Proxy s) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
newline)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Tokens s)
crlf
    forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"end of line"
{-# INLINE eol #-}

-- | Parse a tab character.
tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
tab :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab = forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
'\t'
{-# INLINE tab #-}

-- | Skip /zero/ or more white space characters.
--
-- See also: 'skipMany' and 'spaceChar'.
space :: (MonadParsec e s m, Token s ~ Char) => m ()
space :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m ()
space = forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhileP (forall a. a -> Maybe a
Just String
"white space") Char -> Bool
isSpace
{-# INLINE space #-}

-- | Like 'space', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m ()
hspace = forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhileP (forall a. a -> Maybe a
Just String
"white space") Char -> Bool
isHSpace
{-# INLINE hspace #-}

-- | Skip /one/ or more white space characters.
--
-- See also: 'skipSome' and 'spaceChar'.
--
-- @since 6.0.0
space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
space1 :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m ()
space1 = forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P (forall a. a -> Maybe a
Just String
"white space") Char -> Bool
isSpace
{-# INLINE space1 #-}

-- | Like 'space1', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace1 :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace1 :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m ()
hspace1 = forall (f :: * -> *) a. Functor f => f a -> f ()
void forall a b. (a -> b) -> a -> b
$ forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P (forall a. a -> Maybe a
Just String
"white space") Char -> Bool
isHSpace
{-# INLINE hspace1 #-}

----------------------------------------------------------------------------
-- Categories of characters

-- | Parse a control character (a non-printing character of the Latin-1
-- subset of Unicode).
controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
controlChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
controlChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isControl forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"control character"
{-# INLINE controlChar #-}

-- | Parse a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.
spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
spaceChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
spaceChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isSpace forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"white space"
{-# INLINE spaceChar #-}

-- | Parse an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.
upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
upperChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
upperChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isUpper forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"uppercase letter"
{-# INLINE upperChar #-}

-- | Parse a lower-case alphabetic Unicode character.
lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
lowerChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
lowerChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isLower forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"lowercase letter"
{-# INLINE lowerChar #-}

-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
-- title-case letter, or a letter of case-less scripts\/modifier letter.
letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
letterChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
letterChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isLetter forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"letter"
{-# INLINE letterChar #-}

-- | Parse an alphabetic or numeric digit Unicode characters.
--
-- Note that the numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.
alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
alphaNumChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
alphaNumChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isAlphaNum forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"alphanumeric character"
{-# INLINE alphaNumChar #-}

-- | Parse a printable Unicode character: letter, number, mark, punctuation,
-- symbol or space.
printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
printChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
printChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isPrint forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"printable character"
{-# INLINE printChar #-}

-- | Parse an ASCII digit, i.e between “0” and “9”.
digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
digitChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
digitChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isDigit forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"digit"
{-# INLINE digitChar #-}

-- | Parse a binary digit, i.e. "0" or "1".
--
-- @since 7.0.0
binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
binDigitChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
binDigitChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isBinDigit forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"binary digit"
  where
    isBinDigit :: Char -> Bool
isBinDigit Char
x = Char
x forall a. Eq a => a -> a -> Bool
== Char
'0' Bool -> Bool -> Bool
|| Char
x forall a. Eq a => a -> a -> Bool
== Char
'1'
{-# INLINE binDigitChar #-}

-- | Parse an octal digit, i.e. between “0” and “7”.
octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
octDigitChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
octDigitChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isOctDigit forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"octal digit"
{-# INLINE octDigitChar #-}

-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
-- “A” and “F”.
hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
hexDigitChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
hexDigitChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isHexDigit forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"hexadecimal digit"
{-# INLINE hexDigitChar #-}

-- | Parse a Unicode mark character (accents and the like), which combines
-- with preceding characters.
markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
markChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
markChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isMark forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"mark character"
{-# INLINE markChar #-}

-- | Parse a Unicode numeric character, including digits from various
-- scripts, Roman numerals, etc.
numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
numberChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
numberChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isNumber forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"numeric character"
{-# INLINE numberChar #-}

-- | Parse a Unicode punctuation character, including various kinds of
-- connectors, brackets and quotes.
punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
punctuationChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
punctuationChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isPunctuation forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"punctuation"
{-# INLINE punctuationChar #-}

-- | Parse a Unicode symbol characters, including mathematical and currency
-- symbols.
symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
symbolChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
symbolChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isSymbol forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"symbol"
{-# INLINE symbolChar #-}

-- | Parse a Unicode space and separator characters.
separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
separatorChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
separatorChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isSeparator forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"separator"
{-# INLINE separatorChar #-}

-- | Parse a character from the first 128 characters of the Unicode
-- character set, corresponding to the ASCII character set.
asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
asciiChar :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
asciiChar = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isAscii forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"ASCII character"
{-# INLINE asciiChar #-}

-- | Parse a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
latin1Char :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
latin1Char = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy Char -> Bool
isLatin1 forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> String
"Latin-1 character"
{-# INLINE latin1Char #-}

-- | @'charCategory' cat@ parses character in Unicode General Category
-- @cat@, see 'Data.Char.GeneralCategory'.
charCategory ::
  (MonadParsec e s m, Token s ~ Char) =>
  GeneralCategory ->
  m (Token s)
charCategory :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
GeneralCategory -> m (Token s)
charCategory GeneralCategory
cat = forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy ((forall a. Eq a => a -> a -> Bool
== GeneralCategory
cat) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> GeneralCategory
generalCategory) forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
<?> GeneralCategory -> String
categoryName GeneralCategory
cat
{-# INLINE charCategory #-}

-- | Return the human-readable name of Unicode General Category.
categoryName :: GeneralCategory -> String
categoryName :: GeneralCategory -> String
categoryName = \case
  GeneralCategory
UppercaseLetter -> String
"uppercase letter"
  GeneralCategory
LowercaseLetter -> String
"lowercase letter"
  GeneralCategory
TitlecaseLetter -> String
"titlecase letter"
  GeneralCategory
ModifierLetter -> String
"modifier letter"
  GeneralCategory
OtherLetter -> String
"other letter"
  GeneralCategory
NonSpacingMark -> String
"non-spacing mark"
  GeneralCategory
SpacingCombiningMark -> String
"spacing combining mark"
  GeneralCategory
EnclosingMark -> String
"enclosing mark"
  GeneralCategory
DecimalNumber -> String
"decimal number character"
  GeneralCategory
LetterNumber -> String
"letter number character"
  GeneralCategory
OtherNumber -> String
"other number character"
  GeneralCategory
ConnectorPunctuation -> String
"connector punctuation"
  GeneralCategory
DashPunctuation -> String
"dash punctuation"
  GeneralCategory
OpenPunctuation -> String
"open punctuation"
  GeneralCategory
ClosePunctuation -> String
"close punctuation"
  GeneralCategory
InitialQuote -> String
"initial quote"
  GeneralCategory
FinalQuote -> String
"final quote"
  GeneralCategory
OtherPunctuation -> String
"other punctuation"
  GeneralCategory
MathSymbol -> String
"math symbol"
  GeneralCategory
CurrencySymbol -> String
"currency symbol"
  GeneralCategory
ModifierSymbol -> String
"modifier symbol"
  GeneralCategory
OtherSymbol -> String
"other symbol"
  GeneralCategory
Space -> String
"white space"
  GeneralCategory
LineSeparator -> String
"line separator"
  GeneralCategory
ParagraphSeparator -> String
"paragraph separator"
  GeneralCategory
Control -> String
"control character"
  GeneralCategory
Format -> String
"format character"
  GeneralCategory
Surrogate -> String
"surrogate character"
  GeneralCategory
PrivateUse -> String
"private-use Unicode character"
  GeneralCategory
NotAssigned -> String
"non-assigned Unicode character"

----------------------------------------------------------------------------
-- Single character

-- | A type-constrained version of 'single'.
--
-- > semicolon = char ';'
char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char = forall e s (m :: * -> *).
MonadParsec e s m =>
Token s -> m (Token s)
single
{-# INLINE char #-}

-- | The same as 'char' but case-insensitive. This parser returns the
-- actually parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- 1:1:
-- unexpected 'G'
-- expecting 'E' or 'e'
char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char' :: forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char' Token s
c =
  forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice
    [ forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toLower Token s
c),
      forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toUpper Token s
c),
      forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char (Char -> Char
toTitle Token s
c)
    ]
{-# INLINE char' #-}

----------------------------------------------------------------------------
-- Helpers

-- | Is it a horizontal space character?
isHSpace :: Char -> Bool
isHSpace :: Char -> Bool
isHSpace Char
x = Char -> Bool
isSpace Char
x Bool -> Bool -> Bool
&& Char
x forall a. Eq a => a -> a -> Bool
/= Char
'\n' Bool -> Bool -> Bool
&& Char
x forall a. Eq a => a -> a -> Bool
/= Char
'\r'