-- | -- Module : Text.Megaparsec.Byte -- Copyright : © 2015–2018 Megaparsec contributors -- License : FreeBSD -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- Commonly used binary parsers. -- -- @since 6.0.0 {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} module Text.Megaparsec.Byte ( -- * Simple parsers newline , crlf , eol , tab , space , space1 -- * Categories of characters , controlChar , spaceChar , upperChar , lowerChar , letterChar , alphaNumChar , printChar , digitChar , binDigitChar , octDigitChar , hexDigitChar , asciiChar -- * Single byte , char , char' -- * Sequence of bytes , string , string' ) where import Control.Applicative import Data.Char hiding (toLower, toUpper) import Data.Functor (void) import Data.Proxy import Data.Word (Word8) import Text.Megaparsec import Text.Megaparsec.Common ---------------------------------------------------------------------------- -- Simple parsers -- | Parse a newline byte. newline :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) newline = char 10 {-# INLINE newline #-} -- | Parse a carriage return character followed by a newline character. -- Return the sequence of characters parsed. crlf :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s) crlf = string (tokensToChunk (Proxy :: Proxy s) [13,10]) {-# INLINE crlf #-} -- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the -- sequence of characters parsed. eol :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s) eol = (tokenToChunk (Proxy :: Proxy s) <$> newline) <|> crlf "end of line" {-# INLINE eol #-} -- | Parse a tab character. tab :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) tab = char 9 {-# INLINE tab #-} -- | Skip /zero/ or more white space characters. -- -- See also: 'skipMany' and 'spaceChar'. space :: (MonadParsec e s m, Token s ~ Word8) => m () space = void $ takeWhileP (Just "white space") isSpace' {-# INLINE space #-} -- | Skip /one/ or more white space characters. -- -- See also: 'skipSome' and 'spaceChar'. space1 :: (MonadParsec e s m, Token s ~ Word8) => m () space1 = void $ takeWhile1P (Just "white space") isSpace' {-# INLINE space1 #-} ---------------------------------------------------------------------------- -- Categories of characters -- | Parse a control character. controlChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) controlChar = satisfy (isControl . toChar) "control character" {-# INLINE controlChar #-} -- | Parse a space character, and the control characters: tab, newline, -- carriage return, form feed, and vertical tab. spaceChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) spaceChar = satisfy isSpace' "white space" {-# INLINE spaceChar #-} -- | Parse an upper-case character. upperChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) upperChar = satisfy (isUpper . toChar) "uppercase letter" {-# INLINE upperChar #-} -- | Parse a lower-case alphabetic character. lowerChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) lowerChar = satisfy (isLower . toChar) "lowercase letter" {-# INLINE lowerChar #-} -- | Parse an alphabetic character: lower-case or upper-case. letterChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) letterChar = satisfy (isLetter . toChar) "letter" {-# INLINE letterChar #-} -- | Parse an alphabetic or digit characters. alphaNumChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) alphaNumChar = satisfy (isAlphaNum . toChar) "alphanumeric character" {-# INLINE alphaNumChar #-} -- | Parse a printable character: letter, number, mark, punctuation, symbol -- or space. printChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) printChar = satisfy (isPrint . toChar) "printable character" {-# INLINE printChar #-} -- | Parse an ASCII digit, i.e between “0” and “9”. digitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) digitChar = satisfy isDigit' "digit" where isDigit' x = x >= 48 && x <= 57 {-# INLINE digitChar #-} -- | Parse a binary digit, i.e. “0” or “1”. -- -- @since 7.0.0 binDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) binDigitChar = satisfy isBinDigit "binary digit" where isBinDigit x = x == 48 || x == 49 {-# INLINE binDigitChar #-} -- | Parse an octal digit, i.e. between “0” and “7”. octDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) octDigitChar = satisfy isOctDigit' "octal digit" where isOctDigit' x = x >= 48 && x <= 55 {-# INLINE octDigitChar #-} -- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or -- “A” and “F”. hexDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) hexDigitChar = satisfy (isHexDigit . toChar) "hexadecimal digit" {-# INLINE hexDigitChar #-} -- | Parse a character from the first 128 characters of the Unicode -- character set, corresponding to the ASCII character set. asciiChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s) asciiChar = satisfy (< 128) "ASCII character" {-# INLINE asciiChar #-} ---------------------------------------------------------------------------- -- Single byte -- | A type-constrained version of 'single'. -- -- > newline = char 10 char :: (MonadParsec e s m, Token s ~ Word8) => Token s -> m (Token s) char = single {-# INLINE char #-} -- | The same as 'char' but case-insensitive. This parser returns the -- actually parsed character preserving its case. -- -- >>> parseTest (char' 101) "E" -- 69 -- 'E' -- >>> parseTest (char' 101) "G" -- 1:1: -- unexpected 'G' -- expecting 'E' or 'e' char' :: (MonadParsec e s m, Token s ~ Word8) => Token s -> m (Token s) char' c = choice [ char (toLower c) , char (toUpper c) ] {-# INLINE char' #-} ---------------------------------------------------------------------------- -- Helpers -- | 'Word8'-specialized version of 'isSpace'. isSpace' :: Word8 -> Bool isSpace' x | x >= 9 && x <= 13 = True | x == 32 = True | x == 160 = True | otherwise = False {-# INLINE isSpace' #-} -- | Convert a byte to char. toChar :: Word8 -> Char toChar = chr . fromIntegral {-# INLINE toChar #-} -- | Convert a byte to its upper-case version. toUpper :: Word8 -> Word8 toUpper x | x >= 97 && x <= 122 = x - 32 | x == 247 = x -- division sign | x == 255 = x -- latin small letter y with diaeresis | x >= 224 = x - 32 | otherwise = x {-# INLINE toUpper #-} -- | Convert a byte to its lower-case version. toLower :: Word8 -> Word8 toLower x | x >= 65 && x <= 90 = x + 32 | x == 215 = x -- multiplication sign | x >= 192 && x <= 222 = x + 32 | otherwise = x {-# INLINE toLower #-}