{-# LANGUAGE BangPatterns, FlexibleInstances, TypeFamilies, TypeSynonymInstances, GADTs #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Data.Attoparsec.ByteString.Char8 -- Copyright : Bryan O'Sullivan 2007-2011 -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : unknown -- -- Simple, efficient, character-oriented combinator parsing for -- 'B.ByteString' strings, loosely based on the Parsec library. module Data.Attoparsec.ByteString.Char8 ( -- * Character encodings -- $encodings -- * Parser types Parser , A.Result , A.IResult(..) , I.compareResults -- * Running parsers , A.parse , A.feed , A.parseOnly , A.parseTest , A.parseWith -- ** Result conversion , A.maybeResult , A.eitherResult -- * Combinators , (I.) , I.try , module Data.Attoparsec.Combinator -- * Parsing individual characters , char , char8 , anyChar , notChar , peekChar , satisfy -- ** Special character parsers , digit , letter_iso8859_15 , letter_ascii , space -- ** Fast predicates , isDigit , isDigit_w8 , isAlpha_iso8859_15 , isAlpha_ascii , isSpace , isSpace_w8 -- *** Character classes , inClass , notInClass -- * Efficient string handling , I.string , stringCI , skipSpace , skipWhile , I.take , scan , takeWhile , takeWhile1 , takeTill -- ** String combinators -- $specalt , (.*>) , (<*.) -- ** Consume all remaining input , I.takeByteString , I.takeLazyByteString -- * Text parsing , I.endOfLine , isEndOfLine , isHorizontalSpace -- * Numeric parsers , decimal , hexadecimal , signed , double , Number(..) , number , rational -- * State observation and manipulation functions , I.endOfInput , I.atEnd ) where import Control.Applicative ((*>), (<*), (<$>), (<|>)) import Data.Attoparsec.ByteString.FastSet (charClass, memberChar) import Data.Attoparsec.ByteString.Internal (Parser, ()) import Data.Attoparsec.Combinator import Data.Attoparsec.Number (Number(..)) import Data.Bits (Bits, (.|.), shiftL) import Data.ByteString.Internal (c2w, w2c) import Data.Int (Int8, Int16, Int32, Int64) import Data.Ratio ((%)) import Data.String (IsString(..)) import Data.Word (Word8, Word16, Word32, Word64, Word) import Prelude hiding (takeWhile) import qualified Data.Attoparsec.ByteString as A import qualified Data.Attoparsec.ByteString.Internal as I import qualified Data.Attoparsec.Internal as I import qualified Data.ByteString as B8 import qualified Data.ByteString.Char8 as B instance (a ~ B.ByteString) => IsString (Parser a) where fromString = I.string . B.pack -- $encodings -- -- This module is intended for parsing text that is -- represented using an 8-bit character set, e.g. ASCII or -- ISO-8859-15. It /does not/ make any attempt to deal with character -- encodings, multibyte characters, or wide characters. In -- particular, all attempts to use characters above code point U+00FF -- will give wrong answers. -- -- Code points below U+0100 are simply translated to and from their -- numeric values, so e.g. the code point U+00A4 becomes the byte -- @0xA4@ (which is the Euro symbol in ISO-8859-15, but the generic -- currency sign in ISO-8859-1). Haskell 'Char' values above U+00FF -- are truncated, so e.g. U+1D6B7 is truncated to the byte @0xB7@. -- ASCII-specific but fast, oh yes. toLower :: Word8 -> Word8 toLower w | w >= 65 && w <= 90 = w + 32 | otherwise = w -- | Satisfy a literal string, ignoring case. stringCI :: B.ByteString -> Parser B.ByteString stringCI = I.stringTransform (B8.map toLower) {-# INLINE stringCI #-} -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser requires the predicate to succeed on at least one byte -- of input: it will fail if the predicate never returns 'True' or if -- there is no input left. takeWhile1 :: (Char -> Bool) -> Parser B.ByteString takeWhile1 p = I.takeWhile1 (p . w2c) {-# INLINE takeWhile1 #-} -- | The parser @satisfy p@ succeeds for any byte for which the -- predicate @p@ returns 'True'. Returns the byte that is actually -- parsed. -- -- >digit = satisfy isDigit -- > where isDigit c = c >= '0' && c <= '9' satisfy :: (Char -> Bool) -> Parser Char satisfy = I.satisfyWith w2c {-# INLINE satisfy #-} -- | Match a letter, in the ISO-8859-15 encoding. letter_iso8859_15 :: Parser Char letter_iso8859_15 = satisfy isAlpha_iso8859_15 "letter_iso8859_15" {-# INLINE letter_iso8859_15 #-} -- | Match a letter, in the ASCII encoding. letter_ascii :: Parser Char letter_ascii = satisfy isAlpha_ascii "letter_ascii" {-# INLINE letter_ascii #-} -- | A fast alphabetic predicate for the ISO-8859-15 encoding -- -- /Note/: For all character encodings other than ISO-8859-15, and -- almost all Unicode code points above U+00A3, this predicate gives -- /wrong answers/. isAlpha_iso8859_15 :: Char -> Bool isAlpha_iso8859_15 c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '\166' && moby c) where moby = notInClass "\167\169\171-\179\182\183\185\187\191\215\247" {-# NOINLINE moby #-} {-# INLINE isAlpha_iso8859_15 #-} -- | A fast alphabetic predicate for the ASCII encoding -- -- /Note/: For all character encodings other than ASCII, and -- almost all Unicode code points above U+007F, this predicate gives -- /wrong answers/. isAlpha_ascii :: Char -> Bool isAlpha_ascii c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') {-# INLINE isAlpha_ascii #-} -- | Parse a single digit. digit :: Parser Char digit = satisfy isDigit "digit" {-# INLINE digit #-} -- | A fast digit predicate. isDigit :: Char -> Bool isDigit c = c >= '0' && c <= '9' {-# INLINE isDigit #-} -- | A fast digit predicate. isDigit_w8 :: Word8 -> Bool isDigit_w8 w = w >= 48 && w <= 57 {-# INLINE isDigit_w8 #-} -- | Match any character. anyChar :: Parser Char anyChar = satisfy $ const True {-# INLINE anyChar #-} -- | Match any character. Returns 'Nothing' if end of input has been -- reached. Does not consume any input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. peekChar :: Parser (Maybe Char) peekChar = (fmap w2c) `fmap` I.peekWord8 {-# INLINE peekChar #-} -- | Fast predicate for matching ASCII space characters. -- -- /Note/: This predicate only gives correct answers for the ASCII -- encoding. For instance, it does not recognise U+00A0 (non-breaking -- space) as a space character, even though it is a valid ISO-8859-15 -- byte. For a Unicode-aware and only slightly slower predicate, -- use 'Data.Char.isSpace' isSpace :: Char -> Bool isSpace c = (c == ' ') || ('\t' <= c && c <= '\r') {-# INLINE isSpace #-} -- | Fast 'Word8' predicate for matching ASCII space characters. isSpace_w8 :: Word8 -> Bool isSpace_w8 w = (w == 32) || (9 <= w && w <= 13) {-# INLINE isSpace_w8 #-} -- | Parse a space character. -- -- /Note/: This parser only gives correct answers for the ASCII -- encoding. For instance, it does not recognise U+00A0 (non-breaking -- space) as a space character, even though it is a valid ISO-8859-15 -- byte. space :: Parser Char space = satisfy isSpace "space" {-# INLINE space #-} -- | Match a specific character. char :: Char -> Parser Char char c = satisfy (== c) [c] {-# INLINE char #-} -- | Match a specific character, but return its 'Word8' value. char8 :: Char -> Parser Word8 char8 c = I.satisfy (== c2w c) [c] {-# INLINE char8 #-} -- | Match any character except the given one. notChar :: Char -> Parser Char notChar c = satisfy (/= c) "not " ++ [c] {-# INLINE notChar #-} -- | Match any character in a set. -- -- >vowel = inClass "aeiou" -- -- Range notation is supported. -- -- >halfAlphabet = inClass "a-nA-N" -- -- To add a literal \'-\' to a set, place it at the beginning or end -- of the string. inClass :: String -> Char -> Bool inClass s = (`memberChar` mySet) where mySet = charClass s {-# INLINE inClass #-} -- | Match any character not in a set. notInClass :: String -> Char -> Bool notInClass s = not . inClass s {-# INLINE notInClass #-} -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'False' on the first byte of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeWhile :: (Char -> Bool) -> Parser B.ByteString takeWhile p = I.takeWhile (p . w2c) {-# INLINE takeWhile #-} -- | A stateful scanner. The predicate consumes and transforms a -- state argument, and each transformed state is passed to successive -- invocations of the predicate on each byte of the input until one -- returns 'Nothing' or the input ends. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'Nothing' on the first byte of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. scan :: s -> (s -> Char -> Maybe s) -> Parser B.ByteString scan s0 p = I.scan s0 (\s -> p s . w2c) {-# INLINE scan #-} -- | Consume input as long as the predicate returns 'False' -- (i.e. until it returns 'True'), and return the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'True' on the first byte of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeTill :: (Char -> Bool) -> Parser B.ByteString takeTill p = I.takeTill (p . w2c) {-# INLINE takeTill #-} -- | Skip past input for as long as the predicate returns 'True'. skipWhile :: (Char -> Bool) -> Parser () skipWhile p = I.skipWhile (p . w2c) {-# INLINE skipWhile #-} -- | Skip over white space. skipSpace :: Parser () skipSpace = I.skipWhile isSpace_w8 {-# INLINE skipSpace #-} -- $specalt -- -- The '.*>' and '<*.' combinators are intended for use with the -- @OverloadedStrings@ language extension. They simplify the common -- task of matching a statically known string, then immediately -- parsing something else. -- -- An example makes this easier to understand: -- -- @{-\# LANGUAGE OverloadedStrings #-} -- -- shoeSize = \"Shoe size: \" '.*>' 'decimal' -- @ -- -- If we were to try to use '*>' above instead, the type checker would -- not be able to tell which 'IsString' instance to use for the text -- in quotes. We would have to be explicit, using either a type -- signature or the 'I.string' parser. -- | Type-specialized version of '*>' for 'B.ByteString'. (.*>) :: B.ByteString -> Parser a -> Parser a s .*> f = I.string s *> f -- | Type-specialized version of '<*' for 'B.ByteString'. (<*.) :: Parser a -> B.ByteString -> Parser a f <*. s = f <* I.string s -- | A predicate that matches either a carriage return @\'\\r\'@ or -- newline @\'\\n\'@ character. isEndOfLine :: Word8 -> Bool isEndOfLine w = w == 13 || w == 10 {-# INLINE isEndOfLine #-} -- | A predicate that matches either a space @\' \'@ or horizontal tab -- @\'\\t\'@ character. isHorizontalSpace :: Word8 -> Bool isHorizontalSpace w = w == 32 || w == 9 {-# INLINE isHorizontalSpace #-} -- | Parse and decode an unsigned hexadecimal number. The hex digits -- @\'a\'@ through @\'f\'@ may be upper or lower case. -- -- This parser does not accept a leading @\"0x\"@ string. hexadecimal :: (Integral a, Bits a) => Parser a hexadecimal = B8.foldl' step 0 `fmap` I.takeWhile1 isHexDigit where isHexDigit w = (w >= 48 && w <= 57) || (w >= 97 && w <= 102) || (w >= 65 && w <= 70) step a w | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48) | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87) | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55) {-# SPECIALISE hexadecimal :: Parser Int #-} {-# SPECIALISE hexadecimal :: Parser Int8 #-} {-# SPECIALISE hexadecimal :: Parser Int16 #-} {-# SPECIALISE hexadecimal :: Parser Int32 #-} {-# SPECIALISE hexadecimal :: Parser Int64 #-} {-# SPECIALISE hexadecimal :: Parser Integer #-} {-# SPECIALISE hexadecimal :: Parser Word #-} {-# SPECIALISE hexadecimal :: Parser Word8 #-} {-# SPECIALISE hexadecimal :: Parser Word16 #-} {-# SPECIALISE hexadecimal :: Parser Word32 #-} {-# SPECIALISE hexadecimal :: Parser Word64 #-} -- | Parse and decode an unsigned decimal number. decimal :: Integral a => Parser a decimal = B8.foldl' step 0 `fmap` I.takeWhile1 isDig where isDig w = w >= 48 && w <= 57 step a w = a * 10 + fromIntegral (w - 48) {-# SPECIALISE decimal :: Parser Int #-} {-# SPECIALISE decimal :: Parser Int8 #-} {-# SPECIALISE decimal :: Parser Int16 #-} {-# SPECIALISE decimal :: Parser Int32 #-} {-# SPECIALISE decimal :: Parser Int64 #-} {-# SPECIALISE decimal :: Parser Integer #-} {-# SPECIALISE decimal :: Parser Word #-} {-# SPECIALISE decimal :: Parser Word8 #-} {-# SPECIALISE decimal :: Parser Word16 #-} {-# SPECIALISE decimal :: Parser Word32 #-} {-# SPECIALISE decimal :: Parser Word64 #-} -- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign -- character. signed :: Num a => Parser a -> Parser a {-# SPECIALISE signed :: Parser Int -> Parser Int #-} {-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-} {-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-} {-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-} {-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-} {-# SPECIALISE signed :: Parser Integer -> Parser Integer #-} signed p = (negate <$> (char8 '-' *> p)) <|> (char8 '+' *> p) <|> p -- | Parse a rational number. -- -- This parser accepts an optional leading sign character, followed by -- at least one decimal digit. The syntax similar to that accepted by -- the 'read' function, with the exception that a trailing @\'.\'@ or -- @\'e\'@ /not/ followed by a number is not consumed. -- -- Examples with behaviour identical to 'read', if you feed an empty -- continuation to the first result: -- -- >rational "3" == Done 3.0 "" -- >rational "3.1" == Done 3.1 "" -- >rational "3e4" == Done 30000.0 "" -- >rational "3.1e4" == Done 31000.0, "" -- -- Examples with behaviour identical to 'read': -- -- >rational ".3" == Fail "input does not start with a digit" -- >rational "e3" == Fail "input does not start with a digit" -- -- Examples of differences from 'read': -- -- >rational "3.foo" == Done 3.0 ".foo" -- >rational "3e" == Done 3.0 "e" -- -- This function does not accept string representations of \"NaN\" or -- \"Infinity\". rational :: Fractional a => Parser a {-# SPECIALIZE rational :: Parser Double #-} {-# SPECIALIZE rational :: Parser Float #-} {-# SPECIALIZE rational :: Parser Rational #-} rational = floaty $ \real frac fracDenom -> fromRational $ real % 1 + frac % fracDenom -- | Parse a rational number. -- -- The syntax accepted by this parser is the same as for 'rational'. -- -- /Note/: This function is almost ten times faster than 'rational', -- but is slightly less accurate. -- -- The 'Double' type supports about 16 decimal places of accuracy. -- For 94.2% of numbers, this function and 'rational' give identical -- results, but for the remaining 5.8%, this function loses precision -- around the 15th decimal place. For 0.001% of numbers, this -- function will lose precision at the 13th or 14th decimal place. -- -- This function does not accept string representations of \"NaN\" or -- \"Infinity\". double :: Parser Double double = floaty asDouble asDouble :: Integer -> Integer -> Integer -> Double asDouble real frac fracDenom = fromIntegral real + fromIntegral frac / fromIntegral fracDenom {-# INLINE asDouble #-} -- | Parse a number, attempting to preserve both speed and precision. -- -- The syntax accepted by this parser is the same as for 'rational'. -- -- /Note/: This function is almost ten times faster than 'rational'. -- On integral inputs, it gives perfectly accurate answers, and on -- floating point inputs, it is slightly less accurate than -- 'rational'. -- -- This function does not accept string representations of \"NaN\" or -- \"Infinity\". number :: Parser Number number = floaty $ \real frac fracDenom -> if frac == 0 && fracDenom == 0 then I real else D (asDouble real frac fracDenom) {-# INLINE number #-} data T = T !Integer !Int floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Parser a {-# INLINE floaty #-} floaty f = do let minus = 45 plus = 43 !positive <- ((== plus) <$> I.satisfy (\c -> c == minus || c == plus)) <|> return True real <- decimal let tryFraction = do let dot = 46 _ <- I.satisfy (==dot) ds <- I.takeWhile isDigit_w8 case I.parseOnly decimal ds of Right n -> return $ T n (B.length ds) _ -> fail "no digits after decimal" T fraction fracDigits <- tryFraction <|> return (T 0 0) let littleE = 101 bigE = 69 e w = w == littleE || w == bigE power <- (I.satisfy e *> signed decimal) <|> return (0::Int) let n = if fracDigits == 0 then if power == 0 then fromIntegral real else fromIntegral real * (10 ^^ power) else if power == 0 then f real fraction (10 ^ fracDigits) else f real fraction (10 ^ fracDigits) * (10 ^^ power) return $ if positive then n else -n