{-# LANGUAGE Rank2Types, RecordWildCards, FlexibleInstances #-} -- | -- Module : Data.Attoparsec.Text.Internal -- Copyright : Felipe Lessa 2010-2011, Bryan O'Sullivan 2007-2010 -- License : BSD3 -- -- Maintainer : felipe.lessa@gmail.com -- Stability : experimental -- Portability : unknown -- -- Simple, efficient parser combinators for 'T.Text' strings, -- loosely based on the Parsec library, heavily based on attoparsec. module Data.Attoparsec.Text.Internal ( -- * Parser types Parser , Result(..) , S(input) -- * Running parsers , parse -- * Combinators , () , try , module Data.Attoparsec.Combinator -- * Parsing individual characters , satisfy , satisfyWith , anyChar , skip , char , notChar -- ** Character classes , inClass , notInClass -- ** Special character parsers , digit , letter , space -- * Efficient string handling , skipWhile , skipSpace , string , stringTransform , take , takeWhile , takeWhile1 , takeTill -- * Numeric parsers , decimal , hexadecimal , signed , double , rational -- * State observation and manipulation functions , endOfInput , ensure -- * Utilities , endOfLine ) where import Control.Applicative (Alternative(..), Applicative(..), (<$>)) import Control.Monad (MonadPlus(..), when) import Data.Attoparsec.Combinator import Data.Attoparsec.Text.FastSet (charClass, member) import Data.Char import Data.Monoid (Monoid(..)) import Data.Ratio ((%)) import Data.String (IsString(..)) import Prelude hiding (getChar, take, takeWhile) import qualified Data.Text as T data Result r = Fail S [String] String | Partial (T.Text -> Result r) | Done S r -- | The 'Parser' type is a monad. newtype Parser a = Parser { runParser :: forall r. S -> Failure r -> Success a r -> Result r } type Failure r = S -> [String] -> String -> Result r type Success a r = S -> a -> Result r instance IsString (Parser T.Text) where fromString = string . T.pack -- | Have we read all available input? data More = Complete | Incomplete deriving (Eq, Show) plusMore :: More -> More -> More plusMore Complete _ = Complete plusMore _ Complete = Complete plusMore _ _ = Incomplete {-# INLINE plusMore #-} instance Monoid More where mempty = Incomplete mappend = plusMore data S = S { input :: !T.Text , _added :: !T.Text , more :: !More } deriving (Show) instance Show r => Show (Result r) where show (Fail _ stack msg) = "Fail " ++ show stack ++ " " ++ show msg show (Partial _) = "Partial _" show (Done bs r) = "Done " ++ show bs ++ " " ++ show r addS :: S -> S -> S addS (S s0 a0 c0) (S _s1 a1 c1) = S (s0 +++ a1) (a0 +++ a1) (mappend c0 c1) {-# INLINE addS #-} instance Monoid S where mempty = S T.empty T.empty Incomplete mappend = addS bindP :: Parser a -> (a -> Parser b) -> Parser b bindP m g = Parser (\st0 kf ks -> runParser m st0 kf (\s a -> runParser (g a) s kf ks)) {-# INLINE bindP #-} returnP :: a -> Parser a returnP a = Parser (\st0 _kf ks -> ks st0 a) {-# INLINE returnP #-} instance Monad Parser where return = returnP (>>=) = bindP fail = failDesc noAdds :: S -> S noAdds (S s0 _a0 c0) = S s0 T.empty c0 {-# INLINE noAdds #-} plus :: Parser a -> Parser a -> Parser a plus a b = Parser $ \st0 kf ks -> let kf' st1 _ _ = runParser b (mappend st0 st1) kf ks !st2 = noAdds st0 in runParser a st2 kf' ks {-# INLINE plus #-} instance MonadPlus Parser where mzero = failDesc "mzero" mplus = plus fmapP :: (a -> b) -> Parser a -> Parser b fmapP p m = Parser (\st0 f k -> runParser m st0 f (\s a -> k s (p a))) {-# INLINE fmapP #-} instance Functor Parser where fmap = fmapP apP :: Parser (a -> b) -> Parser a -> Parser b apP d e = do b <- d a <- e return (b a) {-# INLINE apP #-} instance Applicative Parser where pure = returnP (<*>) = apP -- These definitions are equal to the defaults, but this -- way the optimizer doesn't have to work so hard to figure -- that out. (*>) = (>>) x <* y = x >>= \a -> y >> return a instance Alternative Parser where empty = failDesc "empty" (<|>) = plus failDesc :: String -> Parser a failDesc err = Parser (\st0 kf _ks -> kf st0 [] msg) where msg = "Failed reading: " ++ err {-# INLINE failDesc #-} -- | Succeed only if at least @n@ characters of input are available. ensure :: Int -> Parser () ensure n = Parser $ \st0@(S s0 _a0 _c0) kf ks -> if T.length s0 >= n then ks st0 () else runParser (demandInput >> ensure n) st0 kf ks -- | Ask for input. If we receive any, pass it to a success -- continuation, otherwise to a failure continuation. prompt :: S -> (S -> Result r) -> (S -> Result r) -> Result r prompt (S s0 a0 _c0) kf ks = Partial $ \s -> if T.null s then kf $! S s0 a0 Complete else ks $! S (s0 +++ s) (a0 +++ s) Incomplete -- | Immediately demand more input via a 'Partial' continuation -- result. demandInput :: Parser () demandInput = Parser $ \st0 kf ks -> if more st0 == Complete then kf st0 ["demandInput"] "not enough characters" else prompt st0 (\st -> kf st ["demandInput"] "not enough characters") (`ks` ()) -- | This parser always succeeds. It returns 'True' if any input is -- available either immediately or on demand, and 'False' if the end -- of all input has been reached. wantInput :: Parser Bool wantInput = Parser $ \st0@(S s0 _a0 c0) _kf ks -> case () of _ | not (T.null s0) -> ks st0 True | c0 == Complete -> ks st0 False | otherwise -> prompt st0 (`ks` False) (`ks` True) get :: Parser T.Text get = Parser (\st0 _kf ks -> ks st0 (input st0)) put :: T.Text -> Parser () put s = Parser (\(S _s0 a0 c0) _kf ks -> ks (S s a0 c0) ()) (+++) :: T.Text -> T.Text -> T.Text (+++) = T.append {-# INLINE (+++) #-} -- | Attempt a parse, and if it fails, rewind the input so that no -- input appears to have been consumed. -- -- This combinator is useful in cases where a parser might consume -- some input before failing, i.e. the parser needs arbitrary -- lookahead. The downside to using this combinator is that it can -- retain input for longer than is desirable. try :: Parser a -> Parser a try p = Parser $ \st0 kf ks -> runParser p (noAdds st0) (kf . mappend st0) ks -- | The parser @satisfy p@ succeeds for any character for which -- the predicate @p@ returns 'True'. Returns the character that -- is actually parsed. -- -- >import Data.Char (isDigit) -- >digit = satisfy isDigit satisfy :: (Char -> Bool) -> Parser Char satisfy p = do ensure 1 s <- get case T.uncons s of Just (h,t) | p h -> put t >> return h | otherwise -> fail "satisfy" Nothing -> error "Data.Attoparsec.Text.Internal.satisfy: never here" -- | The parser @skip p@ succeeds for any character for which the -- predicate @p@ returns 'True'. -- -- >import Data.Char (isDigit) -- >digit = satisfy isDigit skip :: (Char -> Bool) -> Parser () skip p = do ensure 1 s <- get case T.uncons s of Just (h,t) | p h -> put t | otherwise -> fail "skip" Nothing -> error "Data.Attoparsec.Text.Internal.skip: never here" -- | The parser @satisfyWith f p@ transforms a character, and -- succeeds if the predicate @p@ returns 'True' on the -- transformed value. The parser returns the transformed -- character that was parsed. satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a satisfyWith f p = do ensure 1 s <- get let Just (h,t) = T.uncons s c = f h if p c then put t >> return c else fail "satisfyWith" -- | Parse a single digit. digit :: Parser Char digit = satisfy isDigit "digit" {-# INLINE digit #-} -- | Parse a single letter. letter :: Parser Char letter = satisfy isLetter "letter" {-# INLINE letter #-} -- | Parse a space character. space :: Parser Char space = satisfy isSpace "space" {-# INLINE space #-} -- | Consume @n@ characters of input, but succeed only if the -- predicate returns 'True'. takeWith :: Int -> (T.Text -> Bool) -> Parser T.Text takeWith n p = do ensure n s <- get let (h,t) = T.splitAt n s if p h then put t >> return h else failDesc "takeWith" -- | Consume exactly @n@ characters of input. take :: Int -> Parser T.Text take n = takeWith n (const True) {-# INLINE take #-} -- | @string s@ parses a sequence of characters that identically -- match @s@. Returns the parsed string (i.e. @s@). This parser -- consumes no input if it fails (even if a partial match). -- -- /Note/: The behaviour of this parser is different to that of the -- similarly-named parser in Parsec, as this one is all-or-nothing. -- To illustrate the difference, the following parser will fail under -- Parsec given an input of @"for"@: -- -- >string "foo" <|> string "for" -- -- The reason for its failure is that that the first branch is a -- partial match, and will consume the letters @\'f\'@ and -- @\'o\'@ before failing. In Attoparsec, both the original on -- bytestrings and this one on texts, the above parser will -- /succeed/ on that input, because the failed first branch will -- consume nothing. string :: T.Text -> Parser T.Text string s = takeWith (T.length s) (==s) {-# INLINE string #-} stringTransform :: (T.Text -> T.Text) -> T.Text -> Parser T.Text stringTransform f s = takeWith (T.length s) ((==f s) . f) {-# INLINE stringTransform #-} -- | Skip past input for as long as the predicate returns 'True'. skipWhile :: (Char -> Bool) -> Parser () skipWhile p = go where go = do input <- wantInput when input $ do t <- T.dropWhile p <$> get put t when (T.null t) go -- | Skip over white space. skipSpace :: Parser () skipSpace = skipWhile isSpace >> return () {-# INLINE skipSpace #-} -- | Consume input as long as the predicate returns 'False' -- (i.e. until it returns 'True'), and return the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'True' on the first character of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeTill :: (Char -> Bool) -> Parser T.Text takeTill p = takeWhile (not . p) {-# INLINE takeTill #-} -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'False' on the first character of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeWhile :: (Char -> Bool) -> Parser T.Text takeWhile p = go [] where go acc = do input <- wantInput if input then do #if MIN_VERSION_text(0,11,0) (h,t) <- T.span p <$> get #else (h,t) <- T.spanBy p <$> get #endif put t if T.null t then go (h:acc) else return $ if null acc then h else T.concat $ reverse (h:acc) else return $ case acc of [] -> T.empty [x] -> x _ -> T.concat $ reverse acc -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser requires the predicate to succeed on at least one -- character of input: it will fail if the predicate never -- returns 'True' or if there is no input left. takeWhile1 :: (Char -> Bool) -> Parser T.Text takeWhile1 p = do (`when` demandInput) =<< T.null <$> get #if MIN_VERSION_text(0,11,0) (h,t) <- T.span p <$> get #else (h,t) <- T.spanBy p <$> get #endif when (T.null h) $ failDesc "takeWhile1" put t if T.null t then (h+++) `fmapP` takeWhile p else return h -- | Match any character in a set. -- -- >vowel = inClass "aeiou" -- -- Range notation is supported. -- -- >halfAlphabet = inClass "a-nA-N" -- -- To add a literal @\'-\'@ to a set, place it at the beginning or end -- of the string. inClass :: String -> Char -> Bool inClass s = (`member` mySet) where mySet = charClass s {-# INLINE inClass #-} -- | Match any character not in a set. notInClass :: String -> Char -> Bool notInClass s = not . inClass s {-# INLINE notInClass #-} -- | Match any character. anyChar :: Parser Char anyChar = satisfy $ const True {-# INLINE anyChar #-} -- | Match a specific character. char :: Char -> Parser Char char c = satisfy (== c) show c {-# INLINE char #-} -- | Match any character except the given one. notChar :: Char -> Parser Char notChar c = satisfy (/= c) "not " ++ show c {-# INLINE notChar #-} -- | Parse and decode an unsigned decimal number. decimal :: Integral a => Parser a {-# SPECIALISE decimal :: Parser Int #-} {-# SPECIALISE decimal :: Parser Integer #-} decimal = T.foldl' step 0 `fmap` takeWhile1 asciiIsDigit where step a w = a * 10 + fromIntegral (fromEnum w - 48) asciiIsDigit :: Char -> Bool asciiIsDigit c = c >= '0' && c <= '9' {-# INLINE asciiIsDigit #-} -- | Parse and decode an unsigned hexadecimal number. The hex digits -- @\'a\'@ through @\'f\'@ may be upper or lower case. -- -- This parser does not accept a leading @\"0x\"@ string. hexadecimal :: Integral a => Parser a {-# SPECIALISE hexadecimal :: Parser Int #-} hexadecimal = T.foldl' step 0 `fmap` takeWhile1 asciiIsHexDigit where step a c | c >= '0' && c <= '9' = a * 16 + fromIntegral (fromEnum c - 48) | otherwise = a * 16 + fromIntegral (asciiToLower c - 87) asciiIsHexDigit :: Char -> Bool asciiIsHexDigit c = asciiIsDigit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {-# INLINE asciiIsHexDigit #-} asciiToLower :: Char -> Int asciiToLower c | c >= 'A' && c <= 'Z' = fromEnum c + 32 | otherwise = fromEnum c -- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign -- character. signed :: Num a => Parser a -> Parser a {-# SPECIALISE signed :: Parser Int -> Parser Int #-} signed p = (negate <$> (char '-' *> p)) <|> (char '+' *> p) <|> p -- | Parse a rational number. -- -- This parser accepts an optional leading sign character, followed by -- at least one decimal digit. The syntax similar to that accepted by -- the 'read' function, with the exception that a trailing @\'.\'@ or -- @\'e\'@ /not/ followed by a number is not consumed. -- -- Examples with behaviour identical to 'read', if you feed an empty -- continuation to the first result: -- -- >rational "3" == Done 3.0 "" -- >rational "3.1" == Done 3.1 "" -- >rational "3e4" == Done 30000.0 "" -- >rational "3.1e4" == Done 31000.0 "" -- -- Examples with behaviour identical to 'read': -- -- >rational ".3" == Fail "input does not start with a digit" -- >rational "e3" == Fail "input does not start with a digit" -- -- Examples of differences from 'read': -- -- >rational "3.foo" == Done 3.0 ".foo" -- >rational "3e" == Done 3.0 "e" rational :: RealFloat a => Parser a {-# SPECIALIZE rational :: Parser Double #-} rational = floaty $ \real frac fracDenom -> fromRational $ real % 1 + frac % fracDenom -- | Parse a rational number. -- -- The syntax accepted by this parser is the same as for 'rational'. -- -- /Note/: This function is almost ten times faster than 'rational', -- but is slightly less accurate. -- -- The 'Double' type supports about 16 decimal places of accuracy. -- For 94.2% of numbers, this function and 'rational' give identical -- results, but for the remaining 5.8%, this function loses precision -- around the 15th decimal place. For 0.001% of numbers, this -- function will lose precision at the 13th or 14th decimal place. double :: Parser Double double = floaty $ \real frac fracDenom -> fromIntegral real + fromIntegral frac / fromIntegral fracDenom data T = T !Integer !Int floaty :: RealFloat a => (Integer -> Integer -> Integer -> a) -> Parser a {-# INLINE floaty #-} floaty f = do sign <- satisfy (\c -> c == '-' || c == '+') <|> return '+' real <- decimal let tryFraction = do _ <- satisfy (== '.') ds <- takeWhile asciiIsDigit case (case parse decimal ds of Partial k -> k T.empty r -> r) of Done _ n -> return $ T n (T.length ds) _ -> fail "no digits after decimal" T fraction fracDigits <- tryFraction <|> return (T 0 0) let e c = c == 'e' || c == 'E' power <- (satisfy e *> signed decimal) <|> return (0 :: Int) let n = if fracDigits == 0 then if power == 0 then fromIntegral real else fromIntegral real * (10 ^^ power) else if power == 0 then f real fraction (10 ^ fracDigits) else f real fraction (10 ^ fracDigits) * (10 ^^ power) return $! if sign == '+' then n else -n -- | Match only if all input has been consumed. endOfInput :: Parser () endOfInput = Parser $ \st0@S{..} kf ks -> if T.null input then if more == Complete then ks st0 () else let kf' st1 _ _ = ks (mappend st0 st1) () ks' st1 _ = kf (mappend st0 st1) [] "endOfInput" in runParser demandInput st0 kf' ks' else kf st0 [] "endOfInput" -- | Match either a single newline character @\'\\n\'@, or a carriage -- return followed by a newline character @\"\\r\\n\"@. endOfLine :: Parser () endOfLine = (char '\n' >> return ()) <|> (string (T.pack "\r\n") >> return ()) --- | Name the parser, in case failure occurs. () :: Parser a -> String -- ^ the name to use if parsing fails -> Parser a p msg = Parser $ \s kf ks -> runParser p s (\s' strs m -> kf s' (msg:strs) m) ks {-# INLINE () #-} infix 0 -- | Terminal failure continuation. failK :: Failure a failK st0 stack msg = Fail st0 stack msg -- | Terminal success continuation. successK :: Success a a successK state a = Done state a -- | Run a parser. parse :: Parser a -> T.Text -> Result a parse m s = runParser m (S s T.empty Incomplete) failK successK {-# INLINE parse #-}