{-# LANGUAGE BangPatterns, Rank2Types, RecordWildCards, FlexibleInstances #-} -- | -- Module : Data.Attoparsec.Text.Internal -- Copyright : Felipe Lessa 2010-2011, Bryan O'Sullivan 2007-2010 -- License : BSD3 -- -- Maintainer : felipe.lessa@gmail.com -- Stability : experimental -- Portability : unknown -- -- Simple, efficient parser combinators for 'T.Text' strings, -- loosely based on the Parsec library, heavily based on attoparsec. module Data.Attoparsec.Text.Internal ( -- * Parser types Parser , Result(..) -- * Running parsers , parse , parseOnly -- * Combinators , () , try , module Data.Attoparsec.Combinator -- * Parsing individual characters , satisfy , satisfyWith , anyChar , skip , char , notChar -- ** Character classes , inClass , notInClass -- ** Special character parsers , digit , letter , space -- * Efficient string handling , skipWhile , skipSpace , string , stringTransform , take , scan , takeWhile , takeWhile1 , takeTill -- ** Consume all remaining input , takeText , takeLazyText -- * Numeric parsers , decimal , hexadecimal , signed , double , rational -- * State observation and manipulation functions , endOfInput , atEnd , ensure -- * Utilities , endOfLine ) where import Control.Applicative (Alternative(..), Applicative(..), (<$>)) import Control.Monad (MonadPlus(..), when) import Data.Attoparsec.Combinator import Data.Attoparsec.Text.FastSet (charClass, member) import Data.Char import Data.Monoid (Monoid(..)) import Data.Ratio ((%)) import Data.String (IsString(..)) import Prelude hiding (getChar, take, takeWhile) import qualified Data.Text as T import qualified Data.Text.Lazy as TL -- | The result of a parse. data Result r = Fail T.Text [String] String -- ^ The parse failed. The 'T.Text' is the input -- that had not yet been consumed when the failure -- occurred. The @[@'String'@]@ is a list of contexts -- in which the error occurred. The 'String' is the -- message describing the error, if any. | Partial (T.Text -> Result r) -- ^ Supply this continuation with more input so that -- the parser can resume. To indicate that no more -- input is available, use an 'T.empty' string. | Done T.Text r -- ^ The parse succeeded. The 'T.Text' is the -- input that had not yet been consumed (if any) when -- the parse succeeded. instance Show r => Show (Result r) where show (Fail bs stk msg) = "Fail " ++ show bs ++ " " ++ show stk ++ " " ++ show msg show (Partial _) = "Partial _" show (Done bs r) = "Done " ++ show bs ++ " " ++ show r fmapR :: (a -> b) -> Result a -> Result b fmapR _ (Fail st stk msg) = Fail st stk msg fmapR f (Partial k) = Partial (fmapR f . k) fmapR f (Done bs r) = Done bs (f r) instance Functor Result where fmap = fmapR newtype Input = I {unI :: T.Text} newtype Added = A {unA :: T.Text} -- | The 'Parser' type is a monad. newtype Parser a = Parser { runParser :: forall r. Input -> Added -> More -> Failure r -> Success a r -> Result r } type Failure r = Input -> Added -> More -> [String] -> String -> Result r type Success a r = Input -> Added -> More -> a -> Result r instance IsString (Parser T.Text) where fromString = string . T.pack -- | Have we read all available input? data More = Complete | Incomplete deriving (Eq, Show) addS :: Input -> Added -> More -> Input -> Added -> More -> (Input -> Added -> More -> r) -> r addS i0 a0 m0 _i1 a1 m1 f = let !i = I (unI i0 +++ unA a1) a = A (unA a0 +++ unA a1) !m = m0 <> m1 in f i a m where Complete <> _ = Complete _ <> Complete = Complete _ <> _ = Incomplete {-# INLINE addS #-} bindP :: Parser a -> (a -> Parser b) -> Parser b bindP m g = Parser $ \i0 a0 m0 kf ks -> runParser m i0 a0 m0 kf $ \i1 a1 m1 a -> runParser (g a) i1 a1 m1 kf ks {-# INLINE bindP #-} returnP :: a -> Parser a returnP a = Parser (\i0 a0 m0 _kf ks -> ks i0 a0 m0 a) {-# INLINE returnP #-} instance Monad Parser where return = returnP (>>=) = bindP fail = failDesc noAdds :: Input -> Added -> More -> (Input -> Added -> More -> r) -> r noAdds i0 _a0 m0 f = f i0 (A T.empty) m0 {-# INLINE noAdds #-} plus :: Parser a -> Parser a -> Parser a plus a b = Parser $ \i0 a0 m0 kf ks -> let kf' i1 a1 m1 _ _ = addS i0 a0 m0 i1 a1 m1 $ \ i2 a2 m2 -> runParser b i2 a2 m2 kf ks in noAdds i0 a0 m0 $ \i2 a2 m2 -> runParser a i2 a2 m2 kf' ks {-# INLINE plus #-} instance MonadPlus Parser where mzero = failDesc "mzero" {-# INLINE mzero #-} mplus = plus fmapP :: (a -> b) -> Parser a -> Parser b fmapP p m = Parser $ \i0 a0 m0 f k -> runParser m i0 a0 m0 f $ \i1 a1 s1 a -> k i1 a1 s1 (p a) {-# INLINE fmapP #-} instance Functor Parser where fmap = fmapP apP :: Parser (a -> b) -> Parser a -> Parser b apP d e = do b <- d a <- e return (b a) {-# INLINE apP #-} instance Applicative Parser where pure = returnP (<*>) = apP -- These definitions are equal to the defaults, but this -- way the optimizer doesn't have to work so hard to figure -- that out. (*>) = (>>) x <* y = x >>= \a -> y >> return a instance Monoid (Parser a) where mempty = failDesc "mempty" {-# INLINE mempty #-} mappend = plus instance Alternative Parser where empty = failDesc "empty" {-# INLINE empty #-} (<|>) = plus failDesc :: String -> Parser a failDesc err = Parser (\i0 a0 m0 kf _ks -> kf i0 a0 m0 [] msg) where msg = "Failed reading: " ++ err {-# INLINE failDesc #-} -- | If at least @n@ bytes of input are available, return the current -- input, otherwise fail. ensure :: Int -> Parser T.Text ensure !n = Parser $ \i0 a0 m0 kf ks -> if T.length (unI i0) >= n then ks i0 a0 m0 (unI i0) else runParser (demandInput >> ensure n) i0 a0 m0 kf ks -- | Ask for input. If we receive any, pass it to a success -- continuation, otherwise to a failure continuation. prompt :: Input -> Added -> More -> (Input -> Added -> More -> Result r) -> (Input -> Added -> More -> Result r) -> Result r prompt i0 a0 _m0 kf ks = Partial $ \s -> if T.null s then kf i0 a0 Complete else ks (I (unI i0 +++ s)) (A (unA a0 +++ s)) Incomplete -- | Immediately demand more input via a 'Partial' continuation -- result. demandInput :: Parser () demandInput = Parser $ \i0 a0 m0 kf ks -> if m0 == Complete then kf i0 a0 m0 ["demandInput"] "not enough characters" else let kf' i a m = kf i a m ["demandInput"] "not enough characters" ks' i a m = ks i a m () in prompt i0 a0 m0 kf' ks' -- | This parser always succeeds. It returns 'True' if any input is -- available either immediately or on demand, and 'False' if the end -- of all input has been reached. wantInput :: Parser Bool wantInput = Parser $ \i0 a0 m0 _kf ks -> case () of _ | not (T.null (unI i0)) -> ks i0 a0 m0 True | m0 == Complete -> ks i0 a0 m0 False | otherwise -> let kf' i a m = ks i a m False ks' i a m = ks i a m True in prompt i0 a0 m0 kf' ks' get :: Parser T.Text get = Parser $ \i0 a0 m0 _kf ks -> ks i0 a0 m0 (unI i0) put :: T.Text -> Parser () put s = Parser $ \_i0 a0 m0 _kf ks -> ks (I s) a0 m0 () (+++) :: T.Text -> T.Text -> T.Text (+++) = T.append {-# INLINE (+++) #-} -- | Attempt a parse, and if it fails, rewind the input so that no -- input appears to have been consumed. -- -- This combinator is useful in cases where a parser might consume -- some input before failing, i.e. the parser needs arbitrary -- lookahead. The downside to using this combinator is that it can -- retain input for longer than is desirable. try :: Parser a -> Parser a try p = Parser $ \i0 a0 m0 kf ks -> noAdds i0 a0 m0 $ \i1 a1 m1 -> let kf' i2 a2 m2 = addS i0 a0 m0 i2 a2 m2 kf in runParser p i1 a1 m1 kf' ks -- | The parser @satisfy p@ succeeds for any character for which -- the predicate @p@ returns 'True'. Returns the character that -- is actually parsed. -- -- >import Data.Char (isDigit) -- >digit = satisfy isDigit satisfy :: (Char -> Bool) -> Parser Char satisfy p = do s <- ensure 1 case T.uncons s of Just (h,t) | p h -> put t >> return h | otherwise -> fail "satisfy" Nothing -> error "Data.Attoparsec.Text.Internal.satisfy: never here" -- | The parser @skip p@ succeeds for any character for which the -- predicate @p@ returns 'True'. -- -- >import Data.Char (isDigit) -- >digit = satisfy isDigit skip :: (Char -> Bool) -> Parser () skip p = do s <- ensure 1 case T.uncons s of Just (h,t) | p h -> put t | otherwise -> fail "skip" Nothing -> error "Data.Attoparsec.Text.Internal.skip: never here" -- | The parser @satisfyWith f p@ transforms a character, and -- succeeds if the predicate @p@ returns 'True' on the -- transformed value. The parser returns the transformed -- character that was parsed. satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a satisfyWith f p = do s <- ensure 1 let Just (h,t) = T.uncons s c = f h if p c then put t >> return c else fail "satisfyWith" -- | Parse a single digit. digit :: Parser Char digit = satisfy isDigit "digit" {-# INLINE digit #-} -- | Parse a single letter. letter :: Parser Char letter = satisfy isLetter "letter" {-# INLINE letter #-} -- | Parse a space character. space :: Parser Char space = satisfy isSpace "space" {-# INLINE space #-} -- | Consume @n@ characters of input, but succeed only if the -- predicate returns 'True'. takeWith :: Int -> (T.Text -> Bool) -> Parser T.Text takeWith n p = do s <- ensure n let (h,t) = T.splitAt n s if p h then put t >> return h else failDesc "takeWith" -- | Consume exactly @n@ characters of input. take :: Int -> Parser T.Text take n = takeWith n (const True) {-# INLINE take #-} -- | @string s@ parses a sequence of characters that identically -- match @s@. Returns the parsed string (i.e. @s@). This parser -- consumes no input if it fails (even if a partial match). -- -- /Note/: The behaviour of this parser is different to that of the -- similarly-named parser in Parsec, as this one is all-or-nothing. -- To illustrate the difference, the following parser will fail under -- Parsec given an input of @"for"@: -- -- >string "foo" <|> string "for" -- -- The reason for its failure is that that the first branch is a -- partial match, and will consume the letters @\'f\'@ and -- @\'o\'@ before failing. In Attoparsec, both the original on -- bytestrings and this one on texts, the above parser will -- /succeed/ on that input, because the failed first branch will -- consume nothing. string :: T.Text -> Parser T.Text string s = takeWith (T.length s) (==s) {-# INLINE string #-} stringTransform :: (T.Text -> T.Text) -> T.Text -> Parser T.Text stringTransform f s = takeWith (T.length s) ((==f s) . f) {-# INLINE stringTransform #-} -- | Skip past input for as long as the predicate returns 'True'. skipWhile :: (Char -> Bool) -> Parser () skipWhile p = go where go = do t <- T.dropWhile p <$> get put t when (T.null t) $ do input <- wantInput when input go {-# INLINE skipWhile #-} -- | Skip over white space. skipSpace :: Parser () skipSpace = skipWhile isSpace >> return () {-# INLINE skipSpace #-} -- | Consume input as long as the predicate returns 'False' -- (i.e. until it returns 'True'), and return the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'True' on the first character of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeTill :: (Char -> Bool) -> Parser T.Text takeTill p = takeWhile (not . p) {-# INLINE takeTill #-} -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'False' on the first character of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. takeWhile :: (Char -> Bool) -> Parser T.Text takeWhile p = (T.concat . reverse) `fmap` go [] where go acc = do #if MIN_VERSION_text(0,11,0) (h,t) <- T.span p <$> get #else (h,t) <- T.spanBy p <$> get #endif put t if T.null t then do input <- wantInput if input then go (h:acc) else return (h:acc) else return (h:acc) takeRest :: Parser [T.Text] takeRest = go [] where go acc = do input <- wantInput if input then do s <- get put T.empty go (s:acc) else return (reverse acc) -- | Consume all remaining input and return it as a single string. takeText :: Parser T.Text takeText = T.concat `fmap` takeRest -- | Consume all remaining input and return it as a single string. takeLazyText :: Parser TL.Text takeLazyText = TL.fromChunks `fmap` takeRest -- | A stateful scanner. The predicate consumes and transforms a -- state argument, and each transformed state is passed to successive -- invocations of the predicate on each character of the input until one -- returns 'Nothing' or the input ends. -- -- This parser does not fail. It will return an empty string if the -- predicate returns 'Nothing' on the first character of input. -- -- /Note/: Because this parser does not fail, do not use it with -- combinators such as 'many', because such parsers loop until a -- failure occurs. Careless use will thus result in an infinite loop. scan :: s -> (s -> Char -> Maybe s) -> Parser T.Text scan s0 p = do chunks <- go [] s0 case chunks of [x] -> return x xs -> return . T.concat . reverse $ xs where scanner s !n t = case T.uncons t of Nothing -> Continue s Just (c,t') -> case p s c of Just s' -> scanner s' (n+1) t' Nothing -> Finished n t go acc s = do input <- get case scanner s 0 input of Continue s' -> do put T.empty more <- wantInput if more then go (input : acc) s' else return (input : acc) Finished n t -> put t >> return (T.take n input : acc) {-# INLINE scan #-} data ScannnerResult s = Continue s | Finished !Int !T.Text -- | Consume input as long as the predicate returns 'True', and return -- the consumed input. -- -- This parser requires the predicate to succeed on at least one -- character of input: it will fail if the predicate never -- returns 'True' or if there is no input left. takeWhile1 :: (Char -> Bool) -> Parser T.Text takeWhile1 p = do (`when` demandInput) =<< T.null <$> get #if MIN_VERSION_text(0,11,0) (h,t) <- T.span p <$> get #else (h,t) <- T.spanBy p <$> get #endif when (T.null h) $ failDesc "takeWhile1" put t if T.null t then (h+++) `fmapP` takeWhile p else return h -- | Match any character in a set. -- -- >vowel = inClass "aeiou" -- -- Range notation is supported. -- -- >halfAlphabet = inClass "a-nA-N" -- -- To add a literal @\'-\'@ to a set, place it at the beginning or end -- of the string. inClass :: String -> Char -> Bool inClass s = (`member` mySet) where mySet = charClass s {-# INLINE inClass #-} -- | Match any character not in a set. notInClass :: String -> Char -> Bool notInClass s = not . inClass s {-# INLINE notInClass #-} -- | Match any character. anyChar :: Parser Char anyChar = satisfy $ const True {-# INLINE anyChar #-} -- | Match a specific character. char :: Char -> Parser Char char c = satisfy (== c) show c {-# INLINE char #-} -- | Match any character except the given one. notChar :: Char -> Parser Char notChar c = satisfy (/= c) "not " ++ show c {-# INLINE notChar #-} -- | Parse and decode an unsigned decimal number. decimal :: Integral a => Parser a {-# SPECIALISE decimal :: Parser Int #-} {-# SPECIALISE decimal :: Parser Integer #-} decimal = T.foldl' step 0 `fmap` takeWhile1 asciiIsDigit where step a w = a * 10 + fromIntegral (fromEnum w - 48) asciiIsDigit :: Char -> Bool asciiIsDigit c = c >= '0' && c <= '9' {-# INLINE asciiIsDigit #-} -- | Parse and decode an unsigned hexadecimal number. The hex digits -- @\'a\'@ through @\'f\'@ may be upper or lower case. -- -- This parser does not accept a leading @\"0x\"@ string. hexadecimal :: Integral a => Parser a {-# SPECIALISE hexadecimal :: Parser Int #-} hexadecimal = T.foldl' step 0 `fmap` takeWhile1 asciiIsHexDigit where step a c | c >= '0' && c <= '9' = a * 16 + fromIntegral (fromEnum c - 48) | otherwise = a * 16 + fromIntegral (asciiToLower c - 87) asciiIsHexDigit :: Char -> Bool asciiIsHexDigit c = asciiIsDigit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {-# INLINE asciiIsHexDigit #-} asciiToLower :: Char -> Int asciiToLower c | c >= 'A' && c <= 'Z' = fromEnum c + 32 | otherwise = fromEnum c -- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign -- character. signed :: Num a => Parser a -> Parser a {-# SPECIALISE signed :: Parser Int -> Parser Int #-} signed p = (negate <$> (char '-' *> p)) <|> (char '+' *> p) <|> p -- | Parse a rational number. -- -- This parser accepts an optional leading sign character, followed by -- at least one decimal digit. The syntax similar to that accepted by -- the 'read' function, with the exception that a trailing @\'.\'@ or -- @\'e\'@ /not/ followed by a number is not consumed. -- -- Examples with behaviour identical to 'read', if you feed an empty -- continuation to the first result: -- -- >rational "3" == Done 3.0 "" -- >rational "3.1" == Done 3.1 "" -- >rational "3e4" == Done 30000.0 "" -- >rational "3.1e4" == Done 31000.0 "" -- -- Examples with behaviour identical to 'read': -- -- >rational ".3" == Fail "input does not start with a digit" -- >rational "e3" == Fail "input does not start with a digit" -- -- Examples of differences from 'read': -- -- >rational "3.foo" == Done 3.0 ".foo" -- >rational "3e" == Done 3.0 "e" rational :: RealFloat a => Parser a {-# SPECIALIZE rational :: Parser Double #-} rational = floaty $ \real frac fracDenom -> fromRational $ real % 1 + frac % fracDenom -- | Parse a rational number. -- -- The syntax accepted by this parser is the same as for 'rational'. -- -- /Note/: This function is almost ten times faster than 'rational', -- but is slightly less accurate. -- -- The 'Double' type supports about 16 decimal places of accuracy. -- For 94.2% of numbers, this function and 'rational' give identical -- results, but for the remaining 5.8%, this function loses precision -- around the 15th decimal place. For 0.001% of numbers, this -- function will lose precision at the 13th or 14th decimal place. double :: Parser Double double = floaty $ \real frac fracDenom -> fromIntegral real + fromIntegral frac / fromIntegral fracDenom data T = T !Integer !Int floaty :: RealFloat a => (Integer -> Integer -> Integer -> a) -> Parser a {-# INLINE floaty #-} floaty f = do sign <- satisfy (\c -> c == '-' || c == '+') <|> return '+' real <- decimal let tryFraction = do _ <- satisfy (== '.') ds <- takeWhile asciiIsDigit case (case parse decimal ds of Partial k -> k T.empty r -> r) of Done _ n -> return $ T n (T.length ds) _ -> fail "no digits after decimal" T fraction fracDigits <- tryFraction <|> return (T 0 0) let e c = c == 'e' || c == 'E' power <- (satisfy e *> signed decimal) <|> return (0 :: Int) let n = if fracDigits == 0 then if power == 0 then fromIntegral real else fromIntegral real * (10 ^^ power) else if power == 0 then f real fraction (10 ^ fracDigits) else f real fraction (10 ^ fracDigits) * (10 ^^ power) return $! if sign == '+' then n else -n -- | Match only if all input has been consumed. endOfInput :: Parser () endOfInput = Parser $ \i0 a0 m0 kf ks -> if T.null (unI i0) then if m0 == Complete then ks i0 a0 m0 () else let kf' i1 a1 m1 _ _ = addS i0 a0 m0 i1 a1 m1 $ \ i2 a2 m2 -> ks i2 a2 m2 () ks' i1 a1 m1 _ = addS i0 a0 m0 i1 a1 m1 $ \ i2 a2 m2 -> kf i2 a2 m2 [] "endOfInput" in runParser demandInput i0 a0 m0 kf' ks' else kf i0 a0 m0 [] "endOfInput" -- | Return an indication of whether the end of input has been -- reached. atEnd :: Parser Bool atEnd = not <$> wantInput {-# INLINE atEnd #-} -- | Match either a single newline character @\'\\n\'@, or a carriage -- return followed by a newline character @\"\\r\\n\"@. endOfLine :: Parser () endOfLine = (char '\n' >> return ()) <|> (string (T.pack "\r\n") >> return ()) --- | Name the parser, in case failure occurs. () :: Parser a -> String -- ^ the name to use if parsing fails -> Parser a p msg0 = Parser $ \i0 a0 m0 kf ks -> let kf' i a m strs msg = kf i a m (msg0:strs) msg in runParser p i0 a0 m0 kf' ks {-# INLINE () #-} infix 0 -- | Terminal failure continuation. failK :: Failure a failK i0 _a0 _m0 stack msg = Fail (unI i0) stack msg {-# INLINE failK #-} -- | Terminal success continuation. successK :: Success a a successK i0 _a0 _m0 a = Done (unI i0) a {-# INLINE successK #-} -- | Run a parser. parse :: Parser a -> T.Text -> Result a parse m s = runParser m (I s) (A T.empty) Incomplete failK successK {-# INLINE parse #-} -- | Run a parser that cannot be resupplied via a 'Partial' result. parseOnly :: Parser a -> T.Text -> Either String a parseOnly m s = case runParser m (I s) (A T.empty) Complete failK successK of Fail _ _ err -> Left err Done _ a -> Right a _ -> error "parseOnly: impossible error!" {-# INLINE parseOnly #-}