{-# LANGUAGE CPP, BangPatterns, OverloadedStrings #-} -- | -- Module: Data.Aeson.Parser.Internal -- Copyright: (c) 2011, 2012 Bryan O'Sullivan -- (c) 2011 MailRank, Inc. -- License: Apache -- Maintainer: Bryan O'Sullivan -- Stability: experimental -- Portability: portable -- -- Efficiently and correctly parse a JSON string. The string must be -- encoded as UTF-8. module Data.Aeson.Parser.Internal ( -- * Lazy parsers json, jsonEOF , value , jstring -- * Strict parsers , json', jsonEOF' , value' -- * Helpers , decodeWith , decodeStrictWith , eitherDecodeWith , eitherDecodeStrictWith ) where #if defined(USE_BLAZE_BUILDER) import Blaze.ByteString.Builder (Builder, fromByteString, toByteString) import Blaze.ByteString.Builder.Char.Utf8 (fromChar) import Blaze.ByteString.Builder.Word (fromWord8) #else #if MIN_VERSION_bytestring(0,10,2) import Data.ByteString.Builder #else import Data.ByteString.Lazy.Builder #endif (Builder, byteString, toLazyByteString, charUtf8, word8) #endif import Control.Applicative ((*>), (<$>), (<*), liftA2, pure) import Data.Aeson.Types (Result(..), Value(..)) import Data.Attoparsec.Char8 (Parser, char, endOfInput, rational, skipSpace, string) import Data.Bits ((.|.), shiftL) import Data.ByteString (ByteString) import Data.Char (chr) import Data.Monoid (mappend, mempty) import Data.Text (Text) import Data.Text.Encoding (decodeUtf8') import Data.Vector as Vector (Vector, fromList) import Data.Word (Word8) import qualified Data.Attoparsec as A import qualified Data.Attoparsec.Lazy as L import qualified Data.Attoparsec.Zepto as Z import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Unsafe as B import qualified Data.HashMap.Strict as H #define BACKSLASH 92 #define CLOSE_CURLY 125 #define CLOSE_SQUARE 93 #define COMMA 44 #define DOUBLE_QUOTE 34 #define OPEN_CURLY 123 #define OPEN_SQUARE 91 #define C_0 48 #define C_9 57 #define C_A 65 #define C_F 70 #define C_a 97 #define C_f 102 #define C_n 110 #define C_t 116 -- | Parse a top-level JSON value. This must be either an object or -- an array, per RFC 4627. -- -- The conversion of a parsed value to a Haskell value is deferred -- until the Haskell value is needed. This may improve performance if -- only a subset of the results of conversions are needed, but at a -- cost in thunk allocation. json :: Parser Value json = json_ object_ array_ -- | Parse a top-level JSON value. This must be either an object or -- an array, per RFC 4627. -- -- This is a strict version of 'json' which avoids building up thunks -- during parsing; it performs all conversions immediately. Prefer -- this version if most of the JSON data needs to be accessed. json' :: Parser Value json' = json_ object_' array_' json_ :: Parser Value -> Parser Value -> Parser Value json_ obj ary = do w <- skipSpace *> A.satisfy (\w -> w == OPEN_CURLY || w == OPEN_SQUARE) if w == OPEN_CURLY then obj else ary {-# INLINE json_ #-} object_ :: Parser Value object_ = {-# SCC "object_" #-} Object <$> objectValues jstring value object_' :: Parser Value object_' = {-# SCC "object_'" #-} do !vals <- objectValues jstring' value' return (Object vals) where jstring' = do !s <- jstring return s objectValues :: Parser Text -> Parser Value -> Parser (H.HashMap Text Value) objectValues str val = do skipSpace let pair = liftA2 (,) (str <* skipSpace) (char ':' *> skipSpace *> val) H.fromList <$> commaSeparated pair CLOSE_CURLY {-# INLINE objectValues #-} array_ :: Parser Value array_ = {-# SCC "array_" #-} Array <$> arrayValues value array_' :: Parser Value array_' = {-# SCC "array_'" #-} do !vals <- arrayValues value' return (Array vals) commaSeparated :: Parser a -> Word8 -> Parser [a] commaSeparated item endByte = do w <- peekWord8' if w == endByte then A.anyWord8 >> return [] else loop where loop = do v <- item <* skipSpace ch <- A.satisfy $ \w -> w == COMMA || w == endByte if ch == COMMA then skipSpace >> (v:) <$> loop else return [v] {-# INLINE commaSeparated #-} arrayValues :: Parser Value -> Parser (Vector Value) arrayValues val = do skipSpace Vector.fromList <$> commaSeparated val CLOSE_SQUARE {-# INLINE arrayValues #-} -- | Parse any JSON value. You should usually 'json' in preference to -- this function, as this function relaxes the object-or-array -- requirement of RFC 4627. -- -- In particular, be careful in using this function if you think your -- code might interoperate with Javascript. A naïve Javascript -- library that parses JSON data using @eval@ is vulnerable to attack -- unless the encoded data represents an object or an array. JSON -- implementations in other languages conform to that same restriction -- to preserve interoperability and security. value :: Parser Value value = do w <- peekWord8' case w of DOUBLE_QUOTE -> A.anyWord8 *> (String <$> jstring_) OPEN_CURLY -> A.anyWord8 *> object_ OPEN_SQUARE -> A.anyWord8 *> array_ C_f -> string "false" *> pure (Bool False) C_t -> string "true" *> pure (Bool True) C_n -> string "null" *> pure Null _ | w >= 48 && w <= 57 || w == 45 -> Number <$> rational | otherwise -> fail "not a valid json value" -- | Strict version of 'value'. See also 'json''. value' :: Parser Value value' = do w <- peekWord8' case w of DOUBLE_QUOTE -> do !s <- A.anyWord8 *> jstring_ return (String s) OPEN_CURLY -> A.anyWord8 *> object_' OPEN_SQUARE -> A.anyWord8 *> array_' C_f -> string "false" *> pure (Bool False) C_t -> string "true" *> pure (Bool True) C_n -> string "null" *> pure Null _ | w >= 48 && w <= 57 || w == 45 -> do !n <- rational return (Number n) | otherwise -> fail "not a valid json value" -- | Parse a quoted JSON string. jstring :: Parser Text jstring = A.word8 DOUBLE_QUOTE *> jstring_ -- | Parse a string without a leading quote. jstring_ :: Parser Text jstring_ = {-# SCC "jstring_" #-} do s <- A.scan False $ \s c -> if s then Just False else if c == DOUBLE_QUOTE then Nothing else Just (c == BACKSLASH) _ <- A.word8 DOUBLE_QUOTE s1 <- if BACKSLASH `B.elem` s then case Z.parse unescape s of Right r -> return r Left err -> fail err else return s case decodeUtf8' s1 of Right r -> return r Left err -> fail $ show err {-# INLINE jstring_ #-} unescape :: Z.Parser ByteString unescape = toByteString <$> go mempty where go acc = do h <- Z.takeWhile (/=BACKSLASH) let rest = do start <- Z.take 2 let !slash = B.unsafeHead start !t = B.unsafeIndex start 1 escape = case B.findIndex (==t) "\"\\/ntbrfu" of Just i -> i _ -> 255 if slash /= BACKSLASH || escape == 255 then fail "invalid JSON escape sequence" else do let cont m = go (acc `mappend` byteString h `mappend` m) {-# INLINE cont #-} if t /= 117 -- 'u' then cont (word8 (B.unsafeIndex mapping escape)) else do a <- hexQuad if a < 0xd800 || a > 0xdfff then cont (charUtf8 (chr a)) else do b <- Z.string "\\u" *> hexQuad if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff then let !c = ((a - 0xd800) `shiftL` 10) + (b - 0xdc00) + 0x10000 in cont (charUtf8 (chr c)) else fail "invalid UTF-16 surrogates" done <- Z.atEnd if done then return (acc `mappend` byteString h) else rest mapping = "\"\\/\n\t\b\r\f" hexQuad :: Z.Parser Int hexQuad = do s <- Z.take 4 let hex n | w >= C_0 && w <= C_9 = w - C_0 | w >= C_a && w <= C_f = w - 87 | w >= C_A && w <= C_F = w - 55 | otherwise = 255 where w = fromIntegral $ B.unsafeIndex s n a = hex 0; b = hex 1; c = hex 2; d = hex 3 if (a .|. b .|. c .|. d) /= 255 then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12) else fail "invalid hex escape" decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a decodeWith p to s = case L.parse p s of L.Done _ v -> case to v of Success a -> Just a _ -> Nothing _ -> Nothing {-# INLINE decodeWith #-} decodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString -> Maybe a decodeStrictWith p to s = case either Error to (A.parseOnly p s) of Success a -> Just a Error _ -> Nothing {-# INLINE decodeStrictWith #-} eitherDecodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Either String a eitherDecodeWith p to s = case L.parse p s of L.Done _ v -> case to v of Success a -> Right a Error msg -> Left msg L.Fail _ _ msg -> Left msg {-# INLINE eitherDecodeWith #-} eitherDecodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString -> Either String a eitherDecodeStrictWith p to s = case either Error to (A.parseOnly p s) of Success a -> Right a Error msg -> Left msg {-# INLINE eitherDecodeStrictWith #-} -- $lazy -- -- The 'json' and 'value' parsers decouple identification from -- conversion. Identification occurs immediately (so that an invalid -- JSON document can be rejected as early as possible), but conversion -- to a Haskell value is deferred until that value is needed. -- -- This decoupling can be time-efficient if only a smallish subset of -- elements in a JSON value need to be inspected, since the cost of -- conversion is zero for uninspected elements. The trade off is an -- increase in memory usage, due to allocation of thunks for values -- that have not yet been converted. -- $strict -- -- The 'json'' and 'value'' parsers combine identification with -- conversion. They consume more CPU cycles up front, but have a -- smaller memory footprint. -- | Parse a top-level JSON value followed by optional whitespace and -- end-of-input. See also: 'json'. jsonEOF :: Parser Value jsonEOF = json <* skipSpace <* endOfInput -- | Parse a top-level JSON value followed by optional whitespace and -- end-of-input. See also: 'json''. jsonEOF' :: Parser Value jsonEOF' = json' <* skipSpace <* endOfInput #if defined(USE_BLAZE_BUILDER) byteString :: ByteString -> Builder byteString = fromByteString {-# INLINE byteString #-} charUtf8 :: Char -> Builder charUtf8 = fromChar {-# INLINE charUtf8 #-} word8 :: Word8 -> Builder word8 = fromWord8 {-# INLINE word8 #-} #else toByteString :: Builder -> ByteString toByteString = L.toStrict . toLazyByteString {-# INLINE toByteString #-} #endif peekWord8' :: A.Parser Word8 #if MIN_VERSION_attoparsec(0,11,1) peekWord8' = A.peekWord8' #else peekWord8' = maybe (fail "not enough bytes") return =<< A.peekWord8 #endif