-- Copyright 2016 Google Inc. All Rights Reserved. -- -- Use of this source code is governed by a BSD-style -- license that can be found in the LICENSE file or at -- https://developers.google.com/open-source/licenses/bsd -- | Helper utilities to parse the human-readable text format into a -- proto-agnostic syntax tree. module Data.ProtoLens.TextFormat.Parser ( Message , Field(..) , Key(..) , Value(..) , parser ) where import Data.ByteString (ByteString, pack) import Data.Char (ord) import Data.Functor.Identity (Identity) import Data.List (intercalate) import Data.Maybe (catMaybes) import Data.Text.Lazy (Text) import Data.Word (Word8) import Numeric (readOct, readHex) import Text.Parsec.Char (alphaNum, char, hexDigit, letter, octDigit, oneOf, satisfy) import Text.Parsec.Text.Lazy (Parser) import Text.Parsec.Combinator (choice, eof, many1, optionMaybe, sepBy1) import Text.Parsec.Token hiding (octal) import Control.Applicative ((<|>), many) import Control.Monad (liftM, liftM2, mzero) -- | A 'TokenParser' for the protobuf text format. ptp :: GenTokenParser Text () Identity ptp = makeTokenParser protobufLangDef protobufLangDef :: GenLanguageDef Text () Identity -- We need to fill in the fields manually, since the LanguageDefs provided -- by Parsec are restricted to parsers of Strings. protobufLangDef = LanguageDef { commentStart = "" , commentEnd = "" , commentLine = "#" , nestedComments = False , identStart = letter <|> char '_' , identLetter = alphaNum <|> oneOf "_'" , opStart = mzero , opLetter = mzero , reservedNames = [] , reservedOpNames = [] , caseSensitive = True } type Message = [Field] data Field = Field Key Value deriving (Show,Ord,Eq) data Key = Key String -- ^ A standard key that is just a string. | UnknownKey Integer -- ^ A key that has been written out as a number | ExtensionKey [String] -- ^ An extension, with namespaces and extension. | UnknownExtensionKey Integer -- ^ An extension that has been written out -- as a number. deriving (Ord,Eq) data Value = IntValue Integer -- ^ An integer | DoubleValue Double -- ^ Any floating point number | ByteStringValue ByteString -- ^ A string or bytes literal | MessageValue Message -- ^ A sub message | EnumValue String -- ^ Any undelimited string (including false & true) deriving (Show,Ord,Eq) instance Show Key where show (Key name) = name show (UnknownKey k) = show k show (ExtensionKey name) = "[" ++ intercalate "." name ++ "]" show (UnknownExtensionKey k) = "[" ++ show k ++ "]" parser :: Parser Message parser = whiteSpace ptp *> parseMessage <* eof where parseMessage = many parseField parseField = liftM2 Field parseKey parseValue parseKey = liftM Key (identifier ptp) <|> liftM UnknownKey (natural ptp) <|> liftM ExtensionKey (brackets ptp (identifier ptp `sepBy1` dot ptp)) <|> liftM UnknownExtensionKey (brackets ptp (natural ptp)) parseValue = colon ptp *> choice [parseNumber, parseString, parseEnumValue, parseMessageValue] <|> parseMessageValue parseNumber = do negative <- (symbol ptp "-" >> return True) <|> return False value <- naturalOrFloat ptp return $ makeNumberValue negative value parseString = liftM (ByteStringValue . mconcat) $ many1 $ lexeme ptp $ protoStringLiteral parseEnumValue = liftM EnumValue (identifier ptp) parseMessageValue = liftM MessageValue (braces ptp parseMessage <|> angles ptp parseMessage) makeNumberValue :: Bool -> Either Integer Double -> Value makeNumberValue True (Left intValue) = IntValue (negate intValue) makeNumberValue False (Left intValue) = IntValue intValue makeNumberValue True (Right doubleValue) = DoubleValue (negate doubleValue) makeNumberValue False (Right doubleValue) = DoubleValue doubleValue -- | Reads a literal string the way the Protocol Buffer distribution's -- tokenizer.cc does. This differs from Haskell string literals in treating, -- e.g. "\11" as octal instead of decimal, so reading as 9 instead of 11. Also, -- like tokenizer.cc we assume octal and hex escapes can have at most three and -- two digits, respectively. -- -- TODO: implement reading of Unicode escapes. protoStringLiteral :: Parser ByteString protoStringLiteral = do initialQuoteChar <- char '\'' <|> char '\"' word8s <- many stringChar _ <- char initialQuoteChar return $ pack word8s where stringChar :: Parser Word8 stringChar = nonEscape <|> stringEscape nonEscape = fmap (fromIntegral . ord) $ satisfy (\c -> c `notElem` "\\\'\"" && ord c < 256) stringEscape = char '\\' >> (octal <|> hex <|> unicode <|> simple) octal = do d0 <- octDigit d1 <- optionMaybe octDigit d2 <- optionMaybe octDigit readMaybeDigits readOct [Just d0, d1, d2] readMaybeDigits :: ReadS Word8 -> [Maybe Char] -> Parser Word8 readMaybeDigits reader = return . (\str -> let [(v, "")] = reader str in v) . catMaybes hex = do _ <- oneOf "xX" d0 <- hexDigit d1 <- optionMaybe hexDigit readMaybeDigits readHex [Just d0, d1] unicode = oneOf "uU" >> fail "Unicode in string literals not yet supported" simple = choice $ map charRet [ ('a', '\a') , ('b', '\b') , ('f', '\f') , ('n', '\n') , ('r', '\r') , ('t', '\t') , ('v', '\v') , ('\\', '\\') , ('\'', '\'') , ('\"', '\"') ] where charRet :: (Char, Char) -> Parser Word8 charRet (escapeCh, ch) = do _ <- char escapeCh return $ fromIntegral $ ord ch