Text/ProtocolBuffers/Basic.hs

{-# LANGUAGE DeriveDataTypeable,GeneralizedNewtypeDeriving,CPP #-}
-- | "Text.ProtocolBuffers.Basic" defines or re-exports most of the
-- basic field types; 'Maybe','Bool', 'Double', and 'Float' come from
-- the Prelude instead. This module also defines the 'Mergeable' and
-- 'Default' classes. The 'Wire' class is not defined here to avoid orphans.
module Text.ProtocolBuffers.Basic
  ( -- * Basic types for protocol buffer fields in Haskell
    Double,Float,Bool,Maybe,Seq,Utf8(Utf8),ByteString,Int32,Int64,Word32,Word64
    -- * Haskell types that act in the place of DescritorProto values
  , WireTag(..),FieldId(..),WireType(..),FieldType(..),EnumCode(..),WireSize
    -- * Some of the type classes implemented messages and fields
  , Mergeable(..),Default(..) -- ,Wire(..)
  , isValidUTF8, toUtf8, utf8, uToString, uFromString
  ) where

import Data.Bits(Bits)
import Data.ByteString.Lazy(ByteString)
import Data.Foldable as F(Foldable(foldl))
import Data.Generics(Data(..))
import Data.Int(Int32,Int64)
import Data.Ix(Ix)
#if __GLASGOW_HASKELL__ < 710
import Data.Monoid(Monoid(..))
#endif
import Data.Sequence(Seq,(><))
import Data.Typeable(Typeable)
import Data.Word(Word8,Word32,Word64)

import qualified Data.ByteString.Lazy as L(unpack)
import Data.ByteString.Lazy.UTF8 as U (toString,fromString)

-- Num instances are derived below for the purpose of getting fromInteger for case matching

-- | 'Utf8' is used to mark 'ByteString' values that (should) contain
-- valid utf8 encoded strings.  This type is used to represent
-- 'TYPE_STRING' values.
newtype Utf8 = Utf8 ByteString deriving (Data,Typeable,Eq,Ord)

utf8 :: Utf8 -> ByteString
utf8 (Utf8 bs) = bs

instance Read Utf8 where
  readsPrec d xs = let r :: Int -> ReadS String
                       r = readsPrec
                       f :: (String,String) -> (Utf8,String)
                       f (a,b) = (Utf8 (U.fromString a),b)
                   in map f . r d $ xs

instance Show Utf8 where
  showsPrec d (Utf8 bs) = let s :: Int -> String -> ShowS
                              s = showsPrec
                          in s d (U.toString bs)

instance Monoid Utf8 where
  mempty = Utf8 mempty
  mappend (Utf8 x) (Utf8 y) = Utf8 (mappend x y)

-- | 'WireTag' is the 32 bit value with the upper 29 bits being the
-- 'FieldId' and the lower 3 bits being the 'WireType'
newtype WireTag = WireTag { getWireTag :: Word32 } -- bit concatenation of FieldId and WireType
  deriving (Eq,Ord,Enum,Read,Show,Num,Bits,Bounded,Data,Typeable)

-- | 'FieldId' is the field number which can be in the range 1 to
-- 2^29-1 but the value from 19000 to 19999 are forbidden (so sayeth
-- Google).
newtype FieldId = FieldId { getFieldId :: Int32 } -- really 29 bits
  deriving (Eq,Ord,Enum,Read,Show,Num,Data,Typeable,Ix)

-- Note that values 19000-19999 are forbidden for FieldId
instance Bounded FieldId where
  minBound = 1
  maxBound = 536870911 -- 2^29-1

-- | 'WireType' is the 3 bit wire encoding value, and is currently in
-- the range 0 to 5, leaving 6 and 7 currently invalid.
--
-- * 0 /Varint/ : int32, int64, uint32, uint64, sint32, sint64, bool, enum
--
-- * 1 /64-bit/ : fixed64, sfixed64, double
--
-- * 2 /Length-delimited/ : string, bytes, embedded messages
--
-- * 3 /Start group/ : groups (deprecated)
--
-- * 4 /End group/ : groups (deprecated)
--
-- * 5 /32-bit/ : fixed32, sfixed32, float
--
newtype WireType = WireType { getWireType :: Word32 }    -- really 3 bits
  deriving (Eq,Ord,Enum,Read,Show,Num,Data,Typeable)

instance Bounded WireType where
  minBound = 0
  maxBound = 5

{- | 'FieldType' is the integer associated with the
  FieldDescriptorProto's Type.  The allowed range is currently 1 to
  18, as shown below (excerpt from descritor.proto)

>    // 0 is reserved for errors.
>    // Order is weird for historical reasons.
>    TYPE_DOUBLE         = 1;
>    TYPE_FLOAT          = 2;
>    TYPE_INT64          = 3;   // Not ZigZag encoded.  Negative numbers
>                               // take 10 bytes.  Use TYPE_SINT64 if negative
>                               // values are likely.
>    TYPE_UINT64         = 4;
>    TYPE_INT32          = 5;   // Not ZigZag encoded.  Negative numbers
>                               // take 10 bytes.  Use TYPE_SINT32 if negative
>                               // values are likely.
>    TYPE_FIXED64        = 6;
>    TYPE_FIXED32        = 7;
>    TYPE_BOOL           = 8;
>    TYPE_STRING         = 9;
>    TYPE_GROUP          = 10;  // Tag-delimited aggregate.
>    TYPE_MESSAGE        = 11;  // Length-delimited aggregate.
>
>    // New in version 2.
>    TYPE_BYTES          = 12;
>    TYPE_UINT32         = 13;
>    TYPE_ENUM           = 14;
>    TYPE_SFIXED32       = 15;
>    TYPE_SFIXED64       = 16;
>    TYPE_SINT32         = 17;  // Uses ZigZag encoding.
>    TYPE_SINT64         = 18;  // Uses ZigZag encoding.

-}

newtype FieldType = FieldType { getFieldType :: Int } -- really [1..18] as fromEnum of Type from Type.hs
  deriving (Eq,Ord,Enum,Read,Show,Num,Data,Typeable)

instance Bounded FieldType where
  minBound = 1
  maxBound = 18

-- | 'EnumCode' is the Int32 assoicated with a
-- EnumValueDescriptorProto and is in the range 0 to 2^31-1.
newtype EnumCode = EnumCode { getEnumCode :: Int32 }  -- really [0..maxBound::Int32] of some .proto defined enumeration
  deriving (Eq,Ord,Read,Show,Num,Data,Typeable)

instance Bounded EnumCode where
  minBound = 0
  maxBound = 2147483647 -- 2^-31 -1

-- | 'WireSize' is the Int64 size type associated with the lazy
-- bytestrings used in the 'Put' and 'Get' monads.
type WireSize = Int64

-- | The 'Mergeable' class is not a 'Monoid', 'mergeEmpty' is not a
-- left or right unit like 'mempty'.  The default 'mergeAppend' is to
-- take the second parameter and discard the first one.  The
-- 'mergeConcat' defaults to @foldl@ associativity.
--
-- NOTE: 'mergeEmpty' has been removed in protocol buffers version 2.
-- Use 'defaultValue' instead.  New strict fields would mean that required
-- fields in messages will be automatic errors with 'mergeEmpty'.
class Default a => Mergeable a where
{-
  -- | The 'mergeEmpty' value of a basic type or a message with
  -- required fields will be undefined and a runtime error to
  -- evaluate.  These are only handy for reading the wire encoding and
  -- users should employ 'defaultValue' instead.
  mergeEmpty :: a
  mergeEmpty = error "You did not define Mergeable.mergeEmpty!"
-}
  -- | 'mergeAppend' is the right-biased merge of two values.  A
  -- message (or group) is merged recursively.  Required field are
  -- always taken from the second message. Optional field values are
  -- taken from the most defined message or the second message if
  -- both are set.  Repeated fields have the sequences concatenated.
  -- Note that strings and bytes are NOT concatenated.
  mergeAppend :: a -> a -> a
  mergeAppend _a b = b

  -- | 'mergeConcat' is @ F.foldl mergeAppend defaultValue @ and this
  -- default definition is not overridden in any of the code except
  -- for the (Seq a) instance.
  mergeConcat :: F.Foldable t => t a -> a
  mergeConcat = F.foldl mergeAppend defaultValue

-- | The Default class has the default-default values of types.  See
-- <http://code.google.com/apis/protocolbuffers/docs/proto.html#optional>
-- and also note that 'Enum' types have a 'defaultValue' that is the
-- first one in the @.proto@ file (there is always at least one
-- value).  Instances of this for messages hold any default value
-- defined in the @.proto@ file.  'defaultValue' is where the
-- 'MessageAPI' function 'getVal' looks when an optional field is not
-- set.
class Default a where
  -- | The 'defaultValue' is never undefined or an error to evalute.
  -- This makes it much more useful compared to 'mergeEmpty'. In a
  -- default message all Optional field values are set to 'Nothing'
  -- and Repeated field values are empty.
  defaultValue :: a

-- Returns Nothing if valid, and the position of the error if invalid
isValidUTF8 :: ByteString -> Maybe Int
isValidUTF8 bs = go 0 (L.unpack bs) 0 where
  go :: Int -> [Word8] -> Int -> Maybe Int
  go 0 [] _ = Nothing
  go 0 (x:xs) n | x <= 127 = go 0 xs $! succ n -- binary 01111111
                | x <= 193 = Just n            -- binary 11000001, decodes to <=127, should not be here
                | x <= 223 = go 1 xs $! succ n -- binary 11011111
                | x <= 239 = go 2 xs $! succ n -- binary 11101111
                | x <= 243 = go 3 xs $! succ n -- binary 11110011
                | x == 244 = high xs $! succ n -- binary 11110100
                | otherwise = Just n
  go i (x:xs) n | 128 <= x && x <= 191 = go (pred i) xs $! succ n
  go _ _ n = Just n
  -- leading 3 bits are 100, so next 6 are at most 001111, i.e. 10001111
  high (x:xs) n | 128 <= x && x <= 143 = go 2 xs $! succ n
                | otherwise = Just n
  high [] n = Just n

toUtf8 :: ByteString -> Either Int Utf8
toUtf8 bs = maybe (Right (Utf8 bs)) Left (isValidUTF8 bs)

uToString :: Utf8 -> String
uToString (Utf8 bs) = U.toString bs

uFromString :: String -> Utf8
uFromString s = Utf8 (U.fromString s)


-- Base types are not very mergeable, but their Maybe and Seq versions are:
instance Mergeable a => Mergeable (Maybe a) where
--    mergeEmpty = Nothing
    mergeAppend = mayMerge

{-# INLINE mayMerge #-}
mayMerge :: (Mergeable b) => Maybe b -> Maybe b -> Maybe b
mayMerge Nothing  y        = y
mayMerge x        Nothing  = x
mayMerge (Just x) (Just y) = Just (mergeAppend x y)

instance Mergeable (Seq a) where
--    mergeEmpty = empty
    mergeAppend = (><)

-- These all have errors as mergeEmpty and use the second paramater for mergeAppend
instance Mergeable Bool
instance Mergeable Utf8
instance Mergeable ByteString
instance Mergeable Double
instance Mergeable Float
instance Mergeable Int32
instance Mergeable Int64
instance Mergeable Word32
instance Mergeable Word64

instance Default Word64 where defaultValue = 0
instance Default Word32 where defaultValue = 0
instance Default Int64 where defaultValue = 0
instance Default Int32 where defaultValue = 0
instance Default Float where defaultValue = 0
instance Default Double where defaultValue = 0
instance Default Bool where defaultValue = False
instance Default (Maybe a) where defaultValue = Nothing
instance Default (Seq a) where defaultValue = mempty
instance Default ByteString where defaultValue = mempty
instance Default Utf8 where defaultValue = mempty