{
{-# OPTIONS_GHC -Wwarn #-}
module Text.ProtocolBuffers.ProtoCompile.Lexer (Lexed(..), alexScanTokens,getLinePos)  where

import Control.Monad.Error()
import Codec.Binary.UTF8.String(encode)
import qualified Data.ByteString.Lazy as DBL
import Data.Char(ord,isHexDigit,isOctDigit,toLower)
import Data.Word(Word8)
import Numeric(readHex,readOct,readDec,readSigned,readFloat)
import qualified Data.ByteString.Lazy.Char8 as DBL8

}

%wrapper "posn-bytestring"

@inComment = ([^\*] | $white)+ | ([\*]+ [^\/])
@comment = [\/] [\*] (@inComment)* [\*]+ [\/] | "//".* | "#".*

$d = [0-9]
@decInt = [\-]?[1-9]$d*
@hexInt = [\-]?0[xX]([A-Fa-f0-9])+
@octInt = [\-]?0[0-7]*
@doubleLit = [\-]?$d+(\.$d+)?([Ee][\+\-]?$d+)?

@ident1 = [A-Za-z_][A-Za-z0-9_]*
@ident = [\.]?@ident1([\.]@ident1)*
@notChar = [^A-Za-z0-9_]

@hexEscape = \\[Xx][A-Fa-f0-9]{1,2}
@octEscape = \\0?[0-7]{1,3}
@charEscape = \\[abfnrtv\\\?'\"]
@inStr = @hexEscape | @octEscape | @charEscape | [^'\"\n] | [\0]
@strLit = ['] (@inStr | [\"])* ['] | [\"] (@inStr | ['])* [\"]

$special    = [=\(\)\,\;\[\]\{\}\.\:]

@ninf = [\-][i][n][f]

:-

  $white+  ;
  @comment ;
  @decInt / @notChar    { parseDec }
  @octInt / @notChar    { parseOct }
  @hexInt / @notChar    { parseHex }
  @doubleLit / @notChar { parseDouble }
  @decInt               { dieAt "decimal followed by invalid character" }
  @octInt               { dieAt "octal followed by invalid character" }
  @hexInt               { dieAt "hex followed by invalid character" }
  @doubleLit            { dieAt "floating followed by invalid character" }
  @strLit               { parseStr }
  @ident                { parseName }
  @ninf                 { parseNinf }
  $special              { parseChar }
  .                     { wtfAt }

{
line :: AlexPosn -> Int
line (AlexPn _byte lineNum _col) = lineNum
{-# INLINE line #-}

data Lexed = L_Integer !Int !Integer
           | L_Double !Int !Double
           | L_Name !Int !DBL.ByteString
           | L_String !Int !DBL.ByteString !DBL.ByteString
           | L !Int !Char
           | L_Error !Int !String
  deriving (Show,Eq)

getLinePos :: Lexed -> Int
getLinePos x = case x of
                 L_Integer i _ -> i
                 L_Double  i _ -> i
                 L_Name    i _ -> i
                 L_String  i _ _ -> i
                 L         i _ -> i
                 L_Error   i _ -> i

-- 'errAt' is the only access to L_Error, so I can see where it is created with pos
errAt :: AlexPosn -> [Char] -> Lexed
errAt pos msg =  L_Error (line pos) $ "Lexical error (in Text.ProtocolBuffers.Lexer): "++ msg ++ ", at "++see pos where
  see (AlexPn char lineNum col) = "character "++show char++" line "++show lineNum++" column "++show col++"."

dieAt :: [Char] -> AlexPosn -> t -> Lexed
dieAt msg pos _s = errAt pos msg

wtfAt :: AlexPosn -> DBL8.ByteString -> Lexed
wtfAt pos s = errAt pos $ "unknown character "++show c++" (decimal "++show (ord c)++")"
  where (c:_) = DBL8.unpack s

{-# INLINE mayRead #-}
mayRead :: ReadS a -> String -> Maybe a
mayRead f s = case f s of [(a,"")] -> Just a; _ -> Nothing

-- Given the regexps above, the "parse* failed" messages should be impossible.
parseDec,parseOct,parseHex,parseDouble,parseStr,parseNinf,parseName,parseChar :: AlexPosn -> DBL8.ByteString -> Lexed
parseDec pos s = maybe (errAt pos "Impossible? parseDec failed")
                       (L_Integer (line pos)) $ mayRead (readSigned readDec) (DBL8.unpack s)
parseOct pos s = maybe (errAt pos "Impossible? parseOct failed")
                       (L_Integer (line pos)) $ mayRead (readSigned readOct) (DBL8.unpack s)
parseHex pos s = maybe (errAt pos "Impossible? parseHex failed")
                       (L_Integer (line pos)) $ mayRead (readSigned (readHex . drop 2)) (DBL8.unpack s)
parseDouble pos s = maybe (errAt pos "Impossible? parseDouble failed")
                          (L_Double (line pos)) $ mayRead (readSigned readFloat) (DBL8.unpack s)
-- The sDecode of the string contents may fail
parseStr pos s = let middle = DBL8.init . DBL8.tail $ s
                 in either (errAt pos) (L_String (line pos) middle . DBL.pack)
                    . sDecode . DBL8.unpack $ middle

-- Here s is always "-inf" matched by @ninf
parseNinf pos s = L_Name (line pos) s

parseName pos s = L_Name (line pos) s
parseChar pos s = L (line pos) (DBL8.head s)

-- Generalization of concat . unfoldr to monadic-Either form:
op :: ( [Char] -> Either String (Maybe ([Word8],[Char]))) -> [Char] -> Either String [Word8]
op one = go id where
  go f cs = case one cs of
              Left msg -> Left msg
              Right Nothing -> Right (f [])
              Right (Just (ws,cs')) -> go (f . (ws++)) cs'

-- Put this mess in the lexer, so the rest of the code can assume
-- everything is saner.  The input is checked to really be "Char8"
-- values in the range [0..255] and to be c-escaped (in order to
-- render binary information printable).  This decodes the c-escaping
-- and returns the binary data as Word8.
-- 
-- A decoding error causes (Left msg) to be returned.
sDecode :: [Char] -> Either String [Word8]
sDecode = op one where
  one :: [Char] -> Either String (Maybe ([Word8],[Char]))
  one ('\\':xs) = unescape xs
  one (x:xs) = do x' <- checkChar8 x
                  return $ Just (x',xs)  -- main case of unescaped value
  one [] = return Nothing
  unescape [] = Left "cannot understand a string that ends with a backslash"
  unescape ys | 1 <= len =
      case mayRead readOct oct of
        Just w -> do w' <- checkByte w
                     return $ Just (w',rest)
        Nothing -> Left $ "failed to decode octal sequence "++ys
    where oct = takeWhile isOctDigit (take 3 ys)
          len = length oct
          rest = drop len ys
  unescape (x:ys) | 'x' == toLower x && 1 <= len =
      case mayRead readHex hex of
        Just w -> do w' <- checkByte w
                     return $ Just (w',rest)
        Nothing -> Left $ "failed to decode hex sequence "++ys
    where hex = takeWhile isHexDigit (take 2 ys)
          len = length hex
          rest = drop len ys          
  unescape ('u':ys) | ok =
      case mayRead readHex hex of
        Just w -> do w' <- checkUnicode w
                     return $ Just (w',rest)
        Nothing -> Left $ "failed to decode 4 char unicode sequence "++ys
    where ok = all isHexDigit hex && 4 == length hex
          (hex,rest) = splitAt 4 ys
  unescape ('U':ys) | ok =
      case mayRead readHex hex of
        Just w -> do w' <- checkUnicode w
                     return $ Just (w',rest)
        Nothing -> Left $ "failed to decode 8 char unicode sequence "++ys
    where ok = all isHexDigit hex && 8 == length hex
          (hex,rest) = splitAt 8 ys
  unescape (x:xs) = do x' <- decode x
                       return $ Just ([x'],xs)
  decode :: Char -> Either String Word8
  decode 'a' = return 7
  decode 'b' = return 8
  decode 't' = return 9
  decode 'n' = return 10
  decode 'v' = return 11
  decode 'f' = return 12
  decode 'r' = return 13
  decode '\"' = return 34
  decode '\'' = return 39
  decode '?' = return 63    -- C99 rule : "\?" is '?'
  decode '\\' = return 92
  decode x | toLower x == 'x' = Left "cannot understand your 'xX' hexadecimal escaped value"
  decode x | toLower x == 'u' = Left "cannot understand your 'uU' unicode UTF-8 hexadecimal escaped value"
  decode _ = Left "cannot understand your backslash-escaped value"
  checkChar8 :: Char -> Either String [Word8]
  checkChar8 c | (0 <= i) && (i <= 255) = Right [toEnum i]
               | otherwise = Left $ "found Char out of range 0..255, value="++show (ord c)
    where i = fromEnum c
  checkByte :: Integer -> Either String [Word8]
  checkByte i | (0 <= i) && (i <= 255) = Right [fromInteger i]
              | otherwise = Left $ "found Oct/Hex Int out of range 0..255, value="++show i
  checkUnicode :: Integer -> Either String [Word8]
  checkUnicode i | (0 <= i) && (i <= 127) = Right [fromInteger i]
                 | i <= maxChar = Right $ encode [ toEnum . fromInteger $ i ]
                 | otherwise = Left $ "found Unicode Char out of range 0..0x10FFFF, value="++show i
    where maxChar = toInteger (fromEnum (maxBound ::Char)) -- 0x10FFFF

}