{-# LANGUAGE CPP #-}
-- |
-- Module      : Streamly.Internal.Unicode.Parser
-- Copyright   : (c) 2021 Composewell Technologies
-- License     : BSD-3-Clause
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
-- Portability : GHC
--
-- To parse a text input, use the decode routines from
-- "Streamly.Unicode.Stream" module to convert an input byte stream to a
-- Unicode Char stream and then use these parsers on the Char stream.

module Streamly.Internal.Unicode.Parser
    (
    -- * Setup
    -- | To execute the code examples provided in this module in ghci, please
    -- run the following commands first.
    --
    -- $setup

    -- * Generic
      char
    , charIgnoreCase

    -- * Sequences
    , string
    , stringIgnoreCase
    , dropSpace
    , dropSpace1

    -- * Classes
    , alpha
    , alphaNum
    , letter
    , ascii
    , asciiLower
    , asciiUpper
    , latin1
    , lower
    , upper
    , mark
    , printable
    , punctuation
    , separator
    , space
    , symbol

    -- digits
    , digit
    , octDigit
    , hexDigit
    , numeric

    -- * Numeric
    , signed
    , number
    , doubleParser
    , double
    , decimal
    , hexadecimal

    -- * Utilities
    , mkDouble
    )
where

import Control.Applicative (Alternative(..))
import Data.Bits (Bits, (.|.), shiftL, (.&.))
import Data.Char (ord)
import Data.Ratio ((%))
import Fusion.Plugin.Types (Fuse(..))
import Streamly.Internal.Data.Parser (Parser(..), Initial(..),  Step(..))

import qualified Data.Char as Char
import qualified Streamly.Data.Fold as Fold
import qualified Streamly.Internal.Data.Parser as Parser
    (
      lmap
    , satisfy
    , listEq
    , takeWhile1
    , dropWhile
    )

#include "DocTestUnicodeParser.hs"

--------------------------------------------------------------------------------
-- Character classification
--------------------------------------------------------------------------------

-- XXX It may be possible to implement faster predicates for ASCII byte stream.
-- We can measure if there is a signficant difference and if so we can add such
-- predicates to Streamly.Unicode.Parser.Latin1.
--
#define CHAR_PARSER_SIG(NAME)         NAME :: Monad m => Parser Char m Char
-- XXX Need to use the predicates from Unicode.Char module/unicode-data package
#define CHAR_PARSER(NAME, PREDICATE)  NAME = Parser.satisfy Char.PREDICATE
#define CHAR_PARSER_DOC(PREDICATE) -- | Match any character that satisfies 'Char.PREDICATE'
#define CHAR_PARSER_INLINE(NAME)      {-# INLINE NAME #-}

CHAR_PARSER_DOC(isSpace)
CHAR_PARSER_INLINE(space)
CHAR_PARSER_SIG(space)
CHAR_PARSER(space,isSpace)

CHAR_PARSER_DOC(isLower)
CHAR_PARSER_INLINE(lower)
CHAR_PARSER_SIG(lower)
CHAR_PARSER(lower,isLower)

CHAR_PARSER_DOC(isUpper)
CHAR_PARSER_INLINE(upper)
CHAR_PARSER_SIG(upper)
CHAR_PARSER(upper,isUpper)

CHAR_PARSER_DOC(isAlpha)
CHAR_PARSER_INLINE(alpha)
CHAR_PARSER_SIG(alpha)
CHAR_PARSER(alpha,isAlpha)

CHAR_PARSER_DOC(isAlphaNum)
CHAR_PARSER_INLINE(alphaNum)
CHAR_PARSER_SIG(alphaNum)
CHAR_PARSER(alphaNum,isAlphaNum)

CHAR_PARSER_DOC(isPrint)
CHAR_PARSER_INLINE(printable)
CHAR_PARSER_SIG(printable)
CHAR_PARSER(printable,isPrint)

CHAR_PARSER_DOC(isDigit)
CHAR_PARSER_INLINE(digit)
CHAR_PARSER_SIG(digit)
CHAR_PARSER(digit,isDigit)

CHAR_PARSER_DOC(isOctDigit)
CHAR_PARSER_INLINE(octDigit)
CHAR_PARSER_SIG(octDigit)
CHAR_PARSER(octDigit,isOctDigit)

CHAR_PARSER_DOC(isHexDigit)
CHAR_PARSER_INLINE(hexDigit)
CHAR_PARSER_SIG(hexDigit)
CHAR_PARSER(hexDigit,isHexDigit)

CHAR_PARSER_DOC(isLetter)
CHAR_PARSER_INLINE(letter)
CHAR_PARSER_SIG(letter)
CHAR_PARSER(letter,isLetter)

CHAR_PARSER_DOC(isMark)
CHAR_PARSER_INLINE(mark)
CHAR_PARSER_SIG(mark)
CHAR_PARSER(mark,isMark)

CHAR_PARSER_DOC(isNumber)
CHAR_PARSER_INLINE(numeric)
CHAR_PARSER_SIG(numeric)
CHAR_PARSER(numeric,isNumber)

CHAR_PARSER_DOC(isPunctuation)
CHAR_PARSER_INLINE(punctuation)
CHAR_PARSER_SIG(punctuation)
CHAR_PARSER(punctuation,isPunctuation)

CHAR_PARSER_DOC(isSymbol)
CHAR_PARSER_INLINE(symbol)
CHAR_PARSER_SIG(symbol)
CHAR_PARSER(symbol,isSymbol)

CHAR_PARSER_DOC(isSeparator)
CHAR_PARSER_INLINE(separator)
CHAR_PARSER_SIG(separator)
CHAR_PARSER(separator,isSeparator)

CHAR_PARSER_DOC(isAscii)
CHAR_PARSER_INLINE(ascii)
CHAR_PARSER_SIG(ascii)
CHAR_PARSER(ascii,isAscii)

CHAR_PARSER_DOC(isLatin1)
CHAR_PARSER_INLINE(latin1)
CHAR_PARSER_SIG(latin1)
CHAR_PARSER(latin1,isLatin1)

CHAR_PARSER_DOC(isAsciiUpper)
CHAR_PARSER_INLINE(asciiUpper)
CHAR_PARSER_SIG(asciiUpper)
CHAR_PARSER(asciiUpper,isAsciiUpper)

CHAR_PARSER_DOC(isAsciiLower)
CHAR_PARSER_INLINE(asciiLower)
CHAR_PARSER_SIG(asciiLower)
CHAR_PARSER(asciiLower,isAsciiLower)

--------------------------------------------------------------------------------
-- Character parsers
--------------------------------------------------------------------------------

-- | Match a specific character.
{-# INLINE char #-}
char :: Monad m => Char -> Parser Char m Char
char c = Parser.satisfy (== c)

-- XXX Case conversion may lead to change in number of chars
-- | Match a specific character ignoring case.
{-# INLINE charIgnoreCase #-}
charIgnoreCase :: Monad m => Char -> Parser Char m Char
charIgnoreCase c = Parser.lmap Char.toLower (Parser.satisfy (== Char.toLower c))

--------------------------------------------------------------------------------
-- Character sequences
--------------------------------------------------------------------------------

-- | Match the input with the supplied string and return it if successful.
string :: Monad m => String -> Parser Char m String
string = Parser.listEq

-- XXX Not accurate unicode case conversion
-- | Match the input with the supplied string and return it if successful.
stringIgnoreCase :: Monad m => String -> Parser Char m String
stringIgnoreCase s =
    Parser.lmap Char.toLower (Parser.listEq (map Char.toLower s))

-- | Drop /zero/ or more white space characters.
dropSpace :: Monad m => Parser Char m ()
dropSpace = Parser.dropWhile Char.isSpace

-- | Drop /one/ or more white space characters.
dropSpace1 :: Monad m => Parser Char m ()
dropSpace1 = Parser.takeWhile1 Char.isSpace Fold.drain

--------------------------------------------------------------------------------
-- Numeric parsers
--------------------------------------------------------------------------------

-- XXX It should fail if the number is larger than the size of the type.
--
-- | Parse and decode an unsigned integral decimal number.
{-# INLINE decimal #-}
decimal :: (Monad m, Integral a) => Parser Char m a
decimal = Parser.takeWhile1 Char.isDigit (Fold.foldl' step 0)

    where

    step a c = a * 10 + fromIntegral (ord c - 48)

-- | Parse and decode an unsigned integral hexadecimal number.  The hex digits
-- @\'a\'@ through @\'f\'@ may be upper or lower case.
--
-- Note: This parser does not accept a leading @\"0x\"@ string.
{-# INLINE hexadecimal #-}
hexadecimal :: (Monad m, Integral a, Bits a) => Parser Char m a
hexadecimal = Parser.takeWhile1 isHexDigit (Fold.foldl' step 0)

    where

    isHexDigit c =
           (c >= '0' && c <= '9')
        || (c >= 'a' && c <= 'f')
        || (c >= 'A' && c <= 'F')

    step a c
        | w >= 48 && w <= 57 =
            (a `shiftL` 4) .|. fromIntegral (w - 48)
        | w >= 97 =
            (a `shiftL` 4) .|. fromIntegral (w - 87)
        | otherwise =
            (a `shiftL` 4) .|. fromIntegral (w - 55)

        where

        w = ord c

-- | Allow an optional leading @\'+\'@ or @\'-\'@ sign character before any
-- parser.
{-# INLINE signed #-}
signed :: (Num a, Monad m) => Parser Char m a -> Parser Char m a
signed p = (negate <$> (char '-' *> p)) <|> (char '+' *> p) <|> p

-- XXX Change Multiplier to Sign
type Multiplier = Int

-- XXX We can use Int instead of Integer to make it twice as fast. But then we
-- will have to truncate the significant digits before overflow occurs.
type Number = Integer
type DecimalPlaces = Int
type PowerMultiplier = Int
type Power = Int

{-# ANN type ScientificParseState Fuse #-}
data ScientificParseState
  = SPInitial
  | SPSign !Multiplier
  | SPAfterSign !Multiplier !Number
  | SPDot !Multiplier !Number
  | SPAfterDot !Multiplier !Number !DecimalPlaces
  | SPExponent !Multiplier !Number !DecimalPlaces
  | SPExponentWithSign !Multiplier !Number !DecimalPlaces !PowerMultiplier
  | SPAfterExponent !Multiplier !Number !DecimalPlaces !PowerMultiplier !Power

-- XXX See https://hackage.haskell.org/package/integer-conversion for large
-- integers.

-- | A generic parser for scientific notation of numbers. Returns (mantissa,
-- exponent) tuple. The result can be mapped to 'Double' or any other number
-- representation e.g. @Scientific@.
--
-- For example, using the @scientific@ package:
-- >> parserScientific = uncurry Data.Scientific.scientific <$> 'number'
{-# INLINE number #-}
number :: Monad m => Parser Char m (Integer, Int)
number =  Parser (\s a -> return $ step s a) initial (return . extract)

    where

    intToInteger :: Int -> Integer
    intToInteger = fromIntegral

    combineNum buf num = buf * 10 + num

    {-# INLINE initial #-}
    initial = pure $ IPartial SPInitial

    exitSPInitial msg =
        "number: expecting sign or decimal digit, got " ++ msg
    exitSPSign msg =
        "number: expecting decimal digit, got " ++ msg
    exitSPAfterSign multiplier num = (intToInteger multiplier * num, 0)
    exitSPAfterDot multiplier num decimalPlaces =
        ( intToInteger multiplier * num
        , -decimalPlaces
        )
    exitSPAfterExponent mult num decimalPlaces powerMult powerNum =
        let e = powerMult * powerNum - decimalPlaces
         in (intToInteger mult * num, e)

    {-# INLINE step #-}
    step SPInitial val =
        case val of
          '+' -> Continue 0 (SPSign 1)
          '-' -> Continue 0 $ (SPSign (-1))
          _ -> do
              let num = ord val - 48
              if num >= 0 && num <= 9
              then Partial 0 $ SPAfterSign 1 (intToInteger num)
              else Error $ exitSPInitial $ show val
    step (SPSign multiplier) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then Partial 0 $ SPAfterSign multiplier (intToInteger num)
            else Error $ exitSPSign $ show val
    step (SPAfterSign multiplier buf) val =
        case val of
            '.' -> Continue 0 $ SPDot multiplier buf
            'e' -> Continue 0 $ SPExponent multiplier buf 0
            'E' -> Continue 0 $ SPExponent multiplier buf 0
            _ ->
                let num = ord val - 48
                 in if num >= 0 && num <= 9
                    then
                        Partial 0
                            $ SPAfterSign multiplier (combineNum buf (intToInteger num))
                    else Done 1 $ exitSPAfterSign multiplier buf
    step (SPDot multiplier buf) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then Partial 0 $ SPAfterDot multiplier (combineNum buf (intToInteger num)) 1
            else Done 2 $ exitSPAfterSign multiplier buf
    step (SPAfterDot multiplier buf decimalPlaces) val =
        case val of
            'e' -> Continue 0 $ SPExponent multiplier buf decimalPlaces
            'E' -> Continue 0 $ SPExponent multiplier buf decimalPlaces
            _ ->
                let num = ord val - 48
                 in if num >= 0 && num <= 9
                    then
                        Partial 0
                            $ SPAfterDot
                                  multiplier
                                  (combineNum buf (intToInteger num))
                                  (decimalPlaces + 1)
                    else Done 1 $ exitSPAfterDot multiplier buf decimalPlaces
    step (SPExponent multiplier buf decimalPlaces) val =
        case val of
          '+' -> Continue 0 (SPExponentWithSign multiplier buf decimalPlaces 1)
          '-' -> Continue 0 (SPExponentWithSign multiplier buf decimalPlaces (-1))
          _ -> do
              let num = ord val - 48
              if num >= 0 && num <= 9
              then Partial 0 $ SPAfterExponent multiplier buf decimalPlaces 1 num
              else Done 2 $ exitSPAfterDot multiplier buf decimalPlaces
    step (SPExponentWithSign mult buf decimalPlaces powerMult) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then Partial 0 $ SPAfterExponent mult buf decimalPlaces powerMult num
            else Done 3 $ exitSPAfterDot mult buf decimalPlaces
    step (SPAfterExponent mult num decimalPlaces powerMult buf) val =
        let n = ord val - 48
         in if n >= 0 && n <= 9
            then
                Partial 0
                    $ SPAfterExponent
                          mult num decimalPlaces powerMult (combineNum buf n)
            else
                Done 1
                    $ exitSPAfterExponent mult num decimalPlaces powerMult buf

    {-# INLINE extract #-}
    extract SPInitial = Error $ exitSPInitial "end of input"
    extract (SPSign _) = Error $ exitSPSign "end of input"
    extract (SPAfterSign mult num) = Done 0 $ exitSPAfterSign mult num
    extract (SPDot mult num) = Done 1 $ exitSPAfterSign mult num
    extract (SPAfterDot mult num decimalPlaces) =
        Done 0 $ exitSPAfterDot mult num decimalPlaces
    extract (SPExponent mult num decimalPlaces) =
        Done 1 $ exitSPAfterDot mult num decimalPlaces
    extract (SPExponentWithSign mult num decimalPlaces _) =
        Done 2 $ exitSPAfterDot mult num decimalPlaces
    extract (SPAfterExponent mult num decimalPlaces powerMult powerNum) =
        Done 0 $ exitSPAfterExponent mult num decimalPlaces powerMult powerNum

type MantissaInt = Int
type OverflowPower = Int

{-# ANN type DoubleParseState Fuse #-}
data DoubleParseState
  = DPInitial
  | DPSign !Multiplier
  | DPAfterSign !Multiplier !MantissaInt !OverflowPower
  | DPDot !Multiplier !MantissaInt !OverflowPower
  | DPAfterDot !Multiplier !MantissaInt !OverflowPower
  | DPExponent !Multiplier !MantissaInt !OverflowPower
  | DPExponentWithSign !Multiplier !MantissaInt !OverflowPower !PowerMultiplier
  | DPAfterExponent !Multiplier !MantissaInt !OverflowPower !PowerMultiplier !Power

-- | A fast, custom parser for double precision flaoting point numbers. Returns
-- (mantissa, exponent) tuple. This is much faster than 'number' because it
-- assumes the number will fit in a 'Double' type and uses 'Int' representation
-- to store mantissa.
--
-- Number larger than 'Double' may overflow. Int overflow is not checked in the
-- exponent.
--
{-# INLINE doubleParser #-}
doubleParser :: Monad m => Parser Char m (Int, Int)
doubleParser =  Parser (\s a -> return $ step s a) initial (return . extract)

    where

    -- XXX Assuming Int = Int64

    -- Up to 58 bits Int won't overflow
    -- ghci> (2^59-1)*10+9 :: Int
    -- 5764607523034234879
    mask :: Word
    mask = 0x7c00000000000000 -- 58 bits, ignore the sign bit

    {-# INLINE combineNum #-}
    combineNum :: Int -> Int -> Int -> (Int, Int)
    combineNum mantissa power num =
         if fromIntegral mantissa .&. mask == 0
         then (mantissa * 10 + num, power)
         else (mantissa, power + 1)

    {-# INLINE initial #-}
    initial = pure $ IPartial DPInitial

    exitDPInitial msg =
        "number: expecting sign or decimal digit, got " ++ msg
    exitDPSign msg =
        "number: expecting decimal digit, got " ++ msg
    exitDPAfterSign multiplier num opower = (fromIntegral multiplier * num, opower)
    exitDPAfterDot multiplier num opow =
        (fromIntegral multiplier * num , opow)
    exitDPAfterExponent mult num opow powerMult powerNum =
        (fromIntegral mult * num, opow + powerMult * powerNum)

    {-# INLINE step #-}
    step DPInitial val =
        case val of
          '+' -> Continue 0 (DPSign 1)
          '-' -> Continue 0 $ (DPSign (-1))
          _ -> do
              let num = ord val - 48
              if num >= 0 && num <= 9
              then Partial 0 $ DPAfterSign 1 num 0
              else Error $ exitDPInitial $ show val
    step (DPSign multiplier) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then Partial 0 $ DPAfterSign multiplier num 0
            else Error $ exitDPSign $ show val
    step (DPAfterSign multiplier buf opower) val =
        case val of
            '.' -> Continue 0 $ DPDot multiplier buf opower
            'e' -> Continue 0 $ DPExponent multiplier buf opower
            'E' -> Continue 0 $ DPExponent multiplier buf opower
            _ ->
                let num = ord val - 48
                 in if num >= 0 && num <= 9
                    then
                        let (buf1, power1) = combineNum buf opower num
                         in Partial 0
                            $ DPAfterSign multiplier buf1 power1
                    else Done 1 $ exitDPAfterSign multiplier buf opower
    step (DPDot multiplier buf opower) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then
                let (buf1, power1) = combineNum buf opower num
                 in Partial 0 $ DPAfterDot multiplier buf1 (power1 - 1)
            else Done 2 $ exitDPAfterSign multiplier buf opower
    step (DPAfterDot multiplier buf opower) val =
        case val of
            'e' -> Continue 0 $ DPExponent multiplier buf opower
            'E' -> Continue 0 $ DPExponent multiplier buf opower
            _ ->
                let num = ord val - 48
                 in if num >= 0 && num <= 9
                    then
                        let (buf1, power1) = combineNum buf opower num
                         in Partial 0 $ DPAfterDot multiplier buf1 (power1 - 1)
                    else Done 1 $ exitDPAfterDot multiplier buf opower
    step (DPExponent multiplier buf opower) val =
        case val of
          '+' -> Continue 0 (DPExponentWithSign multiplier buf opower 1)
          '-' -> Continue 0 (DPExponentWithSign multiplier buf opower (-1))
          _ -> do
              let num = ord val - 48
              if num >= 0 && num <= 9
              then Partial 0 $ DPAfterExponent multiplier buf opower 1 num
              else Done 2 $ exitDPAfterDot multiplier buf opower
    step (DPExponentWithSign mult buf opower powerMult) val =
        let num = ord val - 48
         in if num >= 0 && num <= 9
            then Partial 0 $ DPAfterExponent mult buf opower powerMult num
            else Done 3 $ exitDPAfterDot mult buf opower
    step (DPAfterExponent mult num opower powerMult buf) val =
        let n = ord val - 48
         in if n >= 0 && n <= 9
            then
                Partial 0
                    $ DPAfterExponent mult num opower powerMult (buf * 10 + n)
            else Done 1 $ exitDPAfterExponent mult num opower powerMult buf

    {-# INLINE extract #-}
    extract DPInitial = Error $ exitDPInitial "end of input"
    extract (DPSign _) = Error $ exitDPSign "end of input"
    extract (DPAfterSign mult num opow) = Done 0 $ exitDPAfterSign mult num opow
    extract (DPDot mult num opow) = Done 1 $ exitDPAfterSign mult num opow
    extract (DPAfterDot mult num opow) =
        Done 0 $ exitDPAfterDot mult num opow
    extract (DPExponent mult num opow) =
        Done 1 $ exitDPAfterDot mult num opow
    extract (DPExponentWithSign mult num opow _) =
        Done 2 $ exitDPAfterDot mult num opow
    extract (DPAfterExponent mult num opow powerMult powerNum) =
        Done 0 $ exitDPAfterExponent mult num opow powerMult powerNum

-- XXX We can have a `realFloat` parser instead to parse any RealFloat value.
-- And a integral parser to read any integral value.

-- XXX This is very expensive, takes much more time than the rest of the
-- parsing. Need to look into fromRational.

-- | @mkDouble mantissa exponent@ converts a mantissa and exponent to a
-- 'Double' value equivalent to @mantissa * 10^exponent@. It does not check for
-- overflow, powers more than 308 will overflow.
{-# INLINE mkDouble #-}
mkDouble :: Integer -> Int -> Double
mkDouble mantissa power =
    if power > 0
    then fromRational ((mantissa * 10 ^ power) % 1)
    else fromRational (mantissa % 10 ^ (-power))

-- | Parse a decimal 'Double' value. This parser accepts an optional sign (+ or
-- -) followed by at least one decimal digit. Decimal digits are optionally
-- followed by a decimal point and at least one decimal digit after the point.
-- This parser accepts the maximal valid input as long as it gives a valid
-- number. Specifcally a trailing decimal point is allowed but not consumed.
-- This function does not accept \"NaN\" or \"Infinity\" string representations
-- of double values.
--
-- Definition:
--
-- >>> double = uncurry Unicode.mkDouble <$> Unicode.number
--
-- Examples:
--
-- >>> p = Stream.parse Unicode.double . Stream.fromList
--
-- >>> p "-1.23e-123"
-- Right (-1.23e-123)
--
-- Trailing input examples:
--
-- >>> p "1."
-- Right 1.0
--
-- >>> p "1.2.3"
-- Right 1.2
--
-- >>> p "1e"
-- Right 1.0
--
-- >>> p "1e2.3"
-- Right 100.0
--
-- >>> p "1+2"
-- Right 1.0
--
-- Error cases:
--
-- >>> p ""
-- Left (ParseError "number: expecting sign or decimal digit, got end of input")
--
-- >>> p ".1"
-- Left (ParseError "number: expecting sign or decimal digit, got '.'")
--
-- >>> p "+"
-- Left (ParseError "number: expecting decimal digit, got end of input")
--
{-# INLINE double #-}
double :: Monad m => Parser Char m Double
double = fmap (\(m,e) -> mkDouble (fromIntegral m) e) doubleParser