{-# LANGUAGE BangPatterns #-}
{-|
Module      : TOML.LexerUtils
Description : /Internal:/ Lexer support operations for TOML
Copyright   : (c) Eric Mertens, 2017
License     : ISC
Maintainer  : emertens@gmail.com

This module is separate from the Lexer.x input to Alex
to segregate the automatically generated code from the
hand written code. The automatically generated code
causes lots of warnings which mask the interesting warnings.
-}
module TOML.LexerUtils
  (
  -- * Alex required definitions
    AlexInput
  , alexGetByte

  -- * Lexer modes
  , LexerMode(..)
  , lexerModeInt

  -- * Lexer actions
  , Action
  , token
  , token_
  , errorAction
  , eofAction

  -- * Token parsers
  , integer
  , double
  , bareKeyToken

  -- * String literal actions
  , startString
  , emitChar
  , emitChar'
  , emitUnicodeChar
  , endString

  -- * Date/time token parsers
  , localtime
  , zonedtime
  , day
  , timeofday
  ) where

import           Data.Char (isSpace, isControl, isAscii, ord, chr)
import           Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Read as Text
import           Data.Time (ParseTime, parseTimeOrError, defaultTimeLocale, iso8601DateFormat)
import           Data.Word (Word8)

import           TOML.Tokens
import           TOML.Located

------------------------------------------------------------------------
-- Custom Alex wrapper - these functions are used by generated code
------------------------------------------------------------------------

-- | The generated code expects the lexer input type to be named 'AlexInput'
type AlexInput = Located Text

-- | Get the next characteristic byte from the input source.
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (Located p cs)
  = do (c,cs') <- Text.uncons cs
       let !b   = byteForChar c
           !inp = Located (move p c) cs'
       return (b, inp)

-- | The TOML format doesn't distinguish between any of the non-ASCII
-- characters. This function extracts the printable and whitespace
-- subset of Unicode and maps it to the ASCII value as used by Alex.
byteForChar :: Char -> Word8
byteForChar c
  | isControl c && not (isSpace c) = 0
  | isAscii c = fromIntegral (ord c)
  | otherwise = 0

------------------------------------------------------------------------

-- | Advance the position according to the kind of character lexed.
move :: Position -> Char -> Position
move (Position ix line column) c =
  case c of
    '\t' -> Position (ix + 1) line (((column + 7) `div` 8) * 8 + 1)
    '\n' -> Position (ix + 1) (line + 1) 1
    _    -> Position (ix + 1) line (column + 1)

------------------------------------------------------------------------
-- Lexer Modes
------------------------------------------------------------------------

-- | The lexer can be in a normal mode or can be lexing a string literal.
data LexerMode
  = InNormal
  | InString !Int !Position String
    -- ^ alex-mode, starting-position, reversed accumulated characters
  deriving Show


-- | Compute the Alex state corresponding to a particular 'LexerMode'
lexerModeInt :: LexerMode -> Int
lexerModeInt InNormal{}           = 0
lexerModeInt (InString mode _ _)  = mode


------------------------------------------------------------------------
-- Lexer actions
------------------------------------------------------------------------

-- | Type of actions used by lexer upon matching a rule
type Action =
  Located Text                 {- ^ located lexeme                     -} ->
  LexerMode                    {- ^ lexer mode                         -} ->
  (LexerMode, [Located Token]) {- ^ updated lexer mode, emitted tokens -}


-- | Helper function for building an 'Action' using the lexeme
token :: (Text -> Token) {- ^ lexeme -> token -} -> Action
token f match st = (st, [fmap f match])

-- | Helper function for building an 'Action' where the lexeme is unused.
token_ :: Token -> Action
token_ = token . const

-- | Action to perform upon end of file. Produce errors if EOF was unexpected.
eofAction :: Position -> LexerMode -> [Located Token]
eofAction eofPosn st =
  case st of
    InString _ posn _ -> [Located posn (ErrorToken UntermString)]
    InNormal          -> [Located eofPosn EofToken]

-- | Action to perform when lexer gets stuck. Emits an error.
errorAction :: AlexInput -> [Located Token]
errorAction inp = [fmap (ErrorToken . NoMatch . Text.head) inp]

------------------------------------------------------------------------
-- String literal mode actions
------------------------------------------------------------------------

-- | Enter the string literal lexer
startString :: Int -> Action
startString mode lexeme _ = (InString mode (locPosition lexeme) [], [])


-- | Add current lexeme to the current string literal.
emitChar :: Action
emitChar _ InNormal = error "PANIC: emitChar used in normal mode"
emitChar lexeme (InString mode pos acc) = (InString mode pos acc', [])
  where
    acc' = reverse (Text.unpack (locThing lexeme)) ++ acc


-- | Add literal character to the current string literal.
emitChar' :: Char -> Action
emitChar' c _ (InString mode pos acc) = (InString mode pos (c : acc), [])
emitChar' _ _ _ = error "PANIC: emitChar' used in normal mode"


-- | Interpret the current lexeme as a unicode escape sequence and add
-- the resulting character to the current string literal.
emitUnicodeChar :: Action
emitUnicodeChar lexeme mode =
  case Text.hexadecimal (Text.drop 2 (locThing lexeme)) of
    Right (n, _)
      | n < 0x110000 -> emitChar' (chr n) lexeme mode
      | otherwise    -> (InNormal, [Located (locPosition lexeme) (ErrorToken BadEscape)])
    _ -> error "PANIC: bad unicode unescape implementation"


-- | Successfully terminate the current mode and emit tokens as needed
endString :: Action
endString _ mode =
  case mode of
    InNormal -> error "PANIC: error in string literal lexer"
    InString _ p input ->
      let !str = Text.pack (reverse input)
      in (InNormal, [Located p (StringToken str)])

------------------------------------------------------------------------
-- Token builders
------------------------------------------------------------------------

-- | Construct a 'Integer' token from a lexeme.
integer :: Text {- ^ lexeme -} -> Token
integer str = IntegerToken n
  where
  Right (n,_) = Text.signed Text.decimal (Text.filter (/= '_') str)


-- | Construct a 'Double' token from a lexeme.
double :: Text {- ^ lexeme -} -> Token
double str = DoubleToken n
  where
  Right (n,_) = Text.signed Text.double (Text.filter (/= '_') str)


-- | Construct a 'BareKeyToken' for the given lexeme. This operation
-- copies the lexeme into a fresh 'Text' value to ensure that a slice
-- of the original source file is kept.
bareKeyToken :: Text {- ^ lexeme -} -> Token
bareKeyToken txt = BareKeyToken $! Text.copy txt

------------------------------------------------------------------------
-- Date and time token parsers
------------------------------------------------------------------------

-- | Parse a date\/time lexeme to produce a 'Token'. As long as the
-- regular expressions in the "Lexer" module are correct, this parse
-- will never fail, so failure to parse throws an error.
timeParser ::
  ParseTime t =>
  (t -> Token) {- ^ token function   -} ->
  String       {- ^ time format      -} ->
  Text         {- ^ lexeme           -} ->
  Token        {- ^ date\/time token -}
timeParser con fmt txt =
  con (parseTimeOrError False defaultTimeLocale fmt (Text.unpack txt))


-- | Format string for parsing time of day: @hours:minutes:seconds.fractional@
timeFormat :: String
timeFormat = "%T%Q"


-- | Date and time lexeme parsers
zonedtime, localtime, day, timeofday :: Text -> Token
zonedtime = timeParser ZonedTimeToken (iso8601DateFormat (Just timeFormat)++"%Z")
localtime = timeParser LocalTimeToken (iso8601DateFormat (Just timeFormat))
day       = timeParser DayToken       (iso8601DateFormat Nothing)
timeofday = timeParser TimeOfDayToken timeFormat