{-# LANGUAGE BangPatterns #-}
-- | This module is separate from the Lexer.x input to Alex
-- to segregate the automatically generated code from the
-- hand written code. The automatically generated code
-- causes lots of warnings which mask the interesting warnings.
module Config.LexerUtils
  (
  -- * Alex wrapper
    AlexInput
  , alexGetByte

  -- * Lexer modes
  , LexerMode(..)
  , startString
  , nestMode
  , endMode

  -- * Token builders
  , token
  , token_
  , section
  , number

  -- * Final actions
  , untermString
  , eofAction
  , errorAction
  ) where

import           Data.Char (GeneralCategory(..), generalCategory, isAscii, isSpace, ord)
import           Data.Text (Text)
import           Data.Word (Word8)
import qualified Data.Text as Text

import           Config.Tokens
import qualified Config.NumberParser

------------------------------------------------------------------------
-- Custom Alex wrapper - these functions are used by generated code
------------------------------------------------------------------------

-- | The generated code expects the lexer input type to be named 'AlexInput'
type AlexInput = Located Text

-- | Get the next characteristic byte from the input source.
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (Located p cs)
  = do (c,cs') <- Text.uncons cs
       let !b = byteForChar c
           !inp = Located (move p c) cs'
       return (b, inp)

------------------------------------------------------------------------

-- | Advance the position according to the kind of character lexed.
move :: Position -> Char -> Position
move (Position ix line column) c =
  case c of
    '\t' -> Position (ix + 1) line (((column + 7) `div` 8) * 8 + 1)
    '\n' -> Position (ix + 1) (line + 1) 1
    _    -> Position (ix + 1) line (column + 1)

-- | Action to perform upon end of file. Produce errors if EOF was unexpected.
eofAction :: Position -> LexerMode -> [Located Token]
eofAction eofPosn st =
  case st of
    InComment       posn _     -> [Located posn (Error UntermComment)]
    InCommentString posn _     -> [Located posn (Error UntermComment)]
    InString        posn _     -> [Located posn (Error UntermString)]
    InNormal                   -> [Located (park eofPosn) EOF]

-- | Terminate the line if needed and move the cursor to column 0 to ensure
-- that it terminates any top-level block.
park :: Position -> Position
park pos
  | posColumn pos == 1 = pos { posColumn = 0 }
  | otherwise          = pos { posColumn = 0, posLine = posLine pos + 1 }

-- | Action to perform when lexer gets stuck. Emits an error.
errorAction :: AlexInput -> [Located Token]
errorAction inp = [fmap (Error . NoMatch . Text.head) inp]

------------------------------------------------------------------------
-- Lexer Modes
------------------------------------------------------------------------

-- | The lexer can be in any of four modes which determine which rules
-- are active.
data LexerMode
  = InNormal
  | InComment       !Position !LexerMode -- ^ Start of comment and return mode
  | InCommentString !Position !LexerMode -- ^ Start of string and return mode
  | InString        !Position !Text      -- ^ Start of string and input text

-- | Type of actions used by lexer upon matching a rule
type Action =
  Int                          {- ^ match length                       -} ->
  Located Text                 {- ^ current input                      -} ->
  LexerMode                    {- ^ lexer mode                         -} ->
  (LexerMode, [Located Token]) {- ^ updated lexer mode, emitted tokens -}

-- | Helper function for building an 'Action' using the lexeme
token :: (Text -> Token) -> Action
token f len match st = (st, [fmap (f . Text.take len) match])

-- | Helper function for building an 'Action' where the lexeme is unused.
token_ :: Token -> Action
token_ = token . const

------------------------------------------------------------------------
-- Alternative modes
------------------------------------------------------------------------

-- | Used to enter one of the nested modes
nestMode :: (Position -> LexerMode -> LexerMode) -> Action
nestMode f _ match st = (f (locPosition match) st, [])

-- | Enter the string literal lexer
startString :: Action
startString _ (Located posn text) _ = (InString posn text, [])

-- | Successfully terminate the current mode and emit tokens as needed
endMode :: Action
endMode len (Located endPosn _) mode =
  case mode of
    InNormal                 -> (InNormal, [])
    InCommentString _ st     -> (st, [])
    InComment       _ st     -> (st, [])
    InString startPosn input ->
      let n = posIndex endPosn - posIndex startPosn + len
          badEscape = BadEscape (Text.pack "out of range")
      in case reads (Text.unpack (Text.take n input)) of
           [(s,"")] -> (InNormal, [Located startPosn (String (Text.pack s))])
           _        -> (InNormal, [Located startPosn (Error badEscape)])

-- | Action for unterminated string constant
untermString :: Action
untermString _ _ = \(InString posn _) ->
  (InNormal, [Located posn (Error UntermString)])

------------------------------------------------------------------------
-- Token builders
------------------------------------------------------------------------

-- | Construct a 'Number' token from a token using a
-- given base. This function expect the token to be
-- legal for the given base. This is checked by Alex.
number ::
  Text {- ^ sign-prefix-digits -} ->
  Token
number = Number . Config.NumberParser.number
       . Text.unpack . Text.toUpper . Text.filter ('_' /=)

-- | Process a section heading token
section :: Text -> Token
section = Section . Text.dropWhileEnd isSpace . Text.init

------------------------------------------------------------------------
-- Embed all of unicode, kind of, in a single byte!
------------------------------------------------------------------------

-- | Alex is driven by looking up elements in a 128 element array.
-- This function maps each ASCII character to its ASCII encoding
-- and it maps non-ASCII code-points to a character class (0-6)
byteForChar :: Char -> Word8
byteForChar c
  | c <= '\6' = non_graphic
  | isAscii c = fromIntegral (ord c)
  | otherwise = case generalCategory c of
                  LowercaseLetter       -> lower
                  OtherLetter           -> lower
                  UppercaseLetter       -> upper
                  TitlecaseLetter       -> upper
                  DecimalNumber         -> digit
                  OtherNumber           -> digit
                  ConnectorPunctuation  -> symbol
                  DashPunctuation       -> symbol
                  OtherPunctuation      -> symbol
                  MathSymbol            -> symbol
                  CurrencySymbol        -> symbol
                  ModifierSymbol        -> symbol
                  OtherSymbol           -> symbol
                  Space                 -> space
                  ModifierLetter        -> other
                  NonSpacingMark        -> other
                  SpacingCombiningMark  -> other
                  EnclosingMark         -> other
                  LetterNumber          -> other
                  OpenPunctuation       -> other
                  ClosePunctuation      -> other
                  InitialQuote          -> other
                  FinalQuote            -> other
                  _                     -> non_graphic
  where
  non_graphic     = 0
  upper           = 1
  lower           = 2
  digit           = 3
  symbol          = 4
  space           = 5
  other           = 6