{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
-- | This module is separate from the Lexer.x input to Alex
-- to segregate the automatically generated code from the
-- hand written code. The automatically generated code
-- causes lots of warnings which mask the interesting warnings.
module Config.LexerUtils where

import Data.Char            (GeneralCategory(..), generalCategory, digitToInt,
                             isAscii, isSpace, readLitChar, ord)
import Data.Monoid          ((<>))
import Data.Text            (Text)
import Data.Text.Lazy.Builder (Builder)
import Data.Word            (Word8)
import Numeric              (readInt)
import qualified Data.Text                  as Text
import qualified Data.Text.Lazy             as LText
import qualified Data.Text.Lazy.Builder     as Builder

#if !MIN_VERSION_base(4,8,0)
import Data.Monoid (mempty)
#endif

import Config.Tokens

------------------------------------------------------------------------
-- Custom Alex wrapper
------------------------------------------------------------------------

type AlexInput = Located Text

alexStartPos :: Position
alexStartPos = Position { posIndex = 0, posLine = 1, posColumn = 1 }

alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (Located p cs)
  = do (c,!cs') <- Text.uncons cs
       let !p' = alexMove p c
           !b = byteForChar c
       return (b, Located p' cs')

alexMove :: Position -> Char -> Position
alexMove (Position ix line column) c =
  case c of
    '\t' -> Position (ix + 1) line (((column + 7) `div` 8) * 8 + 1)
    '\n' -> Position (ix + 1) (line + 1) 1
    _    -> Position (ix + 1) line (column + 1)

------------------------------------------------------------------------
-- Lexer Modes
------------------------------------------------------------------------

data LexerMode
  = InNormal
  | InComment       !Position !LexerMode -- ^ Start of comment and return mode
  | InCommentString !Position !LexerMode -- ^ Start of string and return mode
  | InString        !Position !Builder   -- ^ Start of string and accumulated text

--  token starting position -> token bytes -> lexer state -> (new state, token)
type Action = Located Text -> LexerMode -> (LexerMode, Maybe (Located Token))

-- Helper function for building an Action given a token constructor
-- function, a position, and the matched token.
token :: (Text -> Token) -> Action
token f match st = (st, Just (fmap f match))

modeChange :: (Located Text -> LexerMode -> LexerMode) -> Action
modeChange f match st = (f match st, Nothing)

------------------------------------------------------------------------
-- Comment state
------------------------------------------------------------------------

startComment :: Action
startComment = modeChange (InComment . locPosition)

endComment :: Action
endComment = modeChange $ \_ (InComment _ st) -> st

------------------------------------------------------------------------
-- Comment string state
------------------------------------------------------------------------

startCommentString :: Action
startCommentString = modeChange (InCommentString . locPosition)

endCommentString :: Action
endCommentString = modeChange $ \_ (InCommentString _ st) -> st

------------------------------------------------------------------------
-- String state
------------------------------------------------------------------------

-- | Enter the string literal lexer
startString :: Action
startString = modeChange $ \match _ -> InString (locPosition match) mempty

-- | Emit completed string literal, exit string literal lexer and return to
-- Normal mode.
endString :: Action
endString _ = \(InString posn builder) ->
   let !t = getStringLit builder
   in (InNormal, Just (Located posn (String t)))

getStringLit :: Builder -> Text
getStringLit = LText.toStrict . Builder.toLazyText

-- | Add region of text to current string literal state. Escapes are handled
-- separately.
addString :: Action
addString = modeChange $ \match (InString posn builder) ->
  InString posn (builder <> Builder.fromText (locThing match))

-- | Handle character escapes in string literal mode
addCharLit :: Action
addCharLit = modeChange $ \match (InString posn builder) ->
  case readLitChar (Text.unpack (locThing match)) of
    [(c,"")] -> InString posn (builder <> Builder.singleton c)
    _        -> error "addCharLit: Lexer failure"

-- | Action for an invalid escape sequence
badEscape :: Action
badEscape = token $ \str -> ErrorEscape str

-- | Action for unterminated string constant
untermString :: Action
untermString _ = \(InString posn builder) ->
  (InNormal, Just (Located posn (ErrorUntermString (getStringLit builder))))

------------------------------------------------------------------------
-- Token builders
------------------------------------------------------------------------

-- | Construct a 'Number' token from a token using a
-- given base. This function expect the token to be
-- legal for the given base. This is checked by Alex.
number ::
  Int  {- ^ prefix length      -} ->
  Int  {- ^ base               -} ->
  Text {- ^ sign-prefix-digits -} ->
  Token
number prefixLen base str =
  case readInt (fromIntegral base) (const True) digitToInt str2 of
    [(n,"")] -> Number base (s*n)
    _        -> error "number: Lexer failure"
  where
  str2     = drop prefixLen str1
  (s,str1) = case Text.unpack str of
               '-':rest -> (-1, rest)
               rest     -> ( 1, rest)

-- | Process a section heading token
section :: Text -> Token
section = Section . Text.dropWhileEnd isSpace . Text.init

------------------------------------------------------------------------
-- Embed all of unicode, kind of, in a single byte!
------------------------------------------------------------------------

byteForChar :: Char -> Word8
byteForChar c
  | c <= '\6' = non_graphic
  | isAscii c = fromIntegral (ord c)
  | otherwise = case generalCategory c of
                  LowercaseLetter       -> lower
                  OtherLetter           -> lower
                  UppercaseLetter       -> upper
                  TitlecaseLetter       -> upper
                  DecimalNumber         -> digit
                  OtherNumber           -> digit
                  ConnectorPunctuation  -> symbol
                  DashPunctuation       -> symbol
                  OtherPunctuation      -> symbol
                  MathSymbol            -> symbol
                  CurrencySymbol        -> symbol
                  ModifierSymbol        -> symbol
                  OtherSymbol           -> symbol
                  Space                 -> space
                  ModifierLetter        -> other
                  NonSpacingMark        -> other
                  SpacingCombiningMark  -> other
                  EnclosingMark         -> other
                  LetterNumber          -> other
                  OpenPunctuation       -> other
                  ClosePunctuation      -> other
                  InitialQuote          -> other
                  FinalQuote            -> other
                  _                     -> non_graphic
  where
  non_graphic     = 0
  upper           = 1
  lower           = 2
  digit           = 3
  symbol          = 4
  space           = 5
  other           = 6