{-|
Module      : Toml.Syntax.LexerUtils
Description : Wrapper and actions for generated lexer
Copyright   : (c) Eric Mertens, 2023
License     : ISC
Maintainer  : emertens@gmail.com

This module provides a custom engine for the Alex generated
lexer. This lexer drive provides nested states, unicode support,
and file location tracking.

The various states of this module are needed to deal with the varying
lexing rules while lexing values, keys, and string-literals.

-}
module Toml.Syntax.LexerUtils (

    -- * Types
    Action,
    Context(..),
    Outcome(..),

    -- * Input processing
    locatedUncons,

    -- * Actions
    token,
    token_,
    textToken,

    timeValue,
    eofToken,

    failure,

    -- * String literals
    strFrag,
    startMlBstr,
    startBstr,
    startMlLstr,
    startLstr,
    endStr,
    unicodeEscape,
    recommendEscape,

    mkError,
    ) where

import Data.Char (ord, chr, isAscii, isControl)
import Data.Foldable (asum)
import Data.Text (Text)
import Data.Text qualified as Text
import Data.Time.Format (parseTimeM, defaultTimeLocale, ParseTime)
import Numeric (readHex)
import Text.Printf (printf)
import Toml.Syntax.Token (Token(..))
import Toml.Syntax.Position (move, Located(..), Position)

-- | Type of actions associated with lexer patterns
type Action = Located Text -> Context -> Outcome

data Outcome
  = Resume Context
  | LexerError (Located String)
  | EmitToken (Located Token)

-- | Representation of the current lexer state.
data Context
  = TopContext -- ^ top-level where @[[@ and @]]@ have special meaning
  | TableContext -- ^ inline table - lex key names
  | ValueContext -- ^ value lexer - lex number literals
  | MlBstrContext Position [Text] -- ^ multiline basic string: position of opening delimiter and list of fragments
  | BstrContext   Position [Text] -- ^ basic string: position of opening delimiter and list of fragments
  | MlLstrContext Position [Text] -- ^ multiline literal string: position of opening delimiter and list of fragments
  | LstrContext   Position [Text] -- ^ literal string: position of opening delimiter and list of fragments
  deriving Int -> Context -> ShowS
[Context] -> ShowS
Context -> String
(Int -> Context -> ShowS)
-> (Context -> String) -> ([Context] -> ShowS) -> Show Context
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Context -> ShowS
showsPrec :: Int -> Context -> ShowS
$cshow :: Context -> String
show :: Context -> String
$cshowList :: [Context] -> ShowS
showList :: [Context] -> ShowS
Show

-- | Add a literal fragment of a string to the current string state.
strFrag :: Action
strFrag :: Action
strFrag (Located Position
_ Text
s) = \case
  BstrContext   Position
p [Text]
acc -> Context -> Outcome
Resume (Position -> [Text] -> Context
BstrContext   Position
p (Text
s Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc))
  MlBstrContext Position
p [Text]
acc -> Context -> Outcome
Resume (Position -> [Text] -> Context
MlBstrContext Position
p (Text
s Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc))
  LstrContext   Position
p [Text]
acc -> Context -> Outcome
Resume (Position -> [Text] -> Context
LstrContext   Position
p (Text
s Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc))
  MlLstrContext Position
p [Text]
acc -> Context -> Outcome
Resume (Position -> [Text] -> Context
MlLstrContext Position
p (Text
s Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc))
  Context
_                   -> String -> Outcome
forall a. HasCallStack => String -> a
error String
"strFrag: panic"

-- | End the current string state and emit the string literal token.
endStr :: Action
endStr :: Action
endStr (Located Position
_ Text
x) = \case
    BstrContext   Position
p [Text]
acc -> Located Token -> Outcome
EmitToken (Position -> Token -> Located Token
forall a. Position -> a -> Located a
Located Position
p (Text -> Token
TokString   ([Text] -> Text
Text.concat ([Text] -> [Text]
forall a. [a] -> [a]
reverse (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc)))))
    MlBstrContext Position
p [Text]
acc -> Located Token -> Outcome
EmitToken (Position -> Token -> Located Token
forall a. Position -> a -> Located a
Located Position
p (Text -> Token
TokMlString ([Text] -> Text
Text.concat ([Text] -> [Text]
forall a. [a] -> [a]
reverse (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc)))))
    LstrContext   Position
p [Text]
acc -> Located Token -> Outcome
EmitToken (Position -> Token -> Located Token
forall a. Position -> a -> Located a
Located Position
p (Text -> Token
TokString   ([Text] -> Text
Text.concat ([Text] -> [Text]
forall a. [a] -> [a]
reverse (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc)))))
    MlLstrContext Position
p [Text]
acc -> Located Token -> Outcome
EmitToken (Position -> Token -> Located Token
forall a. Position -> a -> Located a
Located Position
p (Text -> Token
TokMlString ([Text] -> Text
Text.concat ([Text] -> [Text]
forall a. [a] -> [a]
reverse (Text
x Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: [Text]
acc)))))
    Context
_                  -> String -> Outcome
forall a. HasCallStack => String -> a
error String
"endStr: panic"

-- | Start a basic string literal
startBstr :: Action
startBstr :: Action
startBstr (Located Position
p Text
_) Context
_ = Context -> Outcome
Resume (Position -> [Text] -> Context
BstrContext Position
p [])

-- | Start a literal string literal
startLstr :: Action
startLstr :: Action
startLstr (Located Position
p Text
_) Context
_ = Context -> Outcome
Resume (Position -> [Text] -> Context
LstrContext Position
p [])

-- | Start a multi-line basic string literal
startMlBstr :: Action
startMlBstr :: Action
startMlBstr (Located Position
p Text
_) Context
_ = Context -> Outcome
Resume (Position -> [Text] -> Context
MlBstrContext Position
p [])

-- | Start a multi-line literal string literal
startMlLstr :: Action
startMlLstr :: Action
startMlLstr (Located Position
p Text
_) Context
_ = Context -> Outcome
Resume (Position -> [Text] -> Context
MlLstrContext Position
p [])

-- | Resolve a unicode escape sequence and add it to the current string literal
unicodeEscape :: Action
unicodeEscape :: Action
unicodeEscape (Located Position
p Text
lexeme) Context
ctx =
  case ReadS Int
forall a. (Eq a, Num a) => ReadS a
readHex (Int -> ShowS
forall a. Int -> [a] -> [a]
drop Int
2 (Text -> String
Text.unpack Text
lexeme)) of
    [(Int
n,String
_)] | Int
0xd800 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
n, Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0xe000 -> Located String -> Outcome
LexerError (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"non-scalar unicode escape")
      | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x110000                 -> Located String -> Outcome
LexerError (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"unicode escape too large")
      | Bool
otherwise                     -> Action
strFrag (Position -> Text -> Located Text
forall a. Position -> a -> Located a
Located Position
p (Char -> Text
Text.singleton (Int -> Char
chr Int
n))) Context
ctx
    [(Int, String)]
_                                 -> String -> Outcome
forall a. HasCallStack => String -> a
error String
"unicodeEscape: panic"

recommendEscape :: Action
recommendEscape :: Action
recommendEscape (Located Position
p Text
x) Context
_ =
  Located String -> Outcome
LexerError (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p (String -> Int -> String
forall r. PrintfType r => String -> r
printf String
"control characters must be escaped, use: \\u%04X" (Char -> Int
ord (HasCallStack => Text -> Char
Text -> Char
Text.head Text
x))))

-- | Emit a token ignoring the current lexeme
token_ :: Token -> Action
token_ :: Token -> Action
token_ Token
t Located Text
x Context
_ = Located Token -> Outcome
EmitToken (Token
t Token -> Located Text -> Located Token
forall a b. a -> Located b -> Located a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Located Text
x)

-- | Emit a token using the current lexeme
token :: (String -> Token) -> Action
token :: (String -> Token) -> Action
token String -> Token
f Located Text
x Context
_ = Located Token -> Outcome
EmitToken (String -> Token
f (String -> Token) -> (Text -> String) -> Text -> Token
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
Text.unpack (Text -> Token) -> Located Text -> Located Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Located Text
x)

-- | Emit a token using the current lexeme
textToken :: (Text -> Token) -> Action
textToken :: (Text -> Token) -> Action
textToken Text -> Token
f Located Text
x Context
_ = Located Token -> Outcome
EmitToken (Text -> Token
f (Text -> Token) -> Located Text -> Located Token
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Located Text
x)

-- | Attempt to parse the current lexeme as a date-time token.
timeValue ::
  ParseTime a =>
  String       {- ^ description for error messages -} ->
  [String]     {- ^ possible valid patterns        -} ->
  (a -> Token) {- ^ token constructor              -} ->
  Action
timeValue :: forall a.
ParseTime a =>
String -> [String] -> (a -> Token) -> Action
timeValue String
description [String]
patterns a -> Token
constructor (Located Position
p Text
str) Context
_ =
  case [Maybe a] -> Maybe a
forall (t :: * -> *) (f :: * -> *) a.
(Foldable t, Alternative f) =>
t (f a) -> f a
asum [Bool -> TimeLocale -> String -> String -> Maybe a
forall (m :: * -> *) t.
(MonadFail m, ParseTime t) =>
Bool -> TimeLocale -> String -> String -> m t
parseTimeM Bool
False TimeLocale
defaultTimeLocale String
pat (Text -> String
Text.unpack Text
str) | String
pat <- [String]
patterns] of
    Maybe a
Nothing -> Located String -> Outcome
LexerError (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p (String
"malformed " String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
description))
    Just a
t  -> Located Token -> Outcome
EmitToken (Position -> Token -> Located Token
forall a. Position -> a -> Located a
Located Position
p (a -> Token
constructor a
t))

-- | Pop the first character off a located string if it's not empty.
-- The resulting 'Int' will either be the ASCII value of the character
-- or @1@ for non-ASCII Unicode values. To avoid a clash, @\x1@ is
-- remapped to @0@.
locatedUncons :: Located Text -> Maybe (Int, Located Text)
locatedUncons :: Located Text -> Maybe (Int, Located Text)
locatedUncons Located { locPosition :: forall a. Located a -> Position
locPosition = Position
p, locThing :: forall a. Located a -> a
locThing = Text
str } =
  case Text -> Maybe (Char, Text)
Text.uncons Text
str of
    Maybe (Char, Text)
Nothing -> Maybe (Int, Located Text)
forall a. Maybe a
Nothing
    Just (Char
x, Text
xs)
      | Located Text
rest Located Text -> Bool -> Bool
forall a b. a -> b -> b
`seq` Bool
False -> Maybe (Int, Located Text)
forall a. HasCallStack => a
undefined
      | Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\1' -> (Int, Located Text) -> Maybe (Int, Located Text)
forall a. a -> Maybe a
Just (Int
0,     Located Text
rest)
      | Char -> Bool
isAscii Char
x -> (Int, Located Text) -> Maybe (Int, Located Text)
forall a. a -> Maybe a
Just (Char -> Int
ord Char
x, Located Text
rest)
      | Bool
otherwise -> (Int, Located Text) -> Maybe (Int, Located Text)
forall a. a -> Maybe a
Just (Int
1,     Located Text
rest)
      where
        rest :: Located Text
rest = Located { locPosition :: Position
locPosition = Char -> Position -> Position
move Char
x Position
p, locThing :: Text
locThing = Text
xs }

-- | Generate the correct terminating token given the current lexer state.
eofToken :: Context -> Located Text -> Either (Located String) (Located Token, Located Text)
eofToken :: Context
-> Located Text
-> Either (Located String) (Located Token, Located Text)
eofToken (MlBstrContext Position
p [Text]
_) Located Text
_ = Located String
-> Either (Located String) (Located Token, Located Text)
forall a b. a -> Either a b
Left (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"unterminated multi-line basic string")
eofToken (BstrContext   Position
p [Text]
_) Located Text
_ = Located String
-> Either (Located String) (Located Token, Located Text)
forall a b. a -> Either a b
Left (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"unterminated basic string")
eofToken (MlLstrContext Position
p [Text]
_) Located Text
_ = Located String
-> Either (Located String) (Located Token, Located Text)
forall a b. a -> Either a b
Left (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"unterminated multi-line literal string")
eofToken (LstrContext   Position
p [Text]
_) Located Text
_ = Located String
-> Either (Located String) (Located Token, Located Text)
forall a b. a -> Either a b
Left (Position -> String -> Located String
forall a. Position -> a -> Located a
Located Position
p String
"unterminated literal string")
eofToken Context
_                   Located Text
t = (Located Token, Located Text)
-> Either (Located String) (Located Token, Located Text)
forall a b. b -> Either a b
Right (Token
TokEOF Token -> Located Text -> Located Token
forall a b. a -> Located b -> Located a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Located Text
t, Located Text
t)

failure :: String -> Action
failure :: String -> Action
failure String
err Located Text
t Context
_ = Located String -> Outcome
LexerError (String
err String -> Located Text -> Located String
forall a b. a -> Located b -> Located a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Located Text
t)

-- | Generate an error message given the current string being lexed.
mkError :: String -> String
mkError :: ShowS
mkError String
""    = String
"unexpected end-of-input"
mkError (Char
'\n':String
_) = String
"unexpected end-of-line"
mkError (Char
'\r':Char
'\n':String
_) = String
"unexpected end-of-line"
mkError (Char
x:String
_)
    | Char -> Bool
isControl Char
x = String
"control characters prohibited"
    | Bool
otherwise   = String
"unexpected " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Char -> String
forall a. Show a => a -> String
show Char
x