src/full/Agda/Syntax/Parser/StringLiterals.hs


{-| The code to lex string and character literals. Basically the same code
    as in GHC.
-}
module Agda.Syntax.Parser.StringLiterals
    ( litString, litChar
    ) where

import Control.Monad.State
import Data.Char

import Agda.Syntax.Parser.Alex
import Agda.Syntax.Parser.Monad
import Agda.Syntax.Parser.Tokens
import Agda.Syntax.Parser.LookAhead
import Agda.Syntax.Position
import Agda.Syntax.Literal

import Agda.Utils.Char   ( decDigit, hexDigit, octDigit )
import Agda.Utils.Tuple  ( (-*-) )

{--------------------------------------------------------------------------
    Exported actions
 --------------------------------------------------------------------------}

-- | Lex a string literal. Assumes that a double quote has been lexed.
litString :: LexAction Token
litString = stringToken '"' (\i s ->
              return $ TokLiteral $ LitString (getRange i) s)

{-| Lex a character literal. Assumes that a single quote has been lexed.  A
    character literal is lexed in exactly the same way as a string literal.
    Only before returning the token do we check that the lexed string is of
    length 1. This is maybe not the most efficient way of doing things, but on
    the other hand it will only be inefficient if there is a lexical error.
-}
litChar :: LexAction Token
litChar = stringToken '\'' $ \i s ->
	    do	case s of
		    [c]	-> return $ TokLiteral $ LitChar (getRange i) c
		    _	-> lexError
			    "character literal must contain a single character"


{--------------------------------------------------------------------------
    Errors
 --------------------------------------------------------------------------}

-- | Custom error function.
litError :: String -> LookAhead a
litError msg =
    do	sync
	liftP $ lexError $
	    "Lexical error in string or character literal: " ++ msg


{--------------------------------------------------------------------------
    The meat
 --------------------------------------------------------------------------}

-- | The general function to lex a string or character literal token. The
--   character argument is the delimiter (@\"@ for strings and @\'@ for
--   characters).
stringToken :: Char -> (Interval -> String -> Parser tok) -> LexAction tok
stringToken del mkTok inp inp' n =
    do	setLastPos (backupPos $ lexPos inp')
        setLexInput inp'
        -- TODO: Should setPrevToken be run here? Compare with
        -- Agda.Syntax.Parser.LexActions.token.
	tok <- runLookAhead litError $ lexString del ""
	i   <- getParseInterval
	mkTok i tok


-- | This is where the work happens. The string argument is an accumulating
--   parameter for the string being lexed.
lexString :: Char -> String -> LookAhead String
lexString del s =

    do	c <- nextChar
	case c of

	    c | c == del  -> sync >> return (reverse s)

	    '\\' ->
		do  c' <- nextChar
		    case c' of
			'&'		-> sync >> lexString del s
			c | isSpace c	-> sync >> lexStringGap del s
			_		-> normalChar

	    _ -> normalChar
    where
	normalChar =
	    do	rollback
		c <- lexChar
		lexString del (c:s)


-- | A string gap consists of whitespace (possibly including line breaks)
--   enclosed in backslashes. The gap is not part of the resulting string.
lexStringGap :: Char -> String -> LookAhead String
lexStringGap del s =
    do	c <- eatNextChar
	case c of
	    '\\'	    -> lexString del s
	    c | isSpace c   -> lexStringGap del s
	    _		    -> fail "non-space in string gap"

-- | Lex a single character.
lexChar :: LookAhead Char
lexChar =
    do	c <- eatNextChar
	case c of
	    '\\'    -> lexEscape
	    _	    -> return c

-- | Lex an escaped character. Assumes the backslash has been lexed.
lexEscape :: LookAhead Char
lexEscape =
    do	c <- eatNextChar
	case c of
	    '^'	    -> do c <- eatNextChar
			  if c >= '@' && c <= '_'
			    then return (chr (ord c - ord '@'))
			    else fail "invalid control character"

	    'x'	    -> readNum isHexDigit 16 hexDigit
	    'o'	    -> readNum isOctDigit  8 octDigit
	    x | isDigit x
		    -> readNumAcc isDigit 10 decDigit (decDigit x)

	    c ->
		-- Try to match the input (starting with c) against the
		-- silly escape codes.
		do  esc <- match' c (map (id -*- return) sillyEscapeChars)
				    (fail "bad escape code")
		    sync
		    return esc

-- | Read a number in the specified base.
readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> LookAhead Char
readNum isDigit base conv =
    do	c <- eatNextChar
	if isDigit c
	    then readNumAcc isDigit base conv (conv c)
	    else fail "non-digit in numeral"

-- | Same as 'readNum' but with an accumulating parameter.
readNumAcc :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> LookAhead Char
readNumAcc isDigit base conv i = scan i
    where
	scan i =
	    do	inp <- getInput
		c   <- nextChar
		case c of
		    c | isDigit c -> scan (i*base + conv c)
		    _		  ->
			do  setInput inp
			    sync
			    if i >= ord minBound && i <= ord maxBound
				then return (chr i)
				else fail "character literal out of bounds"

-- | The escape codes.
sillyEscapeChars :: [(String, Char)]
sillyEscapeChars =
    [ ("a", '\a')
    , ("b", '\b')
    , ("f", '\f')
    , ("n", '\n')
    , ("r", '\r')
    , ("t", '\t')
    , ("v", '\v')
    , ("\\", '\\')
    , ("\"", '\"')
    , ("'", '\'')
    , ("NUL", '\NUL')
    , ("SOH", '\SOH')
    , ("STX", '\STX')
    , ("ETX", '\ETX')
    , ("EOT", '\EOT')
    , ("ENQ", '\ENQ')
    , ("ACK", '\ACK')
    , ("BEL", '\BEL')
    , ("BS", '\BS')
    , ("HT", '\HT')
    , ("LF", '\LF')
    , ("VT", '\VT')
    , ("FF", '\FF')
    , ("CR", '\CR')
    , ("SO", '\SO')
    , ("SI", '\SI')
    , ("DLE", '\DLE')
    , ("DC1", '\DC1')
    , ("DC2", '\DC2')
    , ("DC3", '\DC3')
    , ("DC4", '\DC4')
    , ("NAK", '\NAK')
    , ("SYN", '\SYN')
    , ("ETB", '\ETB')
    , ("CAN", '\CAN')
    , ("EM", '\EM')
    , ("SUB", '\SUB')
    , ("ESC", '\ESC')
    , ("FS", '\FS')
    , ("GS", '\GS')
    , ("RS", '\RS')
    , ("US", '\US')
    , ("SP", '\SP')
    , ("DEL", '\DEL')
    ]