src/Curry/Syntax/Lexer.hs

{- |
    Module      :  $Header$
    Description :  A lexer for Curry
    Copyright   :  (c) 1999 - 2004 Wolfgang Lux
                       2005        Martin Engelke
                       2011 - 2013 Björn Peemöller
                       2016        Finn Teegen
                       2016        Jan Tikovsky
    License     :  BSD-3-clause

    Maintainer  :  bjp@informatik.uni-kiel.de
    Stability   :  experimental
    Portability :  portable
-}
module Curry.Syntax.Lexer
  ( -- * Data types for tokens
    Token (..), Category (..), Attributes (..)

    -- * lexing functions
  , lexSource, lexer, fullLexer
  ) where

import Prelude hiding (fail)
import Data.Char
  ( chr, ord, isAlpha, isAlphaNum, isDigit, isHexDigit, isOctDigit
  , isSpace, isUpper, toLower
  )
import Data.List (intercalate)
import qualified Data.Map as Map
  (Map, union, lookup, findWithDefault, fromList)

import Curry.Base.LexComb
import Curry.Base.Position
import Curry.Base.Span

-- ---------------------------------------------------------------------------
-- Tokens. Note that the equality and ordering instances of Token disregard
-- the attributes, as so that the parser decides about accepting a token
-- just by its category.
-- ---------------------------------------------------------------------------

-- |Data type for curry lexer tokens
data Token = Token Category Attributes

instance Eq Token where
  Token c1 _ == Token c2 _ = c1 == c2

instance Ord Token where
  Token c1 _ `compare` Token c2 _ = c1 `compare` c2

instance Symbol Token where
  isEOF (Token c _) = c == EOF

  dist _ (Token VSemicolon         _) = (0,  0)
  dist _ (Token VRightBrace        _) = (0,  0)
  dist _ (Token EOF                _) = (0,  0)
  dist _ (Token DotDot             _) = (0,  1)
  dist _ (Token DoubleColon        _) = (0,  1)
  dist _ (Token LeftArrow          _) = (0,  1)
  dist _ (Token RightArrow         _) = (0,  1)
  dist _ (Token DoubleArrow        _) = (0,  1)
  dist _ (Token KW_do              _) = (0,  1)
  dist _ (Token KW_if              _) = (0,  1)
  dist _ (Token KW_in              _) = (0,  1)
  dist _ (Token KW_of              _) = (0,  1)
  dist _ (Token Id_as              _) = (0,  1)
  dist _ (Token KW_let             _) = (0,  2)
  dist _ (Token PragmaEnd          _) = (0,  2)
  dist _ (Token KW_case            _) = (0,  3)
  dist _ (Token KW_class           _) = (0,  4)
  dist _ (Token KW_data            _) = (0,  3)
  dist _ (Token KW_default         _) = (0,  6)
  dist _ (Token KW_deriving        _) = (0,  7)
  dist _ (Token KW_else            _) = (0,  3)
  dist _ (Token KW_free            _) = (0,  3)
  dist _ (Token KW_then            _) = (0,  3)
  dist _ (Token KW_type            _) = (0,  3)
  dist _ (Token KW_fcase           _) = (0,  4)
  dist _ (Token KW_infix           _) = (0,  4)
  dist _ (Token KW_instance        _) = (0,  7)
  dist _ (Token KW_where           _) = (0,  4)
  dist _ (Token Id_ccall           _) = (0,  4)
  dist _ (Token KW_import          _) = (0,  5)
  dist _ (Token KW_infixl          _) = (0,  5)
  dist _ (Token KW_infixr          _) = (0,  5)
  dist _ (Token KW_module          _) = (0,  5)
  dist _ (Token Id_forall          _) = (0,  5)
  dist _ (Token Id_hiding          _) = (0,  5)
  dist _ (Token KW_newtype         _) = (0,  6)
  dist _ (Token KW_external        _) = (0,  7)
  dist _ (Token Id_interface       _) = (0,  8)
  dist _ (Token Id_primitive       _) = (0,  8)
  dist _ (Token Id_qualified       _) = (0,  8)
  dist _ (Token PragmaHiding       _) = (0,  9)
  dist _ (Token PragmaLanguage     _) = (0, 11)
  dist _ (Token Id                 a) = distAttr False a
  dist _ (Token QId                a) = distAttr False a
  dist _ (Token Sym                a) = distAttr False a
  dist _ (Token QSym               a) = distAttr False a
  dist _ (Token IntTok             a) = distAttr False a
  dist _ (Token FloatTok           a) = distAttr False a
  dist _ (Token CharTok            a) = distAttr False a
  dist c (Token StringTok          a) = updColDist c (distAttr False a)
  dist _ (Token LineComment        a) = distAttr True  a
  dist c (Token NestedComment      a) = updColDist c (distAttr True  a)
  dist _ (Token PragmaOptions      a) = let (ld, cd) = distAttr False a
                                        in  (ld, cd + 11)
  dist _ _                            = (0, 0)

-- TODO: Comment
updColDist :: Int -> Distance -> Distance
updColDist c (ld, cd) = (ld, if ld == 0 then cd else cd - c + 1)

distAttr :: Bool -> Attributes -> Distance
distAttr isComment attr = case attr of
  NoAttributes              -> (0, 0)
  CharAttributes     _ orig -> (0, length orig + 1)
  IntAttributes      _ orig -> (0, length orig - 1)
  FloatAttributes    _ orig -> (0, length orig - 1)
  StringAttributes   _ orig
      -- comment without surrounding quotes
    | isComment             -> (ld, cd)
      -- string with one ending double quote or two surrounding double quotes
      -- (column distance + 1 / + 2)
    | '\n' `elem` orig      -> (ld, cd + 1)
    | otherwise             -> (ld, cd + 2)
    where ld = length (filter    (== '\n') orig)
          cd = length (takeWhile (/= '\n') (reverse orig)) - 1
  IdentAttributes    mid i  -> (0, length (intercalate "." (mid ++ [i])) - 1)
  OptionsAttributes mt args -> case mt of
                                 Nothing -> (0, distArgs + 1)
                                 Just t  -> (0, length t + distArgs + 2)
    where distArgs = length args

-- |Category of curry tokens
data Category
  -- literals
  = CharTok
  | IntTok
  | FloatTok
  | StringTok

  -- identifiers
  | Id   -- identifier
  | QId  -- qualified identifier
  | Sym  -- symbol
  | QSym -- qualified symbol

  -- punctuation symbols
  | LeftParen     -- (
  | RightParen    -- )
  | Semicolon     -- ;
  | LeftBrace     -- {
  | RightBrace    -- }
  | LeftBracket   -- [
  | RightBracket  -- ]
  | Comma         -- ,
  | Underscore    -- _
  | Backquote     -- `

  -- layout
  | VSemicolon         -- virtual ;
  | VRightBrace        -- virtual }

  -- reserved keywords
  | KW_case
  | KW_class
  | KW_data
  | KW_default
  | KW_deriving
  | KW_do
  | KW_else
  | KW_external
  | KW_fcase
  | KW_free
  | KW_if
  | KW_import
  | KW_in
  | KW_infix
  | KW_infixl
  | KW_infixr
  | KW_instance
  | KW_let
  | KW_module
  | KW_newtype
  | KW_of
  | KW_then
  | KW_type
  | KW_where

  -- reserved operators
  | At           -- @
  | Colon        -- :
  | DotDot       -- ..
  | DoubleColon  -- ::
  | Equals       -- =
  | Backslash    -- \
  | Bar          -- |
  | LeftArrow    -- <-
  | RightArrow   -- ->
  | Tilde        -- ~
  | DoubleArrow  -- =>

  -- special identifiers
  | Id_as
  | Id_ccall
  | Id_forall
  | Id_hiding
  | Id_interface
  | Id_primitive
  | Id_qualified

  -- special operators
  | SymDot      -- .
  | SymMinus    -- -

  -- special symbols
  | SymStar -- kind star (*)

  -- pragmas
  | PragmaLanguage -- {-# LANGUAGE
  | PragmaOptions  -- {-# OPTIONS
  | PragmaHiding   -- {-# HIDING
  | PragmaMethod   -- {-# METHOD
  | PragmaModule   -- {-# MODULE
  | PragmaEnd      -- #-}


  -- comments (only for full lexer) inserted by men & bbr
  | LineComment
  | NestedComment

  -- end-of-file token
  | EOF
    deriving (Eq, Ord)

-- There are different kinds of attributes associated with the tokens.
-- Most attributes simply save the string corresponding to the token.
-- However, for qualified identifiers, we also record the list of module
-- qualifiers. The values corresponding to a literal token are properly
-- converted already. To simplify the creation and extraction of
-- attribute values, we make use of records.

-- |Attributes associated to a token
data Attributes
  = NoAttributes
  | CharAttributes    { cval     :: Char        , original :: String }
  | IntAttributes     { ival     :: Integer     , original :: String }
  | FloatAttributes   { fval     :: Double      , original :: String }
  | StringAttributes  { sval     :: String      , original :: String }
  | IdentAttributes   { modulVal :: [String]    , sval     :: String }
  | OptionsAttributes { toolVal  :: Maybe String, toolArgs :: String }

instance Show Attributes where
  showsPrec _ NoAttributes             = showChar '_'
  showsPrec _ (CharAttributes    cv _) = shows cv
  showsPrec _ (IntAttributes     iv _) = shows iv
  showsPrec _ (FloatAttributes   fv _) = shows fv
  showsPrec _ (StringAttributes  sv _) = shows sv
  showsPrec _ (IdentAttributes  mid i) = showsEscaped
                                       $ intercalate "." $ mid ++ [i]
  showsPrec _ (OptionsAttributes mt s) = showsTool mt
                                       . showChar ' ' . showString s
    where showsTool = maybe id (\t -> showChar '_' . showString t)


-- ---------------------------------------------------------------------------
-- The 'Show' instance of 'Token' is designed to display all tokens in their
-- source representation.
-- ---------------------------------------------------------------------------

showsEscaped :: String -> ShowS
showsEscaped s = showChar '`' . showString s . showChar '\''

showsIdent :: Attributes -> ShowS
showsIdent a = showString "identifier " . shows a

showsSpecialIdent :: String -> ShowS
showsSpecialIdent s = showString "identifier " . showsEscaped s

showsOperator :: Attributes -> ShowS
showsOperator a = showString "operator " . shows a

showsSpecialOperator :: String -> ShowS
showsSpecialOperator s = showString "operator " . showsEscaped s

instance Show Token where
  showsPrec _ (Token Id                 a) = showsIdent a
  showsPrec _ (Token QId                a) = showString "qualified "
                                           . showsIdent a
  showsPrec _ (Token Sym                a) = showsOperator a
  showsPrec _ (Token QSym               a) = showString "qualified "
                                           . showsOperator a
  showsPrec _ (Token IntTok             a) = showString "integer "   . shows a
  showsPrec _ (Token FloatTok           a) = showString "float "     . shows a
  showsPrec _ (Token CharTok            a) = showString "character " . shows a
  showsPrec _ (Token StringTok          a) = showString "string "    . shows a
  showsPrec _ (Token LeftParen          _) = showsEscaped "("
  showsPrec _ (Token RightParen         _) = showsEscaped ")"
  showsPrec _ (Token Semicolon          _) = showsEscaped ";"
  showsPrec _ (Token LeftBrace          _) = showsEscaped "{"
  showsPrec _ (Token RightBrace         _) = showsEscaped "}"
  showsPrec _ (Token LeftBracket        _) = showsEscaped "["
  showsPrec _ (Token RightBracket       _) = showsEscaped "]"
  showsPrec _ (Token Comma              _) = showsEscaped ","
  showsPrec _ (Token Underscore         _) = showsEscaped "_"
  showsPrec _ (Token Backquote          _) = showsEscaped "`"
  showsPrec _ (Token VSemicolon         _)
    = showsEscaped ";" . showString " (inserted due to layout)"
  showsPrec _ (Token VRightBrace        _)
    = showsEscaped "}" . showString " (inserted due to layout)"
  showsPrec _ (Token At                 _) = showsEscaped "@"
  showsPrec _ (Token Colon              _) = showsEscaped ":"
  showsPrec _ (Token DotDot             _) = showsEscaped ".."
  showsPrec _ (Token DoubleArrow        _) = showsEscaped "=>"
  showsPrec _ (Token DoubleColon        _) = showsEscaped "::"
  showsPrec _ (Token Equals             _) = showsEscaped "="
  showsPrec _ (Token Backslash          _) = showsEscaped "\\"
  showsPrec _ (Token Bar                _) = showsEscaped "|"
  showsPrec _ (Token LeftArrow          _) = showsEscaped "<-"
  showsPrec _ (Token RightArrow         _) = showsEscaped "->"
  showsPrec _ (Token Tilde              _) = showsEscaped "~"
  showsPrec _ (Token SymDot             _) = showsSpecialOperator "."
  showsPrec _ (Token SymMinus           _) = showsSpecialOperator "-"
  showsPrec _ (Token SymStar            _) = showsEscaped "*"
  showsPrec _ (Token KW_case            _) = showsEscaped "case"
  showsPrec _ (Token KW_class           _) = showsEscaped "class"
  showsPrec _ (Token KW_data            _) = showsEscaped "data"
  showsPrec _ (Token KW_default         _) = showsEscaped "default"
  showsPrec _ (Token KW_deriving        _) = showsEscaped "deriving"
  showsPrec _ (Token KW_do              _) = showsEscaped "do"
  showsPrec _ (Token KW_else            _) = showsEscaped "else"
  showsPrec _ (Token KW_external        _) = showsEscaped "external"
  showsPrec _ (Token KW_fcase           _) = showsEscaped "fcase"
  showsPrec _ (Token KW_free            _) = showsEscaped "free"
  showsPrec _ (Token KW_if              _) = showsEscaped "if"
  showsPrec _ (Token KW_import          _) = showsEscaped "import"
  showsPrec _ (Token KW_in              _) = showsEscaped "in"
  showsPrec _ (Token KW_infix           _) = showsEscaped "infix"
  showsPrec _ (Token KW_infixl          _) = showsEscaped "infixl"
  showsPrec _ (Token KW_infixr          _) = showsEscaped "infixr"
  showsPrec _ (Token KW_instance        _) = showsEscaped "instance"
  showsPrec _ (Token KW_let             _) = showsEscaped "let"
  showsPrec _ (Token KW_module          _) = showsEscaped "module"
  showsPrec _ (Token KW_newtype         _) = showsEscaped "newtype"
  showsPrec _ (Token KW_of              _) = showsEscaped "of"
  showsPrec _ (Token KW_then            _) = showsEscaped "then"
  showsPrec _ (Token KW_type            _) = showsEscaped "type"
  showsPrec _ (Token KW_where           _) = showsEscaped "where"
  showsPrec _ (Token Id_as              _) = showsSpecialIdent "as"
  showsPrec _ (Token Id_ccall           _) = showsSpecialIdent "ccall"
  showsPrec _ (Token Id_forall          _) = showsSpecialIdent "forall"
  showsPrec _ (Token Id_hiding          _) = showsSpecialIdent "hiding"
  showsPrec _ (Token Id_interface       _) = showsSpecialIdent "interface"
  showsPrec _ (Token Id_primitive       _) = showsSpecialIdent "primitive"
  showsPrec _ (Token Id_qualified       _) = showsSpecialIdent "qualified"
  showsPrec _ (Token PragmaLanguage     _) = showString "{-# LANGUAGE"
  showsPrec _ (Token PragmaOptions      a) = showString "{-# OPTIONS"
                                           . shows a
  showsPrec _ (Token PragmaHiding       _) = showString "{-# HIDING"
  showsPrec _ (Token PragmaMethod       _) = showString "{-# METHOD"
  showsPrec _ (Token PragmaModule       _) = showString "{-# MODULE"
  showsPrec _ (Token PragmaEnd          _) = showString "#-}"
  showsPrec _ (Token LineComment        a) = shows a
  showsPrec _ (Token NestedComment      a) = shows a
  showsPrec _ (Token EOF                _) = showString "<end-of-file>"

-- ---------------------------------------------------------------------------
-- The following functions can be used to construct tokens with
-- specific attributes.
-- ---------------------------------------------------------------------------

-- |Construct a simple 'Token' without 'Attributes'
tok :: Category -> Token
tok t = Token t NoAttributes

-- |Construct a 'Token' for a single 'Char'
charTok :: Char -> String -> Token
charTok c o = Token CharTok CharAttributes { cval = c, original = o }

-- |Construct a 'Token' for an int value
intTok :: Integer -> String -> Token
intTok base digits = Token IntTok IntAttributes
  { ival = convertIntegral base digits, original = digits }

-- |Construct a 'Token' for a float value
floatTok :: String -> String -> Int -> String -> Token
floatTok mant frac expo rest = Token FloatTok FloatAttributes
  { fval     = convertFloating mant frac expo
  , original = mant ++ "." ++ frac ++ rest }

-- |Construct a 'Token' for a string value
stringTok :: String -> String -> Token
stringTok cs s = Token StringTok StringAttributes { sval = cs, original = s }

-- |Construct a 'Token' for identifiers
idTok :: Category -> [String] -> String -> Token
idTok t mIdent ident = Token t
  IdentAttributes { modulVal = mIdent, sval = ident }

-- TODO
pragmaOptionsTok :: Maybe String -> String -> Token
pragmaOptionsTok mbTool s = Token PragmaOptions
  OptionsAttributes { toolVal = mbTool, toolArgs = s }

-- |Construct a 'Token' for a line comment
lineCommentTok :: String -> Token
lineCommentTok s = Token LineComment
  StringAttributes { sval = s, original = s }

-- |Construct a 'Token' for a nested comment
nestedCommentTok :: String -> Token
nestedCommentTok s = Token NestedComment
  StringAttributes { sval = s, original = s }

-- ---------------------------------------------------------------------------
-- Tables for reserved operators and identifiers
-- ---------------------------------------------------------------------------

-- |Map of reserved operators
reservedOps:: Map.Map String Category
reservedOps = Map.fromList
  [ ("@" , At         )
  , (":" , Colon      )
  , ("=>", DoubleArrow)
  , ("::", DoubleColon)
  , ("..", DotDot     )
  , ("=" , Equals     )
  , ("\\", Backslash  )
  , ("|" , Bar        )
  , ("<-", LeftArrow  )
  , ("->", RightArrow )
  , ("~" , Tilde      )
  ]

-- |Map of reserved and special operators
reservedSpecialOps :: Map.Map String Category
reservedSpecialOps = Map.union reservedOps $ Map.fromList
  [ ("." , SymDot     )
  , ("-" , SymMinus   )
  , ("*" , SymStar    )
  ]

-- |Map of keywords
keywords :: Map.Map String Category
keywords = Map.fromList
  [ ("case"    , KW_case    )
  , ("class"   , KW_class   )
  , ("data"    , KW_data    )
  , ("default" , KW_default )
  , ("deriving", KW_deriving)
  , ("do"      , KW_do      )
  , ("else"    , KW_else    )
  , ("external", KW_external)
  , ("fcase"   , KW_fcase   )
  , ("free"    , KW_free    )
  , ("if"      , KW_if      )
  , ("import"  , KW_import  )
  , ("in"      , KW_in      )
  , ("infix"   , KW_infix   )
  , ("infixl"  , KW_infixl  )
  , ("infixr"  , KW_infixr  )
  , ("instance", KW_instance)
  , ("let"     , KW_let     )
  , ("module"  , KW_module  )
  , ("newtype" , KW_newtype )
  , ("of"      , KW_of      )
  , ("then"    , KW_then    )
  , ("type"    , KW_type    )
  , ("where"   , KW_where   )
  ]

-- |Map of keywords and special identifiers
keywordsSpecialIds :: Map.Map String Category
keywordsSpecialIds = Map.union keywords $ Map.fromList
  [ ("as"       , Id_as       )
  , ("ccall"    , Id_ccall    )
  , ("forall"   , Id_forall   )
  , ("hiding"   , Id_hiding   )
  , ("interface", Id_interface)
  , ("primitive", Id_primitive)
  , ("qualified", Id_qualified)
  ]

pragmas :: Map.Map String Category
pragmas = Map.fromList
  [ ("language", PragmaLanguage)
  , ("options" , PragmaOptions )
  , ("hiding"  , PragmaHiding  )
  , ("method"  , PragmaMethod  )
  , ("module"  , PragmaModule  )
  ]


-- ---------------------------------------------------------------------------
-- Character classes
-- ---------------------------------------------------------------------------

-- |Check whether a 'Char' is allowed for identifiers
isIdentChar :: Char -> Bool
isIdentChar c = isAlphaNum c || c `elem` "'_"

-- |Check whether a 'Char' is allowed for symbols
isSymbolChar :: Char -> Bool
isSymbolChar c = c `elem` "~!@#$%^&*+-=<>:?./|\\"

-- ---------------------------------------------------------------------------
-- Lexing functions
-- ---------------------------------------------------------------------------

-- |Lex source code
lexSource :: FilePath -> String -> CYM [(Span, Token)]
lexSource = parse (applyLexer fullLexer)

-- |CPS-Lexer for Curry
lexer :: Lexer Token a
lexer = skipWhiteSpace True -- skip comments

-- |CPS-Lexer for Curry which also lexes comments.
-- This lexer is useful for documentation tools.
fullLexer :: Lexer Token a
fullLexer = skipWhiteSpace False -- lex comments

-- |Lex the source code and skip whitespaces
skipWhiteSpace :: Bool -> Lexer Token a
skipWhiteSpace skipComments suc fail = skip
  where
  skip sp   []              bol = suc sp (tok EOF)                   sp            [] bol
  skip sp c@('-':'-':_)     _   = lexLineComment     sucComment fail sp            c  True
  skip sp c@('{':'-':'#':_) bol = lexPragma noPragma suc        fail sp            c  bol
  skip sp c@('{':'-':_)     bol = lexNestedComment   sucComment fail sp            c  bol
  skip sp cs@(c:s)          bol
    | c == '\t'                = warnP sp "Tab character" skip       (tabSpan  sp) s  bol
    | c == '\n'                = skip                                (nlSpan   sp) s  True
    | isSpace c                = skip                                (nextSpan sp) s  bol
    | bol                      = lexBOL             suc        fail  sp            cs bol
    | otherwise                = lexToken           suc        fail  sp            cs bol
  sucComment = if skipComments then (\ _suc _fail -> skip) else suc
  noPragma   = lexNestedComment sucComment fail

-- Lex a line comment
lexLineComment :: Lexer Token a
lexLineComment suc _ sp str = case break (== '\n') str of
--   (_, []) -> fail p "Unterminated line comment" p                   []
  (c, s ) -> suc  sp (lineCommentTok c)          (incrSpan sp $ length c) s

lexPragma :: P a -> Lexer Token a
lexPragma noPragma suc fail sp0 str = pragma (incrSpan sp0 3) (drop 3 str)
  where
  skip = noPragma sp0 str
  pragma sp []         = fail sp0 "Unterminated pragma" sp []
  pragma sp cs@(c : s)
    | c == '\t' = pragma (tabSpan  sp) s
    | c == '\n' = pragma (nlSpan   sp) s
    | isSpace c = pragma (nextSpan sp) s
    | isAlpha c = case Map.lookup (map toLower prag) pragmas of
        Nothing            -> skip
        Just PragmaOptions -> lexOptionsPragma sp0 suc fail sp1 rest
        Just t             -> suc sp0 (tok t)               sp1 rest
    | otherwise = skip
    where
    (prag, rest) = span isAlphaNum cs
    sp1          = incrSpan sp (length prag)

lexOptionsPragma :: Span -> Lexer Token a
lexOptionsPragma sp0 _   fail sp [] = fail sp0 "Unterminated Options pragma" sp []
lexOptionsPragma sp0 suc fail sp (c : s)
  | c == '\t' = lexArgs Nothing (tabSpan  sp) s
  | c == '\n' = lexArgs Nothing (nlSpan   sp) s
  | isSpace c = lexArgs Nothing (nextSpan sp) s
  | c == '_'  = let (tool, s1) = span isIdentChar s
                in  lexArgs (Just tool) (incrSpan sp (length tool + 1)) s1
  | otherwise = fail sp0 "Malformed Options pragma" sp s
  where
  lexArgs mbTool = lexRaw ""
    where
    lexRaw s0 sp1 r = case hash of
      []            -> fail sp0 "End-of-file inside pragma" (incrSpan sp1 len) []
      '#':'-':'}':_ -> token  (trim $ s0 ++ opts) (incrSpan sp1 len)       hash
      _             -> lexRaw (s0 ++ opts ++ "#") (incrSpan sp1 (len + 1)) (drop 1 hash)
      where
      (opts, hash) = span (/= '#') r
      len = length opts
      token = suc sp0 . pragmaOptionsTok mbTool
      trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace

-- Lex a nested comment
lexNestedComment :: Lexer Token a
lexNestedComment suc fail sp0 = lnc (0 :: Integer) id sp0
  where
  -- d   : nesting depth
  -- comm: comment already lexed as functional list
  lnc d comm sp str = case (d, str) of
    (_,        []) -> fail sp0    "Unterminated nested comment"  sp          []
    (1, '-':'}':s) -> suc  sp0    (nestedCommentTok (comm "-}")) (incrSpan sp 2) s
    (_, '{':'-':s) -> cont (d+1) ("{-" ++)                       (incrSpan sp 2) s
    (_, '-':'}':s) -> cont (d-1) ("-}" ++)                       (incrSpan sp 2) s
    (_, c@'\t' :s) -> cont d     (c:)                            (tabSpan    sp) s
    (_, c@'\n' :s) -> cont d     (c:)                            (nlSpan     sp) s
    (_, c      :s) -> cont d     (c:)                            (nextSpan   sp) s
    where cont d' comm' = lnc d' (comm . comm')

-- Lex tokens at the beginning of a line, managing layout.
lexBOL :: Lexer Token a
lexBOL suc fail sp s _ []            = lexToken suc fail sp s False []
lexBOL suc fail sp s _ ctxt@(n:rest)
  | col <  n  = suc sp (tok VRightBrace) sp s True  rest
  | col == n  = suc sp (tok  VSemicolon) sp s False ctxt
  | otherwise = lexToken suc fail        sp s False ctxt
  where col = column (span2Pos sp)

-- Lex a single 'Token'
lexToken :: Lexer Token a
lexToken suc _    sp []       = suc sp (tok EOF) sp []
lexToken suc fail sp cs@(c:s)
  | take 3 cs == "#-}" = suc sp (tok PragmaEnd) (incrSpan sp 3) (drop 3 cs)
  | c == '('           = token LeftParen
  | c == ')'           = token RightParen
  | c == ','           = token Comma
  | c == ';'           = token Semicolon
  | c == '['           = token LeftBracket
  | c == ']'           = token RightBracket
  | c == '_'           = token Underscore
  | c == '`'           = token Backquote
  | c == '{'           = token LeftBrace
  | c == '}'           = lexRightBrace (suc sp) (nextSpan sp) s
  | c == '\''          = lexChar   sp suc fail  (nextSpan sp) s
  | c == '\"'          = lexString sp suc fail  (nextSpan sp) s
  | isAlpha      c     = lexIdent      (suc sp) sp            cs
  | isSymbolChar c     = lexSymbol     (suc sp) sp            cs
  | isDigit      c     = lexNumber     (suc sp) sp            cs
  | otherwise          = fail sp ("Illegal character " ++ show c) sp s
  where token t = suc sp (tok t) (nextSpan sp) s

-- Lex a right brace and pop from the context stack
lexRightBrace :: (Token -> P a) -> P a
lexRightBrace cont sp s bol ctxt = cont (tok RightBrace) sp s bol (drop 1 ctxt)

-- Lex an identifier
lexIdent :: (Token -> P a) -> P a
lexIdent cont sp s = maybe (lexOptQual cont (token Id) [ident]) (cont . token)
                          (Map.lookup ident keywordsSpecialIds)
                          (incrSpan sp $ length ident) rest
  where (ident, rest) = span isIdentChar s
        token t       = idTok t [] ident

-- Lex a symbol
lexSymbol :: (Token -> P a) -> P a
lexSymbol cont sp s = cont
  (idTok (Map.findWithDefault Sym sym reservedSpecialOps) [] sym)
  (incrSpan sp $ length sym) rest
  where (sym, rest) = span isSymbolChar s

-- Lex an optionally qualified entity (identifier or symbol).
lexOptQual :: (Token -> P a) -> Token -> [String] -> P a
lexOptQual cont token mIdent sp cs@('.':c:s)
  | isAlpha  c       = lexQualIdent     cont identCont mIdent (nextSpan sp) (c:s)
  | isSymbolChar c   = lexQualSymbol    cont identCont mIdent (nextSpan sp) (c:s)
--   | c `elem` ":[("   = lexQualPrimitive cont token     mIdent (nextSpan sp) (c:s)
  where identCont _ _ = cont token sp cs
lexOptQual cont token _      sp cs = cont token sp cs

-- Lex a qualified identifier.
lexQualIdent :: (Token -> P a) -> P a -> [String] -> P a
lexQualIdent cont identCont mIdent sp s =
  maybe (lexOptQual cont (idTok QId mIdent ident) (mIdent ++ [ident]))
        (const identCont)
        (Map.lookup ident keywords)
        (incrSpan sp (length ident)) rest
  where (ident, rest) = span isIdentChar s

-- Lex a qualified symbol.
lexQualSymbol :: (Token -> P a) -> P a -> [String] -> P a
lexQualSymbol cont identCont mIdent sp s =
  maybe (cont (idTok QSym mIdent sym)) (const identCont)
        (Map.lookup sym reservedOps)
        (incrSpan sp (length sym)) rest
  where (sym, rest) = span isSymbolChar s

-- ---------------------------------------------------------------------------
-- /Note:/ since Curry allows an unlimited range of integer numbers,
-- read numbers must be converted to Haskell type 'Integer'.
-- ---------------------------------------------------------------------------

-- Lex a numeric literal.
lexNumber :: (Token -> P a) -> P a
lexNumber cont sp ('0':c:s)
  | c `elem` "bB"  = lexBinary      cont nullCont (incrSpan sp 2) s
  | c `elem` "oO"  = lexOctal       cont nullCont (incrSpan sp 2) s
  | c `elem` "xX"  = lexHexadecimal cont nullCont (incrSpan sp 2) s
  where nullCont _ _ = cont (intTok 10 "0") (nextSpan sp) (c:s)
lexNumber cont sp s = lexOptFraction cont (intTok 10 digits) digits
                     (incrSpan sp $ length digits) rest
  where (digits, rest) = span isDigit s

-- Lex a binary literal.
lexBinary :: (Token -> P a) -> P a -> P a
lexBinary cont nullCont sp s
  | null digits = nullCont undefined undefined
  | otherwise   = cont (intTok 2 digits) (incrSpan sp $ length digits) rest
  where (digits, rest) = span isBinDigit s
        isBinDigit c   = c >= '0' && c <= '1'

-- Lex an octal literal.
lexOctal :: (Token -> P a) -> P a -> P a
lexOctal cont nullCont sp s
  | null digits = nullCont undefined undefined
  | otherwise   = cont (intTok 8 digits) (incrSpan sp $ length digits) rest
  where (digits, rest) = span isOctDigit s

-- Lex a hexadecimal literal.
lexHexadecimal :: (Token -> P a) -> P a -> P a
lexHexadecimal cont nullCont sp s
  | null digits = nullCont undefined undefined
  | otherwise   = cont (intTok 16 digits) (incrSpan sp $ length digits) rest
  where (digits, rest) = span isHexDigit s

-- Lex an optional fractional part (float literal).
lexOptFraction :: (Token -> P a) -> Token -> String -> P a
lexOptFraction cont _ mant sp ('.':c:s)
  | isDigit c = lexOptExponent cont (floatTok mant frac 0 "") mant frac
                               (incrSpan sp (length frac+1)) rest
  where (frac,rest) = span isDigit (c:s)
lexOptFraction cont token mant sp (c:s)
  | c `elem` "eE" = lexSignedExponent cont intCont mant "" [c] (nextSpan sp) s
  where intCont _ _ = cont token sp (c:s)
lexOptFraction cont token _ sp s = cont token sp s

-- Lex an optional exponent (float literal).
lexOptExponent :: (Token -> P a) -> Token -> String -> String -> P a
lexOptExponent cont token mant frac sp (c:s)
  | c `elem` "eE" = lexSignedExponent cont floatCont mant frac [c] (nextSpan sp) s
  where floatCont _ _ = cont token sp (c:s)
lexOptExponent cont token _    _    sp s = cont token sp s

-- Lex an exponent with sign (float literal).
lexSignedExponent :: (Token -> P a) -> P a -> String -> String -> String
                  -> P a
lexSignedExponent cont floatCont mant frac e sp str = case str of
  ('+':c:s) | isDigit c -> lexExpo (e ++ "+") id     (nextSpan sp) (c:s)
  ('-':c:s) | isDigit c -> lexExpo (e ++ "-") negate (nextSpan sp) (c:s)
  (c:_)     | isDigit c -> lexExpo e          id     sp            str
  _                     -> floatCont                 sp            str
  where lexExpo = lexExponent cont mant frac

-- Lex an exponent without sign (float literal).
lexExponent :: (Token -> P a) -> String -> String -> String -> (Int -> Int)
            -> P a
lexExponent cont mant frac e expSign sp s =
  cont (floatTok mant frac expo (e ++ digits)) (incrSpan sp $ length digits) rest
  where (digits, rest) = span isDigit s
        expo           = expSign (convertIntegral 10 digits)

-- Lex a character literal.
lexChar :: Span -> Lexer Token a
lexChar sp0 _       fail sp []    = fail sp0 "Illegal character constant" sp []
lexChar sp0 success fail sp (c:s)
  | c == '\\' = lexEscape sp (\d o -> lexCharEnd d o sp0 success fail)
                          fail (nextSpan sp) s
  | c == '\n' = fail sp0 "Illegal character constant" sp (c:s)
  | c == '\t' = lexCharEnd c "\t" sp0 success fail (tabSpan  sp) s
  | otherwise = lexCharEnd c [c]  sp0 success fail (nextSpan sp) s

-- Lex the end of a character literal.
lexCharEnd :: Char -> String -> Span -> Lexer Token a
lexCharEnd c o sp0 suc _    sp ('\'':s) = suc sp0 (charTok c o) (nextSpan sp) s
lexCharEnd _ _ sp0 _   fail sp s        =
  fail sp0 "Improperly terminated character constant" sp s

-- Lex a String literal.
lexString :: Span -> Lexer Token a
lexString sp0 suc fail = lexStringRest "" id
  where
  lexStringRest _  _  sp []    = improperTermination sp
  lexStringRest s0 so sp (c:s)
    | c == '\n' = improperTermination sp
    | c == '\"' = suc sp0 (stringTok (reverse s0) (so "")) (nextSpan sp) s
    | c == '\\' = lexStringEscape sp s0 so lexStringRest fail (nextSpan sp) s
    | c == '\t' = lexStringRest (c:s0) (so . (c:)) (tabSpan  sp) s
    | otherwise = lexStringRest (c:s0) (so . (c:)) (nextSpan sp) s
  improperTermination sp = fail sp0 "Improperly terminated string constant" sp []

-- Lex an escaped character inside a string.
lexStringEscape ::  Span -> String -> (String -> String)
                -> (String -> (String -> String) -> P a)
                -> FailP a -> P a
lexStringEscape sp0 _  _  _   fail sp []      = lexEscape sp0 undefined fail sp []
lexStringEscape sp0 s0 so suc fail sp cs@(c:s)
    -- The escape sequence represents an empty character of length zero
  | c == '&'  = suc s0 (so . ("\\&" ++)) (nextSpan sp) s
  | isSpace c = lexStringGap so (suc s0) fail sp cs
  | otherwise = lexEscape sp0 (\ c' s' -> suc (c': s0) (so . (s' ++))) fail sp cs

-- Lex a string gap.
lexStringGap :: (String -> String) -> ((String -> String) -> P a)
             -> FailP a -> P a
lexStringGap _  _   fail sp []    = fail sp "End-of-file in string gap" sp []
lexStringGap so suc fail sp (c:s)
  | c == '\\' = suc          (so . (c:))          (nextSpan sp) s
  | c == '\t' = lexStringGap (so . (c:)) suc fail (tabSpan  sp) s
  | c == '\n' = lexStringGap (so . (c:)) suc fail (nlSpan   sp) s
  | isSpace c = lexStringGap (so . (c:)) suc fail (nextSpan sp) s
  | otherwise = fail sp ("Illegal character in string gap: " ++ show c) sp s

-- Lex an escaped character.
lexEscape :: Span -> (Char -> String -> P a) -> FailP a -> P a
lexEscape sp0 suc fail sp str = case str of
  -- character escape
  ('a' :s) -> suc '\a' "\\a"  (nextSpan sp) s
  ('b' :s) -> suc '\b' "\\b"  (nextSpan sp) s
  ('f' :s) -> suc '\f' "\\f"  (nextSpan sp) s
  ('n' :s) -> suc '\n' "\\n"  (nextSpan sp) s
  ('r' :s) -> suc '\r' "\\r"  (nextSpan sp) s
  ('t' :s) -> suc '\t' "\\t"  (nextSpan sp) s
  ('v' :s) -> suc '\v' "\\v"  (nextSpan sp) s
  ('\\':s) -> suc '\\' "\\\\" (nextSpan sp) s
  ('"' :s) -> suc '\"' "\\\"" (nextSpan sp) s
  ('\'':s) -> suc '\'' "\\\'" (nextSpan sp) s
  -- control characters
  ('^':c:s) | isControlEsc c -> controlEsc c (incrSpan sp 2) s
  -- numeric escape
  ('o':c:s) | isOctDigit c   -> numEsc  8 isOctDigit ("\\o" ++) (nextSpan sp) (c:s)
  ('x':c:s) | isHexDigit c   -> numEsc 16 isHexDigit ("\\x" ++) (nextSpan sp) (c:s)
  (c:s)     | isDigit    c   -> numEsc 10 isDigit    ("\\"  ++) sp            (c:s)
  -- ascii escape
  _        -> asciiEscape sp0 suc fail sp str
  where numEsc         = numEscape sp0 suc fail
        controlEsc   c = suc (chr (ord c `mod` 32)) ("\\^" ++ [c])
        isControlEsc c = isUpper c || c `elem` "@[\\]^_"

numEscape :: Span -> (Char -> String -> P a) -> FailP a -> Int
          -> (Char -> Bool) -> (String -> String) -> P a
numEscape sp0 suc fail b isDigit' so sp s
  | n >= ord minBound && n <= ord maxBound
   = suc (chr n) (so digits) (incrSpan sp $ length digits) rest
  | otherwise
  = fail sp0 "Numeric escape out-of-range" sp s
  where (digits, rest) = span isDigit' s
        n = convertIntegral b digits

asciiEscape :: Span -> (Char -> String -> P a) -> FailP a -> P a
asciiEscape sp0 suc fail sp str = case str of
  ('N':'U':'L':s) -> suc '\NUL' "\\NUL" (incrSpan sp 3) s
  ('S':'O':'H':s) -> suc '\SOH' "\\SOH" (incrSpan sp 3) s
  ('S':'T':'X':s) -> suc '\STX' "\\STX" (incrSpan sp 3) s
  ('E':'T':'X':s) -> suc '\ETX' "\\ETX" (incrSpan sp 3) s
  ('E':'O':'T':s) -> suc '\EOT' "\\EOT" (incrSpan sp 3) s
  ('E':'N':'Q':s) -> suc '\ENQ' "\\ENQ" (incrSpan sp 3) s
  ('A':'C':'K':s) -> suc '\ACK' "\\ACK" (incrSpan sp 3) s
  ('B':'E':'L':s) -> suc '\BEL' "\\BEL" (incrSpan sp 3) s
  ('B':'S'    :s) -> suc '\BS'  "\\BS"  (incrSpan sp 2) s
  ('H':'T'    :s) -> suc '\HT'  "\\HT"  (incrSpan sp 2) s
  ('L':'F'    :s) -> suc '\LF'  "\\LF"  (incrSpan sp 2) s
  ('V':'T'    :s) -> suc '\VT'  "\\VT"  (incrSpan sp 2) s
  ('F':'F'    :s) -> suc '\FF'  "\\FF"  (incrSpan sp 2) s
  ('C':'R'    :s) -> suc '\CR'  "\\CR"  (incrSpan sp 2) s
  ('S':'O'    :s) -> suc '\SO'  "\\SO"  (incrSpan sp 2) s
  ('S':'I'    :s) -> suc '\SI'  "\\SI"  (incrSpan sp 2) s
  ('D':'L':'E':s) -> suc '\DLE' "\\DLE" (incrSpan sp 3) s
  ('D':'C':'1':s) -> suc '\DC1' "\\DC1" (incrSpan sp 3) s
  ('D':'C':'2':s) -> suc '\DC2' "\\DC2" (incrSpan sp 3) s
  ('D':'C':'3':s) -> suc '\DC3' "\\DC3" (incrSpan sp 3) s
  ('D':'C':'4':s) -> suc '\DC4' "\\DC4" (incrSpan sp 3) s
  ('N':'A':'K':s) -> suc '\NAK' "\\NAK" (incrSpan sp 3) s
  ('S':'Y':'N':s) -> suc '\SYN' "\\SYN" (incrSpan sp 3) s
  ('E':'T':'B':s) -> suc '\ETB' "\\ETB" (incrSpan sp 3) s
  ('C':'A':'N':s) -> suc '\CAN' "\\CAN" (incrSpan sp 3) s
  ('E':'M'    :s) -> suc '\EM'  "\\EM"  (incrSpan sp 2) s
  ('S':'U':'B':s) -> suc '\SUB' "\\SUB" (incrSpan sp 3) s
  ('E':'S':'C':s) -> suc '\ESC' "\\ESC" (incrSpan sp 3) s
  ('F':'S'    :s) -> suc '\FS'  "\\FS"  (incrSpan sp 2) s
  ('G':'S'    :s) -> suc '\GS'  "\\GS"  (incrSpan sp 2) s
  ('R':'S'    :s) -> suc '\RS'  "\\RS"  (incrSpan sp 2) s
  ('U':'S'    :s) -> suc '\US'  "\\US"  (incrSpan sp 2) s
  ('S':'P'    :s) -> suc '\SP'  "\\SP"  (incrSpan sp 2) s
  ('D':'E':'L':s) -> suc '\DEL' "\\DEL" (incrSpan sp 3) s
  s               -> fail sp0 "Illegal escape sequence" sp s