--  C->Haskell Compiler: Lexer for CHS Files
--
--  Author : Manuel M T Chakravarty
--  Created: 13 August 99
--
--  Copyright (c) [1999..2005] Manuel M T Chakravarty
--
--  This file is free software; you can redistribute it and/or modify
--  it under the terms of the GNU General Public License as published by
--  the Free Software Foundation; either version 2 of the License, or
--  (at your option) any later version.
--
--  This file is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--  GNU General Public License for more details.
--
--- DESCRIPTION ---------------------------------------------------------------
--
--  Lexer for CHS files; the tokens are only partially recognised.
--
--- DOCU ----------------------------------------------------------------------
--
--  language: Haskell 98
--
--  * CHS files are assumed to be Haskell 98 files that include C2HS binding
--    hooks.
--
--  * Haskell code is not tokenised, but binding hooks (delimited by `{#'and
--    `#}') are analysed.  Therefore the lexer operates in two states
--    (realised as two lexer coupled by meta actions) depending on whether
--    Haskell code or a binding hook is currently read.  The lexer reading
--    Haskell code is called `base lexer'; the other one, `binding-hook
--    lexer'.  In addition, there is a inline-c lexer, which, as the
--    binding-hook lexer, can be triggered from the base lexer.
--
--  * Base lexer:
--
--      haskell -> (inline \\ special)*
--               | special \\ `"'
--               | comment
--               | nested
--               | hstring
--               | '{#'
--               | cpp
--      special -> `(' | `{' | `-' | `"'
--      ctrl    -> `\n' | `\f' | `\r' | `\t' | `\v'
--
--      inline  -> any \\ ctrl
--      any     -> '\0'..'\255'
--
--    Within the base lexer control codes appear as separate tokens in the
--    token list.
--
--    NOTE: It is important that `{' is an extra lexeme and not added as an
--          optional component at the end of the first alternative for
--          `haskell'.  Otherwise, the principle of the longest match will
--          divide `foo {#' into the tokens `foo {' and `#' instead of `foo '
--          and `{#'.
--
--    One line comments are handled by
--
--      comment -> `--' (any \\ `\n')* `\n'
--
--    and nested comments by
--
--      nested -> `{-' any* `-}'
--
--    where `any*' may contain _balanced_ occurrences of `{-' and `-}'.
--
--      hstring -> `"' inhstr* `"'
--      inhstr  -> ` '..`\127' \\ `"'
--               | `\"'
--
--    Pre-precessor directives as well as the switch to inline-C code are
--    formed as follows:
--
--      cpp     -> `\n#' (inline | `\t')* `\n'
--               | `\n#c' (' ' | '\t')* `\n'
--
--    We allow whitespace between the `#' and the actual directive, but in `#c'
--    and `#endc' the directive must immediately follow the `#'.  This might
--    be regarded as a not entirely orthogonal design, but simplifies matters
--    especially for `#endc'.
--
--  * On encountering the lexeme `{#', a meta action in the base lexer
--    transfers control to the following binding-hook lexer:
--
--      ident       -> letter (letter | digit | `\'')*
--      cidenttail  -> digit (letter | digit)*
--      reservedid  -> `add' | `as' | `call' | `class' | `context' | `deriving'
--                   | `enum' | `foreign' | `fun' | `get' | `lib'
--                   | `downcaseFirstLetter' | `finalizer'
--                   | `newtype' | `nocode' | `pointer' | `prefix' | `pure'
--                   | `set' | `sizeof' | `stable' | `struct' | `type'
--                   | `underscoreToCase' | `upcaseFirstLetter' | `unsafe' |
--                   | `with' | `const' | `omit'
--      reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `='
--                   | `=>' | '-' | `*' | `&' | `^' | `+' | `%'
--      string      -> `"' instr* `"'
--      verbhs      -> `\`' inhsverb* `\''
--      quoths      -> `\'' inhsverb* `\''
--      instr       -> ` '..`\127' \\ `"'
--      inhsverb    -> ` '..`\127' \\ `\''
--      comment     -> `--' (any \\ `\n')* `\n'
--
--    Control characters, white space, and comments are discarded in the
--    binding-hook lexer.  Nested comments are not allowed in a binding hook.
--    Identifiers can be enclosed in single quotes to avoid collision with
--    C->Haskell keywords, and in this case quoted part could also be followed by
--    what is assumed to be a valid Haskell code, which would be transferred in the
--    output file verbatim.
--
--  * In the binding-hook lexer, the lexeme `#}' transfers control back to the
--    base lexer.  An occurence of the lexeme `{#' inside the binding-hook
--    lexer triggers an error.  The symbol `{#' is not explcitly represented
--    in the resulting token stream.  However, the occurrence of a token
--    representing one of the reserved identifiers `call', `context', `enum',
--    and `field' marks the start of a binding hook.  Strictly speaking, `#}'
--    need also not occur in the token stream, as the next `haskell' token
--    marks a hook's end.  It is, however, useful for producing accurate error
--    messages (in case an hook is closed to early) to have a token
--    representing `#}'.
--
--  * The rule `ident' describes Haskell identifiers, but without
--    distinguishing between variable and constructor identifers (ie, those
--    starting with a lowercase and those starting with an uppercase letter).
--    However, we use it also to scan C identifiers; although, strictly
--    speaking, it is too general for them.  In the case of C identifiers,
--    this should not have any impact on the range of descriptions accepted by
--    the tool, as illegal identifier will never occur in a C header file that
--    is accepted by the C lexer.  In the case of Haskell identifiers, a
--    confusion between variable and constructor identifiers will be noted by
--    the Haskell compiler translating the code generated by c2hs.  Moreover,
--    identifiers that are inside quoted parts (see above) may not contain apostrophes.
--
--  * Any line starting with the character `#' is regarded to be a C
--    preprocessor directive.  With the exception of `#c' and `#endc', which
--    delimit a set of lines containing inline C code.  Hence, in the base
--    lexer, the lexeme `#c' triggers a meta action transferring control to the
--    following inline-C lexer:
--
--      c  -> inline* \\ `\n#endc'
--
--    We do neither treat C strings nor C comments specially.  Hence, if the
--    string "\n#endc" occurs in a comment, we will mistakenly regard it as
--    the end of the inline C code.  Note that the problem cannot happen with
--    strings, as C does not permit strings that extend over multiple lines.
--    At the moment, it just seems not to be worth the effort required to
--    treat this situation more accurately.
--
--    The inline-C lexer also doesn't handle pre-processor directives
--    specially.  Hence, structural pre-processor directives (namely,
--    conditionals) may occur within inline-C code only properly nested.
--
--  Shortcomings
--  ~~~~~~~~~~~~
--  Some lexemes that include single and double quote characters are not lexed
--  correctly.  See the implementation comment at `haskell' for details.
--
--
--- TODO ----------------------------------------------------------------------
--
--  * In `haskell', the case of a single `"' (without a matching second one)
--    is caught by an eplicit error raising rule.  This shouldn't be
--    necessary, but for some strange reason, the lexer otherwise hangs when a
--    single `"' appears in the input.
--
--  * Comments in the "gap" of a string are not yet supported.
--

module C2HS.CHS.Lexer (CHSToken(..), lexCHS, keywordToIdent)
where

import Data.List     ((\\))
import Data.Char     (isDigit, isSpace)

import Language.C.Data.Ident
import Language.C.Data.Name
import Language.C.Data.Position

import Data.Errors    (ErrorLevel(..), makeError)
import Text.Lexers    (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
                  lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus,
                  alt, string, execLexer)
import Control.State (getNameSupply, setNameSupply)
import C2HS.State (CST, raise, raiseError)


-- token definition
-- ----------------

-- | possible tokens
--
data CHSToken = CHSTokArrow   Position          -- `->'
              | CHSTokDArrow  Position          -- `=>'
              | CHSTokDot     Position          -- `.'
              | CHSTokComma   Position          -- `,'
              | CHSTokEqual   Position          -- `='
              | CHSTokMinus   Position          -- `-'
              | CHSTokStar    Position          -- `*'
              | CHSTokAmp     Position          -- `&'
              | CHSTokHat     Position          -- `^'
              | CHSTokPercent Position          -- `%'
              | CHSTokPlus    Position          -- `+'
              | CHSTokPlusS   Position          -- `+S'
              | CHSTokPlusNum Position Int      -- `+<num>'
              | CHSTokLBrace  Position          -- `{'
              | CHSTokRBrace  Position          -- `}'
              | CHSTokLParen  Position          -- `('
              | CHSTokRParen  Position          -- `)'
              | CHSTokLBrack  Position          -- `['
              | CHSTokRBrack  Position          -- `]'
              | CHSTokHook    Position          -- `{#'
              | CHSTokEndHook Position          -- `#}'
              | CHSTokAdd     Position          -- `add'
              | CHSTokAs      Position          -- `as'
              | CHSTokCall    Position          -- `call'
              | CHSTokClass   Position          -- `class'
              | CHSTokConst   Position          -- `const'
              | CHSTokContext Position          -- `context'
              | CHSTokNonGNU  Position          -- `nonGNU'
              | CHSTokTypedef Position          -- `typedef'
              | CHSTokDefault Position          -- `default'
              | CHSTokDerive  Position          -- `deriving'
              | CHSTokDown    Position          -- `downcaseFirstLetter'
              | CHSTokEnum    Position          -- `enum'
              | CHSTokFinal   Position          -- `finalizer'
              | CHSTokForeign Position          -- `foreign'
              | CHSTokFun     Position          -- `fun'
              | CHSTokGet     Position          -- `get'
              | CHSTokImport  Position          -- `import'
              | CHSTokLib     Position          -- `lib'
              | CHSTokNewtype Position          -- `newtype'
              | CHSTokNocode  Position          -- `nocode'
              | CHSTokOffsetof Position         -- `offsetof'
              | CHSTokOmit    Position          -- `omit'
              | CHSTokPointer Position          -- `pointer'
              | CHSTokPrefix  Position          -- `prefix'
              | CHSTokPure    Position          -- `pure'
              | CHSTokQualif  Position          -- `qualified'
              | CHSTokSet     Position          -- `set'
              | CHSTokSizeof  Position          -- `sizeof'
              | CHSTokAlignof Position          -- `alignof'
              | CHSTokStable  Position          -- `stable'
              | CHSTokStruct  Position          -- `struct'
              | CHSTokType    Position          -- `type'
              | CHSTok_2Case  Position          -- `underscoreToCase'
              | CHSTokUnsafe  Position          -- `unsafe'
              | CHSTokUpper   Position          -- `upcaseFirstLetter'
              | CHSTokVariadic Position          -- `variadic'
              | CHSTokWith    Position Ident    -- `with'
              | CHSTokIn      Position          -- `in'
              | CHSTokOut     Position          -- `out'
              | CHSTokString  Position String   -- string
              | CHSTokHSVerb  Position String   -- verbatim Haskell (`...')
              | CHSTokHSQuot  Position String   -- quoted Haskell ('...')
              | CHSTokIdent   Position Ident    -- identifier
              | CHSTokHaskell Position String   -- verbatim Haskell code
              | CHSTokCPP     Position String Bool -- pre-processor directive
              | CHSTokLine    Position          -- line pragma
              | CHSTokC       Position String   -- verbatim C code
              | CHSTokCtrl    Position Char     -- control code
              | CHSTokComment Position String   -- comment
              | CHSTokCIdentTail Position Ident -- C identifier without prefix
              | CHSTokCArg Position String      -- C type argument

instance Pos CHSToken where
  posOf (CHSTokArrow   pos  ) = pos
  posOf (CHSTokDArrow  pos  ) = pos
  posOf (CHSTokDot     pos  ) = pos
  posOf (CHSTokComma   pos  ) = pos
  posOf (CHSTokEqual   pos  ) = pos
  posOf (CHSTokMinus   pos  ) = pos
  posOf (CHSTokStar    pos  ) = pos
  posOf (CHSTokAmp     pos  ) = pos
  posOf (CHSTokHat     pos  ) = pos
  posOf (CHSTokLBrace  pos  ) = pos
  posOf (CHSTokRBrace  pos  ) = pos
  posOf (CHSTokLParen  pos  ) = pos
  posOf (CHSTokRParen  pos  ) = pos
  posOf (CHSTokLBrack  pos  ) = pos
  posOf (CHSTokRBrack  pos  ) = pos
  posOf (CHSTokHook    pos  ) = pos
  posOf (CHSTokEndHook pos  ) = pos
  posOf (CHSTokAdd     pos  ) = pos
  posOf (CHSTokAs      pos  ) = pos
  posOf (CHSTokCall    pos  ) = pos
  posOf (CHSTokClass   pos  ) = pos
  posOf (CHSTokConst   pos  ) = pos
  posOf (CHSTokContext pos  ) = pos
  posOf (CHSTokNonGNU  pos  ) = pos
  posOf (CHSTokDerive  pos  ) = pos
  posOf (CHSTokTypedef pos  ) = pos
  posOf (CHSTokDefault pos  ) = pos
  posOf (CHSTokDown    pos  ) = pos
  posOf (CHSTokEnum    pos  ) = pos
  posOf (CHSTokFinal   pos  ) = pos
  posOf (CHSTokForeign pos  ) = pos
  posOf (CHSTokFun     pos  ) = pos
  posOf (CHSTokGet     pos  ) = pos
  posOf (CHSTokImport  pos  ) = pos
  posOf (CHSTokLib     pos  ) = pos
  posOf (CHSTokNewtype pos  ) = pos
  posOf (CHSTokNocode  pos  ) = pos
  posOf (CHSTokOffsetof pos ) = pos
  posOf (CHSTokOmit    pos  ) = pos
  posOf (CHSTokPointer pos  ) = pos
  posOf (CHSTokPrefix  pos  ) = pos
  posOf (CHSTokPure    pos  ) = pos
  posOf (CHSTokQualif  pos  ) = pos
  posOf (CHSTokSet     pos  ) = pos
  posOf (CHSTokSizeof  pos  ) = pos
  posOf (CHSTokAlignof pos  ) = pos
  posOf (CHSTokStable  pos  ) = pos
  posOf (CHSTokStruct  pos  ) = pos
  posOf (CHSTokType    pos  ) = pos
  posOf (CHSTok_2Case  pos  ) = pos
  posOf (CHSTokUnsafe  pos  ) = pos
  posOf (CHSTokUpper   pos  ) = pos
  posOf (CHSTokVariadic pos  ) = pos
  posOf (CHSTokWith    pos _) = pos
  posOf (CHSTokIn      pos  ) = pos
  posOf (CHSTokOut     pos  ) = pos
  posOf (CHSTokString  pos _) = pos
  posOf (CHSTokHSVerb  pos _) = pos
  posOf (CHSTokHSQuot  pos _) = pos
  posOf (CHSTokIdent   pos _) = pos
  posOf (CHSTokHaskell pos _) = pos
  posOf (CHSTokCPP     pos _ _) = pos
  posOf (CHSTokLine    pos  ) = pos
  posOf (CHSTokC       pos _) = pos
  posOf (CHSTokCtrl    pos _) = pos
  posOf (CHSTokComment pos _) = pos
  posOf (CHSTokCIdentTail pos _) = pos
  posOf (CHSTokCArg    pos _) = pos
  posOf (CHSTokPercent pos) = pos

instance Eq CHSToken where
  (CHSTokArrow    _  ) == (CHSTokArrow    _  ) = True
  (CHSTokDArrow   _  ) == (CHSTokDArrow   _  ) = True
  (CHSTokDot      _  ) == (CHSTokDot      _  ) = True
  (CHSTokComma    _  ) == (CHSTokComma    _  ) = True
  (CHSTokEqual    _  ) == (CHSTokEqual    _  ) = True
  (CHSTokMinus    _  ) == (CHSTokMinus    _  ) = True
  (CHSTokStar     _  ) == (CHSTokStar     _  ) = True
  (CHSTokAmp      _  ) == (CHSTokAmp      _  ) = True
  (CHSTokHat      _  ) == (CHSTokHat      _  ) = True
  (CHSTokLBrace   _  ) == (CHSTokLBrace   _  ) = True
  (CHSTokRBrace   _  ) == (CHSTokRBrace   _  ) = True
  (CHSTokLParen   _  ) == (CHSTokLParen   _  ) = True
  (CHSTokRParen   _  ) == (CHSTokRParen   _  ) = True
  (CHSTokLBrack   _  ) == (CHSTokLBrack   _  ) = True
  (CHSTokRBrack   _  ) == (CHSTokRBrack   _  ) = True
  (CHSTokHook     _  ) == (CHSTokHook     _  ) = True
  (CHSTokEndHook  _  ) == (CHSTokEndHook  _  ) = True
  (CHSTokAdd      _  ) == (CHSTokAdd      _  ) = True
  (CHSTokAs       _  ) == (CHSTokAs       _  ) = True
  (CHSTokCall     _  ) == (CHSTokCall     _  ) = True
  (CHSTokClass    _  ) == (CHSTokClass    _  ) = True
  (CHSTokConst    _  ) == (CHSTokConst    _  ) = True
  (CHSTokContext  _  ) == (CHSTokContext  _  ) = True
  (CHSTokNonGNU   _  ) == (CHSTokNonGNU   _  ) = True
  (CHSTokTypedef  _  ) == (CHSTokTypedef  _  ) = True
  (CHSTokDefault  _  ) == (CHSTokDefault  _  ) = True
  (CHSTokDerive   _  ) == (CHSTokDerive   _  ) = True
  (CHSTokDown     _  ) == (CHSTokDown     _  ) = True
  (CHSTokEnum     _  ) == (CHSTokEnum     _  ) = True
  (CHSTokFinal    _  ) == (CHSTokFinal    _  ) = True
  (CHSTokForeign  _  ) == (CHSTokForeign  _  ) = True
  (CHSTokFun      _  ) == (CHSTokFun      _  ) = True
  (CHSTokGet      _  ) == (CHSTokGet      _  ) = True
  (CHSTokImport   _  ) == (CHSTokImport   _  ) = True
  (CHSTokLib      _  ) == (CHSTokLib      _  ) = True
  (CHSTokNewtype  _  ) == (CHSTokNewtype  _  ) = True
  (CHSTokNocode   _  ) == (CHSTokNocode   _  ) = True
  (CHSTokOffsetof _  ) == (CHSTokOffsetof _  ) = True
  (CHSTokOmit     _  ) == (CHSTokOmit     _  ) = True
  (CHSTokPointer  _  ) == (CHSTokPointer  _  ) = True
  (CHSTokPrefix   _  ) == (CHSTokPrefix   _  ) = True
  (CHSTokPure     _  ) == (CHSTokPure     _  ) = True
  (CHSTokQualif   _  ) == (CHSTokQualif   _  ) = True
  (CHSTokSet      _  ) == (CHSTokSet      _  ) = True
  (CHSTokSizeof   _  ) == (CHSTokSizeof   _  ) = True
  (CHSTokAlignof  _  ) == (CHSTokAlignof  _  ) = True
  (CHSTokStable   _  ) == (CHSTokStable   _  ) = True
  (CHSTokStruct   _  ) == (CHSTokStruct   _  ) = True
  (CHSTokType     _  ) == (CHSTokType     _  ) = True
  (CHSTok_2Case   _  ) == (CHSTok_2Case   _  ) = True
  (CHSTokUnsafe   _  ) == (CHSTokUnsafe   _  ) = True
  (CHSTokUpper    _  ) == (CHSTokUpper    _  ) = True
  (CHSTokVariadic _  ) == (CHSTokVariadic _  ) = True
  (CHSTokWith     _ _) == (CHSTokWith     _ _) = True
  (CHSTokIn       _  ) == (CHSTokIn       _  ) = True
  (CHSTokOut      _  ) == (CHSTokOut      _  ) = True
  (CHSTokString   _ _) == (CHSTokString   _ _) = True
  (CHSTokHSVerb   _ _) == (CHSTokHSVerb   _ _) = True
  (CHSTokHSQuot   _ _) == (CHSTokHSQuot   _ _) = True
  (CHSTokIdent    _ _) == (CHSTokIdent    _ _) = True
  (CHSTokHaskell  _ _) == (CHSTokHaskell  _ _) = True
  (CHSTokCPP    _ _ _) == (CHSTokCPP    _ _ _) = True
  (CHSTokLine     _  ) == (CHSTokLine     _  ) = True
  (CHSTokC        _ _) == (CHSTokC        _ _) = True
  (CHSTokCtrl     _ _) == (CHSTokCtrl     _ _) = True
  (CHSTokComment  _ _) == (CHSTokComment  _ _) = True
  (CHSTokCIdentTail _ _) == (CHSTokCIdentTail _ _) = True
  (CHSTokCArg     _ _) == (CHSTokCArg     _ _) = True
  _                    == _                    = False

instance Show CHSToken where
  showsPrec _ (CHSTokArrow   _  ) = showString "->"
  showsPrec _ (CHSTokDArrow  _  ) = showString "=>"
  showsPrec _ (CHSTokDot     _  ) = showString "."
  showsPrec _ (CHSTokComma   _  ) = showString ","
  showsPrec _ (CHSTokEqual   _  ) = showString "="
  showsPrec _ (CHSTokMinus   _  ) = showString "-"
  showsPrec _ (CHSTokStar    _  ) = showString "*"
  showsPrec _ (CHSTokAmp     _  ) = showString "&"
  showsPrec _ (CHSTokHat     _  ) = showString "^"
  showsPrec _ (CHSTokPercent _  ) = showString "%"
  showsPrec _ (CHSTokPlus    _  ) = showString "+"
  showsPrec _ (CHSTokPlusS   _  ) = showString "+S"
  showsPrec _ (CHSTokPlusNum _  sz) = showString ("+" ++ show sz)
  showsPrec _ (CHSTokLBrace  _  ) = showString "{"
  showsPrec _ (CHSTokRBrace  _  ) = showString "}"
  showsPrec _ (CHSTokLParen  _  ) = showString "("
  showsPrec _ (CHSTokRParen  _  ) = showString ")"
  showsPrec _ (CHSTokLBrack  _  ) = showString "["
  showsPrec _ (CHSTokRBrack  _  ) = showString "]"
  showsPrec _ (CHSTokHook    _  ) = showString "{#"
  showsPrec _ (CHSTokEndHook _  ) = showString "#}"
  showsPrec _ (CHSTokAdd     _  ) = showString "add"
  showsPrec _ (CHSTokAs      _  ) = showString "as"
  showsPrec _ (CHSTokCall    _  ) = showString "call"
  showsPrec _ (CHSTokClass   _  ) = showString "class"
  showsPrec _ (CHSTokConst   _  ) = showString "const"
  showsPrec _ (CHSTokContext _  ) = showString "context"
  showsPrec _ (CHSTokNonGNU  _  ) = showString "nonGNU"
  showsPrec _ (CHSTokTypedef _  ) = showString "typedef"
  showsPrec _ (CHSTokDefault _  ) = showString "default"
  showsPrec _ (CHSTokDerive  _  ) = showString "deriving"
  showsPrec _ (CHSTokDown    _  ) = showString "downcaseFirstLetter"
  showsPrec _ (CHSTokEnum    _  ) = showString "enum"
  showsPrec _ (CHSTokFinal   _  ) = showString "finalizer"
  showsPrec _ (CHSTokForeign _  ) = showString "foreign"
  showsPrec _ (CHSTokFun     _  ) = showString "fun"
  showsPrec _ (CHSTokGet     _  ) = showString "get"
  showsPrec _ (CHSTokImport  _  ) = showString "import"
  showsPrec _ (CHSTokLib     _  ) = showString "lib"
  showsPrec _ (CHSTokNewtype _  ) = showString "newtype"
  showsPrec _ (CHSTokNocode  _  ) = showString "nocode"
  showsPrec _ (CHSTokOffsetof _ ) = showString "offsetof"
  showsPrec _ (CHSTokOmit    _  ) = showString "omit"
  showsPrec _ (CHSTokPointer _  ) = showString "pointer"
  showsPrec _ (CHSTokPrefix  _  ) = showString "prefix"
  showsPrec _ (CHSTokPure    _  ) = showString "pure"
  showsPrec _ (CHSTokQualif  _  ) = showString "qualified"
  showsPrec _ (CHSTokSet     _  ) = showString "set"
  showsPrec _ (CHSTokSizeof  _  ) = showString "sizeof"
  showsPrec _ (CHSTokAlignof _  ) = showString "alignof"
  showsPrec _ (CHSTokStable  _  ) = showString "stable"
  showsPrec _ (CHSTokStruct  _  ) = showString "struct"
  showsPrec _ (CHSTokType    _  ) = showString "type"
  showsPrec _ (CHSTok_2Case  _  ) = showString "underscoreToCase"
  showsPrec _ (CHSTokUnsafe  _  ) = showString "unsafe"
  showsPrec _ (CHSTokUpper   _  ) = showString "upcaseFirstLetter"
  showsPrec _ (CHSTokVariadic _  ) = showString "variadic"
  showsPrec _ (CHSTokWith    _ _) = showString "with"
  showsPrec _ (CHSTokIn      _  ) = showString "in"
  showsPrec _ (CHSTokOut     _  ) = showString "out"
  showsPrec _ (CHSTokString  _ s) = showString ("\"" ++ s ++ "\"")
  showsPrec _ (CHSTokHSVerb  _ s) = showString ("`" ++ s ++ "'")
  showsPrec _ (CHSTokHSQuot  _ s) = showString ("'" ++ s ++ "'")
  showsPrec _ (CHSTokIdent   _ i) = (showString . identToString) i
  showsPrec _ (CHSTokHaskell _ s) = showString s
  showsPrec _ (CHSTokCPP  _ s nl) = showString (if nl then "\n" else "" ++ s)
  showsPrec _ (CHSTokLine    _  ) = id            --TODO show line num?
  showsPrec _ (CHSTokC       _ s) = showString s
  showsPrec _ (CHSTokCtrl    _ c) = showChar c
  showsPrec _ (CHSTokComment _ s) = showString (if null s
                                                then ""
                                                else " --" ++ s ++ "\n")
  showsPrec _ (CHSTokCIdentTail _ i) = (showString . identToString) i
  showsPrec _ (CHSTokCArg    _ s) = showString s

-- lexer state
-- -----------

-- | state threaded through the lexer
--
data CHSLexerState = CHSLS {
                       nestLvl :: Int,   -- nesting depth of nested comments
                       inHook  :: Bool,  -- within a binding hook?
                       namesup :: [Name] -- supply of unique names
                     }

-- | initial state
--
initialState :: [Name] -> CST s CHSLexerState
initialState nameSupply =
  do
    return CHSLS {
                         nestLvl = 0,
                         inHook  = False,
                         namesup = nameSupply
                }

-- | raise an error if the given state is not a final state
--
assertFinalState :: Position -> CHSLexerState -> CST s ()
assertFinalState pos CHSLS {nestLvl = nestLvl', inHook = inHook'}
  | nestLvl' > 0 = raiseError pos ["Unexpected end of file!",
                                   "Unclosed nested comment."]
  | inHook'      = raiseError pos ["Unexpected end of file!",
                                   "Unclosed binding hook."]
  | otherwise    = return ()

-- | lexer and action type used throughout this specification
--
type CHSLexer  = Lexer  CHSLexerState CHSToken
type CHSAction = Action               CHSToken
type CHSRegexp = Regexp CHSLexerState CHSToken

-- | for actions that need a new unique name
--
infixl 3 `lexactionName`
lexactionName :: CHSRegexp
              -> (String -> Position -> Name -> CHSToken)
              -> CHSLexer
re `lexactionName` action = re `lexmeta` action'
  where
    action' str pos state = let name:ns = namesup state
                            in
                            (Just $ Right (action str pos name),
                             incPos pos (length str),
                             state {namesup = ns},
                             Nothing)


-- lexical specification
-- ---------------------

-- | the lexical definition of the tokens (the base lexer)
--
chslexer :: CHSLexer
chslexer  =      haskell        -- Haskell code
            >||< nested         -- nested comments
            >||< ctrl           -- control code (that has to be preserved)
            >||< hook           -- start of a binding hook
            >||< cpp            -- a pre-processor directive (or `#c')
            >||< startmarker    -- marks beginning of input

-- | stream of Haskell code (terminated by a control character or binding hook)
--
haskell :: CHSLexer
--
-- NB: We need to make sure that '"' is not regarded as the beginning of a
--     string; however, we cannot really lex character literals properly
--     without lexing identifiers (as the latter may containing single quotes
--     as part of their lexeme).  Thus, we special case '"'.  This is still a
--     kludge, as a program fragment, such as
--
--       foo'"'strange string"
--
--     will not be handled correctly.
--
haskell  = (    anyButSpecial`star` epsilon
            >|< specialButQuotes
            >|< char '"'  +> inhstr`star` char '"'
            >|< string "'\"'"                           -- special case of "
            >|< string "--" +> anyButNL`star` epsilon   -- comment
           )
           `lexaction` copyVerbatim
           >||< char '"'                                -- this is a bad kludge
                `lexactionErr`
                  \_ pos -> (Left $ makeError LevelError pos
                                              ["Lexical error!",
                                              "Unclosed string."])
           where
             anyButSpecial    = alt (inlineSet \\ specialSet)
             specialButQuotes = alt (specialSet \\ ['"'])
             anyButNL         = alt (anySet \\ ['\n'])
             inhstr           = instr >|< char '\\' >|< string "\\\"" >|< gap
             gap              = char '\\' +> alt (' ':ctrlSet)`plus` char '\\'

-- | action copying the input verbatim to `CHSTokHaskell' tokens
--
copyVerbatim        :: CHSAction
copyVerbatim cs pos  = Just $ CHSTokHaskell pos cs

-- | nested comments
--
nested :: CHSLexer
nested  =
       string "{-"              {- for Haskell emacs mode :-( -}
       `lexmeta` enterComment
  >||<
       string "-}"
       `lexmeta` leaveComment
  where
    enterComment cs pos s =
      (copyVerbatim' cs pos,                    -- collect the lexeme
       incPos pos 2,                            -- advance current position
       s {nestLvl = nestLvl s + 1},             -- increase nesting level
       Just $ inNestedComment)                  -- continue in comment lexer
    --
    leaveComment cs pos s =
      case nestLvl s of
        0 -> (commentCloseErr pos,              -- 0: -} outside comment => err
              incPos pos 2,                     -- advance current position
              s,
              Nothing)
        1 -> (copyVerbatim' cs pos,             -- collect the lexeme
              incPos pos 2,                     -- advance current position
              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
              Just chslexer)                    -- 1: continue with root lexer
        _ -> (copyVerbatim' cs pos,             -- collect the lexeme
              incPos pos 2,                     -- advance current position
              s {nestLvl = nestLvl s - 1},      -- decrease nesting level
              Nothing)                          -- _: cont with comment lexer
    --
    copyVerbatim' cs pos  = Just $ Right (CHSTokHaskell pos cs)
    --
    commentCloseErr pos =
      Just $ Left (makeError LevelError pos
                             ["Lexical error!",
                             "`-}' not preceded by a matching `{-'."])
                             {- for Haskell emacs mode :-( -}


-- | lexer processing the inner of a comment
--
inNestedComment :: CHSLexer
inNestedComment  =      commentInterior         -- inside a comment
                   >||< nested                  -- nested comments
                   >||< ctrl                    -- control code (preserved)

-- | standard characters in a nested comment
--
commentInterior :: CHSLexer
commentInterior  = (    anyButSpecial`star` epsilon
                    >|< special
                   )
                   `lexaction` copyVerbatim
                   where
                     anyButSpecial = alt (inlineSet \\ commentSpecialSet)
                     special       = alt commentSpecialSet

-- | control code in the base lexer (is turned into a token)
--
-- * this covers exactly the same set of characters as contained in `ctrlSet'
--   and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer'
--
ctrl :: CHSLexer
ctrl  =
       char '\n' `lexmeta` newline
  >||< char '\r' `lexmeta` newline
  >||< char '\v' `lexmeta` newline
  >||< char '\f' `lexmeta` formfeed
  >||< char '\t' `lexmeta` tab
  where
    newline  [c] pos = ctrlResult pos c (retPos pos)
    formfeed [c] pos = ctrlResult pos c (incPos pos 1)
    tab      [c] pos = ctrlResult pos c (incPos pos 8)

    ctrlResult pos c pos' s =
      (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing)

-- | start of a binding hook (ie, enter the binding hook lexer)
--
hook :: CHSLexer
hook  = string "{#"
        `lexmeta` \_ pos s -> (Just $ Right (CHSTokHook pos),
                               incPos pos 2, s, Just bhLexer)

-- | start marker: used to identify pre-processor directive at
-- beginning of input -- this lexer just drops the start marker if it
-- hasn't been used to handle a pre-processor directive
--
startmarker :: CHSLexer
startmarker = char '\000' `lexmeta`
              \_ pos s -> (Nothing, incPos pos 1, s, Just chslexer)

-- | pre-processor directives and `#c'
--
-- * we lex `#c' as a directive and special case it in the action
--
-- * we lex C line number pragmas and special case it in the action
--
cpp :: CHSLexer
cpp = directive
      where
        directive =
          --(string "\n#" >|< string "\0#") +>
          alt "\n\0" +> alt " \t" `star` string "#" +>
          alt ('\t':inlineSet)`star` epsilon
          `lexmeta`
             \t@(ld:spdir) pos s ->      -- strip off the "\n" or "\0"
             let dir = drop 1 $ dropWhile (`elem` " \t") spdir
             in case dir of
                 ['c']                      ->          -- #c
                   (Nothing, incPos pos (length t), s, Just cLexer)
                 -- a #c may be followed by whitespace
                 'c':sp:_ | sp `elem` " \t" ->          -- #c
                   (Nothing, incPos pos (length t), s, Just cLexer)
                 ' ':line@(n:_) | isDigit n ->                 -- C line pragma
                   let pos' = adjustPosByCLinePragma line pos
                    in (Just $ Right (CHSTokLine pos'), pos', s, Nothing)
                 _                            ->        -- CPP directive
                   (Just $ Right (CHSTokCPP pos dir (ld == '\n')),
                    if ld == '\n' then retPos pos else incPos pos (length t),
                    s, Nothing)

adjustPosByCLinePragma :: String -> Position -> Position
adjustPosByCLinePragma str pos =
  if isSourcePos pos
  then position (posOffset pos) fname' row' 1 (posParent pos)
  else pos
  where
    fname           = posFile pos
    str'            = dropWhite str
    (rowStr, str'') = span isDigit str'
    row'            = read rowStr
    str'''          = dropWhite str''
    fnameStr        = takeWhile (/= '"') . drop 1 $ str'''
    fname'          | null str''' || head str''' /= '"' = fname
                    -- try and get more sharing of file name strings
                    | fnameStr == fname                 = fname
                    | otherwise                         = fnameStr
    --
    dropWhite = dropWhile (\c -> c == ' ' || c == '\t')

-- | the binding hook lexer
--
bhLexer :: CHSLexer
bhLexer  =      identOrKW
           >||< symbol
           >||< strlit
           >||< hsverb
           >||< hsquot
           >||< whitespace
           >||< arglist
           >||< endOfHook
           >||< string "--" +> anyButNL`star` epsilon  -- comment
                `lexaction` \cs pos -> Just (CHSTokComment pos (drop 2 cs))
           where
             anyButNL  = alt (anySet \\ ['\n'])
             endOfHook = string "#}"
                         `lexmeta`
                          \_ pos s -> (Just $ Right (CHSTokEndHook pos),
                                       incPos pos 2, s, Just chslexer)
             arglist = string "["
                       `lexmeta` \_ pos s -> (Just $ Right (CHSTokLBrack pos),
                                              incPos pos 1, s, Just alLexer)

-- | lexer for C function types for variadic functions
--
alLexer :: CHSLexer
alLexer =      sym ","  CHSTokComma
          >||< endOfArgList
          >||< cArg
  where sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
        endOfArgList = string "]"
                       `lexmeta`
                       \_ pos s -> (Just $ Right (CHSTokRBrack pos),
                                    incPos pos 1, s, Just bhLexer)
        cArg = ((alt (anySet \\ ",]")) `star` epsilon) `lexaction`
               \cs pos -> Just (CHSTokCArg pos $ trim cs)
        trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace

-- | the inline-C lexer
--
cLexer :: CHSLexer
cLexer =      inlineC                     -- inline C code
         >||< ctrl                        -- control code (preserved)
         >||< string "\n#endc"            -- end of inline C code...
              `lexmeta`                   -- ...preserve '\n' as control token
              \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s,
                           Just chslexer)
         where
           inlineC = alt inlineSet `lexaction` copyVerbatimC
           --
           copyVerbatimC :: CHSAction
           copyVerbatimC cs pos = Just $ CHSTokC pos cs

-- | whitespace
--
-- * horizontal and vertical tabs, newlines, and form feeds are filter out by
--   `Lexers.ctrlLexer'
--
whitespace :: CHSLexer
whitespace  =      (char ' ' `lexaction` \_ _ -> Nothing)
              >||< ctrlLexer

-- | identifiers and keywords
--
identOrKW :: CHSLexer
--
-- the strictness annotations seem to help a bit
--
identOrKW  =
       -- identifier or keyword
       (letter +> (letter >|< digit >|< char '\'')`star` epsilon
       `lexactionName` \cs pos name -> (idkwtok $!pos) cs name)
       >||<
       (digit +> (letter >|< digit)`star` epsilon
        `lexactionName` \cs pos name ->
        CHSTokCIdentTail pos (mkIdent pos cs name))
  where
    idkwtok pos "add"              _    = CHSTokAdd     pos
    idkwtok pos "as"               _    = CHSTokAs      pos
    idkwtok pos "call"             _    = CHSTokCall    pos
    idkwtok pos "class"            _    = CHSTokClass   pos
    idkwtok pos "const"            _    = CHSTokConst   pos
    idkwtok pos "context"          _    = CHSTokContext pos
    idkwtok pos "nonGNU"           _    = CHSTokNonGNU  pos
    idkwtok pos "typedef"          _    = CHSTokTypedef pos
    idkwtok pos "default"          _    = CHSTokDefault pos
    idkwtok pos "deriving"         _    = CHSTokDerive  pos
    idkwtok pos "downcaseFirstLetter" _ = CHSTokDown    pos
    idkwtok pos "enum"             _    = CHSTokEnum    pos
    idkwtok pos "finalizer"        _    = CHSTokFinal   pos
    idkwtok pos "foreign"          _    = CHSTokForeign pos
    idkwtok pos "fun"              _    = CHSTokFun     pos
    idkwtok pos "get"              _    = CHSTokGet     pos
    idkwtok pos "import"           _    = CHSTokImport  pos
    idkwtok pos "lib"              _    = CHSTokLib     pos
    idkwtok pos "newtype"          _    = CHSTokNewtype pos
    idkwtok pos "nocode"           _    = CHSTokNocode  pos
    idkwtok pos "offsetof"         _    = CHSTokOffsetof pos
    idkwtok pos "omit"             _    = CHSTokOmit    pos
    idkwtok pos "pointer"          _    = CHSTokPointer pos
    idkwtok pos "prefix"           _    = CHSTokPrefix  pos
    idkwtok pos "pure"             _    = CHSTokPure    pos
    idkwtok pos "qualified"        _    = CHSTokQualif  pos
    idkwtok pos "set"              _    = CHSTokSet     pos
    idkwtok pos "sizeof"           _    = CHSTokSizeof  pos
    idkwtok pos "alignof"          _    = CHSTokAlignof pos
    idkwtok pos "stable"           _    = CHSTokStable  pos
    idkwtok pos "struct"           _    = CHSTokStruct  pos
    idkwtok pos "type"             _    = CHSTokType    pos
    idkwtok pos "underscoreToCase" _    = CHSTok_2Case  pos
    idkwtok pos "unsafe"           _    = CHSTokUnsafe  pos
    idkwtok pos "upcaseFirstLetter"_    = CHSTokUpper   pos
    idkwtok pos "variadic"         _    = CHSTokVariadic pos
    idkwtok pos "with"             name = mkwith pos name
    idkwtok pos "in"               _    = CHSTokIn      pos
    idkwtok pos "out"              _    = CHSTokOut     pos
    idkwtok pos cs                 name = mkid pos cs name
    --
    mkid pos cs name = CHSTokIdent pos (mkIdent pos cs name)
    mkwith pos name = CHSTokWith pos (mkIdent pos "with" name)

keywordToIdent :: CHSToken -> CHSToken
keywordToIdent tok =
  case tok of
    CHSTokAdd     pos -> mkid pos "add"
    CHSTokAs      pos -> mkid pos "as"
    CHSTokCall    pos -> mkid pos "call"
    CHSTokClass   pos -> mkid pos "class"
    CHSTokConst   pos -> mkid pos "const"
    CHSTokContext pos -> mkid pos "context"
    CHSTokNonGNU  pos -> mkid pos "nonGNU"
    CHSTokTypedef pos -> mkid pos "typedef"
    CHSTokDefault pos -> mkid pos "default"
    CHSTokDerive  pos -> mkid pos "deriving"
    CHSTokDown    pos -> mkid pos "downcaseFirstLetter"
    CHSTokEnum    pos -> mkid pos "enum"
    CHSTokFinal   pos -> mkid pos "finalizer"
    CHSTokForeign pos -> mkid pos "foreign"
    CHSTokFun     pos -> mkid pos "fun"
    CHSTokGet     pos -> mkid pos "get"
    CHSTokImport  pos -> mkid pos "import"
    CHSTokLib     pos -> mkid pos "lib"
    CHSTokNewtype pos -> mkid pos "newtype"
    CHSTokNocode  pos -> mkid pos "nocode"
    CHSTokOffsetof pos -> mkid pos "offsetof"
    CHSTokOmit    pos -> mkid pos "omit"
    CHSTokPointer pos -> mkid pos "pointer"
    CHSTokPrefix  pos -> mkid pos "prefix"
    CHSTokPure    pos -> mkid pos "pure"
    CHSTokQualif  pos -> mkid pos "qualified"
    CHSTokSet     pos -> mkid pos "set"
    CHSTokSizeof  pos -> mkid pos "sizeof"
    CHSTokAlignof pos -> mkid pos "alignof"
    CHSTokStable  pos -> mkid pos "stable"
    CHSTokStruct  pos -> mkid pos "struct"
    CHSTokType    pos -> mkid pos "type"
    CHSTok_2Case  pos -> mkid pos "underscoreToCase"
    CHSTokUnsafe  pos -> mkid pos "unsafe"
    CHSTokUpper   pos -> mkid pos "upcaseFirstLetter"
    CHSTokVariadic pos -> mkid pos "variadic"
    CHSTokWith    pos ide -> CHSTokIdent pos ide
    CHSTokIn      pos -> mkid pos "in"
    CHSTokOut     pos -> mkid pos "out"
    _ -> tok
    where mkid pos str = CHSTokIdent pos (internalIdent str)

-- | reserved symbols
--
symbol :: CHSLexer
symbol  =      sym "->" CHSTokArrow
          >||< sym "=>" CHSTokDArrow
          >||< sym "."  CHSTokDot
          >||< sym ","  CHSTokComma
          >||< sym "="  CHSTokEqual
          >||< sym "-"  CHSTokMinus
          >||< sym "*"  CHSTokStar
          >||< sym "&"  CHSTokAmp
          >||< sym "^"  CHSTokHat
          >||< sym "%"  CHSTokPercent
          >||< sym "+"  CHSTokPlus
          >||< sym "+S" CHSTokPlusS
          >||< sym_with_num "+" CHSTokPlusNum
          >||< sym "{"  CHSTokLBrace
          >||< sym "}"  CHSTokRBrace
          >||< sym "("  CHSTokLParen
          >||< sym ")"  CHSTokRParen
          where
            sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
            sym_with_num cs con =
              string cs +> digit +> digit`star` epsilon
              `lexaction` \(_:ds) pos -> Just (con pos (read ds))

-- | string
--
strlit :: CHSLexer
strlit  = char '"' +> (instr >|< char '\\')`star` char '"'
          `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs))

-- | verbatim code
--
hsverb :: CHSLexer
hsverb  = char '`' +> inhsverb`star` char '\''
          `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs))

-- | quoted code
--
hsquot :: CHSLexer
hsquot  = char '\'' +> inhsverb`star` char '\''
          `lexaction` \cs pos -> Just (CHSTokHSQuot pos (init . tail $ cs))


-- | regular expressions
--
letter, digit, instr, inhsverb :: Regexp s t
letter    = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_'
digit     = alt ['0'..'9']
instr     = alt ([' '..'\255'] \\ "\"\\")
inhsverb  = alt ([' '..'\127'] \\ "'")

-- | character sets
--
anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char]
anySet            = ['\1'..'\255']
inlineSet         = anySet \\ ctrlSet
specialSet        = ['{', '-', '"', '\'']
commentSpecialSet = ['{', '-']
ctrlSet           = ['\n', '\f', '\r', '\t', '\v']


-- main lexing routine
-- -------------------

-- | generate a token sequence out of a string denoting a CHS file
--
-- * the given position is attributed to the first character in the string
--
-- * errors are entered into the compiler state
--
-- * on a successfull parse, the name supply is updated
lexCHS        :: String -> Position -> CST s [CHSToken]
lexCHS cs pos  =
  do
    nameSupply <- getNameSupply
    state <- initialState nameSupply
    let (ts, lstate, errs) = execLexer chslexer ('\0':cs, pos, state)
        (_, pos', state')  = lstate
    mapM_ raise errs
    assertFinalState pos' state'
    setNameSupply $ namesup state'
    return ts