{-# LANGUAGE NoImplicitPrelude #-}

{-
    BNF Converter: Java Antlr4 Lexer generator
    Copyright (C) 2015  Author:  Gabriele Paganelli

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-}

{-
   **************************************************************
    BNF Converter Module

    Description   : This module generates the Antlr4 input file.
                    Based on CFtoJLex15.hs

    Author        : Gabriele Paganelli (gapag@distruzione.org)

    License       : GPL (GNU General Public License)

    Created       : 15 Oct, 2015

    Modified      :


   **************************************************************
-}

module BNFC.Backend.Java.CFtoAntlr4Lexer ( cf2AntlrLex ) where

import Prelude'

import Text.PrettyPrint
import BNFC.CF
import BNFC.Backend.Java.RegToAntlrLexer
import BNFC.Backend.Java.Utils
import BNFC.Backend.Common.NamedVariables

-- | Creates a lexer grammar.
-- Since antlr token identifiers must start with an uppercase symbol,
-- I prepend "Surrogate_id_SYMB_" to the identifier.
-- This introduces risks of clashes if somebody uses the same identifier for
-- user defined tokens. This is not handled.
-- returns the environment because the parser uses it.
cf2AntlrLex :: String -> CF -> (Doc, SymEnv)
cf2AntlrLex packageBase cf = (vcat
    [ prelude packageBase
    , cMacros
    -- unnamed symbols (those in quotes, not in token definitions)
    , lexSymbols env
    , restOfLexerGrammar cf
    ], env)
  where
    env                    = makeSymEnv (cfgSymbols cf ++ reservedWords cf)
        (0 :: Int)
    makeSymEnv [] _        = []
    makeSymEnv (s:symbs) n = (s, "Surrogate_id_SYMB_" ++ show n)
        : makeSymEnv symbs (n+1)


-- | File prelude
prelude ::  String -> Doc
prelude packageBase = vcat
    [ "// This Antlr4 file was machine-generated by the BNF converter"
    , "lexer grammar" <+> text name <> "Lexer;"
    ]
    where name = getLastInPackage packageBase

--For now all categories are included.
--Optimally only the ones that are used should be generated.
cMacros :: Doc
cMacros = vcat
    [ "// Predefined regular expressions in BNFC"
    , frg "LETTER  : CAPITAL | SMALL"
    , frg "CAPITAL : [A-Z\\u00C0-\\u00D6\\u00D8-\\u00DE]"
    , frg "SMALL   : [a-z\\u00DF-\\u00F6\\u00F8-\\u00FF]"
    , frg "DIGIT   : [0-9]"
    ]
  where frg a = "fragment" <+> a <+> ";"

escapeChars :: String -> String
escapeChars = concatMap escapeChar

-- |
-- >>> lexSymbols [("foo","bar")]
-- bar : 'foo' ;
-- >>> lexSymbols [("\\","bar")]
-- bar : '\\' ;
-- >>> lexSymbols [("/","bar")]
-- bar : '/' ;
-- >>> lexSymbols [("~","bar")]
-- bar : '~' ;
lexSymbols :: SymEnv -> Doc
lexSymbols ss = vcat $  map transSym ss
  where
    transSym (s,r) = text r <>  " : '" <> text (escapeChars s) <> "' ;"

-- | Writes rules for user defined tokens, and, if used, the predefined BNFC tokens.
restOfLexerGrammar :: CF -> Doc
restOfLexerGrammar cf = vcat
    [ lexComments (comments cf)
    , ""
    , userDefTokens
    , ifString strdec
    , ifChar chardec
    , ifC catDouble [
        "// Double predefined token type",
        "DOUBLE : DIGIT+ '.' DIGIT+ ('e' '-'? DIGIT+)?;"
        ]
    , ifC catInteger [
        "//Integer predefined token type",
        "INTEGER : DIGIT+;"
        ]
    , ifC catIdent [
        "// Identifier token type" ,
        "fragment" ,
        "IDENTIFIER_FIRST : LETTER | '_';",
        "IDENT : IDENTIFIER_FIRST (IDENTIFIER_FIRST | DIGIT)*;"
        ]
    , "// Whitespace"
    , "WS : (' ' | '\\r' | '\\t' | '\\n')+ ->  skip;"
    , "// Escapable sequences"
    , "fragment"
    , "Escapable : ('\"' | '\\\\' | 'n' | 't' | 'r');"
    , "ErrorToken : . ;"
    , ifString stringmodes
    , ifChar charmodes
    ]
  where
    ifC cat s     = if isUsedCat cf cat then vcat s else ""
    ifString      = ifC catString
    ifChar        = ifC catChar
    strdec        = [ "// String token type"
                    , "STRING : '\"' -> more, mode(STRINGMODE);"
                    ]
    chardec       = ["CHAR : '\\''   -> more, mode(CHARMODE);"]
    userDefTokens = vcat
        [ text (show name) <>" : " <> text (printRegJLex exp) <> ";"
        | (name, exp) <- tokenPragmas cf ]
    stringmodes   = [ "mode STRESCAPE;"
        , "STRESCAPED : Escapable  -> more, popMode ;"
        , "mode STRINGMODE;"
        , "STRINGESC : '\\\\' -> more , pushMode(STRESCAPE);"
        , "STRINGEND : '\"' ->  type(STRING), mode(DEFAULT_MODE);"
        , "STRINGTEXT : ~[\\\"\\\\] -> more;"
        ]
    charmodes     = [ "mode CHARMODE;"
        , "CHARANY     :  ~[\\'\\\\] -> more, mode(CHAREND);"
        , "CHARESC     :  '\\\\'  -> more, pushMode(CHAREND),pushMode(ESCAPE);"
        , "mode ESCAPE;"
        , "ESCAPED : (Escapable | '\\'')  -> more, popMode ;"
        , "mode CHAREND;"
        , "CHARENDC     :  '\\''  -> type(CHAR), mode(DEFAULT_MODE);"
        ]

lexComments :: ([(String, String)], [String]) -> Doc
lexComments ([],[]) = ""
lexComments (m,s) = vcat
    (prod "COMMENT_antlr_builtin" lexSingleComment s ++
         prod "MULTICOMMENT_antlr_builtin" lexMultiComment m )

  where
    prod bg lc ty = [bg, ": ("] ++ punctuate "|" (map lc ty) ++ skiplex
    skiplex       = [") -> skip;"]

-- | Create lexer rule for single-line comments.
--
-- >>> lexSingleComment "--"
-- '--' ~[\r\n]* (('\r'? '\n')|EOF)
--
-- >>> lexSingleComment "\""
-- '"' ~[\r\n]* (('\r'? '\n')|EOF)
lexSingleComment :: String -> Doc
lexSingleComment c =
    "'" <>text (escapeChars c) <>  "' ~[\\r\\n]* (('\\r'? '\\n')|EOF)"

-- | Create lexer rule for multi-lines comments.
--
-- There might be a possible bug here if a language includes 2 multi-line
-- comments. They could possibly start a comment with one character and end it
-- with another. However this seems rare.
--
-- >>> lexMultiComment ("{-", "-}")
-- '{-' (.)*? '-}'
--
-- >>> lexMultiComment ("\"'", "'\"")
-- '"\'' (.)*? '\'"'
lexMultiComment :: (String, String) -> Doc
lexMultiComment (b,e) =
         "'" <> text (escapeChars b)
        <>"' (.)*? '"<> text (escapeChars e)
        <> "'"