{-# LANGUAGE ScopedTypeVariables, FlexibleInstances, MultiParamTypeClasses
  , DeriveGeneric, DeriveAnyClass, TypeFamilies, FlexibleContexts
  , StandaloneDeriving, OverloadedStrings, DeriveDataTypeable #-}
{-|
  Module      : Text.ANTLR.Grammar
  Description : Grammar data types and API for parsing algorithms
  Copyright   : (c) Karl Cronburg, 2018
  License     : BSD3
  Maintainer  : karl@cs.tufts.edu
  Stability   : experimental
  Portability : POSIX

-}
module Text.ANTLR.Grammar
  ( -- * Data types
    Grammar(..)
  , ProdElem(..), ProdElems
  , Production(..), ProdRHS(..), StateFncn(..)
  , Predicate(..), Mutator(..), Ref(..)
  -- * Basic setter / getter functions:
  , getRHS, getLHS
  , isSem, isAction
  , sameNTs, sameTs
  , isNT, isT, isEps, getNTs, getTs, getEps
  , prodsFor, getProds
  , validGrammar, hasAllNonTerms, hasAllTerms, startIsNonTerm
  , symbols, defaultGrammar
  ) where
import Prelude hiding (pi)
import Data.List (nub, sort)

import System.IO.Unsafe (unsafePerformIO)
import qualified Debug.Trace as D
import Data.Data (Data(..), Typeable(..))
import Language.Haskell.TH.Lift (Lift(..))

import qualified Text.ANTLR.Set as S
import Text.ANTLR.Set
  ( Set(..), empty, fromList, member, union
  , Hashable(..), Generic(..)
  )

import Text.ANTLR.Pretty

uPIO :: IO a -> a
uPIO = unsafePerformIO

----------------------------------------------------------------
-- When we *Show* production elements, they should contain source location
-- information, but when we *compare* them, we should ignore the source info.

-- | Something is "Ref" if it can be symbolized by some symbol in a set of
--   symbols. Symbols are typically Strings, an enum data type, or some other
--   Eq-able (best if finite) set of things.
class Ref v where
  -- | One symbol type for every value type v.
  type Sym v :: *
  -- | Compute (or extract) the symbol for some concrete value.
  getSymbol :: v -> Sym v

compareSymbols :: (Ref ref, Eq (Sym ref)) => ref -> ref -> Bool
compareSymbols a b = getSymbol a == getSymbol b

-- | Nonterminals can be symbolized (for now the types are equivalent, i.e.
--   nt == Sym nt)
sameNTs :: forall nt. (Ref nt, Eq (Sym nt)) => nt -> nt -> Bool
sameNTs = compareSymbols

-- | Terminals can be symbolized (in the current implementation, the input
--   terminal type to a parser is @(t == 'Text.ANTLR.Lex.Tokenizer.Token' n v)@ and the terminal symbol type is
--   @(ts == 'Sym t' == n)@ where @n@ is defined as the name of a token @('Text.ANTLR.Lex.Tokenizer.Token' n v)@.
sameTs :: forall t. (Ref t, Eq (Sym t)) => t -> t -> Bool
sameTs = compareSymbols

instance Ref String where
  type Sym String = String
  getSymbol = id

instance Ref (String, b) where
  type Sym (String, b) = String
  getSymbol = fst

-- | Grammar ProdElems
--   
--   > nts == Non Terminal Symbol (type)
--   > ts == Terminal Symbol (type)
--
--   Production elements are only used in the grammar data structure and parser,
--   therefore these types (nt and ts) are __not__ necessarily equivalent to the
--   terminal types seen by the tokenizer (nonterminals are special because no one
--   sees them until after parsing). Also pushing @(ts = Sym t)@ up to the top of
--   data constructors gets rid of a lot of unnecessary standalone deriving
--   instances. Standalone deriving instances in this case are a programming
--   anti-pattern for allowing you to improperly parametrize your types. In this
--   case a 'ProdElem' cares about the __terminal symbol type__, not the __terminal
--   token type__. In fact it's redundant to say *terminal token* because all
--   tokens are terminals in the grammar. A token is by definition a tokenized
--   __value__ with a __named__ terminal symbol, which is in fact exactly what the
--   'Text.ANTLR.Lex.Tokenizer.Token' type looks like in 'Text.ANTLR.Lex.Tokenizer': @'Text.ANTLR.Lex.Tokenizer.Token' n v@ (name and
--   value). So wherever I see an @n@ type variable in the tokenizer, this is
--   equivalent to @('Sym' t)@ in the parser. And wherever I see a @('Text.ANTLR.Lex.Tokenizer.Token' n v)@ in the
--   tokenizer, this gets passed into the parser as @t@:
--
--   @
--     n           == 'Sym' t
--     ('Text.ANTLR.Lex.Tokenizer.Token' n v) == t
--   @
--
data ProdElem nts ts =
    NT nts -- ^ Nonterminal production element
  | T  ts  -- ^ Terminal production element
  | Eps    -- ^ Empty string production element
  deriving (Eq, Ord, Generic, Hashable, Show, Data, Lift)

instance (Prettify nts, Prettify ts) => Prettify (ProdElem nts ts) where
  prettify (NT nts) = prettify nts
  prettify (T  ts)  = prettify  ts
  prettify Eps      = pStr "ε"

-- | Is the 'ProdElem' a nonterminal?
isNT (NT _) = True
isNT _      = False

-- | Is the 'ProdElem' a terminal?
isT (T _) = True
isT _     = False

-- | Is the 'ProdElem' an epsilon?
isEps Eps = True
isEps _   = False

-- | Get just the nonterminals from a list
getNTs = map (\(NT nt) -> nt) . filter isNT
-- | Get just the terminals from a list
getTs  = map (\(T t) -> t) . filter isT
-- | Get just the epsilons from a list (umm...)
getEps = map (\Eps -> Eps) . filter isEps

-- | Zero or more production elements
type ProdElems nts ts = [ProdElem nts ts]

-- | A function to run when a production rule fires, operating some state @s@.
data StateFncn s =
    Pass                    -- ^ No predicate or mutator
  | Sem    (Predicate ())   -- ^ Semantic predicate
  | Action (Mutator ())     -- ^ Mutator, ProdElems is always empty in this one
  deriving (Eq, Ord, Generic, Hashable, Show, Data, Lift)

instance Prettify (StateFncn s) where
  prettify Pass       = return ()
  prettify (Sem p)    = prettify p
  prettify (Action a) = prettify a

-- | Right-hand side of a single production rule
data ProdRHS s nts ts = Prod (StateFncn s) (ProdElems nts ts)
  deriving (Eq, Ord, Generic, Hashable, Show, Data, Lift)

instance (Prettify s, Prettify nts, Prettify ts) => Prettify (ProdRHS s nts ts) where
  prettify (Prod sf ps) = do
    prettify sf
    prettify ps

-- | Is this 'ProdRHS' a semantic predicate?
isSem (Prod (Sem _) _) = True
isSem _ = False

-- | Is this 'ProdRHS' a mutator?
isAction (Prod (Action _) _) = True
isAction _ = False

-- | Get just the production elements from a bunch of production rules
getProds = map (\(Prod _ ss) -> ss)

-- | A single production rule
data Production s nts ts = Production nts (ProdRHS s nts ts)
  deriving (Eq, Ord, Generic, Hashable, Data, Lift)

instance (Prettify s, Prettify nts, Prettify ts) => Prettify (Production s nts ts) where
  prettify (Production nts (Prod sf ps)) = do
    len <- pCount nts
    -- Put the indentation level after the nonterminal, or just incr by 2 if
    -- lazy...
    incrIndent (len + 4)
    pStr " -> "
    prettify sf
    prettify ps
    incrIndent (-4)

instance (Show s, Show nts, Show ts) => Show (Production s nts ts) where
  show (Production nts rhs) = show nts ++ " -> " ++ show rhs

-- | Inline get 'ProdRHS' of a 'Production'
getRHS :: Production s nts ts -> ProdRHS s nts ts
getRHS (Production lhs rhs) = rhs

-- | Inline get the nonterminal symbol naming a 'Production'
getLHS :: Production s nts ts -> nts
getLHS (Production lhs rhs) = lhs

-- | Get only the productions for the given nonterminal symbol nts:
prodsFor :: forall s nts ts. (Eq nts) => Grammar s nts ts -> nts -> [Production s nts ts]
prodsFor g nts = let
    matchesNT :: Production s nts t -> Bool
    matchesNT (Production nts' _) = nts' == nts
  in filter matchesNT (ps g)

-- TODO: boiler plate auto deriving for "named" of a user defined type?

-- | Predicates and Mutators act over some state. The String
--   identifiers should eventually correspond to source-level
--   e.g. location / allocation site information, i.e. two
--   predicates or mutators are equivalent iff they were
--   constructed from the same production rule.
data Predicate p = Predicate String p
  deriving (Data)

instance (Data s, Typeable s) => Lift (Predicate s)
instance (Data s, Typeable s) => Lift (Mutator s)

instance Eq (Predicate s) where
  Predicate p1 _ == Predicate p2 _ = p1 == p2

instance Ord (Predicate s) where
  Predicate p1 _ `compare` Predicate p2 _ = p1 `compare` p2

instance Show (Predicate s) where
  show (Predicate p1 _) = "π(" ++ show p1 ++ ")"

instance Hashable (Predicate s) where
  hashWithSalt salt (Predicate p1 _) = salt `hashWithSalt` p1

instance Prettify (Predicate s) where
  prettify (Predicate n _) = pStr' n

instance Prettify (Mutator s) where
  prettify (Mutator n _) = pStr' n

-- | Function for mutating the state of the parser when a certain
--   production rule fires.
data Mutator   s = Mutator String ()
  deriving (Data)

instance Eq (Mutator s) where
  Mutator m1 _ == Mutator m2 _ = m1 == m2

instance Ord (Mutator s) where
  Mutator m1 _ `compare` Mutator m2 _ = m1 `compare` m2

instance Show (Mutator s) where
  show (Mutator m1 _) = "µ(" ++ show m1 ++ ")"

instance Hashable (Mutator s) where
  hashWithSalt salt (Mutator m1 _) = salt `hashWithSalt` m1

-- | Core representation of a grammar, as used by the parsing algorithms.
data Grammar s nts ts = G
  { ns  :: Set nts
  , ts  :: Set ts
  , ps  :: [Production s nts ts]
  , s0  :: nts
  , _πs :: Set (Predicate s)
  , _μs :: Set (Mutator   s)
  } deriving (Show, Lift)

instance (Eq s, Eq nts, Eq ts, Hashable nts, Hashable ts, Prettify s, Prettify nts, Prettify ts)
  => Eq (Grammar s nts ts) where
  g1 == g2 = ns g1 == ns g2
          && ts g1 == ts g2
          && eqLists (nub $ ps g1) (nub $ ps g2)
          && s0 g1 == s0 g2
          && _πs g1 == _πs g2
          && _μs g1 == _μs g2

eqLists [] [] = True
eqLists [] vs = False
eqLists vs [] = False
eqLists (v1:vs) vs2 = eqLists vs (filter (/= v1) vs2)

instance (Prettify s, Prettify nts, Prettify ts, Hashable ts, Eq ts, Hashable nts, Eq nts, Ord ts, Ord nts)
  => Prettify (Grammar s nts ts) where
  prettify G {ns = ns, ts = ts, ps = ps, s0 = s0, _πs = _πs, _μs = _μs} = do
    pLine "Grammar:"
    pStr "{ "
    incrIndent 2
    pStr  "  ns = "      ; prettify ns; pLine ""
    pStr  ", ts = "      ; prettify ts; pLine ""
    pStr  ", ps = "      ; pListLines $ sort ps; pLine ""
    pStr  ", s0 = "      ; prettify s0; pLine ""
    pStr  ", _πs = "     ; prettify _πs ; pLine ""
    pStr  ", _μs = "     ; prettify _μs ; pLine ""
    incrIndent (-2)
    pStr "}"

-- | All possible production elements of a given grammar.
symbols
  :: (Ord nts, Ord ts, Hashable s, Hashable nts, Hashable ts)
  => Grammar s nts ts -> Set (ProdElem nts ts)
symbols g = S.insert Eps $ S.map NT (ns g) `union` S.map T (ts g)

-- | The empty grammar - accepts nothing, with one starting nonterminal
--   and nowhere to go.
defaultGrammar
  :: forall s nts ts. (Ord ts, Hashable ts, Hashable nts, Eq nts)
  => nts -> Grammar s nts ts
defaultGrammar start = G
  { ns  = S.singleton start
  , ts  = empty
  , ps  = []
  , _πs = empty
  , _μs = empty
  , s0  = start
  }

-- | Does the given grammar make any sense?
validGrammar
  :: forall s nts ts.
  (Eq nts, Ord nts, Eq ts, Ord ts, Hashable nts, Hashable ts)
  => Grammar s nts ts -> Bool
validGrammar g =
     hasAllNonTerms g
  && hasAllTerms g
  && startIsNonTerm g
--  && distinctTermsNonTerms g

-- | All nonterminals in production rules can be found in the nonterminals list.
hasAllNonTerms
  :: (Eq nts, Ord nts, Hashable nts, Hashable ts)
  => Grammar s nts ts -> Bool
hasAllNonTerms g =
  ns g == (fromList . getNTs . concat . getProds . map getRHS $ ps g)

-- | All terminals in production rules can be found in the terminal list.
hasAllTerms
  :: (Eq ts, Ord ts, Hashable nts, Hashable ts)
  => Grammar s nts ts -> Bool
hasAllTerms g =
  ts g == (fromList . getTs . concat . getProds . map getRHS $ ps g)

-- | The starting symbol is a valid nonterminal.
startIsNonTerm
  :: (Ord nts, Hashable nts)
  => Grammar s nts ts -> Bool
startIsNonTerm g = s0 g `member` ns g

--distinctTermsNonTerms g =
--  (ns g `intersection` ts g) == empty