-- GenI surface realiser
-- Copyright (C) 2005 Carlos Areces and Eric Kow
--
-- This program is free software; you can redistribute it and/or
-- modify it under the terms of the GNU General Public License
-- as published by the Free Software Foundation; either version 2
-- of the License, or (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program; if not, write to the Free Software
-- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

{-# LANGUAGE CPP               #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-unused-do-bind #-}
module NLP.GenI.Parser (
  -- * Test suites
  geniTestSuite, geniSemanticInput, geniTestSuiteString,
  geniDerivations,
  -- * Trees
  geniMacros, geniTagElems,
  -- * Lexicon and morph
  geniLexicon, geniMorphInfo,
  -- * Basics
  geniFeats, geniSemantics, geniValue, geniWords,
  -- * Helpers
  geniWord, geniLanguageDef, tillEof,
  --
  parseFromFile, -- UTF-8 version
  module Text.Parsec,
  module Text.Parsec.Text,
) where


import           Control.Applicative       ((*>), (<$>), (<*), (<*>), pure)
import           Control.Monad             (liftM, when)
import qualified Data.ByteString           as B
import           Data.Functor.Identity     (Identity)
import qualified Data.Map                  as Map
import           Data.Monoid               ((<>))
import           Data.Text                 (Text)
import qualified Data.Text                 as T
import qualified Data.Text.Encoding        as T
import qualified Data.Tree                 as T
import           Text.Parsec
import           Text.Parsec.Text
import           Text.Parsec.Token         (GenLanguageDef (..),
                                            makeTokenParser)
import qualified Text.Parsec.Token         as P

-- import BoolExp
import           Data.FullList             (FullList, Listable (..))
import           NLP.GenI.FeatureStructure (AvPair (..), Flist, sortFlist)
import           NLP.GenI.General          (isGeniIdentLetter)
import           NLP.GenI.GeniShow         (GeniShow (..), geniKeyword)
import           NLP.GenI.GeniVal          (GeniVal, SchemaVal (..), isAnon,
                                            mkGAnon, mkGConst, mkGConstNone,
                                            mkGVar)
import           NLP.GenI.Lexicon          (LexEntry (..), fromLexSem,
                                            mkFullLexEntry)
import           NLP.GenI.Pretty           (above)
import           NLP.GenI.Semantics        (LitConstr, Literal (..), Sem,
                                            SemInput, sortSem)
import           NLP.GenI.Tag              (TagElem (..), setTidnums)
import           NLP.GenI.TestSuite        (TestCase (..))
import           NLP.GenI.TreeSchema       (GNode (..), GType (..), Ptype (..),
                                            AdjunctionConstraint(..),
                                            SchemaTree, Ttree (..))


-- General notes

-- reserved words
#define SEMANTICS       "semantics"
#define SENTENCE        "sentence"
#define OUTPUT          "output"
#define TRACE           "trace"
#define ANCHOR          "anchor"
#define SUBST           "subst"
#define FOOT            "foot"
#define LEX             "lex"
#define TYPE            "type"
#define ACONSTR         "aconstr"
#define INITIAL         "initial"
#define AUXILIARY       "auxiliary"
#define IDXCONSTRAINTS  "idxconstraints"
#define BEGIN           "begin"
#define END             "end"

-- Lexer

geniLanguageDef :: P.GenLanguageDef Text () Identity
geniLanguageDef = LanguageDef
    { commentLine  = "%"
    , commentStart = "/*"
    , commentEnd   = "*/"
    , nestedComments = True
    , opStart  = opLetter geniLanguageDef
    , opLetter = oneOf ""
    , reservedOpNames = []
    , reservedNames =
          [ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE
          , ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR
          , INITIAL , AUXILIARY
          , BEGIN , END
          ]
    , identLetter = identStuff
    , identStart  = identStuff
    , caseSensitive  = True
    }
  where
    identStuff = satisfy isGeniIdentLetter

geniValue :: Parser GeniVal
geniValue =   ((try $ anonymous) <?> "_ or ?_")
          <|> (constants  <?> "a constant or atomic disjunction")
          <|> (variable   <?> "a variable")
  where
    question = "?"
    disjunction = geniAtomicDisjunction
    constants :: Parser GeniVal
    constants = mkGConst <$> disjunction
    variable :: Parser GeniVal
    variable =
      do symbol question
         v <- identifier
         mcs <- option Nothing $ (symbol "/" >> Just `liftM` disjunction)
         return (mkGVar v mcs)
    anonymous :: Parser GeniVal
    anonymous =
      do optional $ symbol question
         symbol "_"
         return mkGAnon

geniAtomicDisjunction :: Parser (FullList Text)
geniAtomicDisjunction = do
    (x:xs) <- atom `sepBy1` (symbol "|")
    return (x !: xs)
  where
    atom = looseFlexiIdentifier

geniFancyDisjunction :: Parser SchemaVal
geniFancyDisjunction = SchemaVal <$> geniValue `sepBy1` symbol ";"

class GeniValLike v where
  geniValueLike :: Parser v

instance GeniValLike GeniVal where
  geniValueLike = geniValue

instance GeniValLike SchemaVal where
  geniValueLike = geniFancyDisjunction

-- We make no attempt to check for / guarantee uniqueness here
-- because the same sort of format is used for things which are
-- not strictly speaking feature structures
geniFeats :: GeniValLike v => Parser (Flist v)
geniFeats = option [] $ squares $ many geniAttVal

geniAttVal :: GeniValLike v => Parser (AvPair v)
geniAttVal = do
    att <- identifierR <?> "an attribute"; colon
    val <- geniValueLike <?> "a GenI value"
    return $ AvPair att val

geniSemantics :: Parser Sem
geniSemantics =
  do sem <- many (geniLiteral <?> "a literal")
     return (sortSem sem)

geniLiteral :: Parser (Literal GeniVal)
geniLiteral = geniLiteral_ mkGAnon geniValue

geniLiteral_ :: a -> Parser a -> Parser (Literal a)
geniLiteral_ zero gv =
    Literal <$> (option zero handleParser <?> "a handle")
            <*> (gv <?> "a predicate")
            <*> (parens (many gv) <?> "some parameters")
  where
    handleParser = try $ gv <* char ':'

geniSemanticInput :: Parser (Sem,Flist GeniVal,[LitConstr])
geniSemanticInput =
  do keywordSemantics
     (sem,litC) <- liftM unzip $ squares $ many literalAndConstraint
     idxC       <- option [] geniIdxConstraints
     --
     let sem2     = createHandles sem
         semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ]
     return (createHandles sem, idxC, semlitC2)
  where
     -- set all anonymous handles to some unique value
     -- this is to simplify checking if a result is
     -- semantically complete
     createHandles :: Sem -> Sem
     createHandles = zipWith setHandle ([1..] :: [Int])
     --
     setHandle i (Literal h pred_ par) =
       let h2 = if isAnon h
                then mkGConstNone ("genihandle" `T.append` T.pack (show i))
                else h
       in Literal h2 pred_ par
     --
     literalAndConstraint :: Parser LitConstr
     literalAndConstraint =
       do l <- geniLiteral
          t <- option [] $ squares $ many identifier
          return (l,t)

-- | The original string representation of the semantics (for gui)
geniSemanticInputString :: Parser Text
geniSemanticInputString = do
    keywordSemantics
    s <- squaresString
    whiteSpace
    xs <- option [] geniIdxConstraints
    return (spitBack s xs)
  where
    -- this is a bit embarassing
    spitBack semStr idxC =
        geniKeyword SEMANTICS semStr `above` r
      where
        r | null idxC = ""
          | otherwise = geniKeyword IDXCONSTRAINTS (geniShowText idxC)

geniIdxConstraints :: Parser (Flist GeniVal)
geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats

{-
geniLitConstraints :: Parser (BoolExp T.Text)
geniLitConstraints =
   P.buildExpressionParser table piece
 where
   piece =  (Cond <$> identifier)
       <|> do { string "~"; Not `liftM` geniLitConstraints }
       <|> parens geniLitConstraints
   table = [ [ op "&" And P.AssocLeft ]
           , [ op "|" Or  P.AssocLeft ]
           ]
   op s f assoc = P.Infix (do { string s ; return f }) assoc
-}

squaresString :: Parser Text
squaresString =
    between (char '[') (char ']') $ do
       xs <- many1 (nonSq <|> squaresString)
       return $ "[" <> T.concat xs <> "]"
  where
    nonSq :: Parser Text
    nonSq = T.pack <$> many1 (noneOf "[]")

-- the output end of things
-- displaying preformatted semantic input

geniTestSuite :: Parser [TestCase SemInput]
geniTestSuite =
  tillEof (many geniTestCase)

-- | Just the String representations of the semantics
--   in the test suite
geniTestSuiteString :: Parser [Text]
geniTestSuiteString =
    tillEof (many geniTestCaseString)

-- | This is only used by the script genimakesuite
geniDerivations :: Parser [TestCaseOutput]
geniDerivations = tillEof $ many geniOutput

geniTestCase :: Parser (TestCase SemInput)
geniTestCase =
     TestCase <$> (option "" (flexiIdentifier <?> "a test case name"))
              <*> lookAhead geniSemanticInputString
              <*> geniSemanticInput
              <*> many geniSentence
              <*> pure Nothing

-- note that the keyword is NOT optional
type TestCaseOutput = (Text, Map.Map (Text,Text) [Text])
geniOutput :: Parser TestCaseOutput
geniOutput = do
    ws <- keyword OUTPUT >> squares geniWords
    ds <- Map.fromList <$> many geniTraces
    return (ws, ds)

geniTraces :: Parser ((Text,Text), [Text])
geniTraces = do
    keyword TRACE
    squares $ do
        k1 <- withWhite geniWord
        k2 <- withWhite geniWord
        whiteSpace >> char '!' >> whiteSpace
        traces <- geniWord `sepEndBy1` whiteSpace
        return ((k1,k2), traces)

withWhite :: Parser a -> Parser a
withWhite p = p >>= (\a -> whiteSpace >> return a)

geniSentence :: Parser Text
geniSentence = optional (keyword SENTENCE) >> squares geniWords

geniWords :: Parser Text
geniWords =
    T.unwords <$> (sepEndBy1 geniWord whiteSpace <?> "a sentence")

geniWord :: Parser Text
geniWord = T.pack <$> many1 (noneOf "[]\v\f\t\r\n ")

-- | The original string representation of a test case semantics
--   (for gui)
geniTestCaseString :: Parser Text
geniTestCaseString = do
    option "" (flexiIdentifier <?> "a test case name")
    geniSemanticInputString <* (many geniSentence >> many geniOutput)

-- ----------------------------------------------------------------------
-- Lexicon
-- ----------------------------------------------------------------------

geniLexicon :: Parser [LexEntry]
geniLexicon = tillEof $ many1 geniLexicalEntry

geniLexicalEntry :: Parser LexEntry
geniLexicalEntry =
  do lemmas  <- geniAtomicDisjunction <?> "a lemma (or disjunction thereof)"
     family  <- identifier <?> "a tree family"
     (pars, interface) <- option ([],[]) $ parens paramsParser
     equations <- option [] $ do keyword "equations"
                                 geniFeats <?> "path equations"
     filters <- option [] $ do keyword "filters"
                               geniFeats
     keywordSemantics
     (sem, pols) <- fromLexSem <$> squares geniLexSemantics
     --
     return (mkFullLexEntry lemmas family pars interface filters equations sem pols)
  where
    paramsParser :: Parser ([GeniVal], Flist GeniVal)
    paramsParser = do
      pars <- many geniValue <?> "some parameters"
      interface <- option [] $ do symbol "!"
                                  many geniAttVal
      return (pars, interface)

geniLexSemantics :: Parser [Literal PolValue]
geniLexSemantics = sortSem <$> many (geniLexLiteral <?> "a literal")

type PolValue = (GeniVal, Int)

geniLexLiteral :: Parser (Literal PolValue)
geniLexLiteral = geniLiteral_ (mkGAnon,0) geniPolValue

geniPolValue :: Parser (GeniVal, Int)
geniPolValue =
  do p <- geniPolarity
     v <- geniValue
     return (v,p)

-- ----------------------------------------------------------------------
-- Tree schemata
-- ----------------------------------------------------------------------

geniMacros :: Parser [SchemaTree]
geniMacros = tillEof $ many geniTreeDef

initType, auxType :: Parser Ptype
initType = do { reserved INITIAL ; return Initial  }
auxType  = do { reserved AUXILIARY ; return Auxiliar }

geniTreeDef :: Parser SchemaTree
geniTreeDef =
  do sourcePos <- getPosition
     family   <- identifier
     tname    <- option "" (colon *> identifier)
     (pars,iface)   <- geniParams
     theTtype  <- (initType <|> auxType)
     theTree  <- geniTree
     -- sanity checks?
     let treeFail x =
          do setPosition sourcePos -- FIXME does not do what I expect
             fail $ "In tree " ++ T.unpack family ++ ":"
                    ++ T.unpack tname ++ " " ++ show sourcePos ++ ": " ++ x
     let theNodes = T.flatten theTree
         numFeet    = length [ x | x <- theNodes, gtype x == Foot ]
         numAnchors = length [ x | x <- theNodes, ganchor x ]
     when (not $ any ganchor theNodes) $
       treeFail "At least one node in an LTAG tree must be an anchor"
     when (numAnchors > 1) $
       treeFail "There can be no more than 1 anchor node in a tree"
     when (numFeet > 1) $
       treeFail "There can be no more than 1 foot node in a tree"
     when (theTtype == Initial && numFeet > 0) $
       treeFail "Initial trees may not have foot nodes"
     --
     psem     <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) }
     ptrc     <- option [] $ do { keyword TRACE; squares (many identifier) }
     --
     return TT{ params = pars
              , pfamily = family
              , pidname = tname
              , pinterface = sortFlist iface
              , ptype = theTtype
              , tree = theTree
              , ptrace = ptrc
              , psemantics = psem
              }

geniTree :: (Ord v, GeniValLike v) => Parser (T.Tree (GNode v))
geniTree =
  do node <- geniNode
     kids <- option [] (braces $ many geniTree)
             <?> "child nodes"
     -- sanity checks
     let noKidsAllowed t c = when (c node && (not.null $ kids)) $
             fail $ t ++ " nodes may *not* have any children"
     noKidsAllowed "Anchor"       $ ganchor
     noKidsAllowed "Substitution" $ (== Subs) . gtype
     noKidsAllowed "Foot"         $ (== Foot) . gtype
     --
     return (T.Node node kids)

geniNode :: (Ord v, GeniValLike v) => Parser (GNode v)
geniNode = do
    name      <- identifier
    nodeType  <- geniNodeAnnotation
    lex_   <- if nodeType == AnnoLexeme
                 then (flexiIdentifier `sepBy` symbol "|") <?> "some lexemes"
                 else return []
    constr <- case nodeType of
                  AnnoDefault -> adjConstraintParser
                  AnnoAnchor  -> adjConstraintParser
                  _           -> return ExplicitNoAdj -- maybe InferredNoAdj instead?
    -- features only obligatory for non-lex nodes
    (top,bot) <- if nodeType == AnnoLexeme
                    then option ([],[]) $ try topbotParser
                    else topbotParser
    return $ GN { gnname   = name
                , gtype    = fromAnnotation nodeType
                , gup      = sortFlist top
                , gdown    = sortFlist bot
                , glexeme  = lex_
                , ganchor  = nodeType == AnnoAnchor
                , gaconstr = constr
                , gorigin  = ""
                }
  where
    adjConstraintParser = option MaybeAdj $
        reserved ACONSTR >> char ':' >> symbol "noadj" >> return ExplicitNoAdj
    topbotParser = do
        top <- geniFeats <?> "top features"
        symbol "!"
        bot <- geniFeats <?> "bot features"
        return (top,bot)

-- | Should be purely internal type to help parsing.
--   Injection to 'GType'.
--
--   We don't just use GType directly because the annotations convey
--   subtle distinctions that aren't encoded, particularly between
--   lexemes and anchors
data Annotation = AnnoAnchor
                | AnnoLexeme
                | AnnoSubst
                | AnnoFoot
                | AnnoDefault
  deriving Eq

fromAnnotation :: Annotation -> GType
fromAnnotation AnnoLexeme  = Lex
fromAnnotation AnnoAnchor  = Lex
fromAnnotation AnnoSubst   = Subs
fromAnnotation AnnoFoot    = Foot
fromAnnotation AnnoDefault = Other

geniNodeAnnotation :: Parser Annotation
geniNodeAnnotation =
    (keyword TYPE *> ty)                   <|>
    (reserved ANCHOR >> return AnnoAnchor) <|>
    return AnnoDefault
  where
    ty    = choice [ try (symbol s) >> return t | (s,t) <- table ]
    table =
        [ (ANCHOR, AnnoAnchor)
        , (FOOT,   AnnoFoot)
        , (SUBST,  AnnoSubst)
        , (LEX,    AnnoLexeme)
        ]

-- | This makes it possible to read anchored trees, which may be
--   useful for debugging purposes.
--
--   FIXME: note that this is very rudimentary; we do not set id numbers,
--   parse polarities. You'll have to call
--   some of our helper functions if you want that functionality.
geniTagElems :: Parser [TagElem]
geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem

geniTagElem :: Parser TagElem
geniTagElem = do
    family   <- identifier
    tname    <- option "" $ (colon *> identifier)
    iface    <- (snd `liftM` geniParams) <|> geniFeats
    theType  <- initType <|> auxType
    theTree  <- geniTree
    sem      <- do { keywordSemantics; squares geniSemantics }
    --
    return $ TE { idname      = tname
                , ttreename   = family
                , tinterface  = iface
                , ttype       = theType
                , ttree       = theTree
                , tsemantics  = sem
                , tidnum      = -1 -- provisional id
                , tpolarities = Map.empty
                , tsempols    = []
                , ttrace      = []
                }

-- | 'geniParams' recognises a list of parameters optionally followed by a
--  bang (\verb$!$) and a list of attribute-value pairs.  This whole thing is
--  to wrapped in the parens.
--
--  TODO: deprecate
geniParams :: Parser ([GeniVal], Flist GeniVal)
geniParams = parens $ do
  pars <- many geniValue <?> "some parameters"
  interface <- option [] $ do { symbol "!"; many geniAttVal }
  return (pars, interface)

-- ----------------------------------------------------------------------
-- Morphology
-- ----------------------------------------------------------------------

geniMorphInfo :: Parser [(Text,Flist GeniVal)]
geniMorphInfo = tillEof $ many morphEntry

morphEntry :: Parser (Text,Flist GeniVal)
morphEntry = (,) <$> identifier <*> geniFeats

-- ======================================================================
-- Everything else
-- ======================================================================

-- ----------------------------------------------------------------------
-- Polarities
-- ----------------------------------------------------------------------

-- | 'geniPolarity' associates a numerical value to a polarity symbol,
--  that is, '+' or '-'.
geniPolarity :: Parser Int
geniPolarity = option 0 (plus <|> minus)
  where
    plus  = do { char '+'; return  1   }
    minus = do { char '-'; return (-1) }

-- ----------------------------------------------------------------------
-- keyword
-- ----------------------------------------------------------------------

{-# INLINE keyword #-}
keyword :: Text -> Parser Text
keyword k =
     (try $ do { reserved k; colon; return k }) <?> T.unpack k ++ ":"

{-# INLINE keywordSemantics #-}
keywordSemantics :: Parser Text
keywordSemantics = keyword SEMANTICS

-- ----------------------------------------------------------------------
-- language def helpers
-- ----------------------------------------------------------------------

lexer :: P.GenTokenParser Text () Identity
lexer  = makeTokenParser geniLanguageDef

whiteSpace :: Parser ()
whiteSpace = P.whiteSpace lexer

identifier :: Parser Text
identifier = decode <$> P.identifier lexer

-- | Like 'identifier', but also accepts string literals
flexiIdentifier :: Parser Text
flexiIdentifier = stringLiteral <|> identifier

-- stolen from Parsec code (ident)
-- | Like 'identifier' but allows for reserved words too
looseIdentifier :: Parser Text
looseIdentifier =
    decode <$> do { i <- ident ; whiteSpace; return i }
  where
    ident = do
        { c <- identStart geniLanguageDef
        ; cs <- many (identLetter geniLanguageDef)
        ; return (c:cs) } <?> "identifier"

-- | Accepts: identifiers, bare reserved words, and string literals
looseFlexiIdentifier :: Parser Text
looseFlexiIdentifier = looseIdentifier <|> stringLiteral

colon :: Parser Text
colon = decode <$> P.colon lexer

stringLiteral :: Parser Text
stringLiteral = decode <$> P.stringLiteral lexer

squares, braces, parens :: Parser a -> Parser a
squares = P.squares lexer
braces  = P.braces  lexer
parens  = P.parens  lexer

reserved :: Text -> Parser Text
reserved s = P.reserved lexer (T.unpack s) >> return s

symbol :: Text -> Parser Text
symbol s = P.symbol lexer (T.unpack s) >> return s

decode :: String -> Text
decode = T.pack

-- ----------------------------------------------------------------------
-- parsec helpers
-- ----------------------------------------------------------------------

-- | identifier, permitting reserved words too
identifierR :: Parser Text
identifierR = decode <$> do
    { c <- P.identStart geniLanguageDef
    ; cs <- many (P.identLetter geniLanguageDef)
    ; return (c:cs)
    } <?> "identifier or reserved word"

tillEof :: Parser a -> Parser a
tillEof p = whiteSpace *> p <* eof

-- stolen from Parsec and adapted to use UTF-8 input
parseFromFile :: Parser a -> SourceName -> IO (Either ParseError a)
parseFromFile p fname = do
    { input <- T.decodeUtf8 <$> B.readFile fname
    ; return (parse p fname input)
    }