{-# LANGUAGE DeriveGeneric, OverloadedStrings, ApplicativeDo #-}
module Data.Concrete.Parsers.CONLL
       ( parser
       , singleParser
       , conllxfields
       , conllufields
       , conll2009fields
       ) where

import Data.List (intercalate)
import Data.Concrete.Parsers.Types (Bookkeeper(..), CommunicationParser)
import Data.Concrete.Parsers.Utils (communicationRule)
import Data.Scientific (scientific, Scientific(..))
import Data.Text.Lazy (pack, unpack, Text)
import Data.Functor (($>))
import qualified Data.Map as Map
import Data.Map (Map)
import Data.List.NonEmpty (fromList)
import Text.Megaparsec.Lexer (symbol, lexeme, signed, number)
import Text.Megaparsec.Pos (initialPos, defaultTabWidth)
import Text.Megaparsec.Error (Dec)
import Text.Megaparsec ( parseErrorPretty
                       , (<|>)
                       , space
                       , hexDigitChar
                       , count
                       , manyTill
                       , anyChar
                       , runParser
                       , some
                       , char
                       , choice
                       , sepBy
                       , between
                       , match
                       , ParsecT
                       , runParserT'
                       , State(..)
                       , getParserState
                       , eol
                       , tab
                       , newline
                       , sepBy1
                       , many
                       , noneOf
                       , eof
                       , separatorChar
                       , someTill
                       )

import Text.Megaparsec.Text.Lazy (Parser)
--import Data.Concrete (default_Communication, Communication(..))
import Data.Concrete.Autogen.Communication_Types (default_Communication, Communication(..))
import qualified Control.Monad.State as S
import qualified Control.Monad.Identity as I
--import Data.Concrete.Types
import Data.Concrete.Parsers.Utils (communicationRule, sectionRule, sentenceRule, tokenRule, pushPathComponent, popPathComponent)

-- | Field names for the CONLL-X format
conllxfields = ["ID", "FORM", "LEMMA", "PLEMMA", "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD"] :: [Text]

-- | Field names for the CONLL-U format
conllufields = ["ID", "FORM", "LEMMA", "UPOSTAG", "XPOSTAG", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"] :: [Text]

-- | Field names for the CONLL2009 format
conll2009fields = ["ID", "FORM", "LEMMA", "UPOSTAG", "XPOSTAG", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC", "DEPREL", "PDEPREL", "FILLPRED", "PRED"] ++ (map (\x -> pack $ "APRED" ++ (show x)) [1..16]) :: [Text]

-- | Parses different flavors of CONLL depending on which fields are passed as arguments
--   Each sentence is made into its own Communication
parser :: [Text] -> CommunicationParser ()
parser fs = (communicationRule id (sentence fs)) `sepBy1` sentenceBreak >> return ()

-- | Parses different flavors of CONLL depending on which fields are passed as arguments
--   All input sentences are placed in a single Communication
singleParser :: [Text] -> CommunicationParser ()
singleParser fs = (communicationRule id (sentence fs)) `sepBy1` sentenceBreak >> return ()
  
sentence fs = do
  pushPathComponent "sentence"
  (sectionRule id . sentenceRule id) $ (some (commentLine <|> wordLine fs))
  popPathComponent
  return ()

commentLine = (char '#') >> (manyTill anyChar newline)
  
wordLine fs = (row fs) >> return []

row fs = do
  ls <- mapM (\n -> namedEntry n >> char '\t') (init fs)
  s <- namedEntry (last fs)
  newline
  return ()

-- | Parse a CONLL cell, which if it's "FORM" should be treated as a Token
namedEntry :: Text -> ParsecT Dec Text (S.StateT Bookkeeper IO) Text
namedEntry f = if f == "FORM" then tokenRule id $ pack <$> (some (noneOf ['\t', '\n'])) else pack <$> (some (noneOf ['\t', '\n']))

sentenceBreak = newline