{-# LANGUAGE DeriveGeneric, OverloadedStrings, ApplicativeDo #-}
module Data.Concrete.Parsers.PTB
       ( parser
       ) where

import Data.Char (isSpace)
import Data.List (intercalate)
import Data.Concrete.Parsers.Types (Bookkeeper(..), CommunicationParser)
import Data.Concrete.Parsers.Utils (communicationRule)
import Data.Scientific (scientific, Scientific(..))
import Data.Text.Lazy (pack, Text)
import Data.Functor (($>))
import qualified Data.Map as Map
import Data.Map (Map)
import Data.List.NonEmpty (fromList)
import Text.Megaparsec.Lexer (symbol, lexeme, signed, number)
import Text.Megaparsec.Pos (initialPos, defaultTabWidth)
import Text.Megaparsec.Error (Dec)
import Text.Megaparsec.Lexer (symbol, lexeme, signed, number)
import Text.Megaparsec ( parseErrorPretty
                       , (<|>)
                       , satisfy
                       , space
                       , hexDigitChar
                       , count
                       , manyTill
                       , anyChar
                       , runParser
                       , some
                       , char
                       , choice
                       , sepBy
                       , between
                       , match
                       , ParsecT
                       , runParserT'
                       , State(..)
                       , getParserState
                       , spaceChar
                       , eof
                       , noneOf
                       , try
                       )

import Control.Monad.IO.Class (liftIO)       
import Text.Megaparsec.Text.Lazy (Parser)
import Data.Concrete.Autogen.Communication_Types (default_Communication, Communication(..))
import qualified Control.Monad.State as S
import qualified Control.Monad.Identity as I
import Data.Concrete.Parsers.Utils (communicationRule, sectionRule, sentenceRule, tokenRule, pushPathComponent, popPathComponent)

-- | Parser for PENN Treebank format
--   NOTE: currently, doesn't capture tags/parses
parser :: CommunicationParser ()
parser = do
  space
  some document
  space
  eof
  return ()

document :: CommunicationParser ()
document = lexeme' $ communicationRule id (parens (some sentence)) >> return ()

sentence = do
  pushPathComponent "sentence"
  (sectionRule id . sentenceRule id) $ lexeme' $ between (symbol' "(S") (symbol' ")") (some phrase)
  popPathComponent  
  
phrase = lexeme' $ parens (tag >> some (token <|> phrase)) >> return []

tag = lexicalItem

token = tokenRule id lexicalItem

lexicalItem = lexeme' $ some notSpaceOrParen

notSpaceOrParen = satisfy (\c -> and [(not . isSpace) c, ('(' /= c), (')' /= c)])

lexeme' = lexeme space
symbol' = symbol space
parens = between (symbol' "(") (symbol' ")")