{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}

module NLP.SentiwordnetParser
  ( parse
  , SentiWordNet(..)
  , Entry(..)
  , SynsetTerm(..)
  , POS(..)
  , SentiWordNetLookup
  , SentiWordNetLookupItem(..)
  , toSentiWordNetLookup
  -- * internal stuff
  , parseSentiWordNet
  , parsePOS
  , parseDecimal
  , parseInt
  , parseEntry
  , parseSynsetTerm
  , parseComment
  , test
  ) where

import Control.Applicative
import Data.Decimal (Decimal)
import qualified Data.HashMap.Strict as H
import Data.Hashable (Hashable)
import Data.Semigroup
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import GHC.Generics (Generic)
import Safe
import Text.Trifecta

data SynsetTerm = SynsetTerm
  { name :: Text
  , num :: Int
  } deriving (Show, Eq)

data POS
  = Noun
  | Verb
  | Adjective
  | AdjectiveSatellite
  | Adverb
  deriving (Show, Eq, Generic, Hashable)

data Entry = Entry
  { pos :: POS
  , id_ :: Text
  , posScore :: Decimal
  , negScore :: Decimal
  , synsetTerms :: [SynsetTerm]
  , gloss :: Text
  } deriving (Show, Eq)

data SentiWordNet = SentiWordNet
  { items :: [Entry]
  } deriving (Show, Eq)

parsePOS :: Parser POS
parsePOS =
  pure Noun <* char 'n' <|> pure Verb <* char 'v' <|> pure Adjective <* char 'a' <|>
  pure AdjectiveSatellite <* char 's' <|>
  pure Adverb <* char 'r'

parseInt :: Parser Int
parseInt = do
  s <- many alphaNum
  case readMay s of
    Nothing -> fail ("Failed to read an int: " ++ s)
    Just res -> return res

parseDecimal :: Parser Decimal
parseDecimal = do
  s <- many (alphaNum <|> char '.')
  let r = readMay s
  case r of
    Nothing -> fail ("Failed to read decimal: " ++ s)
    Just res -> return res

parseSynsetTerm :: Parser SynsetTerm
parseSynsetTerm = do
  name <- T.pack <$> many (noneOf ['#'])
  _ <- char '#'
  num <- parseInt
  return SynsetTerm {..}

parseEntry :: Parser Entry
parseEntry = do
  _ <- optional (many parseComment)
  pos <- parsePOS
  _ <- tab
  id_ <- T.pack <$> many (noneOf ['\t'])
  _ <- tab
  posScore <- parseDecimal
  _ <- tab
  negScore <- parseDecimal
  _ <- tab
  synsetTerms <- sepBy parseSynsetTerm (char ' ')
  _ <- tab
  gloss <- T.pack <$> many (noneOf ['\r', '\n'])
  _ <- optional newline
  return (Entry {..})

parseComment :: Parser String
parseComment = char '#' *> many (noneOf ['\n']) <* optional newline

parseSentiWordNet :: Parser SentiWordNet
parseSentiWordNet = SentiWordNet <$> some parseEntry

parse :: Text -> Either String SentiWordNet
parse t =
  case parseString parseSentiWordNet mempty . T.unpack $ t of
    Failure e -> Left (show e)
    Success r -> Right r

data SentiWordNetLookupItem = SentiWordNetLookupItem
  { lookPos :: Int
  , lookPosScore :: Decimal
  , lookNegScore :: Decimal
  }

-- | Datastructure for efficient 'lookupScoreByPosAndName' lookups
type SentiWordNetLookup = H.HashMap (POS, Text) [SentiWordNetLookupItem]

-- | Convert function
toSentiWordNetLookup :: SentiWordNet -> SentiWordNetLookup
toSentiWordNetLookup SentiWordNet {..} =
  H.fromListWith (<>) (concatMap convEntry items)
  where
    convEntry Entry {..} = map (convTerm pos posScore negScore) synsetTerms
    convTerm pos posScore negScore SynsetTerm {..} =
      ((pos, name), [SentiWordNetLookupItem num posScore negScore])

test :: IO ()
test = do
  sentiWordNet <-
    T.readFile
      "/home/kb/Downloads/SentiWordNet_3.0.0/SentiWordNet_3.0.0_20130122.txt"
  let res = parse sentiWordNet
  case res of
    Right r -> print (length (items r))
    Left e -> putStrLn e