{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- | The module provides functions for splitting positional tags.
-- They can be used together with the layered disambiguation model.

module NLP.Concraft.Disamb.Positional
( Tier (..)
, Part (..)
, select
, split
, tierConfDefault
) where

import Control.Applicative ((<$>), (<*>))
import Data.Binary (Binary, put, get)
import Data.Text.Binary ()
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Tagset.Positional as TP

-- | A tier description.
data Tier = Tier {
    -- | Does it include the part of speech?
      withPos   :: Bool
    -- | Tier grammatical attributes.
    , withAtts  :: S.Set TP.Attr }

instance Binary Tier where
    put Tier{..} = put withPos >> put withAtts
    get = Tier <$> get <*> get

-- | An atomic part of morphosyntactic tag with optional POS.
data Part = Part
    { pos   :: Maybe TP.POS
    , atts  :: M.Map TP.Attr T.Text }
    deriving (Show, Eq, Ord)

instance Binary Part where
    put Part{..} = put pos >> put atts
    get = Part <$> get <*> get

-- | Select tier attributes.
select :: Tier -> TP.Tag -> Part
select Tier{..} tag = Part
    { pos   = if withPos then Just (TP.pos tag) else Nothing
    , atts  = M.filterWithKey (\k _ -> k `S.member` withAtts) (TP.atts tag) }

-- | Split the positional tag.
split :: [Tier] -> TP.Tag -> [Part]
split tiers tag =
    [ select tier tag
    | tier <- tiers ]

-- | Default tiered tagging configuration.
tierConfDefault :: [Tier]
tierConfDefault =
    [tier1, tier2]
  where
    tier1 = Tier True $ S.fromList ["cas", "per"]
    tier2 = Tier False $ S.fromList
        [ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
        , "acn", "ppr", "agg", "vlc", "dot" ]