{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE PatternGuards #-}

module Data.Tagset.Positional
( Tagset (..)
, Attr
, POS
, Optional
, domain
, rule

, Tag (..)
, expand
, tagSim
) where

import Control.Arrow (first)
import qualified Data.Text as T
import qualified Data.Map as M
import qualified Data.Set as S

-- | Attribute name.
type Attr = T.Text

-- | Part of speech.
type POS = T.Text

-- | Is the attribute optional?
type Optional = Bool

-- | The tagset consists of a domain for each attribute name and of a
-- parsing rule for each part of speech.
data Tagset = Tagset
    { domains   :: M.Map Attr (S.Set T.Text)
    , rules     :: M.Map POS  [(Attr, Optional)]
    } deriving (Show)

-- | Set of potential values for the given attribute.
domain :: Tagset -> Attr -> S.Set T.Text
domain Tagset{..} x =
  case x `M.lookup` domains of
    Just y  -> y
    Nothing -> error $ "domain: unknown attribute " ++ T.unpack x

-- | Parsing rule for the given POS.
rule :: Tagset -> POS -> [(Attr, Optional)]
rule Tagset{..} x =
  case x `M.lookup` rules of
    Just y  -> y
    Nothing -> error $ "rule: unknown POS " ++ T.unpack x

-- | The morphosyntactic tag consists of the POS value and corresponding
-- attribute values.
data Tag = Tag
    { pos   :: POS
    , atts  :: M.Map Attr T.Text
    } deriving (Show, Read, Eq, Ord)

-- | Expand optional attributes of the tag.
expand :: Tagset -> Tag -> [Tag]
expand tagset tag = do
    values <- sequence (map attrVal rl)
    let attrMap = M.fromList $ zip (map fst rl) values
    return $ Tag (pos tag) attrMap
  where
    rl = rule tagset (pos tag)
    attrVal (attr, False) = [atts tag M.! attr]
    attrVal (attr, True)
        | Just x <- M.lookup attr (atts tag) = [x]
        | otherwise = S.toList $ domain tagset attr

-- | Measure of similarity between two tags.
tagSim :: Tag -> Tag -> Int
tagSim t t' =
    S.size (xs `S.intersection` xs')
  where
    xs  = S.fromList $ (Nothing, pos t)  : assocs t
    xs' = S.fromList $ (Nothing, pos t') : assocs t'
    assocs = map (first Just) . M.assocs . atts