module Data.CRF.Chain1.Dataset.External
( Word
, Sent
, Dist (unDist)
, mkDist
, WordL
, annotate
, SentL
) where

import qualified Data.Set as S
import qualified Data.Map as M

-- | A Word is represented by a set of observations.
type Word a = S.Set a

-- | A sentence of words.
type Sent a = [Word a]

-- | A probability distribution defined over elements of type a.
-- All elements not included in the map have probability equal
-- to 0.
newtype Dist a = Dist { unDist :: M.Map a Double }

-- | Construct the probability distribution.
mkDist :: Ord a => [(a, Double)] -> Dist a
mkDist =
    Dist . normalize . M.fromListWith (+)
  where
    normalize dist =
        let z = sum (M.elems dist)
        in  fmap (/z) dist

-- | A WordL is a labeled word, i.e. a word with probability distribution
-- defined over labels.
type WordL a b = (Word a, Dist b)

-- | Annotate the word with the label.
annotate :: Word a -> b -> WordL a b
annotate w x = (w, Dist (M.singleton x 1))

-- | A sentence of labeled words.
type SentL a b = [WordL a b]