-- | External data representation. module Data.CRF.Chain2.Tiers.Dataset.External ( Word (obs, lbs) , mkWord , Sent , Prob (unProb) , mkProb , WordL , SentL ) where import Prelude hiding (Word) import qualified Data.Set as S import qualified Data.Map as M -- | A word consists of a set of observations and a set of potential labels. data Word a b = Word { -- | Set of observations. obs :: S.Set a -- | Non-empty set of potential labels. , lbs :: S.Set [b] } deriving (Show, Eq, Ord) -- | A word constructor which checks non-emptiness of the potential -- set of labels. mkWord :: S.Set a -> S.Set [b] -> Word a b mkWord _obs _lbs | S.null _lbs = error "mkWord: empty set of potential labels" | otherwise = Word _obs _lbs -- | A sentence of words. type Sent a b = [Word a b] -- | A probability distribution defined over elements of type a. -- All elements not included in the map have probability equal -- to 0. newtype Prob a = Prob { unProb :: M.Map a Double } deriving (Show, Eq, Ord) -- -- | Construct the probability distribution. -- mkProb :: Ord a => [(a, Double)] -> Prob a -- mkProb = -- Prob . normalize . M.fromListWith (+) . filter ((>0).snd) -- where -- normalize dist -- | M.null dist = -- error "mkProb: no elements with positive probability" -- | otherwise = -- let z = sum (M.elems dist) -- in fmap (/z) dist -- | Construct the probability distribution. -- -- Normalization is not performed because, when working with DAGs, the -- probability of a specific DAG edge can be lower than 1 (in particular, it can -- be 0). -- -- Elements with probability 0 cab be filtered out since information that a -- given label is a potential interpretation of the given word/edge is preserved -- at the level of the `Word` mkProb :: Ord a => [(a, Double)] -> Prob a mkProb = Prob . M.fromListWith (+) . filter ((>0).snd) -- | A WordL is a labeled word, i.e. a word with probability distribution -- defined over labels. We assume that every label from the distribution -- domain is a member of the set of potential labels corresponding to the -- word. TODO: Ensure the assumption using the smart constructor. type WordL a b = (Word a b, Prob [b]) -- | A sentence of labeled words. type SentL a b = [WordL a b]