module Data.CRF.Chain1.Constrained.Dataset.External ( Word (..) , unknown , Sent , Prob (unProb) , mkProb , WordL (word, choice) , mkWordL , SentL ) where import Prelude hiding (Word) import qualified Data.Set as S import qualified Data.Map as M -- | A Word is represented by a set of observations -- and a set of potential interpretation labels. -- When the set of potential labels is empty the word -- is considered to be unknown and the default potential -- set is used in its place. data Word a b = Word { obs :: S.Set a -- ^ The set of observations , lbs :: S.Set b -- ^ The set of potential interpretations. } deriving (Show, Eq, Ord) -- | The word is considered to be unknown when the set of potential -- labels is empty. unknown :: Word a b -> Bool unknown x = S.size (lbs x) == 0 {-# INLINE unknown #-} -- | A sentence of words. type Sent a b = [Word a b] -- | A probability distribution defined over elements of type a. -- All elements not included in the map have probability equal -- to 0. newtype Prob a = Prob { unProb :: M.Map a Double } deriving (Show, Eq, Ord) -- -- | Construct the probability distribution. -- mkProb :: Ord a => [(a, Double)] -> Prob a -- mkProb = -- Prob . normalize . M.fromListWith (+) . filter ((>0).snd) -- where -- normalize dist -- | M.null dist = -- error "mkProb: no elements with positive probability" -- | otherwise = -- let z = sum (M.elems dist) -- in fmap (/z) dist -- | Construct the probability distribution. -- -- Normalization is not performed because, when working with DAGs, the -- probability of a specific DAG edge can be lower than 1 (in particular, it can -- be 0). -- -- Elements with probability 0 cab be filtered out since information that a -- given label is a potential interpretation of the given word/edge is preserved -- at the level of the `Word` mkProb :: Ord a => [(a, Double)] -> Prob a mkProb = Prob . M.fromListWith (+) . filter ((>0).snd) -- | A WordL is a labeled word, i.e. a word with probability distribution -- defined over labels. We assume that every label from the distribution -- domain is a member of the set of potential labels corresponding to the -- word. Use the `mkWordL` smart constructor to build `WordL`. data WordL a b = WordL { word :: Word a b , choice :: Prob b } -- | Ensure, that every label from the distribution domain is a member of the -- set of potential labels corresponding to the word. mkWordL :: (Ord b) => Word a b -> Prob b -> WordL a b mkWordL wd cs -- | S.null chosen && S.null (lbs wd) = -- error "mkWordL: no labels assigned to the word" -- <- the above condition can actualy happen it the model does not -- find any probable label for a given word. | S.null (lbs wd) = WordL wd cs | chosen `S.isSubsetOf` lbs wd = WordL wd cs | otherwise = error "mkWordL: chosen labels outside of `lbs`" where chosen = M.keysSet (unProb cs) -- | A sentence of labeled words. type SentL a b = [WordL a b]