module Data.CRF.Chain2.Tiers.Dataset.External
( Word (obs, lbs)
, mkWord
, Sent
, Prob (unProb)
, mkProb
, WordL
, SentL
) where
import qualified Data.Set as S
import qualified Data.Map as M
data Word a b = Word {
obs :: S.Set a
, lbs :: S.Set [b] }
deriving (Show, Eq, Ord)
mkWord :: S.Set a -> S.Set [b] -> Word a b
mkWord _obs _lbs
| S.null _lbs = error "mkWord: empty set of potential labels"
| otherwise = Word _obs _lbs
type Sent a b = [Word a b]
newtype Prob a = Prob { unProb :: M.Map a Double }
deriving (Show, Eq, Ord)
mkProb :: Ord a => [(a, Double)] -> Prob a
mkProb =
Prob . normalize . M.fromListWith (+) . filter ((>0).snd)
where
normalize dist
| M.null dist =
error "mkProb: no elements with positive probability"
| otherwise =
let z = sum (M.elems dist)
in fmap (/z) dist
type WordL a b = (Word a b, Prob [b])
type SentL a b = [WordL a b]