module Data.CRF.Chain1.Dataset.Codec ( Codec , CodecM , encodeWord'Cu , encodeWord'Cn , encodeSent'Cu , encodeSent'Cn , encodeSent , encodeWordL'Cu , encodeWordL'Cn , encodeSentL'Cu , encodeSentL'Cn , encodeSentL , decodeLabel , decodeLabels , mkCodec , encodeData , encodeDataL ) where import Control.Applicative ((<$>), (<*>), pure) import Data.Maybe (catMaybes) import Data.Lens.Common (fstLens, sndLens) import qualified Data.Set as S import qualified Data.Map as M import qualified Data.Vector as V import qualified Control.Monad.Codec as C import Data.CRF.Chain1.Dataset.Internal import Data.CRF.Chain1.Dataset.External -- | A codec. The first component is used to encode observations -- of type a, the second one is used to encode labels of type b. type Codec a b = (C.AtomCodec a, C.AtomCodec b) -- | Type synonym for the codec monad. It is important to notice that by a -- codec we denote here a structure of two 'C.AtomCodec's while in the -- monad-codec package it denotes a monad. type CodecM a b c = C.Codec (Codec a b) c -- | Encode the labeled word and update the codec. encodeWordL'Cu :: (Ord a, Ord b) => WordL a b -> CodecM a b (X, Y) encodeWordL'Cu word = do x <- mkX . map Ob <$> mapM (C.encode' fstLens) (S.toList $ fst word) y <- mkY <$> sequence [ (,) <$> (Lb <$> C.encode sndLens lb) <*> pure pr | (lb, pr) <- (M.toList . unDist) (snd word) ] return (x, y) -- | Encodec the labeled word and do *not* update the codec. -- If the label is not in the codec, use the default value. encodeWordL'Cn :: (Ord a, Ord b) => Int -> WordL a b -> CodecM a b (X, Y) encodeWordL'Cn i word = do x <- mkX . map Ob . catMaybes <$> mapM (C.maybeEncode fstLens) (S.toList $ fst word) y <- mkY <$> sequence [ (,) <$> encodeL i lb <*> pure pr | (lb, pr) <- (M.toList . unDist) (snd word) ] return (x, y) where encodeL j y = Lb . maybe j id <$> C.maybeEncode sndLens y -- | Encode the word and update the codec. encodeWord'Cu :: Ord a => Word a -> CodecM a b X encodeWord'Cu word = mkX . map Ob <$> mapM (C.encode' fstLens) (S.toList word) -- | Encode the word and do *not* update the codec. encodeWord'Cn :: Ord a => Word a -> CodecM a b X encodeWord'Cn word = mkX . map Ob . catMaybes <$> mapM (C.maybeEncode fstLens) (S.toList word) -- | Encode the labeled sentence and update the codec. encodeSentL'Cu :: (Ord a, Ord b) => SentL a b -> CodecM a b (Xs, Ys) encodeSentL'Cu sent = do ps <- mapM encodeWordL'Cu sent return (V.fromList (map fst ps), V.fromList (map snd ps)) -- | Encode the labeled sentence and do *not* update the codec. -- Substitute the default label for any label not present in the codec. encodeSentL'Cn :: (Ord a, Ord b) => b -> SentL a b -> CodecM a b (Xs, Ys) encodeSentL'Cn def sent = do i <- C.maybeEncode sndLens def >>= \mi -> case mi of Just _i -> return _i Nothing -> error "encodeWordL'Cn: default label not in the codec" ps <- mapM (encodeWordL'Cn i) sent return (V.fromList (map fst ps), V.fromList (map snd ps)) -- | Encode the labeled sentence with the given codec. Substitute the -- default label for any label not present in the codec. encodeSentL :: (Ord a, Ord b) => b -> Codec a b -> SentL a b -> (Xs, Ys) encodeSentL def codec = C.evalCodec codec . encodeSentL'Cn def -- | Encode the sentence and update the codec. encodeSent'Cu :: Ord a => Sent a -> CodecM a b Xs encodeSent'Cu = fmap V.fromList . mapM encodeWord'Cu -- | Encode the sentence and do *not* update the codec. encodeSent'Cn :: Ord a => Sent a -> CodecM a b Xs encodeSent'Cn = fmap V.fromList . mapM encodeWord'Cn -- | Encode the sentence using the given codec. encodeSent :: Ord a => Codec a b -> Sent a -> Xs encodeSent codec = C.evalCodec codec . encodeSent'Cn -- | Create the codec on the basis of the labeled dataset, return the -- resultant codec and the encoded dataset. mkCodec :: (Ord a, Ord b) => [SentL a b] -> (Codec a b, [(Xs, Ys)]) mkCodec = let swap (x, y) = (y, x) in swap . C.runCodec (C.empty, C.empty) . mapM encodeSentL'Cu -- | Encode the labeled dataset using the codec. Substitute the default -- label for any label not present in the codec. encodeDataL :: (Ord a, Ord b) => b -> Codec a b -> [SentL a b] -> [(Xs, Ys)] encodeDataL def codec = C.evalCodec codec . mapM (encodeSentL'Cn def) -- | Encode the dataset with the codec. encodeData :: Ord a => Codec a b -> [Sent a] -> [Xs] encodeData codec = map (encodeSent codec) -- | Decode the label. decodeLabel :: Ord b => Codec a b -> Lb -> b decodeLabel codec x = C.evalCodec codec $ C.decode sndLens (unLb x) -- | Decode the sequence of labels. decodeLabels :: Ord b => Codec a b -> [Lb] -> [b] decodeLabels codec xs = C.evalCodec codec $ sequence [C.decode sndLens (unLb x) | x <- xs]