{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TypeFamilies #-} -- | Internal core data types. module Data.CRF.Chain2.Tiers.Core ( -- * Basic types Ob (..) , mkOb, unOb , Lb (..) , mkLb, unLb , FeatIx (..) , mkFeatIx, unFeatIx , CbIx -- * Complex label , Cb (..) , mkCb , unCb -- * Input element (word) , X (_unX, _unR) , mkX , unX , unR -- ** Indexing , lbAt -- * Output element (choice) , Y (_unY) , mkY , unY -- * Feature , Feat (..) -- ** Feature generation , obFeats , trFeats1 , trFeats2 , trFeats3 ) where import Control.Applicative ((<$>), (<*>)) import Control.Arrow (second) import Data.Binary (Binary, put, get, putWord8, getWord8) import Data.Ix (Ix) import Data.Int (Int16, Int32) import Data.List (zip4) -- import qualified Data.Array.Unboxed as A import qualified Data.Vector as V import qualified Data.Vector.Unboxed as U import Data.Vector.Unboxed.Deriving -- import qualified Data.Vector.Generic.Base as G -- import qualified Data.Vector.Generic.Mutable as G import qualified Data.Number.LogFloat as L -- import qualified Data.Primitive.ByteArray as BA import Data.CRF.Chain2.Tiers.Array (Bounds) ---------------------------------------------------------------- -- Basic types ---------------------------------------------------------------- -- | An observation. newtype Ob = Ob { _unOb :: Int32 } deriving (Show, Eq, Ord, Binary) -- GeneralizedNewtypeDeriving doesn't work for this in 7.8.2: -- , G.Vector U.Vector, G.MVector U.MVector, U.Unbox ) derivingUnbox "Ob" [t| Ob -> Int32 |] [| _unOb |] [| Ob |] -- | Smart observation constructor. mkOb :: Int -> Ob mkOb = Ob . fromIntegral {-# INLINE mkOb #-} -- | Deconstract observation. unOb :: Ob -> Int unOb = fromIntegral . _unOb {-# INLINE unOb #-} -- | An atomic label. newtype Lb = Lb { _unLb :: Int16 } deriving (Show, Eq, Ord, Binary , Num, Ix, Bounds) derivingUnbox "Lb" [t| Lb -> Int16 |] [| _unLb |] [| Lb |] -- | Smart label constructor. mkLb :: Int -> Lb mkLb = Lb . fromIntegral {-# INLINE mkLb #-} -- | Deconstruct label. unLb :: Lb -> Int unLb = fromIntegral . _unLb {-# INLINE unLb #-} -- | An index of the label. type CbIx = Int -- | A feature index. To every model feature a unique index is assigned. newtype FeatIx = FeatIx { _unFeatIx :: Int32 } deriving (Show, Eq, Ord, Binary) derivingUnbox "FeatIx" [t| FeatIx -> Int32 |] [| _unFeatIx |] [| FeatIx |] -- | Smart feature index constructor. mkFeatIx :: Int -> FeatIx mkFeatIx = FeatIx . fromIntegral {-# INLINE mkFeatIx #-} -- | Deconstract feature index. unFeatIx :: FeatIx -> Int unFeatIx = fromIntegral . _unFeatIx {-# INLINE unFeatIx #-} ---------------------------------------------------------------- -- Complex label ---------------------------------------------------------------- -- TODO: Do we gain anything by representing the -- complex label with a byte array? Complex labels -- should not be directly stored in a model, so if -- there is something to gain here, its not obvious. -- -- Perhaps a list representation would be sufficient? -- -- | A complex label is an array of atomic labels. -- newtype Cb = Cb { unCb :: BA.ByteArray } -- | A complex label is a vector of atomic labels. newtype Cb = Cb { _unCb :: U.Vector Lb } deriving (Show, Eq, Ord, Binary) -- | Smart complex label constructor. mkCb :: [Lb] -> Cb mkCb = Cb . U.fromList -- | Deconstract complex label. unCb :: Cb -> [Lb] unCb = U.toList . _unCb ---------------------------------------------------------------- -- Internal dataset representation ---------------------------------------------------------------- -- | A word is represented by a list of its observations -- and a list of its potential label interpretations. data X = X { -- | A set of observations. _unX :: U.Vector Ob -- | A vector of potential labels. , _unR :: V.Vector Cb } deriving (Show, Eq, Ord) instance Binary X where put X{..} = put _unX >> put _unR get = X <$> get <*> get -- | Smart `X` constructor. mkX :: [Ob] -> [Cb] -> X mkX x r = X (U.fromList x) (V.fromList r) {-# INLINE mkX #-} -- | List of observations. unX :: X -> [Ob] unX = U.toList . _unX {-# INLINE unX #-} -- | List of potential labels. unR :: X -> [Cb] unR = V.toList . _unR {-# INLINE unR #-} -- | Potential label at the given position. lbAt :: X -> CbIx -> Cb lbAt x = (_unR x V.!) {-# INLINE lbAt #-} -- | Vector of chosen labels together with corresponding probabilities in log -- domain. newtype Y = Y { _unY :: V.Vector (Cb, Double) } deriving (Show, Eq, Ord, Binary) -- | Y constructor. mkY :: [(Cb, Double)] -> Y mkY = Y . V.fromList . map (second log) {-# INLINE mkY #-} -- | Y deconstructor symetric to mkY. unY :: Y -> [(Cb, L.LogFloat)] unY = map (second L.logToLogFloat) . V.toList . _unY {-# INLINE unY #-} ---------------------------------------------------------------- -- Feature ---------------------------------------------------------------- -- | Feature; every feature is associated to a layer with `ln` identifier. data Feat -- | Second-order transition feature. = TFeat3 { x1 :: {-# UNPACK #-} !Lb , x2 :: {-# UNPACK #-} !Lb , x3 :: {-# UNPACK #-} !Lb , ln :: {-# UNPACK #-} !Int } -- | First-order transition feature. | TFeat2 { x1 :: {-# UNPACK #-} !Lb , x2 :: {-# UNPACK #-} !Lb , ln :: {-# UNPACK #-} !Int } -- | Zero-order transition feature. | TFeat1 { x1 :: {-# UNPACK #-} !Lb , ln :: {-# UNPACK #-} !Int } -- | Observation feature. | OFeat { ob :: {-# UNPACK #-} !Ob , x1 :: {-# UNPACK #-} !Lb , ln :: {-# UNPACK #-} !Int } deriving (Show, Eq, Ord) instance Binary Feat where put (OFeat o x k) = putWord8 0 >> put o >> put x >> put k put (TFeat3 x y z k) = putWord8 1 >> put x >> put y >> put z >> put k put (TFeat2 x y k) = putWord8 2 >> put x >> put y >> put k put (TFeat1 x k) = putWord8 3 >> put x >> put k get = getWord8 >>= \i -> case i of 0 -> OFeat <$> get <*> get <*> get 1 -> TFeat3 <$> get <*> get <*> get <*> get 2 -> TFeat2 <$> get <*> get <*> get 3 -> TFeat1 <$> get <*> get _ -> error "get feature: unknown code" ---------------------------------------------------- -- Features generation ---------------------------------------------------- -- | Generate observation features. obFeats :: Ob -> Cb -> [Feat] obFeats ob' xs = [ OFeat ob' x k | (x, k) <- zip (unCb xs) [0..] ] -- | Generate zero-order transition features. trFeats1 :: Cb -> [Feat] trFeats1 xs = [ TFeat1 x k | (x, k) <- zip (unCb xs) [0..] ] -- | Generate first-order transition features. trFeats2 :: Cb -> Cb -> [Feat] trFeats2 xs1 xs2 = [ TFeat2 x1' x2' k | (x1', x2', k) <- zip3 (unCb xs1) (unCb xs2) [0..] ] -- | Generate second-order transition features. trFeats3 :: Cb -> Cb -> Cb -> [Feat] trFeats3 xs1 xs2 xs3 = [ TFeat3 x1' x2' x3' k | (x1', x2', x3', k) <- zip4 (unCb xs1) (unCb xs2) (unCb xs3) [0..] ]