{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE ViewPatterns #-} module Numeric.SVM.LibLinear.PackFeatures ( FeatureValue, pattern FeatureValue , Samples(samplesMaxFeature) -- * Accumulating samples , addSample , ToSamplesM , runToSamplesM -- * Reading samples out , toList -- * Getting packed sample representation , withPackedSamples ) where import qualified Data.Vector as V import qualified Data.Vector.Fusion.Bundle as VF import qualified Data.Vector.Generic.Mutable as VGM import qualified Data.Vector.Generic as VG import qualified Data.Vector.Storable as VS import qualified Data.Vector.Storable.Mutable as VSM import qualified Data.Vector.Unboxed as VU import qualified Data.Vector.Unboxed.Mutable as VUM import Data.Label hiding (modify) import Data.Label.Monadic as LM import Control.Category import Control.Monad (when) import Prelude hiding ((.), id) import qualified Control.Foldl as Fold import Control.Monad.Trans import Control.Monad.State (StateT, runStateT) import Control.Monad.Reader (ReaderT, runReaderT, ask) import Control.Monad.State.Class () import Control.Monad.Primitive import qualified Debug.Trace import Foreign.Ptr import Foreign.ForeignPtr.Unsafe import Foreign.Marshal.Array import Foreign.Storable import Numeric.SVM.LibLinear.Internal import Numeric.SVM.LibLinear.Types type FeatureValue = C'feature_node pattern FeatureValue :: FeatureIdx -> Double -> FeatureValue pattern FeatureValue f x <- C'feature_node (FeatureIdx . fromIntegral -> f) (realToFrac -> x) where FeatureValue (FeatureIdx f) x = C'feature_node (fromIntegral f) (realToFrac x) data Samples = Samples { samplesChunks :: V.Vector Chunk , samplesMaxFeature :: FeatureIdx } data Chunk = Chunk { _chunkFeatures :: !(VS.Vector C'feature_node) , _chunkSamples :: !(VU.Vector (Label, Int)) } deriving (Show) data MChunk s = MChunk { _mchunkFeatures :: !(VSM.MVector s C'feature_node) , _mchunkSamples :: !(VUM.MVector s (Label, Int)) , _mchunkSampleOffset :: !Int , _mchunkFeatureOffset :: !Int } data Accum s = Accum { _accumChunks :: [Chunk] , _accumActive :: !(MChunk s) , _accumNSamples :: !Int , _accumMaxFeature :: !FeatureIdx } mkLabels [''Chunk, ''MChunk, ''Accum] data Params = Params { samplesPerChunk :: Int , expectedFeaturesPerSample :: Int } newtype ToSamplesM m a = TSM (ReaderT Params (StateT (Accum (PrimState m)) m) a) deriving (Functor, Applicative, Monad) emptyMChunk :: (PrimMonad m) => Int -> Int -> m (MChunk (PrimState m)) emptyMChunk nFeatures nSamples = do _mchunkFeatures <- VGM.new nFeatures _mchunkSamples <- VGM.new nSamples let _mchunkSampleOffset = 0 _mchunkFeatureOffset = 0 return $ MChunk {..} startNewChunk :: forall m. (PrimMonad m) => Int -- ^ number of features to make room for -> Int -- ^ number of samples to make room for -> ToSamplesM m () startNewChunk nFeatures nSamples = TSM $ do chunk <- lift . lift . freezeChunk =<< gets accumActive LM.modify accumChunks (chunk:) newChunk <- emptyMChunk nFeatures nSamples --trace "new chunk" accumActive =: newChunk where freezeChunk :: MChunk (PrimState m) -> m Chunk freezeChunk (MChunk {..}) = do _chunkFeatures <- VG.take _mchunkFeatureOffset <$> VG.freeze _mchunkFeatures _chunkSamples <- VG.take _mchunkSampleOffset <$> VG.freeze _mchunkSamples pure $ Chunk {..} -- | How many features do we have room for in the current chunk? roomInActiveChunk :: forall m. (PrimMonad m) => ToSamplesM m Int roomInActiveChunk = TSM (roomInChunk <$> gets accumActive) where roomInChunk :: MChunk (PrimState m) -> Int roomInChunk (MChunk {..}) = let featuresRoom = VGM.length _mchunkFeatures - _mchunkFeatureOffset - 1 samplesRoom = VGM.length _mchunkSamples - _mchunkSampleOffset in min featuresRoom samplesRoom trace :: PrimMonad m => String -> m () trace msg = primitive_ $ \s -> case Debug.Trace.trace msg () of () -> s addSample :: forall m. (PrimMonad m) => Label -> [(FeatureIdx, Double)] -> ToSamplesM m () addSample lbl [] = return () addSample lbl feats = do room <- roomInActiveChunk Params {..} <- TSM ask --TSM $ trace $ show (room, nFeats) when (nFeats > room) $ do let newChunkFeatures = maximum [nFeats+1, samplesPerChunk * expectedFeaturesPerSample] startNewChunk newChunkFeatures samplesPerChunk startSample mapM_ writeFeature feats -- write sentinel writeFeature (FeatureIdx (-1), 0) TSM $ modify accumMaxFeature (max maxFeatIdx) where (nFeats, Just maxFeatIdx) = Fold.fold ((,) <$> Fold.length <*> Fold.premap fst Fold.maximum) feats startSample :: ToSamplesM m () startSample = TSM $ do LM.modify accumNSamples succ MChunk {..} <- gets accumActive VGM.write _mchunkSamples _mchunkSampleOffset (lbl, _mchunkFeatureOffset) LM.modify (mchunkSampleOffset . accumActive) succ writeFeature :: (FeatureIdx, Double) -> ToSamplesM m () writeFeature (featIdx, val) = TSM $ do let feat = FeatureValue featIdx val MChunk {..} <- gets accumActive VGM.write _mchunkFeatures _mchunkFeatureOffset feat LM.modify (mchunkFeatureOffset . accumActive) succ runToSamplesM :: (PrimMonad m) => Int -> Int -> ToSamplesM m a -> m (a, Samples) runToSamplesM samplesPerChunk expectedFeaturesPerSample action = do chunk0 <- emptyMChunk expectedFeaturesPerSample 10 let acc0 = Accum { _accumChunks = [] , _accumActive = chunk0 , _accumNSamples = 0 , _accumMaxFeature = FeatureIdx 0 } let params = Params {..} let TSM action' = do r <- action offset <- TSM $ gets $ mchunkSampleOffset . accumActive -- Ensure we flush the last chunk when (offset > 0) $ startNewChunk 1 1 return r (r, acc) <- runStateT (runReaderT action' params) acc0 let chunks = V.reverse $ V.fromList (get accumChunks acc) return (r, Samples chunks (_accumMaxFeature acc)) toList :: Samples -> [(Label, [(FeatureIdx, Double)])] toList = foldMap readChunk . samplesChunks where readChunk :: Chunk -> [(Label, [(FeatureIdx, Double)])] readChunk (Chunk {..}) = map readSamples $ VG.toList _chunkSamples where readSamples :: (Label, Int) -> (Label, [(FeatureIdx, Double)]) readSamples (lbl, offset) = ( lbl , takeWhile ((/= FeatureIdx (-1)) . fst) $ map toPair $ VG.toList $ VG.drop offset _chunkFeatures ) toPair :: C'feature_node -> (FeatureIdx, Double) toPair (FeatureValue x y) = (x, y) -- | Produces a set of packed arrays (and the number of samples represented) -- in a form acceptable to @liblinear@. withPackedSamples :: forall a. () => (Int -> Ptr Label -> Ptr (Ptr C'feature_node) -> IO a) -> Samples -> IO a withPackedSamples action (Samples chunks _) = do let packedLabels = VG.convert $ VG.map fst $ VG.concatMap (VG.convert . _chunkSamples) chunks let packChunk :: Chunk -> VS.Vector (Ptr C'feature_node) packChunk (Chunk {..}) = VG.unstream $ VF.reVector $ VF.map (featureOff . snd) $ VG.stream _chunkSamples where featureOff :: Int -> Ptr C'feature_node featureOff offset = let (baseF, _) = VS.unsafeToForeignPtr0 _chunkFeatures base = unsafeForeignPtrToPtr baseF in base `advancePtr` offset packedFeatures :: VS.Vector (Ptr C'feature_node) packedFeatures = VG.unstream $ VF.concatVectors $ VF.map packChunk $ VG.stream chunks let docCount = VG.length packedLabels r <- VS.unsafeWith packedLabels $ \ptrLabels -> VS.unsafeWith packedFeatures $ \ptrFeatures -> action docCount ptrLabels ptrFeatures -- Ensure that feature vectors aren't GC'd touch chunks return r