Safe Haskell | None |
---|---|
Language | Haskell2010 |
Random projection trees for approximate nearest neighbor search in high-dimensional vector spaces
Synopsis
- tree :: (Monad m, Inner SVector v) => Word64 -> Int -> Int -> Int -> Double -> Int -> ConduitT () (v Double) m () -> m (RPTree Double (Vector (v Double)))
- forest :: (Monad m, Inner SVector v) => Word64 -> Int -> Int -> Int -> Int -> Double -> Int -> ConduitT () (v Double) m () -> m (RPForest Double (Vector (v Double)))
- knn :: (Ord p, Inner SVector v, Unbox d, Real d) => (v2 -> v d -> p) -> Int -> RPForest d (Vector v2) -> v d -> Vector (p, v2)
- serialiseRPForest :: (Serialise d, Serialise a, Unbox d) => RPForest d a -> [ByteString]
- deserialiseRPForest :: (Serialise d, Serialise a, Unbox d) => [ByteString] -> Either String (RPForest d a)
- recallWith :: (Inner SVector v, Unbox a, Fractional a, Ord a, Ord d, Ord p) => (p -> v a -> d) -> RPForest a (Vector p) -> Int -> v a -> a
- levels :: RPTree d a -> Int
- points :: Monoid m => RPTree d m -> m
- leaves :: RPTree d a -> [a]
- candidates :: (Inner SVector v, Unbox d, Ord d, Num d, Semigroup xs) => RPTree d xs -> v d -> xs
- data RPTree d a
- type RPForest d a = IntMap (RPTree d a)
- data SVector a
- fromListSv :: Unbox a => Int -> [(Int, a)] -> SVector a
- fromVectorSv :: Int -> Vector (Int, a) -> SVector a
- data DVector a
- fromListDv :: Unbox a => [a] -> DVector a
- fromVectorDv :: Vector a -> DVector a
- class (Scale u, Scale v) => Inner u v where
- class Scale v where
- innerSS :: (Vector u (Int, a), Vector v (Int, a), Unbox a, Num a) => u (Int, a) -> v (Int, a) -> a
- innerSD :: (Num a, Vector u (Int, a), Vector v a, Unbox a) => u (Int, a) -> v a -> a
- innerDD :: (Vector v a, Num a) => v a -> v a -> a
- metricSSL2 :: (Floating a, Vector u a, Unbox a, Vector u (Int, a), Vector v (Int, a)) => u (Int, a) -> v (Int, a) -> a
- metricSDL2 :: (Floating a, Vector v1 a, Unbox a, Vector v1 (Int, a), Vector v2 a) => v1 (Int, a) -> v2 a -> a
- scaleS :: (Vector v (a, b), Num b) => b -> v (a, b) -> v (a, b)
- scaleD :: (Vector v b, Num b) => b -> v b -> v b
- dataSource :: Monad m => Int -> GenT m a -> ConduitT i a (GenT m) ()
- sparse :: (Monad m, Unbox a) => Double -> Int -> GenT m a -> GenT m (SVector a)
- dense :: (Monad m, Vector Vector a) => Int -> GenT m a -> GenT m (DVector a)
- normal2 :: Monad m => GenT m (DVector Double)
- draw :: (Show a, Boxed a, PrintfArg v) => RPTree v a -> IO ()
- writeCsv :: (Show a, Show b, Unbox a) => FilePath -> [(DVector a, b)] -> IO ()
- randSeed :: MonadIO m => m Word64
- data BenchConfig = BenchConfig {
- bcDescription :: String
- bcMaxTreeDepth :: Int
- bcMinLeafSize :: Int
- bcNumTrees :: Int
- bcChunkSize :: Int
- bcNZDensity :: Double
- bcVectorDim :: Int
- bcDataSize :: Int
- bcNumQueryPoints :: Int
- normalSparse2 :: Monad m => Double -> Int -> GenT m (SVector Double)
- liftC :: (Monad m, MonadTrans t) => ConduitT i o m r -> ConduitT i o (t m) r
Construction
:: (Monad m, Inner SVector v) | |
=> Word64 | random seed |
-> Int | max tree depth |
-> Int | min leaf size |
-> Int | data chunk size |
-> Double | nonzero density of projection vectors |
-> Int | dimension of projection vectors |
-> ConduitT () (v Double) m () | data source |
-> m (RPTree Double (Vector (v Double))) |
Populate a tree from a data stream
Assumptions on the data source:
- non-empty : contains at least one value
- stationary : each chunk is representative of the whole dataset
- bounded : we wait until the end of the stream to produce a result
Throws EmptyResult
if the conduit is empty
:: (Monad m, Inner SVector v) | |
=> Word64 | random seed |
-> Int | max tree depth |
-> Int | min leaf size |
-> Int | number of trees |
-> Int | data chunk size |
-> Double | nonzero density of projection vectors |
-> Int | dimension of projection vectors |
-> ConduitT () (v Double) m () | data source |
-> m (RPForest Double (Vector (v Double))) |
Populate a forest from a data stream
Assumptions on the data source:
- non-empty : contains at least one value
- stationary : each chunk is representative of the whole dataset
- bounded : we wait until the end of the stream to produce a result
Throws EmptyResult
if the conduit is empty
Query
:: (Ord p, Inner SVector v, Unbox d, Real d) | |
=> (v2 -> v d -> p) | distance function |
-> Int | k neighbors |
-> RPForest d (Vector v2) | random projection forest |
-> v d | query point |
-> Vector (p, v2) | ordered in increasing distance order |
k nearest neighbors
I/O
:: (Serialise d, Serialise a, Unbox d) | |
=> RPForest d a | |
-> [ByteString] | the order is undefined |
Serialise each tree in the RPForest
as a separate bytestring
:: (Serialise d, Serialise a, Unbox d) | |
=> [ByteString] | |
-> Either String (RPForest d a) | the order is undefined |
Deserialise each tree in the RPForest
from a separate bytestring and reconstruct
Validation
:: (Inner SVector v, Unbox a, Fractional a, Ord a, Ord d, Ord p) | |
=> (p -> v a -> d) | |
-> RPForest a (Vector p) | |
-> Int | k : number of nearest neighbors to consider |
-> v a | query point |
-> a |
average recall-at-k, computed over a set of trees
Access
Retrieve points nearest to the query
in case of a narrow margin, collect both branches of the tree
Types
RPTree
Random projection trees
The first type parameter corresponds to a floating point scalar value, the second is the type of the data collected at the leaves of the tree (e.g. lists of vectors)
We keep them separate to leverage the Functor instance for postprocessing and visualization
One projection vector per tree level (as suggested in https://www.cs.helsinki.fi/u/ttonteri/pub/bigdata2016.pdf )
Instances
Functor (RPTree d) Source # | |
Foldable (RPTree d) Source # | |
Defined in Data.RPTree.Internal fold :: Monoid m => RPTree d m -> m # foldMap :: Monoid m => (a -> m) -> RPTree d a -> m # foldMap' :: Monoid m => (a -> m) -> RPTree d a -> m # foldr :: (a -> b -> b) -> b -> RPTree d a -> b # foldr' :: (a -> b -> b) -> b -> RPTree d a -> b # foldl :: (b -> a -> b) -> b -> RPTree d a -> b # foldl' :: (b -> a -> b) -> b -> RPTree d a -> b # foldr1 :: (a -> a -> a) -> RPTree d a -> a # foldl1 :: (a -> a -> a) -> RPTree d a -> a # elem :: Eq a => a -> RPTree d a -> Bool # maximum :: Ord a => RPTree d a -> a # minimum :: Ord a => RPTree d a -> a # | |
Traversable (RPTree d) Source # | |
(Unbox d, Eq d, Eq a) => Eq (RPTree d a) Source # | |
(Unbox d, Show d, Show a) => Show (RPTree d a) Source # | |
Generic (RPTree d a) Source # | |
(NFData a, NFData d) => NFData (RPTree d a) Source # | |
Defined in Data.RPTree.Internal | |
(Serialise d, Serialise a, Unbox d) => Serialise (RPTree d a) Source # | |
type Rep (RPTree d a) Source # | |
Defined in Data.RPTree.Internal |
type RPForest d a = IntMap (RPTree d a) Source #
A random projection forest is an ordered set of RPTree
s
This supports efficient updates of the ensemble in the streaming/online setting.
Sparse vectors with unboxed components
Instances
Scale SVector Source # | |
Inner SVector Vector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector DVector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector SVector Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Eq a) => Eq (SVector a) Source # | |
(Unbox a, Ord a) => Ord (SVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Show a) => Show (SVector a) Source # | |
Generic (SVector a) Source # | |
NFData (SVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Serialise a) => Serialise (SVector a) Source # | |
type Rep (SVector a) Source # | |
Defined in Data.RPTree.Internal type Rep (SVector a) = D1 ('MetaData "SVector" "Data.RPTree.Internal" "rp-tree-0.2.1.0-5AqidgumDDrLxZlp3YUXya" 'False) (C1 ('MetaCons "SV" 'PrefixI 'True) (S1 ('MetaSel ('Just "svDim") 'SourceUnpack 'SourceStrict 'DecidedStrict) (Rec0 Int) :*: S1 ('MetaSel ('Just "svVec") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Vector (Int, a))))) |
(Unsafe) Pack a SVector
from its vector dimension and components
Note : the relevant invariants are not checked :
- vector components are _assumed_ to be in increasing order
- vector dimension is larger than any component index
Dense vectors with unboxed components
Instances
Scale DVector Source # | |
Inner DVector DVector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector DVector Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Eq a) => Eq (DVector a) Source # | |
(Unbox a, Ord a) => Ord (DVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Show a) => Show (DVector a) Source # | |
Generic (DVector a) Source # | |
NFData (DVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Serialise a) => Serialise (DVector a) Source # | |
type Rep (DVector a) Source # | |
Defined in Data.RPTree.Internal |
fromListDv :: Unbox a => [a] -> DVector a Source #
fromVectorDv :: Vector a -> DVector a Source #
Vector space types
class (Scale u, Scale v) => Inner u v where Source #
Inner product spaces
This typeclass is provided as a convenience for library users to interface their own vector types.
inner :: (Unbox a, Num a) => u a -> v a -> a Source #
metricL2 :: (Unbox a, Floating a) => u a -> v a -> a Source #
helpers for implementing Inner instances
inner product
innerSS :: (Vector u (Int, a), Vector v (Int, a), Unbox a, Num a) => u (Int, a) -> v (Int, a) -> a Source #
sparse-sparse inner product
innerSD :: (Num a, Vector u (Int, a), Vector v a, Unbox a) => u (Int, a) -> v a -> a Source #
sparse-dense inner product
L2 distance
metricSSL2 :: (Floating a, Vector u a, Unbox a, Vector u (Int, a), Vector v (Int, a)) => u (Int, a) -> v (Int, a) -> a Source #
Vector distance induced by the L2 norm (sparse-sparse)
metricSDL2 :: (Floating a, Vector v1 a, Unbox a, Vector v1 (Int, a), Vector v2 a) => v1 (Int, a) -> v2 a -> a Source #
Vector distance induced by the L2 norm (sparse-dense)
Scale
Conduit
:: Monad m | |
=> Int | number of vectors to generate |
-> GenT m a | random generator for the vector components |
-> ConduitT i a (GenT m) () |
Source of random data points
Random generation
vector
:: (Monad m, Unbox a) | |
=> Double | nonzero density |
-> Int | vector dimension |
-> GenT m a | random generator of vector components |
-> GenT m (SVector a) |
Generate a sparse random vector with a given nonzero density and components sampled from the supplied random generator
:: (Monad m, Vector Vector a) | |
=> Int | vector dimension |
-> GenT m a | random generator of vector components |
-> GenT m (DVector a) |
Generate a dense random vector with components sampled from the supplied random generator
Rendering
draw :: (Show a, Boxed a, PrintfArg v) => RPTree v a -> IO () Source #
Render a tree to stdout
Useful for debugging
This should be called only for small trees, otherwise the printed result quickly overflows the screen and becomes hard to read.
NB : prints distance information rounded to two decimal digits
CSV
Encode dataset as CSV and save into file
Testing
data BenchConfig Source #
BenchConfig | |
|
Instances
Show BenchConfig Source # | |
Defined in Data.RPTree.Internal.Testing showsPrec :: Int -> BenchConfig -> ShowS # show :: BenchConfig -> String # showList :: [BenchConfig] -> ShowS # |