rp-tree-0.2.1.0: Random projection trees
Safe HaskellNone
LanguageHaskell2010

Data.RPTree

Description

Random projection trees for approximate nearest neighbor search in high-dimensional vector spaces

Synopsis

Construction

tree Source #

Arguments

:: (Monad m, Inner SVector v) 
=> Word64

random seed

-> Int

max tree depth

-> Int

min leaf size

-> Int

data chunk size

-> Double

nonzero density of projection vectors

-> Int

dimension of projection vectors

-> ConduitT () (v Double) m ()

data source

-> m (RPTree Double (Vector (v Double))) 

Populate a tree from a data stream

Assumptions on the data source:

  • non-empty : contains at least one value
  • stationary : each chunk is representative of the whole dataset
  • bounded : we wait until the end of the stream to produce a result

Throws EmptyResult if the conduit is empty

forest Source #

Arguments

:: (Monad m, Inner SVector v) 
=> Word64

random seed

-> Int

max tree depth

-> Int

min leaf size

-> Int

number of trees

-> Int

data chunk size

-> Double

nonzero density of projection vectors

-> Int

dimension of projection vectors

-> ConduitT () (v Double) m ()

data source

-> m (RPForest Double (Vector (v Double))) 

Populate a forest from a data stream

Assumptions on the data source:

  • non-empty : contains at least one value
  • stationary : each chunk is representative of the whole dataset
  • bounded : we wait until the end of the stream to produce a result

Throws EmptyResult if the conduit is empty

Query

knn Source #

Arguments

:: (Ord p, Inner SVector v, Unbox d, Real d) 
=> (v2 -> v d -> p)

distance function

-> Int

k neighbors

-> RPForest d (Vector v2)

random projection forest

-> v d

query point

-> Vector (p, v2)

ordered in increasing distance order

k nearest neighbors

I/O

serialiseRPForest Source #

Arguments

:: (Serialise d, Serialise a, Unbox d) 
=> RPForest d a 
-> [ByteString]

the order is undefined

Serialise each tree in the RPForest as a separate bytestring

deserialiseRPForest Source #

Arguments

:: (Serialise d, Serialise a, Unbox d) 
=> [ByteString] 
-> Either String (RPForest d a)

the order is undefined

Deserialise each tree in the RPForest from a separate bytestring and reconstruct

Validation

recallWith Source #

Arguments

:: (Inner SVector v, Unbox a, Fractional a, Ord a, Ord d, Ord p) 
=> (p -> v a -> d) 
-> RPForest a (Vector p) 
-> Int

k : number of nearest neighbors to consider

-> v a

query point

-> a 

average recall-at-k, computed over a set of trees

Access

levels :: RPTree d a -> Int Source #

Number of tree levels

points :: Monoid m => RPTree d m -> m Source #

Set of data points used to construct the index

leaves :: RPTree d a -> [a] Source #

candidates Source #

Arguments

:: (Inner SVector v, Unbox d, Ord d, Num d, Semigroup xs) 
=> RPTree d xs 
-> v d

query point

-> xs 

Retrieve points nearest to the query

in case of a narrow margin, collect both branches of the tree

Types

RPTree

data RPTree d a Source #

Random projection trees

The first type parameter corresponds to a floating point scalar value, the second is the type of the data collected at the leaves of the tree (e.g. lists of vectors)

We keep them separate to leverage the Functor instance for postprocessing and visualization

One projection vector per tree level (as suggested in https://www.cs.helsinki.fi/u/ttonteri/pub/bigdata2016.pdf )

Instances

Instances details
Functor (RPTree d) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

fmap :: (a -> b) -> RPTree d a -> RPTree d b #

(<$) :: a -> RPTree d b -> RPTree d a #

Foldable (RPTree d) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

fold :: Monoid m => RPTree d m -> m #

foldMap :: Monoid m => (a -> m) -> RPTree d a -> m #

foldMap' :: Monoid m => (a -> m) -> RPTree d a -> m #

foldr :: (a -> b -> b) -> b -> RPTree d a -> b #

foldr' :: (a -> b -> b) -> b -> RPTree d a -> b #

foldl :: (b -> a -> b) -> b -> RPTree d a -> b #

foldl' :: (b -> a -> b) -> b -> RPTree d a -> b #

foldr1 :: (a -> a -> a) -> RPTree d a -> a #

foldl1 :: (a -> a -> a) -> RPTree d a -> a #

toList :: RPTree d a -> [a] #

null :: RPTree d a -> Bool #

length :: RPTree d a -> Int #

elem :: Eq a => a -> RPTree d a -> Bool #

maximum :: Ord a => RPTree d a -> a #

minimum :: Ord a => RPTree d a -> a #

sum :: Num a => RPTree d a -> a #

product :: Num a => RPTree d a -> a #

Traversable (RPTree d) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

traverse :: Applicative f => (a -> f b) -> RPTree d a -> f (RPTree d b) #

sequenceA :: Applicative f => RPTree d (f a) -> f (RPTree d a) #

mapM :: Monad m => (a -> m b) -> RPTree d a -> m (RPTree d b) #

sequence :: Monad m => RPTree d (m a) -> m (RPTree d a) #

(Unbox d, Eq d, Eq a) => Eq (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(==) :: RPTree d a -> RPTree d a -> Bool #

(/=) :: RPTree d a -> RPTree d a -> Bool #

(Unbox d, Show d, Show a) => Show (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

showsPrec :: Int -> RPTree d a -> ShowS #

show :: RPTree d a -> String #

showList :: [RPTree d a] -> ShowS #

Generic (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

Associated Types

type Rep (RPTree d a) :: Type -> Type #

Methods

from :: RPTree d a -> Rep (RPTree d a) x #

to :: Rep (RPTree d a) x -> RPTree d a #

(NFData a, NFData d) => NFData (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

rnf :: RPTree d a -> () #

(Serialise d, Serialise a, Unbox d) => Serialise (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

encode :: RPTree d a -> Encoding #

decode :: Decoder s (RPTree d a) #

encodeList :: [RPTree d a] -> Encoding #

decodeList :: Decoder s [RPTree d a] #

type Rep (RPTree d a) Source # 
Instance details

Defined in Data.RPTree.Internal

type Rep (RPTree d a)

type RPForest d a = IntMap (RPTree d a) Source #

A random projection forest is an ordered set of RPTrees

This supports efficient updates of the ensemble in the streaming/online setting.

data SVector a Source #

Sparse vectors with unboxed components

Instances

Instances details
Scale SVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(.*) :: (Unbox a, Num a) => a -> SVector a -> SVector a Source #

Inner SVector Vector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> Vector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> Vector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> Vector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> Vector a -> SVector a Source #

Inner SVector DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> DVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> DVector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

Inner SVector SVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> SVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> SVector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> SVector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> SVector a -> SVector a Source #

(Unbox a, Eq a) => Eq (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(==) :: SVector a -> SVector a -> Bool #

(/=) :: SVector a -> SVector a -> Bool #

(Unbox a, Ord a) => Ord (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

compare :: SVector a -> SVector a -> Ordering #

(<) :: SVector a -> SVector a -> Bool #

(<=) :: SVector a -> SVector a -> Bool #

(>) :: SVector a -> SVector a -> Bool #

(>=) :: SVector a -> SVector a -> Bool #

max :: SVector a -> SVector a -> SVector a #

min :: SVector a -> SVector a -> SVector a #

(Unbox a, Show a) => Show (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

showsPrec :: Int -> SVector a -> ShowS #

show :: SVector a -> String #

showList :: [SVector a] -> ShowS #

Generic (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Associated Types

type Rep (SVector a) :: Type -> Type #

Methods

from :: SVector a -> Rep (SVector a) x #

to :: Rep (SVector a) x -> SVector a #

NFData (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

rnf :: SVector a -> () #

(Unbox a, Serialise a) => Serialise (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

type Rep (SVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

type Rep (SVector a) = D1 ('MetaData "SVector" "Data.RPTree.Internal" "rp-tree-0.2.1.0-5AqidgumDDrLxZlp3YUXya" 'False) (C1 ('MetaCons "SV" 'PrefixI 'True) (S1 ('MetaSel ('Just "svDim") 'SourceUnpack 'SourceStrict 'DecidedStrict) (Rec0 Int) :*: S1 ('MetaSel ('Just "svVec") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Vector (Int, a)))))

fromListSv :: Unbox a => Int -> [(Int, a)] -> SVector a Source #

fromVectorSv Source #

Arguments

:: Int

vector dimension

-> Vector (Int, a)

vector components (in increasing order)

-> SVector a 

(Unsafe) Pack a SVector from its vector dimension and components

Note : the relevant invariants are not checked :

  • vector components are _assumed_ to be in increasing order
  • vector dimension is larger than any component index

data DVector a Source #

Dense vectors with unboxed components

Instances

Instances details
Scale DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(.*) :: (Unbox a, Num a) => a -> DVector a -> DVector a Source #

Inner DVector DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => DVector a -> DVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => DVector a -> DVector a -> a Source #

(^+^) :: (Unbox a, Num a) => DVector a -> DVector a -> DVector a Source #

(^-^) :: (Unbox a, Num a) => DVector a -> DVector a -> DVector a Source #

Inner SVector DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> DVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> DVector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

(Unbox a, Eq a) => Eq (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(==) :: DVector a -> DVector a -> Bool #

(/=) :: DVector a -> DVector a -> Bool #

(Unbox a, Ord a) => Ord (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

compare :: DVector a -> DVector a -> Ordering #

(<) :: DVector a -> DVector a -> Bool #

(<=) :: DVector a -> DVector a -> Bool #

(>) :: DVector a -> DVector a -> Bool #

(>=) :: DVector a -> DVector a -> Bool #

max :: DVector a -> DVector a -> DVector a #

min :: DVector a -> DVector a -> DVector a #

(Unbox a, Show a) => Show (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

showsPrec :: Int -> DVector a -> ShowS #

show :: DVector a -> String #

showList :: [DVector a] -> ShowS #

Generic (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Associated Types

type Rep (DVector a) :: Type -> Type #

Methods

from :: DVector a -> Rep (DVector a) x #

to :: Rep (DVector a) x -> DVector a #

NFData (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

rnf :: DVector a -> () #

(Unbox a, Serialise a) => Serialise (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

type Rep (DVector a) Source # 
Instance details

Defined in Data.RPTree.Internal

type Rep (DVector a) = D1 ('MetaData "DVector" "Data.RPTree.Internal" "rp-tree-0.2.1.0-5AqidgumDDrLxZlp3YUXya" 'True) (C1 ('MetaCons "DV" 'PrefixI 'True) (S1 ('MetaSel ('Just "dvVec") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Vector a))))

fromListDv :: Unbox a => [a] -> DVector a Source #

Vector space types

class (Scale u, Scale v) => Inner u v where Source #

Inner product spaces

This typeclass is provided as a convenience for library users to interface their own vector types.

Methods

inner :: (Unbox a, Num a) => u a -> v a -> a Source #

metricL2 :: (Unbox a, Floating a) => u a -> v a -> a Source #

(^+^) :: (Unbox a, Num a) => u a -> v a -> u a Source #

(^-^) :: (Unbox a, Num a) => u a -> v a -> u a Source #

Instances

Instances details
Inner DVector DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => DVector a -> DVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => DVector a -> DVector a -> a Source #

(^+^) :: (Unbox a, Num a) => DVector a -> DVector a -> DVector a Source #

(^-^) :: (Unbox a, Num a) => DVector a -> DVector a -> DVector a Source #

Inner SVector Vector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> Vector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> Vector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> Vector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> Vector a -> SVector a Source #

Inner SVector DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> DVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> DVector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> DVector a -> SVector a Source #

Inner SVector SVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

inner :: (Unbox a, Num a) => SVector a -> SVector a -> a Source #

metricL2 :: (Unbox a, Floating a) => SVector a -> SVector a -> a Source #

(^+^) :: (Unbox a, Num a) => SVector a -> SVector a -> SVector a Source #

(^-^) :: (Unbox a, Num a) => SVector a -> SVector a -> SVector a Source #

class Scale v where Source #

Scale a vector

Methods

(.*) :: (Unbox a, Num a) => a -> v a -> v a Source #

Instances

Instances details
Scale Vector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(.*) :: (Unbox a, Num a) => a -> Vector a -> Vector a Source #

Scale DVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(.*) :: (Unbox a, Num a) => a -> DVector a -> DVector a Source #

Scale SVector Source # 
Instance details

Defined in Data.RPTree.Internal

Methods

(.*) :: (Unbox a, Num a) => a -> SVector a -> SVector a Source #

helpers for implementing Inner instances

inner product

innerSS :: (Vector u (Int, a), Vector v (Int, a), Unbox a, Num a) => u (Int, a) -> v (Int, a) -> a Source #

sparse-sparse inner product

innerSD :: (Num a, Vector u (Int, a), Vector v a, Unbox a) => u (Int, a) -> v a -> a Source #

sparse-dense inner product

innerDD :: (Vector v a, Num a) => v a -> v a -> a Source #

L2 distance

metricSSL2 :: (Floating a, Vector u a, Unbox a, Vector u (Int, a), Vector v (Int, a)) => u (Int, a) -> v (Int, a) -> a Source #

Vector distance induced by the L2 norm (sparse-sparse)

metricSDL2 :: (Floating a, Vector v1 a, Unbox a, Vector v1 (Int, a), Vector v2 a) => v1 (Int, a) -> v2 a -> a Source #

Vector distance induced by the L2 norm (sparse-dense)

Scale

scaleS :: (Vector v (a, b), Num b) => b -> v (a, b) -> v (a, b) Source #

scaleD :: (Vector v b, Num b) => b -> v b -> v b Source #

Conduit

dataSource Source #

Arguments

:: Monad m 
=> Int

number of vectors to generate

-> GenT m a

random generator for the vector components

-> ConduitT i a (GenT m) () 

Source of random data points

Random generation

vector

sparse Source #

Arguments

:: (Monad m, Unbox a) 
=> Double

nonzero density

-> Int

vector dimension

-> GenT m a

random generator of vector components

-> GenT m (SVector a) 

Generate a sparse random vector with a given nonzero density and components sampled from the supplied random generator

dense Source #

Arguments

:: (Monad m, Vector Vector a) 
=> Int

vector dimension

-> GenT m a

random generator of vector components

-> GenT m (DVector a) 

Generate a dense random vector with components sampled from the supplied random generator

Rendering

draw :: (Show a, Boxed a, PrintfArg v) => RPTree v a -> IO () Source #

Render a tree to stdout

Useful for debugging

This should be called only for small trees, otherwise the printed result quickly overflows the screen and becomes hard to read.

NB : prints distance information rounded to two decimal digits

CSV

writeCsv Source #

Arguments

:: (Show a, Show b, Unbox a) 
=> FilePath 
-> [(DVector a, b)]

data point, label

-> IO () 

Encode dataset as CSV and save into file

Testing

liftC :: (Monad m, MonadTrans t) => ConduitT i o m r -> ConduitT i o (t m) r Source #