module DenseIntSet ( -- * DenseIntSet DenseIntSet, -- ** Constructors foldable, topValueIndices, filteredIndices, -- *** Composition intersection, union, -- ** Accessors size, lookup, -- *** Vectors presentElementsVector, indexVector, filterVector, -- *** Unfoldrs presentElementsUnfoldr, absentElementsUnfoldr, vectorElementsUnfoldr, -- * Composition DenseIntSetComposition, compose, composeList, ) where import DenseIntSet.Prelude hiding (intersection, union, lookup) import qualified DeferredFolds.Unfoldr as Unfoldr import qualified Data.Vector as Vector import qualified Data.Vector.Unboxed as UnboxedVector import qualified Data.Vector.Generic as GenericVector import qualified Data.Vector.Generic.Mutable as MutableGenericVector import qualified Data.Vector.Algorithms.Intro as IntroVectorAlgorithm -- * DenseIntSet ------------------------- {-| Set of integer values represented as a space-optimized dense array of booleans, where an entry only occupies 1 bit. Compared to IntSet of the \"containers\" library, it trades off the the ability for modification for much better lookup performance. Hence it best fits the usage patterns, where you first create the set and then only use it for lookups. Since there's multiple ways to implement a monoid for this data-structure, the instances are provided for "DenseIntSetComposition", which is open for interpretation of how to compose. -} newtype DenseIntSet = DenseIntSet (UnboxedVector Word64) deriving instance Eq DenseIntSet deriving instance Ord DenseIntSet instance Show DenseIntSet where show = show . toList deriving instance Serialize DenseIntSet instance Hashable DenseIntSet where hashWithSalt salt (DenseIntSet vec) = hashWithSalt salt (GenericVector.toList vec) instance IsList DenseIntSet where type Item DenseIntSet = Int fromList list = foldable (foldl' max 0 list) list toList = toList . presentElementsUnfoldr -- * Constructors ------------------------- {-| Given a maximum int, construct from a foldable of ints, which are smaller or equal to it. It is your responsibility to ensure that the values match this contract. -} foldable :: Foldable foldable => Int -> foldable Int -> DenseIntSet foldable size foldable = let !wordsAmount = divCeiling size 64 in DenseIntSet $ runST $ do indexSetMVec <- MutableGenericVector.new wordsAmount forM_ foldable $ \ index -> let (wordIndex, bitIndex) = divMod index 64 in MutableGenericVector.modify indexSetMVec (flip setBit bitIndex) wordIndex GenericVector.unsafeFreeze indexSetMVec {-| Interpret a composition as an intersection of sets. -} intersection :: DenseIntSetComposition -> DenseIntSet intersection = zipWords (.&.) maxBound {-| Interpret a composition as a union of sets. -} union :: DenseIntSetComposition -> DenseIntSet union = zipWords (.|.) 0 zipWords :: (Word64 -> Word64 -> Word64) -> Word64 -> DenseIntSetComposition -> DenseIntSet zipWords append empty (DenseIntSetComposition minLength vecs) = DenseIntSet $ runST $ let wordIndexUnfoldr = Unfoldr.intsInRange 0 (pred minLength) vecVec = Vector.fromList vecs vecUnfoldr = Unfoldr.foldable vecVec wordUnfoldrAt wordIndex = fmap (flip GenericVector.unsafeIndex wordIndex) vecUnfoldr finalWordAt = foldr append empty . wordUnfoldrAt in do indexSetMVec <- MutableGenericVector.new minLength forM_ wordIndexUnfoldr $ \ index -> MutableGenericVector.unsafeWrite indexSetMVec index (finalWordAt index) GenericVector.unsafeFreeze indexSetMVec {-| Using the provided ordering function, select the indices of the specified amount of top-ordered elements of a generic vector. -} topValueIndices :: (GenericVector.Vector vector a, GenericVector.Vector vector (a, Int)) => (a -> a -> Ordering) -> Int -> vector a -> DenseIntSet topValueIndices compare amount valueVec = let valuesAmount = GenericVector.length valueVec limitedAmount = min amount valuesAmount wordsAmount = divCeiling valuesAmount 64 in runST $ do pairMVec <- GenericVector.unsafeThaw (GenericVector.imap (\ index count -> (count, index)) valueVec) IntroVectorAlgorithm.selectBy (\ a b -> compare (fst b) (fst a)) pairMVec limitedAmount indexSetMVec <- MutableGenericVector.new wordsAmount forM_ (Unfoldr.intsInRange 0 (pred limitedAmount)) $ \ pairIndex -> do (_, index) <- MutableGenericVector.unsafeRead pairMVec pairIndex let (wordIndex, bitIndex) = divMod index 64 MutableGenericVector.modify indexSetMVec (flip setBit bitIndex) wordIndex DenseIntSet <$> GenericVector.unsafeFreeze indexSetMVec {-| Select the indices of vector elements, which match the predicate. -} filteredIndices :: GenericVector.Vector vector a => (a -> Bool) -> vector a -> DenseIntSet filteredIndices predicate valueVec = DenseIntSet $ let valuesAmount = GenericVector.length valueVec wordsAmount = divCeiling valuesAmount 64 indexUnfoldr = do (index, a) <- Unfoldr.vectorWithIndices valueVec guard (predicate a) return (divMod index 64) in runST $ do indexSetMVec <- MutableGenericVector.new wordsAmount forM_ indexUnfoldr $ \ (wordIndex, bitIndex) -> MutableGenericVector.modify indexSetMVec (flip setBit bitIndex) wordIndex GenericVector.unsafeFreeze indexSetMVec -- * Accessors ------------------------- {-| /O(log n)/. Count the amount of present elements in the set. -} size :: DenseIntSet -> Int size (DenseIntSet vec) = getSum (foldMap (Sum . popCount) (Unfoldr.vector vec)) {-| /O(1)/. Check whether an int is a member of the set. -} lookup :: Int -> DenseIntSet -> Bool lookup index (DenseIntSet vec) = let (wordIndex, bitIndex) = divMod index 64 in vec GenericVector.!? wordIndex & maybe False (flip testBit bitIndex) -- ** Vectors ------------------------- {-| Extract the present elements into a vector. -} presentElementsVector :: GenericVector.Vector vector element => DenseIntSet -> (Int -> element) -> vector element presentElementsVector intSet intToIndex = let sizeVal = size intSet unfoldr = Unfoldr.zipWithIndex (presentElementsUnfoldr intSet) in runST $ do mv <- MutableGenericVector.unsafeNew sizeVal forM_ unfoldr $ \ (index, element) -> MutableGenericVector.unsafeWrite mv index (intToIndex element) GenericVector.unsafeFreeze mv {-| Construct a vector, which maps from the original ints into their indices amongst the ones present in the set. -} indexVector :: GenericVector.Vector vector (Maybe index) => DenseIntSet -> (Int -> index) -> vector (Maybe index) indexVector set@(DenseIntSet setVec) intToIndex = let indexVecSize = GenericVector.length setVec * 64 in runST $ do v <- MutableGenericVector.replicate indexVecSize Nothing forM_ (Unfoldr.zipWithIndex (presentElementsUnfoldr set)) $ \ (index, element) -> do MutableGenericVector.unsafeWrite v element (Just (intToIndex index)) GenericVector.unsafeFreeze v {-| Filter a vector, leaving only the entries, under the indices, which are in the set. It is your responsibility to ensure that the indices in the set don't exceed the original vector's bounds. -} filterVector :: GenericVector.Vector vector a => DenseIntSet -> vector a -> vector a filterVector set vector = let !newVectorSize = size set in runST $ do newVector <- MutableGenericVector.unsafeNew newVectorSize forM_ (Unfoldr.zipWithIndex (vectorElementsUnfoldr vector set)) $ \ (newIndex, a) -> do MutableGenericVector.unsafeWrite newVector newIndex a GenericVector.unsafeFreeze newVector -- ** Unfoldr ------------------------- {-| Unfold the present elements. -} presentElementsUnfoldr :: DenseIntSet -> Unfoldr Int presentElementsUnfoldr (DenseIntSet vec) = do (wordIndex, word) <- Unfoldr.vectorWithIndices vec bitIndex <- Unfoldr.setBitIndices word return (wordIndex * 64 + bitIndex) {-| Unfold the absent elements. -} absentElementsUnfoldr :: DenseIntSet -> Unfoldr Int absentElementsUnfoldr (DenseIntSet vec) = do (wordIndex, word) <- Unfoldr.vectorWithIndices vec bitIndex <- Unfoldr.unsetBitIndices word return (wordIndex * 64 + bitIndex) {-| Unfold the elements of a vector by indices in the set. It is your responsibility to ensure that the indices in the set don't exceed the vector's bounds. -} vectorElementsUnfoldr :: (GenericVector.Vector vector a) => vector a -> DenseIntSet -> Unfoldr a vectorElementsUnfoldr vec = fmap (vec GenericVector.!) . presentElementsUnfoldr -- * Composition ------------------------- {-| Abstraction over the composition of sets, which is cheap to append and can be used for interpreted merging of sets. -} data DenseIntSetComposition = DenseIntSetComposition !Int [UnboxedVector Word64] instance Semigroup DenseIntSetComposition where (<>) (DenseIntSetComposition leftMinLength leftVecs) = if null leftVecs then id else \ (DenseIntSetComposition rightMinLength rightVecs) -> if null rightVecs then DenseIntSetComposition leftMinLength leftVecs else DenseIntSetComposition (min leftMinLength rightMinLength) (leftVecs <> rightVecs) instance Monoid DenseIntSetComposition where mempty = DenseIntSetComposition 0 [] mappend = (<>) {-| Lift a set into composition. -} compose :: DenseIntSet -> DenseIntSetComposition compose (DenseIntSet vec) = DenseIntSetComposition (UnboxedVector.length vec) (pure vec) {-| Lift a list of sets into composition. -} composeList :: [DenseIntSet] -> DenseIntSetComposition composeList list = if null list then mempty else let unboxedVec = fmap (\ (DenseIntSet x) -> x) list in DenseIntSetComposition (foldr1 min (fmap UnboxedVector.length unboxedVec)) unboxedVec