module Test.Database.LSMTree.Internal.Vector.Growing (tests) where import Prelude hiding (head, length, tail) import Control.Category ((>>>)) import Control.Monad.ST.Strict (ST, runST) import qualified Data.List as List (length) import Data.Vector (Vector, toList) import Database.LSMTree.Internal.Vector.Growing (GrowingVector, append, freeze, new) import Test.QuickCheck (Arbitrary (arbitrary, shrink), Gen, NonNegative (NonNegative, getNonNegative), Positive (Positive), Property, Small (Small), Testable, chooseInt, coverTable, shrinkList, shrinkMap, shrinkMapBy, sized, tabulate, (===)) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) -- * Tests tests :: TestTree tests = testGroup "Test.Database.LSMTree.Internal.Vector.Growing" $ [ testProperty "Final vector is correct" prop_finalVectorIsCorrect ] -- * Utilities -- ** Segments data Segment a = Replicate Int a deriving stock Show segmentLength :: Segment a -> Int segmentLength (Replicate count _) = max 0 count segmentToList :: Segment a -> [a] segmentToList (Replicate count val) = replicate count val appendSegment :: GrowingVector s a -> Segment a -> ST s () appendSegment vec (Replicate count val) = append vec count val -- ** Vector construction {-| Constructs an ordinary vector by creating a growing vector, appending segments to it, and finally freezing it. -} finalVector :: Int -- Initial buffer size -> [Segment a] -- Segments -> Vector a -- Final vector finalVector initialBufferSize segments = runST $ do vec <- new initialBufferSize mapM_ (appendSegment vec) segments freeze vec {-| Supplies the final contents of a growing vector for constructing a property. The resulting property additionally provides information about the occurrence of buffer size scaling exponents, where a buffer size scaling exponent is the binary logarithm of the ratio between the buffer size after and before appending a segment. If buffer enlargement for an append would happen in cycles where in each cycle the buffer size is doubled, then the buffer size scaling exponent would be equal to the number of such cycles. Therefore, a buffer size scaling exponent tells if the buffer is enlarged and, if yes, how much. -} withFinalVector :: Testable prop => Int -- Initial buffer size -> [Segment a] -- Segments -> (Vector a -> prop) -- Property from final vector -> Property -- Resulting property withFinalVector initialBufferSize segments consumer = coverTable "Buffer size scaling exponents" [(show scalingExp, 10) | scalingExp <- [0 .. 3] :: [Int]] $ tabulate "Buffer size scaling exponents" (show <$> bufferSizeScalingExponents availableBufferSizes 0 segments) $ consumer (finalVector initialBufferSize segments) where availableBufferSizes :: [Int] availableBufferSizes = iterate (* 2) initialBufferSize {- We do not need to account for overflow here, because the lengths of the vectors we construct are small compared to @maxBound :: Int@. -} bufferSizeScalingExponents :: [Int] -> Int -> [Segment a] -> [Int] bufferSizeScalingExponents _ _ [] = [] bufferSizeScalingExponents availableSizes length (head : tail) = List.length insufficient : bufferSizeScalingExponents sufficient length' tail where length' :: Int !length' = length + segmentLength head insufficient, sufficient :: [Int] (insufficient, sufficient) = span (< length') availableSizes -- * Properties to test prop_finalVectorIsCorrect :: Positive (Small Int) -> Segments Integer -> Property prop_finalVectorIsCorrect (Positive (Small initialBufferSize)) (Segments segments) = withFinalVector initialBufferSize segments $ \ vec -> toList vec === concatMap segmentToList segments -- * Test case generation and shrinking generateSegment :: Arbitrary a => Int -> Gen (Segment a) generateSegment maxCount = Replicate <$> chooseInt (0, maxCount) <*> arbitrary shrinkSegment :: Arbitrary a => Segment a -> [Segment a] shrinkSegment (Replicate count val) = [Replicate count' val | count' <- shrinkNat count] ++ [Replicate count val' | val' <- shrink val] where shrinkNat :: Int -> [Int] shrinkNat = shrinkMap getNonNegative NonNegative {-| A list of segments to be appended to an initially empty vector. 'Segments a' is isomorphic to '[Segment a]' but has a special way of generating test cases, which avoids skewing with respect to buffer size scaling exponents. The key issue of segment list generation is how to generate the length of an individual segment. It seems worthwhile to randomly pick a natural number smaller than or equal to some maximum length, using a uniform distribution. The question to be answered is how to choose the maximum lengths for the different segments. A naïve approach is to use a common maximum length for all segments in a particular list, possibly computed from the size parameter. However, this results in most appends not enlarging the buffer, meaning that the buffer size scaling exponent is zero in most cases. This is because, as the buffer size increases, the number of elements needed to cause the next buffer enlargement increases too. Note that it does not help much to choose the maximum segment length large compared to the initial buffer size. If this is done, then the buffer is enlarged very quickly, likely during the first append, and the distributions of buffer size scaling exponents up to this first enlargement may actually be very good. However, after the first enlargement, the buffer size is typically of the same order of magnitude as the maximum segment length, so that the buffer size scaling exponents of the following appends are likely to be zero or close to it, with further buffer enlargements making the situation only worse. A better idea is to choose the maximum length of each segment but the first proportional to the length of the vector just before appending it (for the first segment, such a choice is not appropriate, because it would lead to a maximum length of zero). With this approach, the distribution of buffer size scaling exponents stays roughly the same as soon as the buffer has been enlarged for the first time, because then the ratio between the current buffer size and the current vector length is always in the interval \([1, 2)\) and thus varies only slightly. Reasonable distributions of buffer size scaling exponents can be achieved by choosing the ratio between maximum segment lengths and corresponding current-vector lengths appropriately. A downside of this approach is that it requires tracking the current vector length. This tracking can be avoided by considering only the /expected/ current vector length and choosing the maximum segment length proportional to it. The expected length of a vector is the sum of the expected lengths of the segments constituting it, and the expected length of a segment is proportional to the maximum length used for generating it. Therefore, this modified solution boils down to choosing the maximum length of each segment but the first proportional to the sum of the maximum lengths of the segments preceding it. Note that, with the approach just set out, the maximum lengths of the segments in a segment list almost form an exponential sequence, whose base is determined by the chosen ratio between maximum segment lengths and corresponding sums of maximum predecessor segment lengths. Therefore, a similarly good, but simpler, solution is to have the maximum segment lengths precisely form an exponential sequence. This is the approach that we use in our segment list generation algorithm. The concrete exponential sequences that we employ start with the size parameter and use 3 as the base of the exponentiation. Since initial buffer sizes are generated using a uniform distribution with the size parameter as the maximum, the choice of the size parameter as the first segment’s maximum length leads to non-trivial distributions of buffer size scaling exponents for the appends up to the one that causes the first buffer enlargement. The choice of 3 as the exponentiation’s base leads to a good overall distribution of buffer size scaling exponents, as experiments have shown. Because of the exponential growth of maximum segment lengths, vectors constructed during testing get very large for longer segment lists. Therefore, we do not pick the lengths of segment lists in the usual way but instead make all segment lists have a common, small length. Concretely, we use 9 as this common length, because this leads to vectors that are not trivial but still consume reasonable amounts of memory. -} newtype Segments a = Segments [Segment a] deriving stock Show instance Arbitrary a => Arbitrary (Segments a) where arbitrary = sized $ iterate (* scalingFactor) >>> take count >>> mapM generateSegment >>> fmap Segments where scalingFactor :: Int scalingFactor = 3 count :: Int count = 9 shrink = shrinkMapBy Segments (\ (Segments segments) -> segments) $ shrinkList shrinkSegment