jshook/nosqlbench-2135-lerpfield (#2136)

* implement emperical distribution * shore-up tests for Interpolate * add licenses
2025-02-25 18:55:28 -06:00 · 2025-01-10 18:11:15 -06:00 · 2025-01-10 18:11:15 -06:00 · 03319d0405
commit 03319d0405
parent 71f66a75fc
4 changed files with 237 additions and 10 deletions
--- a/nb-virtdata/virtdata-lib-basics/src/main/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/EmpiricalDistribution.java
+++ b/nb-virtdata/virtdata-lib-basics/src/main/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/EmpiricalDistribution.java
@ -0,0 +1,93 @@
+package io.nosqlbench.virtdata.library.basics.shared.from_long.to_double;
+
+/*
+ * Copyright (c) nosqlbench
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import io.nosqlbench.nb.api.errors.BasicError;
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.Example;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+
+/// This distribution is an easy-to use and modify distribution which
+/// is simply based on observed or expected frequencies. If you imagine
+/// drawing a line across a chart and then being able to use that to
+/// model frequencies, that is what this function does.
+///
+/// Values must be specified as x,y points, alternating. The x points draw a line segment
+/// from left 0.0 to right 1.0 on the unit interval, and the y points
+/// plot the magnitude. A LERP table with 1000 fixed points, which provides
+/// substantial precision for most systems testing purposes.
+///
+/// It is valid to have y values repeated, which is another way of saying that part
+/// of the sampled population will have identical values. x coordinates must be monotonically
+/// increasing, while y values may be any valid value, even out of order
+@ThreadSafeMapper
+@Categories(Category.distributions)
+public class EmpiricalDistribution extends Interpolate {
+
+    private static int lutSize = 1000;
+
+    @Example({
+        "EmpiricalDistribution(0.0d, 0.0d, 1.0d, 1.0d)",
+        "Create a uniform distribution, " + "from (x,y)=0,0 to (x,y) = 1,1"
+    })
+    @Example({
+        "EmpiricalDistribution(0.0d, 0.0d, 0.333d, 0.1d, 1.0d, 1.0d)",
+        "Create a distribution where 1/3 of values range from 0.0 to 0"
+        + ".1 and 2/3 range from 0.1 to 1.0"
+    })
+    public EmpiricalDistribution(double... values) {
+        super(genTable(values));
+    }
+
+    private static double[] genTable(double[] values) {
+        if (values.length < 4) {
+            throw new BasicError("You must specify at least 2 x,y points, as in 0.0, 0.0, 1.0, 1"
+                                 + ".0, which describes a uniform distribution");
+        }
+        double[] lut = new double[lutSize + 1];
+        double[] offsets = new double[values.length >> 1];
+        double[] magnitudes = new double[values.length >> 1];
+        for (int idx = 0; idx < offsets.length; idx++) {
+            offsets[idx] = values[idx << 1];
+            magnitudes[idx] = values[(idx << 1) + 1];
+        }
+        for (int idx = 0; idx < offsets.length - 1; idx++) {
+            double offsetBase = offsets[idx];
+            int startIdx = (int) (offsetBase * lutSize);
+            double unitFraction = (offsets[idx + 1] - offsetBase);
+            if (unitFraction < 0.0) {
+                throw new BasicError("offsets must be increasing");
+            }
+            int segmentSize = (int) (unitFraction * lutSize);
+            double[] segment = new double[segmentSize + 1];
+            double startMagnitude = magnitudes[idx];
+            double endMagnitude = magnitudes[idx + 1];
+            Interpolate segmentLine = new Interpolate(startMagnitude, endMagnitude);
+            for (int ins = 0; ins < segmentSize; ins++) {
+                double frac = (double) ins / (double) segment.length;
+                frac = frac * (double) Long.MAX_VALUE;
+                segment[ins] = segmentLine.applyAsDouble((long) frac);
+            }
+            segment[segment.length - 1] = endMagnitude;
+            System.arraycopy(segment, 0, lut, startIdx, segment.length);
+        }
+        return lut;
+    }
+}
--- a/nb-virtdata/virtdata-lib-basics/src/main/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/Interpolate.java
+++ b/nb-virtdata/virtdata-lib-basics/src/main/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/Interpolate.java
@ -51,7 +51,7 @@ public class Interpolate implements LongToDoubleFunction {
    private final double resolution;

    // The lookup table
-    private final double[] lut;
+    public final double[] lut;

    /**
     * The scale of Long.MAX_VALUE and the unit interval scale factor are pre-combined
--- a/nb-virtdata/virtdata-lib-basics/src/test/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/EmpiricalDistributionTest.java
+++ b/nb-virtdata/virtdata-lib-basics/src/test/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_double/EmpiricalDistributionTest.java
@ -0,0 +1,116 @@
+package io.nosqlbench.virtdata.library.basics.shared.from_long.to_double;
+
+/*
+ * Copyright (c) nosqlbench
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import io.nosqlbench.virtdata.library.basics.shared.from_long.to_long.Hash;
+import io.nosqlbench.virtdata.library.basics.shared.from_long.to_long.InterpolateTest;
+import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
+import org.assertj.core.data.Offset;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+class EmpiricalDistributionTest {
+
+    @Test
+    @Disabled("performance intensive")
+    public void testUniform() {
+        EmpiricalDistribution d =
+            new EmpiricalDistribution(0.0d, 0.0d, 1.0d, 1.0d);
+        DescriptiveStatistics data = InterpolateTest.tabulate(new Hash(), d, 1000000000);
+        assertThat(data.getPercentile(0.0001d)).isCloseTo(0.0d, Offset.offset(0.0001));
+        assertThat(data.getPercentile(50.0d)).isCloseTo(0.5d, Offset.offset(0.005));
+        assertThat(data.getPercentile(100.0d)).isCloseTo(1.0d, Offset.offset(0.001));
+    }
+
+    /// convergence to expected value at different number of samples and LERP resolution
+    ///
+    /// @100000 / 100
+    /// p50 = 0.09961336762080965
+    /// p55 = 0.4887600943079539
+    /// p80 = 0.9486573852803234
+    /// @100000 / 1000
+    /// p50 = 0.0996064221289679
+    /// p55 = 0.4887600943079539
+    /// p80 = 0.9497494462965901
+    ///
+    /// @1000000 / 100
+    /// p50 = 0.10105949687725542
+    /// p55 = 0.49758658404616063
+    /// p80 = 0.9486389093179619
+    /// @1000000 / 1000
+    /// p50 = 0.10105949687725548
+    /// p55 = 0.49758658404616074
+    /// p80 = 0.9497305556617565
+    ///
+    /// @10000000 / 100
+    /// p50 = 0.1000117051372746
+    /// p55 = 0.4997387848207568
+    /// p80 = 0.9487722639153554
+    /// @10000000 / 1000
+    /// p50 = 0.10001170513727448
+    /// p55 = 0.4997387848207569
+    /// p80 = 0.9498669032551016
+    ///
+    /// @100000000 / 100
+    /// p50 = 0.0999966957844636
+    /// p55 = 0.5001328046490157
+    /// p80 = 0.9487758571324978
+    /// @100000000 / 1000
+    /// p50 = 0.09999663642729828
+    /// p55 = 0.5001328046490157
+    /// p80 = 0.9498705771180153
+    ///
+    /// @1000000000 / 100
+    /// p50 = 0.09999563860575955
+    /// p55 = 0.5000398035892097
+    /// p80 = 0.9487774978532897
+    /// @1000000000 / 1000
+    ///
+    ///
+    @Test
+    @Disabled("performance intensive")
+    public void testPieceWise() {
+        EmpiricalDistribution d =
+            new EmpiricalDistribution(0.0d, 0.0d, 0.5d, 0.1d, 0.6d, 0.9d, 1.0d, 1.0d);
+        DescriptiveStatistics data = InterpolateTest.tabulate(new Hash(), d, 1000000000);
+        assertThat(data.getPercentile(0.0001d)).isCloseTo(0.0d, Offset.offset(0.01));
+        assertThat(data.getPercentile(25.0d)).isCloseTo(0.05d, Offset.offset(0.01));
+
+        // was 0.101059
+        double p50 = data.getPercentile(50.0d);
+        System.out.println("p50 = " + p50);
+        assertThat(p50).isCloseTo(0.1d, Offset.offset(0.005));
+
+        // was 0.4975865
+        double p55 = data.getPercentile(55.0d);
+        System.out.println("p55 = " + p55);
+        assertThat(p55).isCloseTo(0.5d, Offset.offset(0.1));
+
+        // was 0.948638
+        double p80 = data.getPercentile(80.0d);
+        System.out.println("p80 = " + p80);
+        assertThat(p80).isCloseTo(0.95d, Offset.offset(0.005));
+
+        assertThat(data.getPercentile(100.0d)).isCloseTo(1.0d, Offset.offset(0.001));
+
+    }
+
+}
--- a/nb-virtdata/virtdata-lib-basics/src/test/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_long/InterpolateTest.java
+++ b/nb-virtdata/virtdata-lib-basics/src/test/java/io/nosqlbench/virtdata/library/basics/shared/from_long/to_long/InterpolateTest.java
@ -22,29 +22,47 @@ import org.apache.logging.log4j.Logger;
 import org.assertj.core.data.Offset;
 import org.junit.jupiter.api.Test;

+import java.util.function.LongToDoubleFunction;
+import java.util.function.LongUnaryOperator;
+
 import static org.assertj.core.api.Assertions.assertThat;

 public class InterpolateTest {
    private final static Logger logger = LogManager.getLogger(InterpolateTest.class);
+    private static int iterations = 1000000;

    @Test
    public void testRanging() {
        io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.Interpolate interpolate =
            new io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.Interpolate (0.0d, 1.0d);
-        Hash hf = new Hash();
-        DescriptiveStatistics dss = new DescriptiveStatistics();
-        long count=10000000;
-        for (long i = 0; i < count; i++) {
-            long input = (long) (Long.MAX_VALUE * ((double)i/(double)count));
-            long prn = hf.applyAsLong(input);
-            double v = interpolate.applyAsDouble(prn);
-            dss.addValue(v);
-        }
+        DescriptiveStatistics dss = tabulate(new Hash(),interpolate, iterations);
        assertThat(dss.getPercentile(0.000001)).isCloseTo(0.0, Offset.offset(0.01));
+        assertThat(dss.getPercentile(50.0)).isCloseTo(0.5,Offset.offset(0.01));
        assertThat(dss.getPercentile(99.99999)).isCloseTo(1.0, Offset.offset(0.01));
    }

+    @Test
+    public void testShaping() {
+        double[] shape = new double[]{0.0,0.9,0.95,1.0};
+        io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.Interpolate interpolate =
+            new io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.Interpolate (shape);
+        DescriptiveStatistics dss = tabulate(new Hash(),interpolate, iterations);
+        assertThat(dss.getPercentile(0.000001)).isCloseTo(0.0, Offset.offset(0.01));
+        assertThat(dss.getPercentile(50.0)).isCloseTo(0.5,Offset.offset(0.925));
+        assertThat(dss.getPercentile(99.99999)).isCloseTo(1.0, Offset.offset(0.01));
+    }

+    public static DescriptiveStatistics tabulate(LongUnaryOperator bias,
+        LongToDoubleFunction f, int count) {
+        DescriptiveStatistics dss = new DescriptiveStatistics();
+        for (long i = 0; i < count; i++) {
+            long input = (long) (Long.MAX_VALUE * ((double)i/(double)count));
+            long prn = bias.applyAsLong(input);
+            double v = f.applyAsDouble(prn);
+            dss.addValue(v);
+        }
+        return dss;
+    }

    @Test
    public void testDeciles() {