nosqlbench-824 Add function to provide clustered values with monotonic stepping

This commit is contained in:
Jonathan Shook 2022-12-06 20:34:02 -06:00
parent 2ca4320c24
commit dc74283952
2 changed files with 242 additions and 0 deletions

View File

@ -0,0 +1,96 @@
/*
* Copyright (c) 2022 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_long;
import io.nosqlbench.virtdata.api.annotations.Example;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.util.function.LongUnaryOperator;
/**
* <P>Compute a value which increases monotonically with respect to the cycle value.
* All values for f(X+(m>=0)) will be equal or greater than f(X). In effect, this
* means that with a sequence of monotonic inputs, the results will be monotonic as
* well as clustered. The values will approximate input/average, but will vary in frequency
* around a simple binomial distribution.</P>
*
* <p>The practical effect of this is to be able to compute a sequence of values
* over inputs which can act as foreign keys, but which are effectively ordered.</p>
*
* <H3>Call for Ideas</H3>
* <p>Due to the complexity of generalizing this as a pure function over other distributions,
* this is the only function of this type for now. If you are interested in this problem
* domain and have some suggestions for how to extend it to other distributions, please
* join the project or let us know.</p>
*/
@ThreadSafeMapper
public class TriangularStepFunction implements LongUnaryOperator {
private final Hash hasher = new Hash();
private final long median;
private final LongUnaryOperator sizer;
private final long variance;
@Example({"TriangularStepFunction(100,20)","Create a sequence of values where the average and median is 100, but the range of values is between 90 and 120."})
@Example({"TriangularStepFunction(80,10)","Create a sequence of values where the average and median is 80, but the range of values is between 70 and 90."})
TriangularStepFunction(long average, long variance) {
if (variance < 0 || variance > average) {
throw new RuntimeException(
"The median must be non-negative, and the variance must be less than the median. " +
"You provided median=" + average + ", variance=" + variance + "."
);
}
this.median = average;
this.variance = variance;
this.sizer = new HashRange(average-variance,average+variance);
}
TriangularStepFunction(long average) {
this(average, average/2);
// if (maxOffset>=avgsize) {
// throw new RuntimeException("max offset " + maxOffset + " has to be less than avg size " + avgsize);
// }
}
@Override
public long applyAsLong(long operand) {
// window number
long count = operand / median;
// offset within window
long offset = operand % median;
// base of window
long base = operand - offset;
// variate up to window size
long variance = sizer.applyAsLong(base);
// variate offset from start of window
long slice = base + variance;
// select current or next window
long result = ((slice)>operand) ? count : count + 1;
return result;
}
public long inlined(long operand) {
return (operand < operand - operand % median + sizer.applyAsLong(operand - operand % median)) ? operand / median : operand / median + 1;
}
@Override
public String toString() {
return this.getClass().getSimpleName()+"{median="+median+",variance="+variance+"}";
}
}

View File

@ -0,0 +1,146 @@
/*
* Copyright (c) 2022 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_long;
import org.junit.jupiter.api.Test;
import java.security.InvalidParameterException;
import java.util.Arrays;
import java.util.LongSummaryStatistics;
import java.util.stream.IntStream;
import static org.assertj.core.api.Assertions.assertThat;
public class TriangularStepFunctionTest {
private static final int LABEL=0;
private static final int FREQUENCY=1;
@Test
public void testExample1() {
TriangularStepFunction e1 = new TriangularStepFunction(100, 20);
int[] runLengths = this.rleStatsFor(e1, 0, 10000);
System.out.println(Arrays.toString(runLengths));
assertThat(IntStream.of(runLengths).min().orElseThrow()).isEqualTo(80L);
assertThat(IntStream.of(runLengths).max().orElseThrow()).isEqualTo(120L);
}
@Test
public void testExample2() {
TriangularStepFunction e1 = new TriangularStepFunction(80, 10);
int[] runLengths = this.rleStatsFor(e1, 0, 10000);
System.out.println(Arrays.toString(runLengths));
assertThat(IntStream.of(runLengths).min().orElseThrow()).isEqualTo(70L);
assertThat(IntStream.of(runLengths).max().orElseThrow()).isEqualTo(90L);
}
@Test
public void testStepSlice() {
int avgsize=10;
TriangularStepFunction f = new TriangularStepFunction(avgsize);
int[] ary = new int[avgsize*2];
long current=0L;
int count=0;
for (int i = 0; i < 10000; i++) {
long result = f.applyAsLong(i);
if (result==current) {
count++;
} else {
ary[count]++;
current=result;
count=0;
}
}
for (int i = 0; i < ary.length; i++) {
System.out.println("bucket " + i + ", count " + ary[i]);
}
}
@Test
public void testIncrementalVariance() {
TriangularStepFunction f = new TriangularStepFunction(100, 0);
assertThat(f.applyAsLong(0L)).isEqualTo(0L);
assertThat(f.applyAsLong(1L)).isEqualTo(0L);
assertThat(f.applyAsLong(99L)).isEqualTo(0L);
assertThat(f.applyAsLong(100L)).isEqualTo(1L);
}
@Test
public void testVariance() {
long first=0;
TriangularStepFunction f = new TriangularStepFunction(100,1);
var rlestats = rleStatsFor(f, 0, 100000);
LongSummaryStatistics stats99to101 = statsForRle((int) f.applyAsLong(first),rlestats);
assertThat(stats99to101.getMin()).isEqualTo(99L);
assertThat(stats99to101.getMax()).isEqualTo(101L);
int[][] histo = histoFor(rlestats);
LongSummaryStatistics histoStats = new LongSummaryStatistics();
for (int[] ints : histo) {
histoStats.accept(ints[LABEL]);
}
assertThat(histoStats.getAverage()).isEqualTo(100);
}
private int[] rleStatsFor(TriangularStepFunction f, long firstTrialIncl, long lastTrialExcl) {
long firstBucket = f.applyAsLong(firstTrialIncl);
long lastBucket = f.applyAsLong(lastTrialExcl);
if (firstBucket>Integer.MAX_VALUE||lastBucket>Integer.MAX_VALUE) {
throw new InvalidParameterException("can't fit result data into range of ints from long [" + firstBucket + ","+lastBucket+"]");
}
int base = (int) firstBucket;
int[] counts = new int[(((int) lastBucket-(int)firstBucket))+1];
for (long trial=firstTrialIncl; trial < lastTrialExcl; trial++) {
long result = f.applyAsLong(trial);
counts[(int)(result-base)]++;
}
// remove last partial, as only the front initial partial is compensated
counts= Arrays.copyOfRange(counts,0,counts.length-1);
return counts;
}
private int[][] histoFor(int[] counts) {
var minval = IntStream.of(counts).min().orElseThrow();
var maxval = IntStream.of(counts).max().orElseThrow();
int[][] histo = new int[(maxval-minval)+1][2];
for (int i = 0; i <= histo[LABEL].length; i++) {
histo[i][LABEL]=i+minval;
}
for (int count : counts) {
System.out.println(count);
histo[count-minval][FREQUENCY]++;
}
return histo;
}
private LongSummaryStatistics statsForRle(int base, int[] counts) {
LongSummaryStatistics stats = new LongSummaryStatistics();
for (int element = 0; element < counts.length; element++) {
int count = counts[element];
if (count==0) {
continue;
}
stats.accept(count);
}
return stats;
}
}