improved SequenceOf coverage

added a proper alias method int sampler
added a long->double hash interval function
added basic ScaledDouble function
made bind point parser honor \n conversions
improve docs
This commit is contained in:
Jonathan Shook 2024-10-04 18:35:54 -05:00
parent e338c94bec
commit e2e897ecf4
11 changed files with 414 additions and 14 deletions

View File

@ -26,6 +26,8 @@ import java.util.function.BiFunction;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Matcher.quoteReplacement;
/**
* BindPointParser parses a user-provide string template into spans. It builds a simple list of
* BindPoints, and provides both the parsed spans and the BindPoints in a result.
@ -51,10 +53,10 @@ public class BindPointParser implements BiFunction<String, Map<String, String>,
List<String> spans = new ArrayList<>();
List<BindPoint> bindpoints = new ArrayList<>();
int genid=0;
int genid = 0;
while (m.find()) {
String pre = template.substring(lastMatch, m.start());
spans.add(pre);
spans.add(unescape(pre));
lastMatch = m.end();
String reference = m.group("reference");
@ -64,21 +66,29 @@ public class BindPointParser implements BiFunction<String, Map<String, String>,
bindpoints.add(BindPoint.of(reference, bindings.getOrDefault(reference, null), BindPoint.Type.reference));
spans.add(reference);
} else if (inline1 != null) {
bindpoints.add(BindPoint.of(DEFINITION, inline1, BindPoint.Type.definition));
bindpoints.add(BindPoint.of(DEFINITION, unescape(inline1), BindPoint.Type.definition));
spans.add(inline1);
} else if (inline2 != null) {
bindpoints.add(BindPoint.of(DEFINITION, inline2, BindPoint.Type.definition));
bindpoints.add(BindPoint.of(DEFINITION, unescape(inline2), BindPoint.Type.definition));
spans.add(inline2);
} else {
throw new BasicError("Unable to parse: " + template);
}
}
spans.add(lastMatch >= 0 ? template.substring(lastMatch) : template);
spans.add(lastMatch >= 0 ? unescape(template.substring(lastMatch)) : unescape(template));
for (String span : spans) {
}
return new Result(spans, bindpoints);
}
private static String unescape(String s) {
String s1 = s.replaceAll("\\\\n", "\n");
return s1;
}
public final static class Result {
private final List<String> spans;

View File

@ -88,4 +88,18 @@ public class BindPointParserTest {
}
@Test
public void testUnescapingNewlines() {
BindPointParser bpp = new BindPointParser();
assertThat(bpp.apply("a{{Template(\"-{}-\",Combinations(\"a-z\"))}}\\nb", Map.of())).isEqualTo(
new BindPointParser.Result(
List.of("a","Template(\"-{}-\",Combinations(\"a-z\"))","\nb"),
List.of(
BindPoint.of(BindPointParser.DEFINITION,"Template(\"-{}-\",Combinations(\"a-z\"))", BindPoint.Type.definition)
)
)
);
}
}

View File

@ -38,9 +38,9 @@ public class AliasSamplerDoubleInt implements DoubleToIntFunction {
private final ByteBuffer stats; // tuples of double,int,int (unfair coin, direct pointers to referents)
private final double slotCount; // The number of fair die-roll slotCount that contain unfair coin probabilities
private static final int _r0=0;
private static final int _r1=_r0+Double.BYTES;
private static final int _r2=_r1+Integer.BYTES;
public static int RECORD_LEN = _r2 + Integer.BYTES; // Record size for the above.
private static final int _r1=_r0+Double.BYTES; // unfair coin
private static final int _r2=_r1+Integer.BYTES; // + referent 1
public static int RECORD_LEN = _r2 + Integer.BYTES; // + referent 2 = Record size for the above.
// for testing
AliasSamplerDoubleInt(ByteBuffer stats) {

View File

@ -61,7 +61,9 @@ import java.util.stream.Collectors;
* the values will appear monotonically as you scan through the unit interval of all long values.
* Specifically, 0L represents 0.0d in the unit interval on input, and Long.MAX_VALUE represents
* 1.0 on the unit interval.) This mode is only recommended for advanced scenarios and should otherwise be
* avoided. You will know if you need this mode.
* avoided. You will know if you need this mode. For alias sampling, the values may not always occur
* in the order specified due to the alias table construction. However, the values will be clustered in the order
* they appear in that table.
*
*/
@Categories(Category.general)

View File

@ -0,0 +1,95 @@
package io.nosqlbench.virtdata.library.basics.shared.distributions;
/*
* Copyright (c) 2022 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import io.nosqlbench.nb.api.errors.BasicError;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.basics.core.stathelpers.AliasSamplerDoubleInt;
import io.nosqlbench.virtdata.library.basics.core.stathelpers.EvProbD;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.HashInterval;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.HashRange;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.ScaledDouble;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_long.Hash;
import java.util.ArrayList;
import java.util.List;
import java.util.function.LongToDoubleFunction;
import java.util.function.LongToIntFunction;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ThreadSafeMapper
@Categories(Category.distributions)
public class WeightedInts implements LongToIntFunction {
private final AliasSamplerDoubleInt sampler;
private final LongToIntFunction function;
public WeightedInts(String spec, String... modifiers) {
sampler = new AliasSamplerDoubleInt(parseWeights(spec));
this.function = applyModifiers(sampler, modifiers);
}
private LongToIntFunction applyModifiers(AliasSamplerDoubleInt aliasSampler, String[] modifiers) {
String mode = "hash";
for (String modifier : modifiers) {
switch (modifier) {
case "map":
mode = "map";
break;
default:
throw new RuntimeException("Unrecognized modifier: " + modifier);
}
}
if (mode.equals("hash")) {
HashInterval f2 = new HashInterval(0.0d, 1.0d);
return (long l) -> aliasSampler.applyAsInt(f2.applyAsDouble(l));
} else if (mode.equals("map")) {
ScaledDouble f1 = new ScaledDouble();
return (long l) -> aliasSampler.applyAsInt(f1.applyAsDouble(l));
} else {
throw new BasicError("Unable to determine mapping mode for weighted ints function");
}
}
private final static Pattern weight = Pattern.compile(
"(?<value>\\d+)(:(?<weight>[0-9.]+))?([; ,]+)?"
);
private List<EvProbD> parseWeights(String spec) {
List<EvProbD> events = new ArrayList<>();
Matcher matcher = weight.matcher(spec);
while (matcher.find()) {
int value = Integer.parseInt(matcher.group("value"));
String weightSpec = matcher.group("weight");
double weight = (weightSpec != null) ? Double.parseDouble(weightSpec) : 1.0d;
events.add(new EvProbD(value, weight));
}
return events;
}
@Override
public int applyAsInt(long value) {
return function.applyAsInt(value);
}
}

View File

@ -0,0 +1,58 @@
/*
* Copyright (c) 2022 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_double;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_long.Hash;
import java.util.function.LongToDoubleFunction;
/**
* Create a double value from a hashed long, over the valid range of long inputs.
* This version provides a strict unit interval value, not a unit range value.
* That is, it can yield any value between 0.0 and 1.0, EXCEPT 1.0.
*/
@ThreadSafeMapper
@Categories({Category.general})
public class HashInterval implements LongToDoubleFunction {
private final double min;
private final double max;
private final double interval;
private final static double MAX_DOUBLE_VIA_LONG_PHI = ((double) Long.MAX_VALUE)+1026d;
private final Hash hash = new Hash();
public HashInterval(double min, double max) {
this.min = min;
this.max = max;
this.interval = max - min;
if (min>max) {
throw new RuntimeException("min must be less than or equal to max");
}
}
@Override
public double applyAsDouble(long value) {
long hashed = hash.applyAsLong(value);
double unitScale = ((double) hashed) / MAX_DOUBLE_VIA_LONG_PHI;
double valueScaled =interval*unitScale + min;
return valueScaled;
}
}

View File

@ -0,0 +1,61 @@
/*
* Copyright (c) 2022-2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_double;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_long.Hash;
import java.util.function.LongToDoubleFunction;
/*
* <p>This function attempts to take a double
* unit interval value from a long/long division over the whole
* range of long values but via double value types, thus providing
* a very linear sample. This means that the range of double
* values to be accessed will not fall along all possible doubles,
* but will still provide suitable values for ranges close to
* high-precision points in the IEEE floating point number line.
* This suffices for most reasonable ranges in practice outside
* of scientific computing, where large exponents put adjacent
* IEEE floating point values much further apart.</p>
*
* <p>This should be consider the default double range sampling
* function for most uses, when the exponent is not needed for
* readability.</p>
*/
/**
* Return the double value closest to the fraction (input) / (Long.MAX_VALUE).
* This is essentially a scaling function from Long to Double over the range of
* positive longs to the double unit interval, so [0.0d - 1.0d)
*/
@ThreadSafeMapper
@Categories({Category.general})
public class ScaledDouble implements LongToDoubleFunction {
public final static double MAX_DOUBLE_VIA_LONG_PHI = ((double) Long.MAX_VALUE)+1026d;
public ScaledDouble(){}
@Override
public double applyAsDouble(long value) {
double unitScaled = ((double) value) / MAX_DOUBLE_VIA_LONG_PHI;
return unitScaled;
}
}

View File

@ -0,0 +1,74 @@
package io.nosqlbench.virtdata.library.basics.shared.distributions;
/*
* Copyright (c) 2022 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.assertj.core.data.Offset;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.*;
public class WeightedIntsTest {
@Test
public void testWeightedInts() {
WeightedInts weightedInts = new WeightedInts("10:10 20:20: 30:30 40:40", "map");
assertThat(weightedInts.applyAsInt(0L)).isEqualTo(10);
assertThat(weightedInts.applyAsInt(1L)).isEqualTo(10);
assertThat(weightedInts.applyAsInt(Long.MAX_VALUE)).isEqualTo(40);
assertThat(weightedInts.applyAsInt(Long.MAX_VALUE-1L)).isEqualTo(40);
}
@Test
public void testDistributionError() {
WeightedInts weightedInts = new WeightedInts("10:10 20:20: 30:30 40:40");
double[] weights =new double[100];
long count = 1000000;
for (long i = 0; i < count; i++) {
int value = weightedInts.applyAsInt(i);
weights[value]++;
}
// Verify that each label has been sampled at a frequency which is within
// 0.1% of the expected value.
Offset offset = Offset.offset(((double)count)/1000d);
assertThat(weights[10]).isCloseTo(((double)count)*(10.d/100.d), offset);
assertThat(weights[20]).isCloseTo(((double)count)*(20.d/100.d), offset);
assertThat(weights[30]).isCloseTo(((double)count)*(30.d/100.d), offset);
assertThat(weights[40]).isCloseTo(((double)count)*(40.d/100.d), offset);
}
@Test
@Disabled("leaving here to show boundary check logic for PHI")
public void boundaryCheck() {
for (long i = 0; i < 100000000; i++) {
double pad = ((double) i)*1.0;
double denominator = ((double) Long.MAX_VALUE) + pad;
double scaled = ((double) Long.MAX_VALUE) / denominator;
if (scaled < 1.0d) {
System.out.println("phi:" + i);
break;
}
}
}
}

View File

@ -17,6 +17,7 @@
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_long;
import io.nosqlbench.nb.api.errors.BasicError;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat;
@ -37,4 +38,15 @@ public class HashIntervalTest {
assertThatExceptionOfType(BasicError.class)
.isThrownBy(() -> new HashInterval(3L, 3L));
}
@Test
@Disabled("exhaustive boundary check, expensive")
public void testDoubleHashIntervalBounds() {
io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.HashInterval hi = new io.nosqlbench.virtdata.library.basics.shared.from_long.to_double.HashInterval(0.0, 1.0);
for (long i = 0; i < 100000000; i++) {
double v = hi.applyAsDouble(i);
assertThat(v).isGreaterThanOrEqualTo(0.0d);
assertThat(v).isLessThan(1.0d);
}
}
}

View File

@ -2,13 +2,13 @@ package io.nosqlbench.virtdata.library.basics.shared.from_long.to_long;
/*
* Copyright (c) 2022 nosqlbench
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@ -26,7 +26,7 @@ import static org.junit.jupiter.api.Assertions.*;
public class SequenceOfTest {
@Test
public void testSequenceSimple() {
public void testSequenceSimpleToLong() {
SequenceOf so = new SequenceOf(1L,"0 1 2 3 4 5 6 7 8 9");
long[] results = new long[10];
for (int i = 0; i < 10; i++) {
@ -38,7 +38,20 @@ public class SequenceOfTest {
}
@Test
public void testSequenceWeighted() {
public void testSequenceSimpleToInt() {
io.nosqlbench.virtdata.library.basics.shared.from_long.to_int.SequenceOf so = new io.nosqlbench.virtdata.library.basics.shared.from_long.to_int.SequenceOf(1,"0 1 2 3 4 5 6 7 8 9");
long[] results = new long[10];
for (int i = 0; i < 10; i++) {
results[i] = so.applyAsInt(i);
}
for (int i = 0; i < 10; i++) {
assertEquals(i,results[i]);
}
}
@Test
public void testSequenceWeightedToLong() {
SequenceOf so = new SequenceOf(1L,"0:6 1 2 3 4");
int samples = 100;
long[] results = new long[samples];
@ -56,7 +69,29 @@ public class SequenceOfTest {
assertThat(results[8]).isEqualTo(3);
assertThat(results[9]).isEqualTo(4);
assertThat(results[10]).isEqualTo(0);
}
@Test
public void testSequenceWeightedToInt() {
io.nosqlbench.virtdata.library.basics.shared.from_long.to_int.SequenceOf so = new io.nosqlbench.virtdata.library.basics.shared.from_long.to_int.SequenceOf(1,"0:6 1 2 3 4");
int samples = 100;
long[] results = new long[samples];
for (int i = 0; i < samples; i++) {
results[i]=so.applyAsInt(i);
}
assertThat(results[0]).isEqualTo(0);
assertThat(results[1]).isEqualTo(0);
assertThat(results[2]).isEqualTo(0);
assertThat(results[3]).isEqualTo(0);
assertThat(results[4]).isEqualTo(0);
assertThat(results[5]).isEqualTo(0);
assertThat(results[6]).isEqualTo(1);
assertThat(results[7]).isEqualTo(2);
assertThat(results[8]).isEqualTo(3);
assertThat(results[9]).isEqualTo(4);
assertThat(results[10]).isEqualTo(0);
}
}

View File

@ -21,6 +21,45 @@ import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import org.apache.commons.math4.legacy.distribution.EnumeratedIntegerDistribution;
/**
* Create a sampler based on enumeration of integer values an sample over them
* using the EnumeratedInts distribution curve provided by Apache Commons Math.
* This version will roughly produce the distribution, but since it also relies on
* interpolation by default, non-step values may appear at low frequencies. If this
* is a desired effect, then this function is suitable. For example: consider this
* result:
* <pre>{@code
* nb5 run driver=stdout op="{{EnumeratedInts('10:10 20:20 30:30 40:40')}}\n" cycles=10000 | sort -n | uniq -c
* 1 STDOUT0 (pending,current,complete)=(0,0,10000) 100.00% (last report)
* 1 9
* 1036 10
* 2 11
* 2 13
* 1 14
* 3 15
* 2 16
* 1 18
* 1 19
* 1937 20
* 1 21
* 1 23
* 1 24
* 1 25
* 1 28
* 1 29
* 3077 30
* 1 31
* 1 33
* 1 34
* 2 35
* 1 37
* 1 39
* 3924 40
* }</pre>
*
* The values here which are not multiples of 10 are not specified, yet the appear. For some testing, this is
* helpful as a fuzzer, but for more precise step-value sampling, see {@link AliasSampler}
*/
@ThreadSafeMapper
@Categories({Category.distributions})
public class EnumeratedInts extends IntToIntDiscreteCurve {