Improve efficiency of random number generation (discussion #8).

Use a more efficient algorithm that can be in-lined, and keep the
generator state in a local variable.
This commit is contained in:
Martin Whitaker 2022-03-05 20:04:32 +00:00
parent 5e2ab9289b
commit e92f488753
4 changed files with 41 additions and 82 deletions

View File

@ -34,19 +34,20 @@ int test_mov_inv_random(int my_cpu)
{ {
int ticks = 0; int ticks = 0;
uint64_t seed; testword_t seed;
if (cpuid_info.flags.rdtsc) { if (cpuid_info.flags.rdtsc) {
seed = get_tsc(); seed = get_tsc();
} else { } else {
seed = UINT64_C(0x12345678) * (1 + pass_num); seed = 1 + pass_num;
} }
seed *= 0x87654321;
if (my_cpu == master_cpu) { if (my_cpu == master_cpu) {
display_test_pattern_value(seed); display_test_pattern_value(seed);
} }
// Initialize memory with the initial pattern. // Initialize memory with the initial pattern.
random_seed(my_cpu, seed); testword_t prsg_state = seed;
for (int i = 0; i < vm_map_size; i++) { for (int i = 0; i < vm_map_size; i++) {
testword_t *start, *end; testword_t *start, *end;
calculate_chunk(&start, &end, my_cpu, i, sizeof(testword_t)); calculate_chunk(&start, &end, my_cpu, i, sizeof(testword_t));
@ -69,7 +70,8 @@ int test_mov_inv_random(int my_cpu)
} }
test_addr[my_cpu] = (uintptr_t)p; test_addr[my_cpu] = (uintptr_t)p;
do { do {
write_word(p, random(my_cpu)); prsg_state = prsg(prsg_state);
write_word(p, prsg_state);
} while (p++ < pe); // test before increment in case pointer overflows } while (p++ < pe); // test before increment in case pointer overflows
do_tick(my_cpu); do_tick(my_cpu);
BAILOUT; BAILOUT;
@ -82,7 +84,7 @@ int test_mov_inv_random(int my_cpu)
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {
flush_caches(my_cpu); flush_caches(my_cpu);
random_seed(my_cpu, seed); prsg_state = seed;
for (int j = 0; j < vm_map_size; j++) { for (int j = 0; j < vm_map_size; j++) {
testword_t *start, *end; testword_t *start, *end;
calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t)); calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
@ -105,7 +107,8 @@ int test_mov_inv_random(int my_cpu)
} }
test_addr[my_cpu] = (uintptr_t)p; test_addr[my_cpu] = (uintptr_t)p;
do { do {
testword_t expect = random(my_cpu) ^ invert; prsg_state = prsg(prsg_state);
testword_t expect = prsg_state ^ invert;
testword_t actual = read_word(p); testword_t actual = read_word(p);
if (unlikely(actual != expect)) { if (unlikely(actual != expect)) {
data_error(p, expect, actual, true); data_error(p, expect, actual, true);

View File

@ -25,71 +25,10 @@
#include "test_helper.h" #include "test_helper.h"
//------------------------------------------------------------------------------
// Types
//------------------------------------------------------------------------------
// We keep a separate LFSR for each CPU. Space them out by at least a cache line,
// otherwise performance suffers.
typedef struct {
uint64_t lfsr;
uint64_t pad[7];
} prsg_state_t;
//------------------------------------------------------------------------------
// Private Variables
//------------------------------------------------------------------------------
static prsg_state_t prsg_state[MAX_CPUS];
//------------------------------------------------------------------------------
// Private Functions
//------------------------------------------------------------------------------
static inline uint32_t prsg(int my_cpu)
{
// This implements a 64 bit linear feedback shift register with XNOR
// feedback from taps 64, 63, 61, 60. It generates 32 new bits each
// time the function is called. Because the feedback taps are all in
// the upper 32 bits, we can generate the new bits in parallel.
uint64_t lfsr = prsg_state[my_cpu].lfsr;
uint32_t feedback = ~((lfsr >> 32) ^ (lfsr >> 31) ^ (lfsr >> 29) ^ (lfsr >> 28));
prsg_state[my_cpu].lfsr = (lfsr << 32) | feedback;
return feedback;
}
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
// Public Functions // Public Functions
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
void random_seed(int my_cpu, uint64_t seed)
{
if (my_cpu < 0) {
return;
}
// Avoid the PRSG illegal state.
if (~seed == 0) {
seed = 0;
}
prsg_state[my_cpu].lfsr = seed;
}
testword_t random(int my_cpu)
{
if (my_cpu < 0) {
return 0;
}
testword_t value = prsg(my_cpu);
#if TESTWORD_WIDTH > 32
value = value << 32 | prsg(my_cpu);
#endif
return value;
}
void calculate_chunk(testword_t **start, testword_t **end, int my_cpu, int segment, size_t chunk_align) void calculate_chunk(testword_t **start, testword_t **end, int my_cpu, int segment, size_t chunk_align)
{ {
if (my_cpu < 0) { if (my_cpu < 0) {

View File

@ -63,16 +63,23 @@ static inline uintptr_t round_up(uintptr_t value, size_t align_size)
} }
/** /**
* Seeds the psuedo-random number generator for my_cpu. * Returns the next word in a pseudo-random sequence where state was the
* previous word in that sequence.
*/ */
void random_seed(int my_cpu, uint64_t seed); static inline testword_t prsg(testword_t state)
{
/** // This uses the algorithms described at https://en.wikipedia.org/wiki/Xorshift
* Returns a psuedo-random number for my_cpu. The sequence of numbers returned #ifdef __x86_64__
* is repeatable for a given starting seed. The sequence repeats after 2^64 - 1 state ^= state << 13;
* numbers. Within that period, no number is repeated. state ^= state >> 7;
*/ state ^= state << 17;
testword_t random(int my_cpu); #else
state ^= state << 13;
state ^= state >> 17;
state ^= state << 5;
#endif
return state;
}
/** /**
* Calculates the start and end word address for the chunk of segment that is * Calculates the start and end word address for the chunk of segment that is

View File

@ -106,6 +106,8 @@ int run_test(int my_cpu, int test, int stage, int iterations)
} }
BARRIER; BARRIER;
testword_t prsg_state;
int ticks = 0; int ticks = 0;
switch (test) { switch (test) {
@ -168,12 +170,16 @@ int run_test(int my_cpu, int test, int stage, int iterations)
// Moving inversions, fixed random pattern. // Moving inversions, fixed random pattern.
case 5: case 5:
if (cpuid_info.flags.rdtsc) { if (cpuid_info.flags.rdtsc) {
random_seed(my_cpu, get_tsc()); prsg_state = get_tsc();
} else { } else {
random_seed(my_cpu, UINT64_C(0x12345678) * (1 + pass_num)); prsg_state = 1 + pass_num;
} }
prsg_state *= 0x12345678;
for (int i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
testword_t pattern1 = random(my_cpu); prsg_state = prsg(prsg_state);
testword_t pattern1 = prsg_state;
testword_t pattern2 = ~pattern1; testword_t pattern2 = ~pattern1;
BARRIER; BARRIER;
@ -213,13 +219,17 @@ int run_test(int my_cpu, int test, int stage, int iterations)
// Modulo 20 check, fixed random pattern. // Modulo 20 check, fixed random pattern.
case 9: case 9:
if (cpuid_info.flags.rdtsc) { if (cpuid_info.flags.rdtsc) {
random_seed(my_cpu, get_tsc()); prsg_state = get_tsc();
} else { } else {
random_seed(my_cpu, UINT64_C(0x12345678) * (1 + pass_num)); prsg_state = 1 + pass_num;
} }
prsg_state *= 0x87654321;
for (int i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
for (int offset = 0; offset < MODULO_N; offset++) { for (int offset = 0; offset < MODULO_N; offset++) {
testword_t pattern1 = random(my_cpu); prsg_state = prsg(prsg_state);
testword_t pattern1 = prsg_state;
testword_t pattern2 = ~pattern1; testword_t pattern2 = ~pattern1;
BARRIER; BARRIER;