Add experimental mode with nontemporal stores (movnt[iq]) in own addr test, the only one where it seems to uniformly help with performance across a number of processors I have access to, both single and multi-socket. Per #79, it saves several dozens of minutes on my 4S Opteron 62xx / 63xx servers equipped with 256 GB of RAM.

This commit is contained in:
Lionel Debroux
2022-07-13 10:27:44 +02:00
parent ed8ea7cdc4
commit a373e97175
6 changed files with 85 additions and 15 deletions

View File

@@ -97,6 +97,7 @@ bool enable_trace = false;
bool enable_sm = true;
bool enable_bench = true;
bool enable_nontemporal = false;
bool pause_at_start = true;
@@ -209,6 +210,8 @@ static void parse_option(const char *option, const char *params)
enable_big_status = false;
} else if (strncmp(option, "noehci", 7) == 0) {
usb_init_options |= USB_IGNORE_EHCI;
} else if (strncmp(option, "nontemporal", 12) == 0) {
enable_nontemporal = true;
} else if (strncmp(option, "nopause", 8) == 0) {
pause_at_start = false;
} else if (strncmp(option, "nosm", 5) == 0) {

View File

@@ -58,6 +58,7 @@ extern bool enable_trace;
extern bool enable_sm;
extern bool enable_tty;
extern bool enable_bench;
extern bool enable_nontemporal;
extern bool pause_at_start;

View File

@@ -44,6 +44,20 @@ static inline void write32(const volatile uint32_t *ptr, uint32_t val)
);
}
/**
* Writes val to the 32-bit memory location pointed to by ptr, using non-temporal hint.
*/
static inline void write32_nt(const volatile uint32_t *ptr, uint32_t val)
{
__asm__ __volatile__(
"movntil %1, %0"
:
: "m" (*ptr),
"r" (val)
: "memory"
);
}
/**
* Writes val to the 32-bit memory location pointed to by ptr. Reads it
* back (and discards it) to ensure the write is complete.

View File

@@ -44,6 +44,20 @@ static inline void write64(const volatile uint64_t *ptr, uint64_t val)
);
}
/**
* Writes val to the 64-bit memory location pointed to by ptr, using non-temporal hint.
*/
static inline void write64_nt(const volatile uint64_t *ptr, uint64_t val)
{
__asm__ __volatile__(
"movnti %1, %0"
:
: "m" (*ptr),
"r" (val)
: "memory"
);
}
/**
* Writes val to the 64-bit memory location pointed to by ptr. Reads it
* back (and discards it) to ensure the write is complete.

View File

@@ -21,6 +21,8 @@
#include "display.h"
#include "error.h"
#include "test.h"
#include "config.h"
#include "cpuid.h"
#include "test_funcs.h"
#include "test_helper.h"
@@ -37,7 +39,7 @@ static int pattern_fill(int my_cpu, testword_t offset)
display_test_pattern_name("own address");
}
// Write each address with it's own address.
// Write each address with its own address.
for (int i = 0; i < vm_map_size; i++) {
testword_t *start = vm_map[i].start;
testword_t *end = vm_map[i].end;
@@ -59,9 +61,32 @@ static int pattern_fill(int my_cpu, testword_t offset)
continue;
}
test_addr[my_cpu] = (uintptr_t)p;
do {
write_word(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
if (!offset) {
if (enable_nontemporal && cpuid_info.flags.sse2) {
do {
write_word_nt(p, (testword_t)p);
} while (p++ < pe); // test before increment in case pointer overflows
__asm__ __volatile__ ("mfence");
}
else {
do {
write_word(p, (testword_t)p);
} while (p++ < pe); // test before increment in case pointer overflows
}
}
else {
if (enable_nontemporal && cpuid_info.flags.sse2) {
do {
write_word_nt(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
__asm__ __volatile__ ("mfence");
}
else {
do {
write_word(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
}
}
do_tick(my_cpu);
BAILOUT;
} while (!at_end && ++pe); // advance pe to next start point
@@ -98,13 +123,24 @@ static int pattern_check(int my_cpu, testword_t offset)
continue;
}
test_addr[my_cpu] = (uintptr_t)p;
do {
testword_t expect = (testword_t)p + offset;
testword_t actual = read_word(p);
if (unlikely(actual != expect)) {
data_error(p, expect, actual, true);
}
} while (p++ < pe); // test before increment in case pointer overflows
if (!offset) {
do {
testword_t expect = (testword_t)p;
testword_t actual = read_word(p);
if (unlikely(actual != expect)) {
data_error(p, expect, actual, true);
}
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
testword_t expect = (testword_t)p + offset;
testword_t actual = read_word(p);
if (unlikely(actual != expect)) {
data_error(p, expect, actual, true);
}
} while (p++ < pe); // test before increment in case pointer overflows
}
do_tick(my_cpu);
BAILOUT;
} while (!at_end && ++pe); // advance pe to next start point

View File

@@ -21,12 +21,14 @@
*/
#ifdef __x86_64__
#include "memrw64.h"
#define read_word read64
#define write_word write64
#define read_word read64
#define write_word write64
#define write_word_nt write64_nt
#else
#include "memrw32.h"
#define read_word read32
#define write_word write32
#define read_word read32
#define write_word write32
#define write_word_nt write32_nt
#endif
/**