Add experimental mode with nontemporal stores (movnt[iq]) in own addr test, the only one where helps with performance across most processors I have access to, both single and multi-socket. Per #79, it saves several dozens of minutes on my 4S Opteron 62xx / 63xx servers equipped with 256 GB of RAM.

This commit is contained in:
Lionel Debroux 2022-07-13 10:27:44 +02:00
parent 5a046291fa
commit 52a589fd29
6 changed files with 102 additions and 12 deletions

View File

@ -99,6 +99,7 @@ bool enable_sm = true;
bool enable_bench = true;
bool enable_mch_read = true;
bool enable_numa = false;
bool enable_nontemporal = false;
bool enable_ecc_polling = false;
@ -244,6 +245,8 @@ static void parse_option(const char *option, const char *params)
usb_init_options |= USB_IGNORE_EHCI;
} else if (strncmp(option, "nomch", 6) == 0) {
enable_mch_read = false;
} else if (strncmp(option, "nontemporal", 12) == 0) {
enable_nontemporal = true;
} else if (strncmp(option, "nopause", 8) == 0) {
pause_at_start = false;
} else if (strncmp(option, "nosm", 5) == 0) {

View File

@ -63,6 +63,7 @@ extern bool enable_bench;
extern bool enable_mch_read;
extern bool enable_ecc_polling;
extern bool enable_numa;
extern bool enable_nontemporal;
extern bool pause_at_start;

View File

@ -226,6 +226,20 @@ static inline void cpuid(uint32_t op, uint32_t count, uint32_t *eax, uint32_t *e
"2" (*ecx)
);
}
/**
* Returns whether the processor supports nontemporal writes
*/
#define nontemporal_writes_supported() (cpuid_info.flags.sse2)
#elif defined(__loongarch_lp64)
/**
* Returns whether the processor supports nontemporal writes
*/
// TODO
#define nontemporal_writes_supported() (0)
#endif
#endif // CPUID_H

View File

@ -23,6 +23,7 @@
#define __MEMRW_SUFFIX_64BIT "q"
#define __MEMRW_READ_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0"
#define __MEMRW_WRITE_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0"
#define __MEMRW_WRITENT_INSTRUCTIONS(bitwidth) "movnti" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0"
#define __MEMRW_FLUSH_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0; mov" __MEMRW_SUFFIX_##bitwidth##BIT " %0, %1"
#elif defined(__loongarch_lp64)
@ -62,6 +63,18 @@ static inline void write##bitwidth(const volatile uint##bitwidth##_t *ptr, uint#
); \
}
#define __MEMRW_WRITENT_FUNC(bitwidth) \
static inline void write##bitwidth##nt(const volatile uint##bitwidth##_t *ptr, uint##bitwidth##_t val) \
{ \
__asm__ __volatile__( \
__MEMRW_WRITENT_INSTRUCTIONS(bitwidth) \
: \
: "m" (*ptr), \
"r" (val) \
: "memory" \
); \
}
#define __MEMRW_FLUSH_FUNC(bitwidth) \
static inline void flush##bitwidth(const volatile uint##bitwidth##_t *ptr, uint##bitwidth##_t val) \
{ \
@ -108,6 +121,15 @@ __MEMRW_WRITE_FUNC(32)
*/
__MEMRW_WRITE_FUNC(64)
/**
* Writes val to the 32-bit memory location pointed to by ptr, using non-temporal hint.
*/
__MEMRW_WRITENT_FUNC(32)
/**
* Writes val to the 64-bit memory location pointed to by ptr, using non-temporal hint.
*/
__MEMRW_WRITENT_FUNC(64)
/**
* Writes val to the 8-bit memory location pointed to by ptr. Only returns when the write is complete.
*/

View File

@ -20,6 +20,8 @@
#include "display.h"
#include "error.h"
#include "test.h"
#include "config.h"
#include "cpuid.h"
#include "test_funcs.h"
#include "test_helper.h"
@ -36,7 +38,7 @@ static int pattern_fill(int my_cpu, testword_t offset)
display_test_pattern_name("own address");
}
// Write each address with it's own address.
// Write each address with its own address.
for (int i = 0; i < vm_map_size; i++) {
testword_t *start = vm_map[i].start;
testword_t *end = vm_map[i].end;
@ -58,6 +60,41 @@ static int pattern_fill(int my_cpu, testword_t offset)
continue;
}
test_addr[my_cpu] = (uintptr_t)p;
if (!offset) {
if (enable_nontemporal && nontemporal_writes_supported()) {
do {
write_word_nt(p, (testword_t)p);
} while (p++ < pe); // test before increment in case pointer overflows
#if defined(__i386__) || defined(__x86_64__)
__asm__ __volatile__ ("mfence");
#elif defined(__loongarch_lp64)
// TODO LoongArch barrier
#endif
}
else {
do {
write_word(p, (testword_t)p);
} while (p++ < pe); // test before increment in case pointer overflows
}
}
else {
if (enable_nontemporal && nontemporal_writes_supported()) {
do {
write_word_nt(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
#if defined(__i386__) || defined(__x86_64__)
__asm__ __volatile__ ("mfence");
#elif defined(__loongarch_lp64)
// TODO LoongArch barrier
#endif
}
else {
do {
write_word(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
}
}
do {
write_word(p, (testword_t)p + offset);
} while (p++ < pe); // test before increment in case pointer overflows
@ -97,6 +134,16 @@ static int pattern_check(int my_cpu, testword_t offset)
continue;
}
test_addr[my_cpu] = (uintptr_t)p;
if (!offset) {
do {
testword_t expect = (testword_t)p;
testword_t actual = read_word(p);
if (unlikely(actual != expect)) {
data_error(p, expect, actual, true);
}
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
testword_t expect = (testword_t)p + offset;
testword_t actual = read_word(p);
@ -104,6 +151,7 @@ static int pattern_check(int my_cpu, testword_t offset)
data_error(p, expect, actual, true);
}
} while (p++ < pe); // test before increment in case pointer overflows
}
do_tick(my_cpu);
BAILOUT;
} while (!at_end && ++pe); // advance pe to next start point

View File

@ -23,9 +23,11 @@
#if (ARCH_BITS == 64)
#define read_word read64
#define write_word write64
#define write_word_nt write64nt
#else
#define read_word read32
#define write_word write32
#define write_word_nt write32nt
#endif
/**