From 52a589fd291fd38617e8eb03a5ea2401fd9dbd62 Mon Sep 17 00:00:00 2001 From: Lionel Debroux Date: Wed, 13 Jul 2022 10:27:44 +0200 Subject: [PATCH] Add experimental mode with nontemporal stores (movnt[iq]) in own addr test, the only one where helps with performance across most processors I have access to, both single and multi-socket. Per #79, it saves several dozens of minutes on my 4S Opteron 62xx / 63xx servers equipped with 256 GB of RAM. --- app/config.c | 3 +++ app/config.h | 1 + system/cpuid.h | 14 ++++++++++ system/memrw.h | 22 ++++++++++++++++ tests/own_addr.c | 64 +++++++++++++++++++++++++++++++++++++++------ tests/test_helper.h | 10 ++++--- 6 files changed, 102 insertions(+), 12 deletions(-) diff --git a/app/config.c b/app/config.c index d9a0823..ed92967 100644 --- a/app/config.c +++ b/app/config.c @@ -99,6 +99,7 @@ bool enable_sm = true; bool enable_bench = true; bool enable_mch_read = true; bool enable_numa = false; +bool enable_nontemporal = false; bool enable_ecc_polling = false; @@ -244,6 +245,8 @@ static void parse_option(const char *option, const char *params) usb_init_options |= USB_IGNORE_EHCI; } else if (strncmp(option, "nomch", 6) == 0) { enable_mch_read = false; + } else if (strncmp(option, "nontemporal", 12) == 0) { + enable_nontemporal = true; } else if (strncmp(option, "nopause", 8) == 0) { pause_at_start = false; } else if (strncmp(option, "nosm", 5) == 0) { diff --git a/app/config.h b/app/config.h index b13a7c8..9192cad 100644 --- a/app/config.h +++ b/app/config.h @@ -63,6 +63,7 @@ extern bool enable_bench; extern bool enable_mch_read; extern bool enable_ecc_polling; extern bool enable_numa; +extern bool enable_nontemporal; extern bool pause_at_start; diff --git a/system/cpuid.h b/system/cpuid.h index 1e0b2cb..6e81645 100644 --- a/system/cpuid.h +++ b/system/cpuid.h @@ -226,6 +226,20 @@ static inline void cpuid(uint32_t op, uint32_t count, uint32_t *eax, uint32_t *e "2" (*ecx) ); } + +/** + * Returns whether the processor supports nontemporal writes + */ +#define nontemporal_writes_supported() (cpuid_info.flags.sse2) + +#elif defined(__loongarch_lp64) + +/** + * Returns whether the processor supports nontemporal writes + */ +// TODO +#define nontemporal_writes_supported() (0) + #endif #endif // CPUID_H diff --git a/system/memrw.h b/system/memrw.h index cd8b9c9..23eb97f 100644 --- a/system/memrw.h +++ b/system/memrw.h @@ -23,6 +23,7 @@ #define __MEMRW_SUFFIX_64BIT "q" #define __MEMRW_READ_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0" #define __MEMRW_WRITE_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0" +#define __MEMRW_WRITENT_INSTRUCTIONS(bitwidth) "movnti" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0" #define __MEMRW_FLUSH_INSTRUCTIONS(bitwidth) "mov" __MEMRW_SUFFIX_##bitwidth##BIT " %1, %0; mov" __MEMRW_SUFFIX_##bitwidth##BIT " %0, %1" #elif defined(__loongarch_lp64) @@ -62,6 +63,18 @@ static inline void write##bitwidth(const volatile uint##bitwidth##_t *ptr, uint# ); \ } +#define __MEMRW_WRITENT_FUNC(bitwidth) \ +static inline void write##bitwidth##nt(const volatile uint##bitwidth##_t *ptr, uint##bitwidth##_t val) \ +{ \ + __asm__ __volatile__( \ + __MEMRW_WRITENT_INSTRUCTIONS(bitwidth) \ + : \ + : "m" (*ptr), \ + "r" (val) \ + : "memory" \ + ); \ +} + #define __MEMRW_FLUSH_FUNC(bitwidth) \ static inline void flush##bitwidth(const volatile uint##bitwidth##_t *ptr, uint##bitwidth##_t val) \ { \ @@ -108,6 +121,15 @@ __MEMRW_WRITE_FUNC(32) */ __MEMRW_WRITE_FUNC(64) +/** + * Writes val to the 32-bit memory location pointed to by ptr, using non-temporal hint. + */ +__MEMRW_WRITENT_FUNC(32) +/** + * Writes val to the 64-bit memory location pointed to by ptr, using non-temporal hint. + */ +__MEMRW_WRITENT_FUNC(64) + /** * Writes val to the 8-bit memory location pointed to by ptr. Only returns when the write is complete. */ diff --git a/tests/own_addr.c b/tests/own_addr.c index 612c1cd..8d99373 100644 --- a/tests/own_addr.c +++ b/tests/own_addr.c @@ -20,6 +20,8 @@ #include "display.h" #include "error.h" #include "test.h" +#include "config.h" +#include "cpuid.h" #include "test_funcs.h" #include "test_helper.h" @@ -36,7 +38,7 @@ static int pattern_fill(int my_cpu, testword_t offset) display_test_pattern_name("own address"); } - // Write each address with it's own address. + // Write each address with its own address. for (int i = 0; i < vm_map_size; i++) { testword_t *start = vm_map[i].start; testword_t *end = vm_map[i].end; @@ -58,6 +60,41 @@ static int pattern_fill(int my_cpu, testword_t offset) continue; } test_addr[my_cpu] = (uintptr_t)p; + if (!offset) { + if (enable_nontemporal && nontemporal_writes_supported()) { + do { + write_word_nt(p, (testword_t)p); + } while (p++ < pe); // test before increment in case pointer overflows +#if defined(__i386__) || defined(__x86_64__) + __asm__ __volatile__ ("mfence"); +#elif defined(__loongarch_lp64) + // TODO LoongArch barrier +#endif + } + else { + do { + write_word(p, (testword_t)p); + } while (p++ < pe); // test before increment in case pointer overflows + } + } + else { + if (enable_nontemporal && nontemporal_writes_supported()) { + do { + write_word_nt(p, (testword_t)p + offset); + } while (p++ < pe); // test before increment in case pointer overflows +#if defined(__i386__) || defined(__x86_64__) + __asm__ __volatile__ ("mfence"); +#elif defined(__loongarch_lp64) + // TODO LoongArch barrier +#endif + } + else { + do { + write_word(p, (testword_t)p + offset); + } while (p++ < pe); // test before increment in case pointer overflows + } + } + do { write_word(p, (testword_t)p + offset); } while (p++ < pe); // test before increment in case pointer overflows @@ -97,13 +134,24 @@ static int pattern_check(int my_cpu, testword_t offset) continue; } test_addr[my_cpu] = (uintptr_t)p; - do { - testword_t expect = (testword_t)p + offset; - testword_t actual = read_word(p); - if (unlikely(actual != expect)) { - data_error(p, expect, actual, true); - } - } while (p++ < pe); // test before increment in case pointer overflows + if (!offset) { + do { + testword_t expect = (testword_t)p; + testword_t actual = read_word(p); + if (unlikely(actual != expect)) { + data_error(p, expect, actual, true); + } + } while (p++ < pe); // test before increment in case pointer overflows + } + else { + do { + testword_t expect = (testword_t)p + offset; + testword_t actual = read_word(p); + if (unlikely(actual != expect)) { + data_error(p, expect, actual, true); + } + } while (p++ < pe); // test before increment in case pointer overflows + } do_tick(my_cpu); BAILOUT; } while (!at_end && ++pe); // advance pe to next start point diff --git a/tests/test_helper.h b/tests/test_helper.h index f3ecbf3..90f26ec 100644 --- a/tests/test_helper.h +++ b/tests/test_helper.h @@ -21,11 +21,13 @@ */ #include "memrw.h" #if (ARCH_BITS == 64) -#define read_word read64 -#define write_word write64 +#define read_word read64 +#define write_word write64 +#define write_word_nt write64nt #else -#define read_word read32 -#define write_word write32 +#define read_word read32 +#define write_word write32 +#define write_word_nt write32nt #endif /**