From a373e971752d4739656670a74f381bb49f55554b Mon Sep 17 00:00:00 2001 From: Lionel Debroux Date: Wed, 13 Jul 2022 10:27:44 +0200 Subject: [PATCH] Add experimental mode with nontemporal stores (movnt[iq]) in own addr test, the only one where it seems to uniformly help with performance across a number of processors I have access to, both single and multi-socket. Per #79, it saves several dozens of minutes on my 4S Opteron 62xx / 63xx servers equipped with 256 GB of RAM. --- app/config.c | 3 +++ app/config.h | 1 + system/memrw32.h | 14 +++++++++++ system/memrw64.h | 14 +++++++++++ tests/own_addr.c | 58 ++++++++++++++++++++++++++++++++++++--------- tests/test_helper.h | 10 ++++---- 6 files changed, 85 insertions(+), 15 deletions(-) diff --git a/app/config.c b/app/config.c index 61a216c..048e6d6 100644 --- a/app/config.c +++ b/app/config.c @@ -97,6 +97,7 @@ bool enable_trace = false; bool enable_sm = true; bool enable_bench = true; +bool enable_nontemporal = false; bool pause_at_start = true; @@ -209,6 +210,8 @@ static void parse_option(const char *option, const char *params) enable_big_status = false; } else if (strncmp(option, "noehci", 7) == 0) { usb_init_options |= USB_IGNORE_EHCI; + } else if (strncmp(option, "nontemporal", 12) == 0) { + enable_nontemporal = true; } else if (strncmp(option, "nopause", 8) == 0) { pause_at_start = false; } else if (strncmp(option, "nosm", 5) == 0) { diff --git a/app/config.h b/app/config.h index 3d6b82a..33b1ad9 100644 --- a/app/config.h +++ b/app/config.h @@ -58,6 +58,7 @@ extern bool enable_trace; extern bool enable_sm; extern bool enable_tty; extern bool enable_bench; +extern bool enable_nontemporal; extern bool pause_at_start; diff --git a/system/memrw32.h b/system/memrw32.h index e9316ec..53a5ac5 100644 --- a/system/memrw32.h +++ b/system/memrw32.h @@ -44,6 +44,20 @@ static inline void write32(const volatile uint32_t *ptr, uint32_t val) ); } +/** + * Writes val to the 32-bit memory location pointed to by ptr, using non-temporal hint. + */ +static inline void write32_nt(const volatile uint32_t *ptr, uint32_t val) +{ + __asm__ __volatile__( + "movntil %1, %0" + : + : "m" (*ptr), + "r" (val) + : "memory" + ); +} + /** * Writes val to the 32-bit memory location pointed to by ptr. Reads it * back (and discards it) to ensure the write is complete. diff --git a/system/memrw64.h b/system/memrw64.h index f45d36b..3019aa5 100644 --- a/system/memrw64.h +++ b/system/memrw64.h @@ -44,6 +44,20 @@ static inline void write64(const volatile uint64_t *ptr, uint64_t val) ); } +/** + * Writes val to the 64-bit memory location pointed to by ptr, using non-temporal hint. + */ +static inline void write64_nt(const volatile uint64_t *ptr, uint64_t val) +{ + __asm__ __volatile__( + "movnti %1, %0" + : + : "m" (*ptr), + "r" (val) + : "memory" + ); +} + /** * Writes val to the 64-bit memory location pointed to by ptr. Reads it * back (and discards it) to ensure the write is complete. diff --git a/tests/own_addr.c b/tests/own_addr.c index 249cf74..d19cb0e 100644 --- a/tests/own_addr.c +++ b/tests/own_addr.c @@ -21,6 +21,8 @@ #include "display.h" #include "error.h" #include "test.h" +#include "config.h" +#include "cpuid.h" #include "test_funcs.h" #include "test_helper.h" @@ -37,7 +39,7 @@ static int pattern_fill(int my_cpu, testword_t offset) display_test_pattern_name("own address"); } - // Write each address with it's own address. + // Write each address with its own address. for (int i = 0; i < vm_map_size; i++) { testword_t *start = vm_map[i].start; testword_t *end = vm_map[i].end; @@ -59,9 +61,32 @@ static int pattern_fill(int my_cpu, testword_t offset) continue; } test_addr[my_cpu] = (uintptr_t)p; - do { - write_word(p, (testword_t)p + offset); - } while (p++ < pe); // test before increment in case pointer overflows + if (!offset) { + if (enable_nontemporal && cpuid_info.flags.sse2) { + do { + write_word_nt(p, (testword_t)p); + } while (p++ < pe); // test before increment in case pointer overflows + __asm__ __volatile__ ("mfence"); + } + else { + do { + write_word(p, (testword_t)p); + } while (p++ < pe); // test before increment in case pointer overflows + } + } + else { + if (enable_nontemporal && cpuid_info.flags.sse2) { + do { + write_word_nt(p, (testword_t)p + offset); + } while (p++ < pe); // test before increment in case pointer overflows + __asm__ __volatile__ ("mfence"); + } + else { + do { + write_word(p, (testword_t)p + offset); + } while (p++ < pe); // test before increment in case pointer overflows + } + } do_tick(my_cpu); BAILOUT; } while (!at_end && ++pe); // advance pe to next start point @@ -98,13 +123,24 @@ static int pattern_check(int my_cpu, testword_t offset) continue; } test_addr[my_cpu] = (uintptr_t)p; - do { - testword_t expect = (testword_t)p + offset; - testword_t actual = read_word(p); - if (unlikely(actual != expect)) { - data_error(p, expect, actual, true); - } - } while (p++ < pe); // test before increment in case pointer overflows + if (!offset) { + do { + testword_t expect = (testword_t)p; + testword_t actual = read_word(p); + if (unlikely(actual != expect)) { + data_error(p, expect, actual, true); + } + } while (p++ < pe); // test before increment in case pointer overflows + } + else { + do { + testword_t expect = (testword_t)p + offset; + testword_t actual = read_word(p); + if (unlikely(actual != expect)) { + data_error(p, expect, actual, true); + } + } while (p++ < pe); // test before increment in case pointer overflows + } do_tick(my_cpu); BAILOUT; } while (!at_end && ++pe); // advance pe to next start point diff --git a/tests/test_helper.h b/tests/test_helper.h index 25a49f1..4f41411 100644 --- a/tests/test_helper.h +++ b/tests/test_helper.h @@ -21,12 +21,14 @@ */ #ifdef __x86_64__ #include "memrw64.h" -#define read_word read64 -#define write_word write64 +#define read_word read64 +#define write_word write64 +#define write_word_nt write64_nt #else #include "memrw32.h" -#define read_word read32 -#define write_word write32 +#define read_word read32 +#define write_word write32 +#define write_word_nt write32_nt #endif /**