WIP BROKEN Add SIMD tests for x86 & x86-64: MMX, SSE, SSE2, AVX ( #98 ).

2025-02-25 18:55:23 -06:00 · 2023-09-28 23:43:56 +02:00 · 2023-09-28 23:43:56 +02:00 · e16118505f
commit e16118505f
parent 4821c1bdba
16 changed files with 968 additions and 89 deletions
--- a/app/display.h
+++ b/app/display.h
@ -172,6 +172,12 @@ typedef enum {
        prints(5, 39, str); \
    }

+#define display_test_pattern_names(str, step) \
+    { \
+        clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \
+        printf(5, 39, "%s - %i", str, step); \
+    }
+
 #define display_test_pattern_value(pattern) \
    { \
        clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \
--- a/app/error.c
+++ b/app/error.c
@ -365,6 +365,15 @@ void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_
    common_err(DATA_ERROR, (uintptr_t)addr, good, bad, use_for_badram);
 }

+void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram)
+{
+    for (unsigned int i = 0; i < width; i++, addr++) {
+        if (*good != *bad) {
+            common_err(DATA_ERROR, (uintptr_t)addr, *good, *bad, use_for_badram);
+        }
+    }
+}
+
 void ecc_error()
 {
    common_err(CECC_ERROR, ecc_status.addr, 0, 0, false);
--- a/app/error.h
+++ b/app/error.h
@ -40,6 +40,11 @@ void addr_error(testword_t *addr1, testword_t *addr2, testword_t good, testword_
 */
 void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_badram);

+/**
+ * Adds one or more data errors to the error reports, version for data types wider than 64 bits.
+ */
+void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram);
+
 /**
 * Adds an ECC error to the error reports.
 * ECC Error details are stored in ecc_status
--- a/app/interrupt.c
+++ b/app/interrupt.c
@ -52,11 +52,9 @@
 #define OPCODE_WRMSR    0x300F

 #ifdef __x86_64__
-#define REG_PREFIX  "r"
 #define REG_DIGITS  "16"
 #define ADR_DIGITS  "12"
 #else
-#define REG_PREFIX  "e"
 #define REG_DIGITS  "8"
 #define ADR_DIGITS  "8"
 #endif
--- a/app/main.c
+++ b/app/main.c
@ -607,6 +607,40 @@ void main(void)
    }
 #endif

+#if defined(__i386__) || defined(__x86_64__)
+    // Enable SSE and AVX, if available.
+    if (cpuid_info.flags.sse) {
+        uintptr_t temp;
+        __asm__ __volatile__(
+            "mov    %%cr0, %0\n"
+            "andb   $0xfb, %b0\n"        // clear coprocessor emulation bit
+            "orb    $0x02, %b0\n"        // set coprocessor monitoring bit
+            "mov    %0, %%cr0\n"
+            "mov    %%cr4, %0\n"
+            "orb    $0x06, %h0"          // set OSFXSR and OSXMMEXCPT
+            : "=a" (temp)
+        );
+        if (cpuid_info.flags.xsave && cpuid_info.flags.avx) {
+            __asm__ __volatile__(
+                "bts    $18,%k0\n"       // set OSXSAVE bit
+                "mov    %0, %%cr4\n"
+                "xor    %%" REG_PREFIX "cx, %%" REG_PREFIX "cx\n"
+                "xgetbv\n"               // Load XCR0 register
+                "orb    $0x07, %%al\n"   // Set AVX, SSE, X87 bits
+                "xsetbv"                 // Save back to XCR0
+                : "=a" (temp) : "a" (temp)
+                : REG_PREFIX "cx", REG_PREFIX "dx"
+            );
+        }
+        else {
+            __asm__ __volatile__(
+                "mov %0, %%cr4"
+                : : "a" (temp)
+            );
+        }
+    }
+#endif
+
    // Due to the need to relocate ourselves in the middle of tests, the following
    // code cannot be written in the natural way as a set of nested loops. So we
    // have a single loop and use global state variables to allow us to restart
--- a/app/test.h
+++ b/app/test.h
@ -65,6 +65,12 @@ extern spinlock_t *error_mutex;
 * The string representation of TESTWORDS_DIGITS
 */
 #define TESTWORD_DIGITS_STR "16"
+#if defined(__x86_64__)
+/**
+ * The register prefix for full-sized registers.
+ */
+#define REG_PREFIX  "r"
+#endif
 #else
 /**
 * The word width (in bits) used for memory testing.
@ -78,6 +84,12 @@ extern spinlock_t *error_mutex;
 * The string representation of TESTWORDS_DIGITS
 */
 #define TESTWORD_DIGITS_STR "8"
+#if defined(__i386__)
+/**
+ * The register prefix for full-sized registers.
+ */
+#define REG_PREFIX  "e"
+#endif
 #endif

 /**
--- a/boot/startup64.S
+++ b/boot/startup64.S
@ -258,18 +258,6 @@ flush:	movw	$KERNEL_DS, %ax

 	finit

-#if 0
-	# Enable SSE.
-
-	movq	%cr0, %rax
-	andb	$0xfb, %al		# clear coprocessor emulation bit
-	orb	$0x02, %al		# set coprocessor monitoring bit
-	mov	%rax, %cr0
-	movq	%cr4, %rax
-	orw	$0x0600, %ax		# set OSFXSR and OSXMMEXCPT
-	movq	%rax, %cr4
-#endif
-
 	# Call the dynamic linker to fix up the addresses in the GOT.

 	call	reloc
--- a/build32/Makefile
+++ b/build32/Makefile
@ -10,8 +10,8 @@ else
  GIT_AVAILABLE = true
 endif

-CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -fpic -fno-builtin \
-         -ffreestanding -fomit-frame-pointer -fno-stack-protector \
+CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -mno-mmx -mno-sse -mno-sse2 \
+         -fpic -fno-builtin -ffreestanding -fomit-frame-pointer -fno-stack-protector \
         -fexcess-precision=standard -DARCH_BITS=32

 ifeq ($(DEBUG), 1)
--- a/build32/debug_memtest.sh
+++ b/build32/debug_memtest.sh
@ -134,6 +134,44 @@ Init() {
    QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF32_CODE.fd"
    QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF32_VARS.fd"

+    if [ "x$MACHINE" = "x4S4CN270" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu n270-v1"
+        QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
+
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
+
+    elif [ "x$MACHINE" = "x4S4CP3" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu pentium3-v1"
+        QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
+
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
+
+    elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
+        QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
+
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
+
+    elif [ "x$MACHINE" = "x1S1CAthlon" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu athlon-v1"
+        QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
+
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
+
+    fi
+
    # Define offsets for loading of symbol-table
    IMAGEBASE=0x200000
    BASEOFCODE=0x1000
--- a/build64/debug_memtest.sh
+++ b/build64/debug_memtest.sh
@ -135,6 +135,152 @@ Init() {
    QEMU_FLAGS+=" -hda fat:rw:hda-contents -net none"
    QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF_CODE.fd"
    QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF_VARS.fd"
+#   QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/hpet.dat:../ACPI/RS904A/srat_32c.dat:../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
+#   QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
+    QEMU_FLAGS+=" -machine q35" # q35,accel=kvm
+
+    if [ "x$MACHINE" = "x4S32CBulldozer" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Opteron_G4" # -cpu host Opteron_G5
+        QEMU_FLAGS+=" -smp 32,sockets=4,cores=8,maxcpus=32"
+
+        MEMSIZE8=$((MEMSIZE / 8))
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m0,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m1,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m2,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m3,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m4,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m5,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m6,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m7,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
+        QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=4-7"
+        QEMU_FLAGS+=" -numa node,nodeid=2,memdev=m2,cpus=8-11"
+        QEMU_FLAGS+=" -numa node,nodeid=3,memdev=m3,cpus=12-15"
+        QEMU_FLAGS+=" -numa node,nodeid=4,memdev=m4,cpus=16-19"
+        QEMU_FLAGS+=" -numa node,nodeid=5,memdev=m5,cpus=20-23"
+        QEMU_FLAGS+=" -numa node,nodeid=6,memdev=m6,cpus=24-27"
+        QEMU_FLAGS+=" -numa node,nodeid=7,memdev=m7,cpus=28-31"
+
+        QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=16"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=3,val=22"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=4,val=16"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=5,val=22"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=6,val=16"
+        QEMU_FLAGS+=" -numa dist,src=0,dst=7,val=22"
+
+        QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=16"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=3,val=22"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=4,val=22"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=5,val=16"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=6,val=22"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=7,val=16"
+
+        QEMU_FLAGS+=" -numa dist,src=2,dst=0,val=16"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=1,val=16"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=3,val=16"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=4,val=16"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=5,val=22"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=6,val=16"
+        QEMU_FLAGS+=" -numa dist,src=2,dst=7,val=16"
+
+        QEMU_FLAGS+=" -numa dist,src=3,dst=0,val=22"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=1,val=22"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=4,val=22"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=5,val=16"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=6,val=16"
+        QEMU_FLAGS+=" -numa dist,src=3,dst=7,val=16"
+
+        QEMU_FLAGS+=" -numa dist,src=4,dst=0,val=16"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=1,val=22"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=3,val=22"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=5,val=16"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=6,val=16"
+        QEMU_FLAGS+=" -numa dist,src=4,dst=7,val=22"
+
+        QEMU_FLAGS+=" -numa dist,src=5,dst=0,val=22"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=1,val=16"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=2,val=22"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=3,val=16"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=4,val=16"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=6,val=16"
+        QEMU_FLAGS+=" -numa dist,src=5,dst=7,val=22"
+
+        QEMU_FLAGS+=" -numa dist,src=6,dst=0,val=16"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=1,val=22"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=3,val=16"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=4,val=16"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=5,val=16"
+        QEMU_FLAGS+=" -numa dist,src=6,dst=7,val=16"
+
+        QEMU_FLAGS+=" -numa dist,src=7,dst=0,val=22"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=1,val=16"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=2,val=16"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=3,val=16"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=4,val=22"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=5,val=22"
+        QEMU_FLAGS+=" -numa dist,src=7,dst=6,val=16"
+
+#[02Ch 0044   8]                 Locality   0 : 0A 10 10 16 10 16 10 16
+#[034h 0052   8]                 Locality   1 : 10 0A 10 16 16 10 16 10
+#[03Ch 0060   8]                 Locality   2 : 10 10 0A 10 10 16 10 10
+#[044h 0068   8]                 Locality   3 : 16 16 10 0A 16 10 10 10
+#[04Ch 0076   8]                 Locality   4 : 10 16 10 16 0A 10 10 16
+#[054h 0084   8]                 Locality   5 : 16 10 16 10 10 0A 10 16
+#[05Ch 0092   8]                 Locality   6 : 10 16 10 10 10 10 0A 10
+#[064h 0100   8]                 Locality   7 : 16 10 10 10 16 16 10 0A
+
+    elif [ "x$MACHINE" = "x2S12CWestmere" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Westmere-v2"
+        QEMU_FLAGS+=" -smp 12,sockets=2,cores=6,maxcpus=12"
+
+        MEMSIZE2=$((MEMSIZE / 2))
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
+        QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=6-11"
+
+        QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=20"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=20"
+
+    elif [ "x$MACHINE" = "x2S2CPenryn" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Penryn-v1"
+        QEMU_FLAGS+=" -smp 2,sockets=2,cores=1,maxcpus=2"
+
+        MEMSIZE2=$((MEMSIZE / 2))
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
+        QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=1"
+
+        QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=21"
+        QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=21"
+
+    elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
+        QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
+
+        QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
+
+        QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
+
+    elif [ "x$MACHINE" = "x1S1CPhenom" ]; then
+
+        QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu phenom-v1"
+        QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
+
+    fi

    # Define offsets for loading of symbol-table
    IMAGEBASE=0x200000
--- a/system/cpuid.h
+++ b/system/cpuid.h
@ -56,7 +56,7 @@ typedef union {
 typedef union {
    uint32_t        raw[3];
    struct {
-        uint32_t    fpu     : 1;    // EDX feature flags, bit 0 */
+        uint32_t    fpu     : 1;    // EDX feature flags, bit 0
        uint32_t    vme     : 1;
        uint32_t    de      : 1;
        uint32_t    pse     : 1;
@ -88,6 +88,7 @@ typedef union {
        uint32_t    tm      : 1;
        uint32_t    bit30   : 1;
        uint32_t    pbe     : 1;    // EDX feature flags, bit 31
+
        uint32_t    sse3    : 1;    // ECX feature flags, bit 0
        uint32_t    mulq    : 1;
        uint32_t    bit2    : 1;
@ -99,7 +100,12 @@ typedef union {
        uint32_t    tm2     : 1;
        uint32_t            : 12;   // ECX feature flags, bit 20
        uint32_t    x2apic  : 1;
-        uint32_t            : 10;   // ECX feature flags, bit 31
+        uint32_t            : 4;    // ECX feature flags, bit 22
+        uint32_t    xsave   : 1;
+        uint32_t    osxsave : 1;
+        uint32_t    avx     : 1;
+        uint32_t            : 3;    // ECX feature flags, bit 31
+
        uint32_t            : 19;   // EDX extended feature flags, bit 0
        uint32_t    nx      : 1;
        uint32_t            : 9;
--- a/system/memrw_simd.h
+++ b/system/memrw_simd.h
@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef MEMRW_SIMD_H
+#define MEMRW_SIMD_H
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#include <immintrin.h>
+
+// -------------------- COMMON TO x86 & x86-64 --------------------
+
+/**
+ * Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
+ */
+#pragma GCC target ("sse")
+static inline __m128 read128_sse(const volatile __m128 *ptr)
+{
+    __m128 val;
+    __asm__ __volatile__(
+        "movaps %1, %0"
+        : "=x" (val)
+        : "m" (*ptr)
+        : "memory"
+    );
+    return val;
+}
+
+/**
+ * Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
+ */
+#pragma GCC target ("sse")
+static inline void write128_sse(const volatile __m128 *ptr, __m128 val)
+{
+    __asm__ __volatile__(
+        "movaps %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+/**
+ * Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
+ */
+#pragma GCC target ("sse")
+static inline void write128nt_sse(const volatile __m128 *ptr, __m128 val)
+{
+    __asm__ __volatile__(
+        "movntps %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+#pragma GCC target ("sse")
+static inline int compare128_sse(__m128 val1, __m128 val2)
+{
+    return _mm_movemask_ps(_mm_cmpeq_ps(val1, val2));
+}
+
+/**
+ * Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
+ */
+#pragma GCC target ("sse2")
+static inline void write128_sse2(const volatile __m128 *ptr, __m128 val)
+{
+    __asm__ __volatile__(
+        "movdqa %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+/**
+ * Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
+ */
+#pragma GCC target ("sse2")
+static inline void write128nt_sse2(const volatile __m128 *ptr, __m128 val)
+{
+    __asm__ __volatile__(
+        "movntdq %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+/**
+ * Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
+ */
+#pragma GCC target ("sse2")
+static inline __m128 read128_sse2(const volatile __m128 *ptr)
+{
+    __m128 val;
+    __asm__ __volatile__(
+        "movdqa %1, %0"
+        : "=x" (val)
+        : "m" (*ptr)
+        : "memory"
+    );
+    return val;
+}
+
+#pragma GCC target ("sse2")
+static inline int compare128_sse2(__m128 val1, __m128 val2)
+{
+    return _mm_movemask_pd(_mm_cmpeq_pd((__m128d)val1, (__m128d)val2));
+}
+
+/**
+ * Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
+ */
+#pragma GCC target ("avx")
+static inline __m256 read256_avx(const volatile __m256 *ptr)
+{
+    __m256 val;
+    __asm__ __volatile__(
+        "vmovdqa %1, %0"
+        : "=x" (val)
+        : "m" (*ptr)
+        : "memory"
+    );
+    return val;
+}
+
+/**
+ * Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand.
+ */
+#pragma GCC target ("avx")
+static inline void write256_avx(const volatile __m256 *ptr, __m256 val)
+{
+    __asm__ __volatile__(
+        "vmovdqa %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+/**
+ * Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand, using non-temporal hint.
+ */
+#pragma GCC target ("avx")
+static inline void write256nt_avx(const volatile __m256 *ptr, __m256 val)
+{
+    __asm__ __volatile__(
+        "vmovntdq %1, %0"
+        :
+        : "m" (*ptr),
+          "x" (val)
+        : "memory"
+    );
+}
+
+#pragma GCC target ("avx")
+static inline int compare256_avx(__m256 val1, __m256 val2)
+{
+    return _mm256_movemask_pd(_mm256_cmp_pd((__m256d)val1, (__m256d)val2, 0));
+}
+
+#endif
+
+// -------------------- SEPARATE FOR x86 & x86-64 --------------------
+
+#if defined(__i386__)
+
+#if 0
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline __m64 convert_testword_to_simd64_mmx(uint32_t val)
+{
+    return _mm_set1_pi32(val);
+}
+#endif
+
+/**
+ * Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
+ */
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline void write64_mmx(const volatile uint32_t *ptr, uint32_t val)
+{
+    __m64 val2 = _mm_set1_pi32(val);
+    __asm__ __volatile__(
+        "movq %1, %0"
+        :
+        : "m" (*ptr),
+          "y" (val2)
+        : "memory"
+    );
+}
+
+/**
+ * Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
+ */
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline void write64nt_mmx(const volatile uint32_t *ptr, uint32_t val)
+{
+    __m64 val2 = _mm_set1_pi32(val);
+    __asm__ __volatile__(
+        "movntq %1, %0"
+        :
+        : "m" (*ptr),
+          "y" (val2)
+        : "memory"
+    );
+}
+
+#pragma GCC target ("sse")
+static inline __m128 convert_testword_to_simd128_sse(uint32_t val)
+{
+    __attribute__((aligned(16))) float tmp[4];
+    float * tmp2 = tmp;
+    *(uint32_t *)tmp2++ = val;
+    *(uint32_t *)tmp2++ = val;
+    *(uint32_t *)tmp2++ = val;
+    *(uint32_t *)tmp2++ = val;
+    return _mm_load_ps(tmp);
+}
+
+#pragma GCC target ("sse2")
+static inline __m128 convert_testword_to_simd128_sse2(uint32_t val)
+{
+    return (__m128)_mm_set1_epi32(val);
+}
+
+#pragma GCC target ("avx")
+static inline __m256 convert_testword_to_simd256_avx(uint32_t val)
+{
+    return (__m256)_mm256_set1_epi32(val);
+}
+
+#elif defined(__x86_64__)
+
+// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
+#if 0
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline __m64 convert_testword_to_simd64_mmx(uint64_t val)
+{
+    return (__m64)val;
+}
+#endif
+
+/**
+ * Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
+ */
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline void write64_mmx(const volatile uint64_t *ptr, uint64_t val)
+{
+    __m64 val2 = (__m64)val;
+    __asm__ __volatile__(
+        "movq %1, %0"
+        :
+        : "m" (*ptr),
+          "y" (val2)
+        : "memory"
+    );
+}
+
+/**
+ * Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
+ */
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static inline void write64nt_mmx(const volatile uint64_t *ptr, uint64_t val)
+{
+    __m64 val2 = (__m64)val;
+    __asm__ __volatile__(
+        "movntq %1, %0"
+        :
+        : "m" (*ptr),
+          "y" (val2)
+        : "memory"
+    );
+}
+
+#pragma GCC target ("sse")
+static inline __m128 convert_testword_to_simd128_sse(uint64_t val)
+{
+    __attribute__((aligned(16))) float tmp[4];
+    double * tmp2 = (double *)tmp;
+    *(uint64_t *)tmp2++ = val;
+    *(uint64_t *)tmp2++ = val;
+    return _mm_load_ps(tmp);
+}
+
+#pragma GCC target ("sse2")
+static inline __m128 convert_testword_to_simd128_sse2(uint64_t val)
+{
+    return (__m128)_mm_set1_epi64x(val);
+}
+
+#pragma GCC target ("avx")
+static inline __m256 convert_testword_to_simd256_avx(uint64_t val)
+{
+    return (__m256)_mm256_set1_epi64x(val);
+}
+
+#endif // __i386__ or __x86_64__
+
+#endif // MEMRW_SIMD_H
--- a/tests/block_move.c
+++ b/tests/block_move.c
@ -28,12 +28,12 @@
 // Public Functions
 //------------------------------------------------------------------------------

-int test_block_move(int my_cpu, int iterations)
+int test_block_move(int my_cpu, int iterations, int simd)
 {
    int ticks = 0;

    if (my_cpu == master_cpu) {
-        display_test_pattern_name("block move");
+        display_test_pattern_names("block move", simd);
    }

    // Initialize memory with the initial pattern.
--- a/tests/mov_inv_fixed.c
+++ b/tests/mov_inv_fixed.c
@ -20,6 +20,7 @@
 #include "display.h"
 #include "error.h"
 #include "test.h"
+#include "config.h"

 #include "test_funcs.h"
 #include "test_helper.h"
@ -30,19 +31,279 @@
 // Public Functions
 //------------------------------------------------------------------------------

-int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2)
+#if defined(__i386__) || defined(__x86_64__)
+
+#include "memrw_simd.h"
+
+// ==================== vvvvv MMX vvvvv ====================
+
+#if 0
+// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static __attribute__((noinline)) testword_t * write_loops_simd64(testword_t *p, testword_t *pe, testword_t pattern1)
+{
+    register __m64 mdpattern1 __asm__("%mm0") = convert_testword_to_simd64(pattern1);
+    if (enable_nontemporal) {
+        do {
+            write64nt_simd((__m64 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 1 : 0;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    else {
+        do {
+            write64_simd((__m64 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 1 : 0;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    __asm__ __volatile__ ("emms");
+    __sync_synchronize();
+    return p;
+}
+#endif
+
+#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
+static __attribute__((noinline)) testword_t * write_loops_simd64_mmx(testword_t *p, testword_t *pe, testword_t pattern1)
+{
+    if (enable_nontemporal) {
+        do {
+            write64nt_mmx(p, pattern1);
+            p += (sizeof(*p) < 8) ? 1 : 0;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    else {
+        do {
+            write64_mmx(p, pattern1);
+            p += (sizeof(*p) < 8) ? 1 : 0;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    __asm__ __volatile__ ("emms");
+    __sync_synchronize();
+    return p;
+}
+
+// ? TODO ? read1_loops_simd64
+// ? TODO ? read2_loops_simd64
+
+// ==================== ^^^^^ MMX ^^^^^ ====================
+
+
+// ==================== vvvvv SSE vvvvv ====================
+
+#pragma GCC target ("sse", "no-sse2", "no-avx")
+static __attribute__((noinline)) testword_t * write_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
+    if (enable_nontemporal) {
+        do {
+            write128nt_sse((__m128 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 3 : 1;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    else {
+        do {
+            write128_sse((__m128 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 3 : 1;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    __sync_synchronize();
+    return p;
+}
+
+#define COMPARE_TARGET 0xF
+
+#pragma GCC target ("sse", "no-sse2", "no-avx")
+static __attribute__((noinline)) testword_t * read1_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
+    __m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
+    do {
+        __m128 actual = read128_sse((__m128 *)p);
+        int compar_result = compare128_sse(mdpattern1, actual);
+        write128_sse((__m128 *)p, mdpattern2);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m128 good = mdpattern1;
+            __m128 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
+        }
+        p += (sizeof(*p) < 8) ? 3 : 1;
+    } while (p++ < pe); // test before increment in case pointer overflows
+    return p;
+}
+
+#pragma GCC target ("sse", "no-sse2", "no-avx")
+static __attribute__((noinline)) testword_t * read2_loops_simd128_sse(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
+    __m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
+    do {
+        __m128 actual = read128_sse((__m128 *)p);
+        int compar_result = compare128_sse(mdpattern2, actual);
+        write128_sse((__m128 *)p, mdpattern1);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m128 good = mdpattern2;
+            __m128 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
+        }
+        p -= (sizeof(*p) < 8) ? 3 : 1;
+    } while (p-- > ps); // test before decrement in case pointer overflows
+    return p;
+}
+
+#undef COMPARE_TARGET
+
+// ==================== ^^^^^ SSE ^^^^^ ====================
+
+
+// ==================== vvvvv SSE2 vvvvv ====================
+
+#pragma GCC target ("sse2", "no-avx")
+static __attribute__((noinline)) testword_t * write_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
+    if (enable_nontemporal) {
+        do {
+            write128nt_sse2((__m128 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 3 : 1;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    else {
+        do {
+            write128_sse2((__m128 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 3 : 1;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    __sync_synchronize();
+    return p;
+}
+
+#define COMPARE_TARGET 0x3
+
+#pragma GCC target ("sse2", "no-avx")
+static __attribute__((noinline)) testword_t * read1_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
+    __m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
+    do {
+        __m128 actual = read128_sse2((__m128 *)p);
+        int compar_result = compare128_sse2(mdpattern1, actual);
+        write128_sse2((__m128 *)p, mdpattern2);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m128 good = mdpattern1;
+            __m128 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
+        }
+        p += (sizeof(*p) < 8) ? 3 : 1;
+    } while (p++ < pe); // test before increment in case pointer overflows
+    return p;
+}
+
+#pragma GCC target ("sse2", "no-avx")
+static __attribute__((noinline)) testword_t * read2_loops_simd128_sse2(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
+{
+    __m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
+    __m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
+    do {
+        __m128 actual = read128_sse2((__m128 *)p);
+        int compar_result = compare128_sse2(mdpattern2, actual);
+        write128_sse2((__m128 *)p, mdpattern1);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m128 good = mdpattern2;
+            __m128 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
+        }
+        p -= (sizeof(*p) < 8) ? 3 : 1;
+    } while (p-- > ps); // test before decrement in case pointer overflows
+    return p;
+}
+
+#undef COMPARE_TARGET
+
+// ==================== ^^^^^ SSE2 ^^^^^ ====================
+
+
+// ==================== vvvvv AVX vvvvv ====================
+
+#define COMPARE_TARGET 0xF
+
+#pragma GCC target ("avx")
+static __attribute__((noinline)) testword_t * write_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1)
+{
+    __m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
+    if (enable_nontemporal) {
+        do {
+            write256nt_avx((__m256 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 7 : 3;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    else {
+        do {
+            write256_avx((__m256 *)p, mdpattern1);
+            p += (sizeof(*p) < 8) ? 7 : 3;
+        } while (p++ < pe); // test before increment in case pointer overflows
+    }
+    __sync_synchronize();
+    return p;
+}
+
+#pragma GCC target ("avx")
+static __attribute__((noinline)) testword_t * read1_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
+{
+    __m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
+    __m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
+    do {
+        __m256 actual = read256_avx((__m256 *)p);
+        int compar_result = compare256_avx(mdpattern1, actual);
+        write256_avx((__m256 *)p, mdpattern2);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m256 good = mdpattern1;
+            __m256 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
+        }
+        p += (sizeof(*p) < 8) ? 7 : 3;
+    } while (p++ < pe); // test before increment in case pointer overflows
+    return p;
+}
+
+#pragma GCC target ("avx")
+static __attribute__((noinline)) testword_t * read2_loops_simd256_avx(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
+{
+    __m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
+    __m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
+    do {
+        __m256 actual = read256_avx((__m256 *)p);
+        int compar_result = compare256_avx(mdpattern2, actual);
+        write256_avx((__m256 *)p, mdpattern1);
+        if (unlikely(compar_result != COMPARE_TARGET)) {
+            __m256 good = mdpattern2;
+            __m256 bad = actual;
+            data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
+        }
+        p -= (sizeof(*p) < 8) ? 7 : 3;
+    } while (p-- > ps); // test before decrement in case pointer overflows
+    return p;
+}
+
+#undef COMPARE_TARGET
+
+// ==================== ^^^^^ AVX ^^^^^ ====================
+
+#endif
+
+int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd)
 {
    int ticks = 0;

+    size_t chunk_align = simd == 1 ? 64/8 : ((simd == 2 || simd == 3) ? 128/8 : (simd == 4 ? 256/8 : sizeof(testword_t)));
    if (my_cpu == master_cpu) {
-        display_test_pattern_value(pattern1);
+        display_test_pattern_values(pattern1, simd);
    }

    // Initialize memory with the initial pattern.
    for (int i = 0; i < vm_map_size; i++) {
        testword_t *start, *end;
-        calculate_chunk(&start, &end, my_cpu, i, sizeof(testword_t));
-        if (end < start) SKIP_RANGE(1) // we need at least one word for this test
+        calculate_chunk(&start, &end, my_cpu, i, chunk_align);
+        __asm__ volatile("nop");
+        if (end < start) SKIP_RANGE(1) // we need enough words for this test

        testword_t *p  = start;
        testword_t *pe = start;
@ -60,46 +321,64 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
            if (my_cpu < 0) {
                continue;
            }
+            //do_trace(0, "W  p  %016x -> pe %016x", (uintptr_t)p, (uintptr_t) pe);
            test_addr[my_cpu] = (uintptr_t)p;
+            if (!simd || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
 #if HAND_OPTIMISED
 #if defined(__x86_64__)
-            uint64_t length = pe - p + 1;
-            __asm__  __volatile__ ("\t"
-                "rep    \n\t"
-                "stosq  \n\t"
-                :
-                : "c" (length), "D" (p), "a" (pattern1)
-                :
-            );
-            p = pe;
+                uint64_t length = pe - p + 1;
+                __asm__  __volatile__ ("\t"
+                    "rep    \n\t"
+                    "stosq  \n\t"
+                    :
+                    : "c" (length), "D" (p), "a" (pattern1)
+                    :
+                );
+                p = pe;
 #elif defined(__i386__)
-            uint32_t length = pe - p + 1;
-            __asm__  __volatile__ ("\t"
-                "rep    \n\t"
-                "stosl  \n\t"
-                :
-                : "c" (length), "D" (p), "a" (pattern1)
-                :
-            );
-            p = pe;
+                uint32_t length = pe - p + 1;
+                __asm__  __volatile__ ("\t"
+                    "rep    \n\t"
+                    "stosl  \n\t"
+                    :
+                    : "c" (length), "D" (p), "a" (pattern1)
+                    :
+                );
+                p = pe;
 #elif defined(__loongarch_lp64)
-            uint64_t length = pe - p + 1;
-            __asm__  __volatile__ ("\t"
-                "loop:               \n\t"
-                "st.d %2, %1, 0x0    \n\t"
-                "addi.d %1, %1, 0x8  \n\t"
-                "addi.d %0, %0, -0x1 \n\t"
-                "bnez %0, loop       \n\t"
-                :
-                : "r" (length), "r" (p), "r" (pattern1)
-                : "memory"
-            );
-            p = pe;
+                uint64_t length = pe - p + 1;
+                __asm__  __volatile__ ("\t"
+                    "loop:               \n\t"
+                    "st.d %2, %1, 0x0    \n\t"
+                    "addi.d %1, %1, 0x8  \n\t"
+                    "addi.d %0, %0, -0x1 \n\t"
+                    "bnez %0, loop       \n\t"
+                    :
+                    : "r" (length), "r" (p), "r" (pattern1)
+                    : "memory"
+                );
+                p = pe;
 #endif
 #else
-            do {
-                write_word(p, pattern1);
-            } while (p++ < pe); // test before increment in case pointer overflows
+                do {
+                    write_word(p, pattern1);
+                } while (p++ < pe); // test before increment in case pointer overflows
+#endif
+            }
+// SIMD code paths
+#if defined(__i386__) || defined(__x86_64__)
+            else if (simd == 1) {
+                p = write_loops_simd64_mmx(p, pe, pattern1);
+            }
+            else if (simd == 2) {
+                p = write_loops_simd128_sse(p, pe, pattern1);
+            }
+            else if (simd == 3) {
+                p = write_loops_simd128_sse2(p, pe, pattern1);
+            }
+            else if (simd == 4) {
+                p = write_loops_simd256_avx(p, pe, pattern1);
+            }
 #endif
            do_tick(my_cpu);
            BAILOUT;
@ -113,8 +392,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword

        for (int j = 0; j < vm_map_size; j++) {
            testword_t *start, *end;
-            calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
-            if (end < start) SKIP_RANGE(1) // we need at least one word for this test
+            calculate_chunk(&start, &end, my_cpu, j, chunk_align);
+            if (end < start) SKIP_RANGE(1) // we need enough words for this test

            testword_t *p  = start;
            testword_t *pe = start;
@ -132,14 +411,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
                if (my_cpu < 0) {
                    continue;
                }
+                //do_trace(0, "R1 p  %016x -> pe %016x", (uintptr_t)p, (uintptr_t)pe);
                test_addr[my_cpu] = (uintptr_t)p;
-                do {
-                    testword_t actual = read_word(p);
-                    if (unlikely(actual != pattern1)) {
-                        data_error(p, pattern1, actual, true);
-                    }
-                    write_word(p, pattern2);
-                } while (p++ < pe); // test before increment in case pointer overflows
+                if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
+                    do {
+                        testword_t actual = read_word(p);
+                        if (unlikely(actual != pattern1)) {
+                            data_error(p, pattern1, actual, true);
+                        }
+                        write_word(p, pattern2);
+                    } while (p++ < pe); // test before increment in case pointer overflows
+                }
+// SIMD code paths
+#if defined(__i386__) || defined(__x86_64__)
+                else if (simd == 2) {
+                    p = read1_loops_simd128_sse(p, pe, pattern1, pattern2);
+                }
+                else if (simd == 3) {
+                    p = read1_loops_simd128_sse2(p, pe, pattern1, pattern2);
+                }
+                else if (simd == 4) {
+                    p = read1_loops_simd256_avx(p, pe, pattern1, pattern2);
+                }
+#endif
                do_tick(my_cpu);
                BAILOUT;
            } while (!at_end && ++pe); // advance pe to next start point
@ -149,8 +443,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword

        for (int j = vm_map_size - 1; j >= 0; j--) {
            testword_t *start, *end;
-            calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
-            if (end < start) SKIP_RANGE(1) // we need at least one word for this test
+            calculate_chunk(&start, &end, my_cpu, j, chunk_align);
+            if (end < start) SKIP_RANGE(1) // we need enough words for this test

            testword_t *p  = end;
            testword_t *ps = end;
@ -168,14 +462,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
                if (my_cpu < 0) {
                    continue;
                }
+                //do_trace(0, "R2 ps %016x -> p  %016x", (uintptr_t)ps, (uintptr_t)p);
                test_addr[my_cpu] = (uintptr_t)p;
-                do {
-                    testword_t actual = read_word(p);
-                    if (unlikely(actual != pattern2)) {
-                        data_error(p, pattern2, actual, true);
-                    }
-                    write_word(p, pattern1);
-                } while (p-- > ps); // test before decrement in case pointer overflows
+                if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
+                    do {
+                        testword_t actual = read_word(p);
+                        if (unlikely(actual != pattern2)) {
+                            data_error(p, pattern2, actual, true);
+                        }
+                        write_word(p, pattern1);
+                    } while (p-- > ps); // test before decrement in case pointer overflows
+                }
+// SIMD code paths
+#if defined(__i386__) || defined(__x86_64__)
+                else if (simd == 2) {
+                    p = read2_loops_simd128_sse((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
+                }
+                else if (simd == 3) {
+                    p = read2_loops_simd128_sse2((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
+                }
+                else if (simd == 4) {
+                    p = read2_loops_simd256_avx((testword_t *)((uintptr_t)p & ~(uintptr_t)0x1F), ps, pattern1, pattern2);
+                }
+#endif
                do_tick(my_cpu);
                BAILOUT;
            } while (!at_start && --ps); // advance ps to next start point
--- a/tests/test_funcs.h
+++ b/tests/test_funcs.h
@ -21,7 +21,7 @@ int test_own_addr1(int my_cpu);

 int test_own_addr2(int my_cpu, int stage);

-int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2);
+int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd);

 int test_mov_inv_walk1(int my_cpu, int iterations, int offset, bool inverse);

@ -29,7 +29,7 @@ int test_mov_inv_random(int my_cpu);

 int test_modulo_n(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int n, int offset);

-int test_block_move(int my_cpu, int iterations);
+int test_block_move(int my_cpu, int iterations, int simd);

 int test_bit_fade(int my_cpu, int stage, int sleep_secs);

--- a/tests/tests.c
+++ b/tests/tests.c
@ -112,6 +112,21 @@ int run_test(int my_cpu, int test, int stage, int iterations)
    testword_t prsg_state;

    int ticks = 0;
+    int simd = 0;
+#if defined(__i386__) || defined(__x86_64__)
+    if (cpuid_info.flags.mmx) {
+        simd++;
+        if (cpuid_info.flags.sse) {
+            simd++;
+            if (cpuid_info.flags.sse2) {
+                simd++;
+                if (cpuid_info.flags.avx) {
+                    simd++;
+                }
+            }
+        }
+    }
+#endif

    switch (test) {
        // Address test, walking ones.
@ -140,11 +155,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
        testword_t pattern2 = ~pattern1;

        BARRIER;
-        ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
+        ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
        BAILOUT;

        BARRIER;
-        ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
+        ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
        BAILOUT;
      } break;

@ -159,11 +174,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
            testword_t pattern2 = ~pattern1;

            BARRIER;
-            ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
+            ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
            BAILOUT;

            BARRIER;
-            ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
+            ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
            BAILOUT;

            pattern1 >>= 1;
@ -171,6 +186,7 @@ int run_test(int my_cpu, int test, int stage, int iterations)
      } break;

        // Moving inversions, fixed random pattern.
+        // SIMD variants after rep stos[lq] variant.
      case 5:
        if (cpuid_info.flags.rdtsc) {
            prsg_state = get_tsc();
@ -179,15 +195,19 @@ int run_test(int my_cpu, int test, int stage, int iterations)
        }
        prsg_state *= 0x12345678;

-        for (int i = 0; i < iterations; i++) {
-            prsg_state = prsg(prsg_state);
+        {
+            for (int j = 0; j <= simd; j++) {
+                for (int i = 0; i < iterations; i++) {
+                    prsg_state = prsg(prsg_state);

-            testword_t pattern1 = prsg_state;
-            testword_t pattern2 = ~pattern1;
+                    testword_t pattern1 = prsg_state;
+                    testword_t pattern2 = ~pattern1;

-            BARRIER;
-            ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2);
-            BAILOUT;
+                    BARRIER;
+                    ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2, j);
+                    BAILOUT;
+                }
+            }
        }
        break;

@ -206,7 +226,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)

        // Block move.
      case 7:
-        ticks += test_block_move(my_cpu, iterations);
+        {
+            for (int j = 0; j <= simd; j++) {
+                ticks += test_block_move(my_cpu, iterations, simd);
+            }
+        }
        BAILOUT;
        break;