WIP BROKEN Add SIMD tests for x86 & x86-64: MMX, SSE, SSE2, AVX ( #98 ).

This commit is contained in:
Lionel Debroux 2023-09-28 23:43:56 +02:00
parent 4821c1bdba
commit e16118505f
16 changed files with 968 additions and 89 deletions

View File

@ -172,6 +172,12 @@ typedef enum {
prints(5, 39, str); \
}
#define display_test_pattern_names(str, step) \
{ \
clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \
printf(5, 39, "%s - %i", str, step); \
}
#define display_test_pattern_value(pattern) \
{ \
clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \

View File

@ -365,6 +365,15 @@ void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_
common_err(DATA_ERROR, (uintptr_t)addr, good, bad, use_for_badram);
}
void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram)
{
for (unsigned int i = 0; i < width; i++, addr++) {
if (*good != *bad) {
common_err(DATA_ERROR, (uintptr_t)addr, *good, *bad, use_for_badram);
}
}
}
void ecc_error()
{
common_err(CECC_ERROR, ecc_status.addr, 0, 0, false);

View File

@ -40,6 +40,11 @@ void addr_error(testword_t *addr1, testword_t *addr2, testword_t good, testword_
*/
void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_badram);
/**
* Adds one or more data errors to the error reports, version for data types wider than 64 bits.
*/
void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram);
/**
* Adds an ECC error to the error reports.
* ECC Error details are stored in ecc_status

View File

@ -52,11 +52,9 @@
#define OPCODE_WRMSR 0x300F
#ifdef __x86_64__
#define REG_PREFIX "r"
#define REG_DIGITS "16"
#define ADR_DIGITS "12"
#else
#define REG_PREFIX "e"
#define REG_DIGITS "8"
#define ADR_DIGITS "8"
#endif

View File

@ -607,6 +607,40 @@ void main(void)
}
#endif
#if defined(__i386__) || defined(__x86_64__)
// Enable SSE and AVX, if available.
if (cpuid_info.flags.sse) {
uintptr_t temp;
__asm__ __volatile__(
"mov %%cr0, %0\n"
"andb $0xfb, %b0\n" // clear coprocessor emulation bit
"orb $0x02, %b0\n" // set coprocessor monitoring bit
"mov %0, %%cr0\n"
"mov %%cr4, %0\n"
"orb $0x06, %h0" // set OSFXSR and OSXMMEXCPT
: "=a" (temp)
);
if (cpuid_info.flags.xsave && cpuid_info.flags.avx) {
__asm__ __volatile__(
"bts $18,%k0\n" // set OSXSAVE bit
"mov %0, %%cr4\n"
"xor %%" REG_PREFIX "cx, %%" REG_PREFIX "cx\n"
"xgetbv\n" // Load XCR0 register
"orb $0x07, %%al\n" // Set AVX, SSE, X87 bits
"xsetbv" // Save back to XCR0
: "=a" (temp) : "a" (temp)
: REG_PREFIX "cx", REG_PREFIX "dx"
);
}
else {
__asm__ __volatile__(
"mov %0, %%cr4"
: : "a" (temp)
);
}
}
#endif
// Due to the need to relocate ourselves in the middle of tests, the following
// code cannot be written in the natural way as a set of nested loops. So we
// have a single loop and use global state variables to allow us to restart

View File

@ -65,6 +65,12 @@ extern spinlock_t *error_mutex;
* The string representation of TESTWORDS_DIGITS
*/
#define TESTWORD_DIGITS_STR "16"
#if defined(__x86_64__)
/**
* The register prefix for full-sized registers.
*/
#define REG_PREFIX "r"
#endif
#else
/**
* The word width (in bits) used for memory testing.
@ -78,6 +84,12 @@ extern spinlock_t *error_mutex;
* The string representation of TESTWORDS_DIGITS
*/
#define TESTWORD_DIGITS_STR "8"
#if defined(__i386__)
/**
* The register prefix for full-sized registers.
*/
#define REG_PREFIX "e"
#endif
#endif
/**

View File

@ -258,18 +258,6 @@ flush: movw $KERNEL_DS, %ax
finit
#if 0
# Enable SSE.
movq %cr0, %rax
andb $0xfb, %al # clear coprocessor emulation bit
orb $0x02, %al # set coprocessor monitoring bit
mov %rax, %cr0
movq %cr4, %rax
orw $0x0600, %ax # set OSFXSR and OSXMMEXCPT
movq %rax, %cr4
#endif
# Call the dynamic linker to fix up the addresses in the GOT.
call reloc

View File

@ -10,8 +10,8 @@ else
GIT_AVAILABLE = true
endif
CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -fpic -fno-builtin \
-ffreestanding -fomit-frame-pointer -fno-stack-protector \
CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -mno-mmx -mno-sse -mno-sse2 \
-fpic -fno-builtin -ffreestanding -fomit-frame-pointer -fno-stack-protector \
-fexcess-precision=standard -DARCH_BITS=32
ifeq ($(DEBUG), 1)

View File

@ -134,6 +134,44 @@ Init() {
QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF32_CODE.fd"
QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF32_VARS.fd"
if [ "x$MACHINE" = "x4S4CN270" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu n270-v1"
QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
elif [ "x$MACHINE" = "x4S4CP3" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu pentium3-v1"
QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
elif [ "x$MACHINE" = "x1S1CAthlon" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu athlon-v1"
QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
fi
# Define offsets for loading of symbol-table
IMAGEBASE=0x200000
BASEOFCODE=0x1000

View File

@ -135,6 +135,152 @@ Init() {
QEMU_FLAGS+=" -hda fat:rw:hda-contents -net none"
QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF_CODE.fd"
QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF_VARS.fd"
# QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/hpet.dat:../ACPI/RS904A/srat_32c.dat:../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
# QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
QEMU_FLAGS+=" -machine q35" # q35,accel=kvm
if [ "x$MACHINE" = "x4S32CBulldozer" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Opteron_G4" # -cpu host Opteron_G5
QEMU_FLAGS+=" -smp 32,sockets=4,cores=8,maxcpus=32"
MEMSIZE8=$((MEMSIZE / 8))
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m1,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m2,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m3,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m4,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m5,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m6,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m7,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=4-7"
QEMU_FLAGS+=" -numa node,nodeid=2,memdev=m2,cpus=8-11"
QEMU_FLAGS+=" -numa node,nodeid=3,memdev=m3,cpus=12-15"
QEMU_FLAGS+=" -numa node,nodeid=4,memdev=m4,cpus=16-19"
QEMU_FLAGS+=" -numa node,nodeid=5,memdev=m5,cpus=20-23"
QEMU_FLAGS+=" -numa node,nodeid=6,memdev=m6,cpus=24-27"
QEMU_FLAGS+=" -numa node,nodeid=7,memdev=m7,cpus=28-31"
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=16"
QEMU_FLAGS+=" -numa dist,src=0,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=0,dst=3,val=22"
QEMU_FLAGS+=" -numa dist,src=0,dst=4,val=16"
QEMU_FLAGS+=" -numa dist,src=0,dst=5,val=22"
QEMU_FLAGS+=" -numa dist,src=0,dst=6,val=16"
QEMU_FLAGS+=" -numa dist,src=0,dst=7,val=22"
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=16"
QEMU_FLAGS+=" -numa dist,src=1,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=1,dst=3,val=22"
QEMU_FLAGS+=" -numa dist,src=1,dst=4,val=22"
QEMU_FLAGS+=" -numa dist,src=1,dst=5,val=16"
QEMU_FLAGS+=" -numa dist,src=1,dst=6,val=22"
QEMU_FLAGS+=" -numa dist,src=1,dst=7,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=0,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=1,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=3,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=4,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=5,val=22"
QEMU_FLAGS+=" -numa dist,src=2,dst=6,val=16"
QEMU_FLAGS+=" -numa dist,src=2,dst=7,val=16"
QEMU_FLAGS+=" -numa dist,src=3,dst=0,val=22"
QEMU_FLAGS+=" -numa dist,src=3,dst=1,val=22"
QEMU_FLAGS+=" -numa dist,src=3,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=3,dst=4,val=22"
QEMU_FLAGS+=" -numa dist,src=3,dst=5,val=16"
QEMU_FLAGS+=" -numa dist,src=3,dst=6,val=16"
QEMU_FLAGS+=" -numa dist,src=3,dst=7,val=16"
QEMU_FLAGS+=" -numa dist,src=4,dst=0,val=16"
QEMU_FLAGS+=" -numa dist,src=4,dst=1,val=22"
QEMU_FLAGS+=" -numa dist,src=4,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=4,dst=3,val=22"
QEMU_FLAGS+=" -numa dist,src=4,dst=5,val=16"
QEMU_FLAGS+=" -numa dist,src=4,dst=6,val=16"
QEMU_FLAGS+=" -numa dist,src=4,dst=7,val=22"
QEMU_FLAGS+=" -numa dist,src=5,dst=0,val=22"
QEMU_FLAGS+=" -numa dist,src=5,dst=1,val=16"
QEMU_FLAGS+=" -numa dist,src=5,dst=2,val=22"
QEMU_FLAGS+=" -numa dist,src=5,dst=3,val=16"
QEMU_FLAGS+=" -numa dist,src=5,dst=4,val=16"
QEMU_FLAGS+=" -numa dist,src=5,dst=6,val=16"
QEMU_FLAGS+=" -numa dist,src=5,dst=7,val=22"
QEMU_FLAGS+=" -numa dist,src=6,dst=0,val=16"
QEMU_FLAGS+=" -numa dist,src=6,dst=1,val=22"
QEMU_FLAGS+=" -numa dist,src=6,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=6,dst=3,val=16"
QEMU_FLAGS+=" -numa dist,src=6,dst=4,val=16"
QEMU_FLAGS+=" -numa dist,src=6,dst=5,val=16"
QEMU_FLAGS+=" -numa dist,src=6,dst=7,val=16"
QEMU_FLAGS+=" -numa dist,src=7,dst=0,val=22"
QEMU_FLAGS+=" -numa dist,src=7,dst=1,val=16"
QEMU_FLAGS+=" -numa dist,src=7,dst=2,val=16"
QEMU_FLAGS+=" -numa dist,src=7,dst=3,val=16"
QEMU_FLAGS+=" -numa dist,src=7,dst=4,val=22"
QEMU_FLAGS+=" -numa dist,src=7,dst=5,val=22"
QEMU_FLAGS+=" -numa dist,src=7,dst=6,val=16"
#[02Ch 0044 8] Locality 0 : 0A 10 10 16 10 16 10 16
#[034h 0052 8] Locality 1 : 10 0A 10 16 16 10 16 10
#[03Ch 0060 8] Locality 2 : 10 10 0A 10 10 16 10 10
#[044h 0068 8] Locality 3 : 16 16 10 0A 16 10 10 10
#[04Ch 0076 8] Locality 4 : 10 16 10 16 0A 10 10 16
#[054h 0084 8] Locality 5 : 16 10 16 10 10 0A 10 16
#[05Ch 0092 8] Locality 6 : 10 16 10 10 10 10 0A 10
#[064h 0100 8] Locality 7 : 16 10 10 10 16 16 10 0A
elif [ "x$MACHINE" = "x2S12CWestmere" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Westmere-v2"
QEMU_FLAGS+=" -smp 12,sockets=2,cores=6,maxcpus=12"
MEMSIZE2=$((MEMSIZE / 2))
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=6-11"
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=20"
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=20"
elif [ "x$MACHINE" = "x2S2CPenryn" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Penryn-v1"
QEMU_FLAGS+=" -smp 2,sockets=2,cores=1,maxcpus=2"
MEMSIZE2=$((MEMSIZE / 2))
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=1"
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=21"
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=21"
elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
elif [ "x$MACHINE" = "x1S1CPhenom" ]; then
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu phenom-v1"
QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
fi
# Define offsets for loading of symbol-table
IMAGEBASE=0x200000

View File

@ -56,7 +56,7 @@ typedef union {
typedef union {
uint32_t raw[3];
struct {
uint32_t fpu : 1; // EDX feature flags, bit 0 */
uint32_t fpu : 1; // EDX feature flags, bit 0
uint32_t vme : 1;
uint32_t de : 1;
uint32_t pse : 1;
@ -88,6 +88,7 @@ typedef union {
uint32_t tm : 1;
uint32_t bit30 : 1;
uint32_t pbe : 1; // EDX feature flags, bit 31
uint32_t sse3 : 1; // ECX feature flags, bit 0
uint32_t mulq : 1;
uint32_t bit2 : 1;
@ -99,7 +100,12 @@ typedef union {
uint32_t tm2 : 1;
uint32_t : 12; // ECX feature flags, bit 20
uint32_t x2apic : 1;
uint32_t : 10; // ECX feature flags, bit 31
uint32_t : 4; // ECX feature flags, bit 22
uint32_t xsave : 1;
uint32_t osxsave : 1;
uint32_t avx : 1;
uint32_t : 3; // ECX feature flags, bit 31
uint32_t : 19; // EDX extended feature flags, bit 0
uint32_t nx : 1;
uint32_t : 9;

304
system/memrw_simd.h Normal file
View File

@ -0,0 +1,304 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef MEMRW_SIMD_H
#define MEMRW_SIMD_H
#if defined(__i386__) || defined(__x86_64__)
#include <immintrin.h>
// -------------------- COMMON TO x86 & x86-64 --------------------
/**
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
*/
#pragma GCC target ("sse")
static inline __m128 read128_sse(const volatile __m128 *ptr)
{
__m128 val;
__asm__ __volatile__(
"movaps %1, %0"
: "=x" (val)
: "m" (*ptr)
: "memory"
);
return val;
}
/**
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
*/
#pragma GCC target ("sse")
static inline void write128_sse(const volatile __m128 *ptr, __m128 val)
{
__asm__ __volatile__(
"movaps %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
/**
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
*/
#pragma GCC target ("sse")
static inline void write128nt_sse(const volatile __m128 *ptr, __m128 val)
{
__asm__ __volatile__(
"movntps %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
#pragma GCC target ("sse")
static inline int compare128_sse(__m128 val1, __m128 val2)
{
return _mm_movemask_ps(_mm_cmpeq_ps(val1, val2));
}
/**
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
*/
#pragma GCC target ("sse2")
static inline void write128_sse2(const volatile __m128 *ptr, __m128 val)
{
__asm__ __volatile__(
"movdqa %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
/**
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
*/
#pragma GCC target ("sse2")
static inline void write128nt_sse2(const volatile __m128 *ptr, __m128 val)
{
__asm__ __volatile__(
"movntdq %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
/**
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
*/
#pragma GCC target ("sse2")
static inline __m128 read128_sse2(const volatile __m128 *ptr)
{
__m128 val;
__asm__ __volatile__(
"movdqa %1, %0"
: "=x" (val)
: "m" (*ptr)
: "memory"
);
return val;
}
#pragma GCC target ("sse2")
static inline int compare128_sse2(__m128 val1, __m128 val2)
{
return _mm_movemask_pd(_mm_cmpeq_pd((__m128d)val1, (__m128d)val2));
}
/**
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
*/
#pragma GCC target ("avx")
static inline __m256 read256_avx(const volatile __m256 *ptr)
{
__m256 val;
__asm__ __volatile__(
"vmovdqa %1, %0"
: "=x" (val)
: "m" (*ptr)
: "memory"
);
return val;
}
/**
* Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand.
*/
#pragma GCC target ("avx")
static inline void write256_avx(const volatile __m256 *ptr, __m256 val)
{
__asm__ __volatile__(
"vmovdqa %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
/**
* Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand, using non-temporal hint.
*/
#pragma GCC target ("avx")
static inline void write256nt_avx(const volatile __m256 *ptr, __m256 val)
{
__asm__ __volatile__(
"vmovntdq %1, %0"
:
: "m" (*ptr),
"x" (val)
: "memory"
);
}
#pragma GCC target ("avx")
static inline int compare256_avx(__m256 val1, __m256 val2)
{
return _mm256_movemask_pd(_mm256_cmp_pd((__m256d)val1, (__m256d)val2, 0));
}
#endif
// -------------------- SEPARATE FOR x86 & x86-64 --------------------
#if defined(__i386__)
#if 0
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline __m64 convert_testword_to_simd64_mmx(uint32_t val)
{
return _mm_set1_pi32(val);
}
#endif
/**
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
*/
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline void write64_mmx(const volatile uint32_t *ptr, uint32_t val)
{
__m64 val2 = _mm_set1_pi32(val);
__asm__ __volatile__(
"movq %1, %0"
:
: "m" (*ptr),
"y" (val2)
: "memory"
);
}
/**
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
*/
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline void write64nt_mmx(const volatile uint32_t *ptr, uint32_t val)
{
__m64 val2 = _mm_set1_pi32(val);
__asm__ __volatile__(
"movntq %1, %0"
:
: "m" (*ptr),
"y" (val2)
: "memory"
);
}
#pragma GCC target ("sse")
static inline __m128 convert_testword_to_simd128_sse(uint32_t val)
{
__attribute__((aligned(16))) float tmp[4];
float * tmp2 = tmp;
*(uint32_t *)tmp2++ = val;
*(uint32_t *)tmp2++ = val;
*(uint32_t *)tmp2++ = val;
*(uint32_t *)tmp2++ = val;
return _mm_load_ps(tmp);
}
#pragma GCC target ("sse2")
static inline __m128 convert_testword_to_simd128_sse2(uint32_t val)
{
return (__m128)_mm_set1_epi32(val);
}
#pragma GCC target ("avx")
static inline __m256 convert_testword_to_simd256_avx(uint32_t val)
{
return (__m256)_mm256_set1_epi32(val);
}
#elif defined(__x86_64__)
// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
#if 0
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline __m64 convert_testword_to_simd64_mmx(uint64_t val)
{
return (__m64)val;
}
#endif
/**
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
*/
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline void write64_mmx(const volatile uint64_t *ptr, uint64_t val)
{
__m64 val2 = (__m64)val;
__asm__ __volatile__(
"movq %1, %0"
:
: "m" (*ptr),
"y" (val2)
: "memory"
);
}
/**
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
*/
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static inline void write64nt_mmx(const volatile uint64_t *ptr, uint64_t val)
{
__m64 val2 = (__m64)val;
__asm__ __volatile__(
"movntq %1, %0"
:
: "m" (*ptr),
"y" (val2)
: "memory"
);
}
#pragma GCC target ("sse")
static inline __m128 convert_testword_to_simd128_sse(uint64_t val)
{
__attribute__((aligned(16))) float tmp[4];
double * tmp2 = (double *)tmp;
*(uint64_t *)tmp2++ = val;
*(uint64_t *)tmp2++ = val;
return _mm_load_ps(tmp);
}
#pragma GCC target ("sse2")
static inline __m128 convert_testword_to_simd128_sse2(uint64_t val)
{
return (__m128)_mm_set1_epi64x(val);
}
#pragma GCC target ("avx")
static inline __m256 convert_testword_to_simd256_avx(uint64_t val)
{
return (__m256)_mm256_set1_epi64x(val);
}
#endif // __i386__ or __x86_64__
#endif // MEMRW_SIMD_H

View File

@ -28,12 +28,12 @@
// Public Functions
//------------------------------------------------------------------------------
int test_block_move(int my_cpu, int iterations)
int test_block_move(int my_cpu, int iterations, int simd)
{
int ticks = 0;
if (my_cpu == master_cpu) {
display_test_pattern_name("block move");
display_test_pattern_names("block move", simd);
}
// Initialize memory with the initial pattern.

View File

@ -20,6 +20,7 @@
#include "display.h"
#include "error.h"
#include "test.h"
#include "config.h"
#include "test_funcs.h"
#include "test_helper.h"
@ -30,19 +31,279 @@
// Public Functions
//------------------------------------------------------------------------------
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2)
#if defined(__i386__) || defined(__x86_64__)
#include "memrw_simd.h"
// ==================== vvvvv MMX vvvvv ====================
#if 0
// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static __attribute__((noinline)) testword_t * write_loops_simd64(testword_t *p, testword_t *pe, testword_t pattern1)
{
register __m64 mdpattern1 __asm__("%mm0") = convert_testword_to_simd64(pattern1);
if (enable_nontemporal) {
do {
write64nt_simd((__m64 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 1 : 0;
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
write64_simd((__m64 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 1 : 0;
} while (p++ < pe); // test before increment in case pointer overflows
}
__asm__ __volatile__ ("emms");
__sync_synchronize();
return p;
}
#endif
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
static __attribute__((noinline)) testword_t * write_loops_simd64_mmx(testword_t *p, testword_t *pe, testword_t pattern1)
{
if (enable_nontemporal) {
do {
write64nt_mmx(p, pattern1);
p += (sizeof(*p) < 8) ? 1 : 0;
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
write64_mmx(p, pattern1);
p += (sizeof(*p) < 8) ? 1 : 0;
} while (p++ < pe); // test before increment in case pointer overflows
}
__asm__ __volatile__ ("emms");
__sync_synchronize();
return p;
}
// ? TODO ? read1_loops_simd64
// ? TODO ? read2_loops_simd64
// ==================== ^^^^^ MMX ^^^^^ ====================
// ==================== vvvvv SSE vvvvv ====================
#pragma GCC target ("sse", "no-sse2", "no-avx")
static __attribute__((noinline)) testword_t * write_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
if (enable_nontemporal) {
do {
write128nt_sse((__m128 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
write128_sse((__m128 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
}
__sync_synchronize();
return p;
}
#define COMPARE_TARGET 0xF
#pragma GCC target ("sse", "no-sse2", "no-avx")
static __attribute__((noinline)) testword_t * read1_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
__m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
do {
__m128 actual = read128_sse((__m128 *)p);
int compar_result = compare128_sse(mdpattern1, actual);
write128_sse((__m128 *)p, mdpattern2);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m128 good = mdpattern1;
__m128 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
}
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
return p;
}
#pragma GCC target ("sse", "no-sse2", "no-avx")
static __attribute__((noinline)) testword_t * read2_loops_simd128_sse(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
__m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
do {
__m128 actual = read128_sse((__m128 *)p);
int compar_result = compare128_sse(mdpattern2, actual);
write128_sse((__m128 *)p, mdpattern1);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m128 good = mdpattern2;
__m128 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
}
p -= (sizeof(*p) < 8) ? 3 : 1;
} while (p-- > ps); // test before decrement in case pointer overflows
return p;
}
#undef COMPARE_TARGET
// ==================== ^^^^^ SSE ^^^^^ ====================
// ==================== vvvvv SSE2 vvvvv ====================
#pragma GCC target ("sse2", "no-avx")
static __attribute__((noinline)) testword_t * write_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
if (enable_nontemporal) {
do {
write128nt_sse2((__m128 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
write128_sse2((__m128 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
}
__sync_synchronize();
return p;
}
#define COMPARE_TARGET 0x3
#pragma GCC target ("sse2", "no-avx")
static __attribute__((noinline)) testword_t * read1_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
__m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
do {
__m128 actual = read128_sse2((__m128 *)p);
int compar_result = compare128_sse2(mdpattern1, actual);
write128_sse2((__m128 *)p, mdpattern2);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m128 good = mdpattern1;
__m128 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
}
p += (sizeof(*p) < 8) ? 3 : 1;
} while (p++ < pe); // test before increment in case pointer overflows
return p;
}
#pragma GCC target ("sse2", "no-avx")
static __attribute__((noinline)) testword_t * read2_loops_simd128_sse2(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
{
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
__m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
do {
__m128 actual = read128_sse2((__m128 *)p);
int compar_result = compare128_sse2(mdpattern2, actual);
write128_sse2((__m128 *)p, mdpattern1);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m128 good = mdpattern2;
__m128 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
}
p -= (sizeof(*p) < 8) ? 3 : 1;
} while (p-- > ps); // test before decrement in case pointer overflows
return p;
}
#undef COMPARE_TARGET
// ==================== ^^^^^ SSE2 ^^^^^ ====================
// ==================== vvvvv AVX vvvvv ====================
#define COMPARE_TARGET 0xF
#pragma GCC target ("avx")
static __attribute__((noinline)) testword_t * write_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1)
{
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
if (enable_nontemporal) {
do {
write256nt_avx((__m256 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 7 : 3;
} while (p++ < pe); // test before increment in case pointer overflows
}
else {
do {
write256_avx((__m256 *)p, mdpattern1);
p += (sizeof(*p) < 8) ? 7 : 3;
} while (p++ < pe); // test before increment in case pointer overflows
}
__sync_synchronize();
return p;
}
#pragma GCC target ("avx")
static __attribute__((noinline)) testword_t * read1_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
{
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
__m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
do {
__m256 actual = read256_avx((__m256 *)p);
int compar_result = compare256_avx(mdpattern1, actual);
write256_avx((__m256 *)p, mdpattern2);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m256 good = mdpattern1;
__m256 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
}
p += (sizeof(*p) < 8) ? 7 : 3;
} while (p++ < pe); // test before increment in case pointer overflows
return p;
}
#pragma GCC target ("avx")
static __attribute__((noinline)) testword_t * read2_loops_simd256_avx(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
{
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
__m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
do {
__m256 actual = read256_avx((__m256 *)p);
int compar_result = compare256_avx(mdpattern2, actual);
write256_avx((__m256 *)p, mdpattern1);
if (unlikely(compar_result != COMPARE_TARGET)) {
__m256 good = mdpattern2;
__m256 bad = actual;
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
}
p -= (sizeof(*p) < 8) ? 7 : 3;
} while (p-- > ps); // test before decrement in case pointer overflows
return p;
}
#undef COMPARE_TARGET
// ==================== ^^^^^ AVX ^^^^^ ====================
#endif
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd)
{
int ticks = 0;
size_t chunk_align = simd == 1 ? 64/8 : ((simd == 2 || simd == 3) ? 128/8 : (simd == 4 ? 256/8 : sizeof(testword_t)));
if (my_cpu == master_cpu) {
display_test_pattern_value(pattern1);
display_test_pattern_values(pattern1, simd);
}
// Initialize memory with the initial pattern.
for (int i = 0; i < vm_map_size; i++) {
testword_t *start, *end;
calculate_chunk(&start, &end, my_cpu, i, sizeof(testword_t));
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
calculate_chunk(&start, &end, my_cpu, i, chunk_align);
__asm__ volatile("nop");
if (end < start) SKIP_RANGE(1) // we need enough words for this test
testword_t *p = start;
testword_t *pe = start;
@ -60,46 +321,64 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
if (my_cpu < 0) {
continue;
}
//do_trace(0, "W p %016x -> pe %016x", (uintptr_t)p, (uintptr_t) pe);
test_addr[my_cpu] = (uintptr_t)p;
if (!simd || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
#if HAND_OPTIMISED
#if defined(__x86_64__)
uint64_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"rep \n\t"
"stosq \n\t"
:
: "c" (length), "D" (p), "a" (pattern1)
:
);
p = pe;
uint64_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"rep \n\t"
"stosq \n\t"
:
: "c" (length), "D" (p), "a" (pattern1)
:
);
p = pe;
#elif defined(__i386__)
uint32_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"rep \n\t"
"stosl \n\t"
:
: "c" (length), "D" (p), "a" (pattern1)
:
);
p = pe;
uint32_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"rep \n\t"
"stosl \n\t"
:
: "c" (length), "D" (p), "a" (pattern1)
:
);
p = pe;
#elif defined(__loongarch_lp64)
uint64_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"loop: \n\t"
"st.d %2, %1, 0x0 \n\t"
"addi.d %1, %1, 0x8 \n\t"
"addi.d %0, %0, -0x1 \n\t"
"bnez %0, loop \n\t"
:
: "r" (length), "r" (p), "r" (pattern1)
: "memory"
);
p = pe;
uint64_t length = pe - p + 1;
__asm__ __volatile__ ("\t"
"loop: \n\t"
"st.d %2, %1, 0x0 \n\t"
"addi.d %1, %1, 0x8 \n\t"
"addi.d %0, %0, -0x1 \n\t"
"bnez %0, loop \n\t"
:
: "r" (length), "r" (p), "r" (pattern1)
: "memory"
);
p = pe;
#endif
#else
do {
write_word(p, pattern1);
} while (p++ < pe); // test before increment in case pointer overflows
do {
write_word(p, pattern1);
} while (p++ < pe); // test before increment in case pointer overflows
#endif
}
// SIMD code paths
#if defined(__i386__) || defined(__x86_64__)
else if (simd == 1) {
p = write_loops_simd64_mmx(p, pe, pattern1);
}
else if (simd == 2) {
p = write_loops_simd128_sse(p, pe, pattern1);
}
else if (simd == 3) {
p = write_loops_simd128_sse2(p, pe, pattern1);
}
else if (simd == 4) {
p = write_loops_simd256_avx(p, pe, pattern1);
}
#endif
do_tick(my_cpu);
BAILOUT;
@ -113,8 +392,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
for (int j = 0; j < vm_map_size; j++) {
testword_t *start, *end;
calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
calculate_chunk(&start, &end, my_cpu, j, chunk_align);
if (end < start) SKIP_RANGE(1) // we need enough words for this test
testword_t *p = start;
testword_t *pe = start;
@ -132,14 +411,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
if (my_cpu < 0) {
continue;
}
//do_trace(0, "R1 p %016x -> pe %016x", (uintptr_t)p, (uintptr_t)pe);
test_addr[my_cpu] = (uintptr_t)p;
do {
testword_t actual = read_word(p);
if (unlikely(actual != pattern1)) {
data_error(p, pattern1, actual, true);
}
write_word(p, pattern2);
} while (p++ < pe); // test before increment in case pointer overflows
if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
do {
testword_t actual = read_word(p);
if (unlikely(actual != pattern1)) {
data_error(p, pattern1, actual, true);
}
write_word(p, pattern2);
} while (p++ < pe); // test before increment in case pointer overflows
}
// SIMD code paths
#if defined(__i386__) || defined(__x86_64__)
else if (simd == 2) {
p = read1_loops_simd128_sse(p, pe, pattern1, pattern2);
}
else if (simd == 3) {
p = read1_loops_simd128_sse2(p, pe, pattern1, pattern2);
}
else if (simd == 4) {
p = read1_loops_simd256_avx(p, pe, pattern1, pattern2);
}
#endif
do_tick(my_cpu);
BAILOUT;
} while (!at_end && ++pe); // advance pe to next start point
@ -149,8 +443,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
for (int j = vm_map_size - 1; j >= 0; j--) {
testword_t *start, *end;
calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
calculate_chunk(&start, &end, my_cpu, j, chunk_align);
if (end < start) SKIP_RANGE(1) // we need enough words for this test
testword_t *p = end;
testword_t *ps = end;
@ -168,14 +462,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
if (my_cpu < 0) {
continue;
}
//do_trace(0, "R2 ps %016x -> p %016x", (uintptr_t)ps, (uintptr_t)p);
test_addr[my_cpu] = (uintptr_t)p;
do {
testword_t actual = read_word(p);
if (unlikely(actual != pattern2)) {
data_error(p, pattern2, actual, true);
}
write_word(p, pattern1);
} while (p-- > ps); // test before decrement in case pointer overflows
if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
do {
testword_t actual = read_word(p);
if (unlikely(actual != pattern2)) {
data_error(p, pattern2, actual, true);
}
write_word(p, pattern1);
} while (p-- > ps); // test before decrement in case pointer overflows
}
// SIMD code paths
#if defined(__i386__) || defined(__x86_64__)
else if (simd == 2) {
p = read2_loops_simd128_sse((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
}
else if (simd == 3) {
p = read2_loops_simd128_sse2((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
}
else if (simd == 4) {
p = read2_loops_simd256_avx((testword_t *)((uintptr_t)p & ~(uintptr_t)0x1F), ps, pattern1, pattern2);
}
#endif
do_tick(my_cpu);
BAILOUT;
} while (!at_start && --ps); // advance ps to next start point

View File

@ -21,7 +21,7 @@ int test_own_addr1(int my_cpu);
int test_own_addr2(int my_cpu, int stage);
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2);
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd);
int test_mov_inv_walk1(int my_cpu, int iterations, int offset, bool inverse);
@ -29,7 +29,7 @@ int test_mov_inv_random(int my_cpu);
int test_modulo_n(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int n, int offset);
int test_block_move(int my_cpu, int iterations);
int test_block_move(int my_cpu, int iterations, int simd);
int test_bit_fade(int my_cpu, int stage, int sleep_secs);

View File

@ -112,6 +112,21 @@ int run_test(int my_cpu, int test, int stage, int iterations)
testword_t prsg_state;
int ticks = 0;
int simd = 0;
#if defined(__i386__) || defined(__x86_64__)
if (cpuid_info.flags.mmx) {
simd++;
if (cpuid_info.flags.sse) {
simd++;
if (cpuid_info.flags.sse2) {
simd++;
if (cpuid_info.flags.avx) {
simd++;
}
}
}
}
#endif
switch (test) {
// Address test, walking ones.
@ -140,11 +155,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
testword_t pattern2 = ~pattern1;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
BAILOUT;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
BAILOUT;
} break;
@ -159,11 +174,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
testword_t pattern2 = ~pattern1;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
BAILOUT;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
BAILOUT;
pattern1 >>= 1;
@ -171,6 +186,7 @@ int run_test(int my_cpu, int test, int stage, int iterations)
} break;
// Moving inversions, fixed random pattern.
// SIMD variants after rep stos[lq] variant.
case 5:
if (cpuid_info.flags.rdtsc) {
prsg_state = get_tsc();
@ -179,15 +195,19 @@ int run_test(int my_cpu, int test, int stage, int iterations)
}
prsg_state *= 0x12345678;
for (int i = 0; i < iterations; i++) {
prsg_state = prsg(prsg_state);
{
for (int j = 0; j <= simd; j++) {
for (int i = 0; i < iterations; i++) {
prsg_state = prsg(prsg_state);
testword_t pattern1 = prsg_state;
testword_t pattern2 = ~pattern1;
testword_t pattern1 = prsg_state;
testword_t pattern2 = ~pattern1;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2);
BAILOUT;
BARRIER;
ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2, j);
BAILOUT;
}
}
}
break;
@ -206,7 +226,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
// Block move.
case 7:
ticks += test_block_move(my_cpu, iterations);
{
for (int j = 0; j <= simd; j++) {
ticks += test_block_move(my_cpu, iterations, simd);
}
}
BAILOUT;
break;