mirror of
https://github.com/memtest86plus/memtest86plus.git
synced 2025-02-25 18:55:23 -06:00
WIP BROKEN Add SIMD tests for x86 & x86-64: MMX, SSE, SSE2, AVX ( #98 ).
This commit is contained in:
parent
4821c1bdba
commit
e16118505f
@ -172,6 +172,12 @@ typedef enum {
|
||||
prints(5, 39, str); \
|
||||
}
|
||||
|
||||
#define display_test_pattern_names(str, step) \
|
||||
{ \
|
||||
clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \
|
||||
printf(5, 39, "%s - %i", str, step); \
|
||||
}
|
||||
|
||||
#define display_test_pattern_value(pattern) \
|
||||
{ \
|
||||
clear_screen_region(5, 39, 5, SCREEN_WIDTH - 1); \
|
||||
|
@ -365,6 +365,15 @@ void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_
|
||||
common_err(DATA_ERROR, (uintptr_t)addr, good, bad, use_for_badram);
|
||||
}
|
||||
|
||||
void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram)
|
||||
{
|
||||
for (unsigned int i = 0; i < width; i++, addr++) {
|
||||
if (*good != *bad) {
|
||||
common_err(DATA_ERROR, (uintptr_t)addr, *good, *bad, use_for_badram);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ecc_error()
|
||||
{
|
||||
common_err(CECC_ERROR, ecc_status.addr, 0, 0, false);
|
||||
|
@ -40,6 +40,11 @@ void addr_error(testword_t *addr1, testword_t *addr2, testword_t good, testword_
|
||||
*/
|
||||
void data_error(testword_t *addr, testword_t good, testword_t bad, bool use_for_badram);
|
||||
|
||||
/**
|
||||
* Adds one or more data errors to the error reports, version for data types wider than 64 bits.
|
||||
*/
|
||||
void data_error_wide(testword_t *addr, testword_t * good, testword_t * bad, unsigned int width, bool use_for_badram);
|
||||
|
||||
/**
|
||||
* Adds an ECC error to the error reports.
|
||||
* ECC Error details are stored in ecc_status
|
||||
|
@ -52,11 +52,9 @@
|
||||
#define OPCODE_WRMSR 0x300F
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define REG_PREFIX "r"
|
||||
#define REG_DIGITS "16"
|
||||
#define ADR_DIGITS "12"
|
||||
#else
|
||||
#define REG_PREFIX "e"
|
||||
#define REG_DIGITS "8"
|
||||
#define ADR_DIGITS "8"
|
||||
#endif
|
||||
|
34
app/main.c
34
app/main.c
@ -607,6 +607,40 @@ void main(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
// Enable SSE and AVX, if available.
|
||||
if (cpuid_info.flags.sse) {
|
||||
uintptr_t temp;
|
||||
__asm__ __volatile__(
|
||||
"mov %%cr0, %0\n"
|
||||
"andb $0xfb, %b0\n" // clear coprocessor emulation bit
|
||||
"orb $0x02, %b0\n" // set coprocessor monitoring bit
|
||||
"mov %0, %%cr0\n"
|
||||
"mov %%cr4, %0\n"
|
||||
"orb $0x06, %h0" // set OSFXSR and OSXMMEXCPT
|
||||
: "=a" (temp)
|
||||
);
|
||||
if (cpuid_info.flags.xsave && cpuid_info.flags.avx) {
|
||||
__asm__ __volatile__(
|
||||
"bts $18,%k0\n" // set OSXSAVE bit
|
||||
"mov %0, %%cr4\n"
|
||||
"xor %%" REG_PREFIX "cx, %%" REG_PREFIX "cx\n"
|
||||
"xgetbv\n" // Load XCR0 register
|
||||
"orb $0x07, %%al\n" // Set AVX, SSE, X87 bits
|
||||
"xsetbv" // Save back to XCR0
|
||||
: "=a" (temp) : "a" (temp)
|
||||
: REG_PREFIX "cx", REG_PREFIX "dx"
|
||||
);
|
||||
}
|
||||
else {
|
||||
__asm__ __volatile__(
|
||||
"mov %0, %%cr4"
|
||||
: : "a" (temp)
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Due to the need to relocate ourselves in the middle of tests, the following
|
||||
// code cannot be written in the natural way as a set of nested loops. So we
|
||||
// have a single loop and use global state variables to allow us to restart
|
||||
|
12
app/test.h
12
app/test.h
@ -65,6 +65,12 @@ extern spinlock_t *error_mutex;
|
||||
* The string representation of TESTWORDS_DIGITS
|
||||
*/
|
||||
#define TESTWORD_DIGITS_STR "16"
|
||||
#if defined(__x86_64__)
|
||||
/**
|
||||
* The register prefix for full-sized registers.
|
||||
*/
|
||||
#define REG_PREFIX "r"
|
||||
#endif
|
||||
#else
|
||||
/**
|
||||
* The word width (in bits) used for memory testing.
|
||||
@ -78,6 +84,12 @@ extern spinlock_t *error_mutex;
|
||||
* The string representation of TESTWORDS_DIGITS
|
||||
*/
|
||||
#define TESTWORD_DIGITS_STR "8"
|
||||
#if defined(__i386__)
|
||||
/**
|
||||
* The register prefix for full-sized registers.
|
||||
*/
|
||||
#define REG_PREFIX "e"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@ -258,18 +258,6 @@ flush: movw $KERNEL_DS, %ax
|
||||
|
||||
finit
|
||||
|
||||
#if 0
|
||||
# Enable SSE.
|
||||
|
||||
movq %cr0, %rax
|
||||
andb $0xfb, %al # clear coprocessor emulation bit
|
||||
orb $0x02, %al # set coprocessor monitoring bit
|
||||
mov %rax, %cr0
|
||||
movq %cr4, %rax
|
||||
orw $0x0600, %ax # set OSFXSR and OSXMMEXCPT
|
||||
movq %rax, %cr4
|
||||
#endif
|
||||
|
||||
# Call the dynamic linker to fix up the addresses in the GOT.
|
||||
|
||||
call reloc
|
||||
|
@ -10,8 +10,8 @@ else
|
||||
GIT_AVAILABLE = true
|
||||
endif
|
||||
|
||||
CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -fpic -fno-builtin \
|
||||
-ffreestanding -fomit-frame-pointer -fno-stack-protector \
|
||||
CFLAGS = -std=gnu11 -Wall -Wextra -Wshadow -m32 -march=i586 -mno-mmx -mno-sse -mno-sse2 \
|
||||
-fpic -fno-builtin -ffreestanding -fomit-frame-pointer -fno-stack-protector \
|
||||
-fexcess-precision=standard -DARCH_BITS=32
|
||||
|
||||
ifeq ($(DEBUG), 1)
|
||||
|
@ -134,6 +134,44 @@ Init() {
|
||||
QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF32_CODE.fd"
|
||||
QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF32_VARS.fd"
|
||||
|
||||
if [ "x$MACHINE" = "x4S4CN270" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu n270-v1"
|
||||
QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
|
||||
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
|
||||
|
||||
elif [ "x$MACHINE" = "x4S4CP3" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu pentium3-v1"
|
||||
QEMU_FLAGS+=" -smp 4,sockets=4,cores=1,maxcpus=4"
|
||||
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
|
||||
|
||||
elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
|
||||
QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
|
||||
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
|
||||
|
||||
elif [ "x$MACHINE" = "x1S1CAthlon" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu athlon-v1"
|
||||
QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
|
||||
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
|
||||
|
||||
fi
|
||||
|
||||
# Define offsets for loading of symbol-table
|
||||
IMAGEBASE=0x200000
|
||||
BASEOFCODE=0x1000
|
||||
|
@ -135,6 +135,152 @@ Init() {
|
||||
QEMU_FLAGS+=" -hda fat:rw:hda-contents -net none"
|
||||
QEMU_FLAGS+=" -drive if=pflash,format=raw,readonly=on,file=OVMF_CODE.fd"
|
||||
QEMU_FLAGS+=" -drive if=pflash,format=raw,file=OVMF_VARS.fd"
|
||||
# QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/hpet.dat:../ACPI/RS904A/srat_32c.dat:../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
|
||||
# QEMU_FLAGS+=" -machine q35 -acpitable file=../ACPI/RS904A/apic_32c.dat:../ACPI/RS904A/slit.dat -smbios file=../SPD/dmidecode_rs904a2_dump.bin"
|
||||
QEMU_FLAGS+=" -machine q35" # q35,accel=kvm
|
||||
|
||||
if [ "x$MACHINE" = "x4S32CBulldozer" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Opteron_G4" # -cpu host Opteron_G5
|
||||
QEMU_FLAGS+=" -smp 32,sockets=4,cores=8,maxcpus=32"
|
||||
|
||||
MEMSIZE8=$((MEMSIZE / 8))
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m0,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m1,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m2,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m3,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m4,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m5,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m6,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE8}M,id=m7,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-3"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=4-7"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=2,memdev=m2,cpus=8-11"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=3,memdev=m3,cpus=12-15"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=4,memdev=m4,cpus=16-19"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=5,memdev=m5,cpus=20-23"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=6,memdev=m6,cpus=24-27"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=7,memdev=m7,cpus=28-31"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=3,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=4,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=5,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=6,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=7,val=22"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=3,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=4,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=5,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=6,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=7,val=16"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=0,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=1,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=3,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=4,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=5,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=6,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=2,dst=7,val=16"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=0,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=1,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=4,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=5,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=6,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=3,dst=7,val=16"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=0,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=1,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=3,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=5,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=6,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=4,dst=7,val=22"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=0,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=1,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=2,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=3,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=4,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=6,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=5,dst=7,val=22"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=0,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=1,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=3,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=4,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=5,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=6,dst=7,val=16"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=0,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=1,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=2,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=3,val=16"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=4,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=5,val=22"
|
||||
QEMU_FLAGS+=" -numa dist,src=7,dst=6,val=16"
|
||||
|
||||
#[02Ch 0044 8] Locality 0 : 0A 10 10 16 10 16 10 16
|
||||
#[034h 0052 8] Locality 1 : 10 0A 10 16 16 10 16 10
|
||||
#[03Ch 0060 8] Locality 2 : 10 10 0A 10 10 16 10 10
|
||||
#[044h 0068 8] Locality 3 : 16 16 10 0A 16 10 10 10
|
||||
#[04Ch 0076 8] Locality 4 : 10 16 10 16 0A 10 10 16
|
||||
#[054h 0084 8] Locality 5 : 16 10 16 10 10 0A 10 16
|
||||
#[05Ch 0092 8] Locality 6 : 10 16 10 10 10 10 0A 10
|
||||
#[064h 0100 8] Locality 7 : 16 10 10 10 16 16 10 0A
|
||||
|
||||
elif [ "x$MACHINE" = "x2S12CWestmere" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Westmere-v2"
|
||||
QEMU_FLAGS+=" -smp 12,sockets=2,cores=6,maxcpus=12"
|
||||
|
||||
MEMSIZE2=$((MEMSIZE / 2))
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=6-11"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=20"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=20"
|
||||
|
||||
elif [ "x$MACHINE" = "x2S2CPenryn" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Penryn-v1"
|
||||
QEMU_FLAGS+=" -smp 2,sockets=2,cores=1,maxcpus=2"
|
||||
|
||||
MEMSIZE2=$((MEMSIZE / 2))
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m0,prealloc=on"
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE2}M,id=m1,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0"
|
||||
QEMU_FLAGS+=" -numa node,nodeid=1,memdev=m1,cpus=1"
|
||||
|
||||
QEMU_FLAGS+=" -numa dist,src=0,dst=1,val=21"
|
||||
QEMU_FLAGS+=" -numa dist,src=1,dst=0,val=21"
|
||||
|
||||
elif [ "x$MACHINE" = "x1S6CBroadwell" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu Broadwell-v4"
|
||||
QEMU_FLAGS+=" -smp 6,sockets=1,cores=6,maxcpus=6"
|
||||
|
||||
QEMU_FLAGS+=" -object memory-backend-ram,size=${MEMSIZE}M,id=m0,prealloc=on"
|
||||
|
||||
QEMU_FLAGS+=" -numa node,nodeid=0,memdev=m0,cpus=0-5"
|
||||
|
||||
elif [ "x$MACHINE" = "x1S1CPhenom" ]; then
|
||||
|
||||
QEMU_FLAGS+=" -m ${MEMSIZE}M -cpu phenom-v1"
|
||||
QEMU_FLAGS+=" -smp 1,sockets=1,cores=1,maxcpus=1"
|
||||
|
||||
fi
|
||||
|
||||
# Define offsets for loading of symbol-table
|
||||
IMAGEBASE=0x200000
|
||||
|
@ -56,7 +56,7 @@ typedef union {
|
||||
typedef union {
|
||||
uint32_t raw[3];
|
||||
struct {
|
||||
uint32_t fpu : 1; // EDX feature flags, bit 0 */
|
||||
uint32_t fpu : 1; // EDX feature flags, bit 0
|
||||
uint32_t vme : 1;
|
||||
uint32_t de : 1;
|
||||
uint32_t pse : 1;
|
||||
@ -88,6 +88,7 @@ typedef union {
|
||||
uint32_t tm : 1;
|
||||
uint32_t bit30 : 1;
|
||||
uint32_t pbe : 1; // EDX feature flags, bit 31
|
||||
|
||||
uint32_t sse3 : 1; // ECX feature flags, bit 0
|
||||
uint32_t mulq : 1;
|
||||
uint32_t bit2 : 1;
|
||||
@ -99,7 +100,12 @@ typedef union {
|
||||
uint32_t tm2 : 1;
|
||||
uint32_t : 12; // ECX feature flags, bit 20
|
||||
uint32_t x2apic : 1;
|
||||
uint32_t : 10; // ECX feature flags, bit 31
|
||||
uint32_t : 4; // ECX feature flags, bit 22
|
||||
uint32_t xsave : 1;
|
||||
uint32_t osxsave : 1;
|
||||
uint32_t avx : 1;
|
||||
uint32_t : 3; // ECX feature flags, bit 31
|
||||
|
||||
uint32_t : 19; // EDX extended feature flags, bit 0
|
||||
uint32_t nx : 1;
|
||||
uint32_t : 9;
|
||||
|
304
system/memrw_simd.h
Normal file
304
system/memrw_simd.h
Normal file
@ -0,0 +1,304 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef MEMRW_SIMD_H
|
||||
#define MEMRW_SIMD_H
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
// -------------------- COMMON TO x86 & x86-64 --------------------
|
||||
|
||||
/**
|
||||
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
|
||||
*/
|
||||
#pragma GCC target ("sse")
|
||||
static inline __m128 read128_sse(const volatile __m128 *ptr)
|
||||
{
|
||||
__m128 val;
|
||||
__asm__ __volatile__(
|
||||
"movaps %1, %0"
|
||||
: "=x" (val)
|
||||
: "m" (*ptr)
|
||||
: "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
|
||||
*/
|
||||
#pragma GCC target ("sse")
|
||||
static inline void write128_sse(const volatile __m128 *ptr, __m128 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"movaps %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
|
||||
*/
|
||||
#pragma GCC target ("sse")
|
||||
static inline void write128nt_sse(const volatile __m128 *ptr, __m128 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"movntps %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse")
|
||||
static inline int compare128_sse(__m128 val1, __m128 val2)
|
||||
{
|
||||
return _mm_movemask_ps(_mm_cmpeq_ps(val1, val2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand.
|
||||
*/
|
||||
#pragma GCC target ("sse2")
|
||||
static inline void write128_sse2(const volatile __m128 *ptr, __m128 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"movdqa %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 128-bit memory location pointed to by ptr, using SSE register as source operand, using non-temporal hint.
|
||||
*/
|
||||
#pragma GCC target ("sse2")
|
||||
static inline void write128nt_sse2(const volatile __m128 *ptr, __m128 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"movntdq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
|
||||
*/
|
||||
#pragma GCC target ("sse2")
|
||||
static inline __m128 read128_sse2(const volatile __m128 *ptr)
|
||||
{
|
||||
__m128 val;
|
||||
__asm__ __volatile__(
|
||||
"movdqa %1, %0"
|
||||
: "=x" (val)
|
||||
: "m" (*ptr)
|
||||
: "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse2")
|
||||
static inline int compare128_sse2(__m128 val1, __m128 val2)
|
||||
{
|
||||
return _mm_movemask_pd(_mm_cmpeq_pd((__m128d)val1, (__m128d)val2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value stored in the 128-bit memory location pointed to by ptr.
|
||||
*/
|
||||
#pragma GCC target ("avx")
|
||||
static inline __m256 read256_avx(const volatile __m256 *ptr)
|
||||
{
|
||||
__m256 val;
|
||||
__asm__ __volatile__(
|
||||
"vmovdqa %1, %0"
|
||||
: "=x" (val)
|
||||
: "m" (*ptr)
|
||||
: "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand.
|
||||
*/
|
||||
#pragma GCC target ("avx")
|
||||
static inline void write256_avx(const volatile __m256 *ptr, __m256 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"vmovdqa %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 256-bit memory location pointed to by ptr, using AVX register as source operand, using non-temporal hint.
|
||||
*/
|
||||
#pragma GCC target ("avx")
|
||||
static inline void write256nt_avx(const volatile __m256 *ptr, __m256 val)
|
||||
{
|
||||
__asm__ __volatile__(
|
||||
"vmovntdq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"x" (val)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static inline int compare256_avx(__m256 val1, __m256 val2)
|
||||
{
|
||||
return _mm256_movemask_pd(_mm256_cmp_pd((__m256d)val1, (__m256d)val2, 0));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// -------------------- SEPARATE FOR x86 & x86-64 --------------------
|
||||
|
||||
#if defined(__i386__)
|
||||
|
||||
#if 0
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline __m64 convert_testword_to_simd64_mmx(uint32_t val)
|
||||
{
|
||||
return _mm_set1_pi32(val);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
|
||||
*/
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline void write64_mmx(const volatile uint32_t *ptr, uint32_t val)
|
||||
{
|
||||
__m64 val2 = _mm_set1_pi32(val);
|
||||
__asm__ __volatile__(
|
||||
"movq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"y" (val2)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
|
||||
*/
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline void write64nt_mmx(const volatile uint32_t *ptr, uint32_t val)
|
||||
{
|
||||
__m64 val2 = _mm_set1_pi32(val);
|
||||
__asm__ __volatile__(
|
||||
"movntq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"y" (val2)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse")
|
||||
static inline __m128 convert_testword_to_simd128_sse(uint32_t val)
|
||||
{
|
||||
__attribute__((aligned(16))) float tmp[4];
|
||||
float * tmp2 = tmp;
|
||||
*(uint32_t *)tmp2++ = val;
|
||||
*(uint32_t *)tmp2++ = val;
|
||||
*(uint32_t *)tmp2++ = val;
|
||||
*(uint32_t *)tmp2++ = val;
|
||||
return _mm_load_ps(tmp);
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse2")
|
||||
static inline __m128 convert_testword_to_simd128_sse2(uint32_t val)
|
||||
{
|
||||
return (__m128)_mm_set1_epi32(val);
|
||||
}
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static inline __m256 convert_testword_to_simd256_avx(uint32_t val)
|
||||
{
|
||||
return (__m256)_mm256_set1_epi32(val);
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__)
|
||||
|
||||
// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
|
||||
#if 0
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline __m64 convert_testword_to_simd64_mmx(uint64_t val)
|
||||
{
|
||||
return (__m64)val;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand.
|
||||
*/
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline void write64_mmx(const volatile uint64_t *ptr, uint64_t val)
|
||||
{
|
||||
__m64 val2 = (__m64)val;
|
||||
__asm__ __volatile__(
|
||||
"movq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"y" (val2)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes val to the 64-bit memory location pointed to by ptr, using MMX register as source operand, using non-temporal hint.
|
||||
*/
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static inline void write64nt_mmx(const volatile uint64_t *ptr, uint64_t val)
|
||||
{
|
||||
__m64 val2 = (__m64)val;
|
||||
__asm__ __volatile__(
|
||||
"movntq %1, %0"
|
||||
:
|
||||
: "m" (*ptr),
|
||||
"y" (val2)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse")
|
||||
static inline __m128 convert_testword_to_simd128_sse(uint64_t val)
|
||||
{
|
||||
__attribute__((aligned(16))) float tmp[4];
|
||||
double * tmp2 = (double *)tmp;
|
||||
*(uint64_t *)tmp2++ = val;
|
||||
*(uint64_t *)tmp2++ = val;
|
||||
return _mm_load_ps(tmp);
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse2")
|
||||
static inline __m128 convert_testword_to_simd128_sse2(uint64_t val)
|
||||
{
|
||||
return (__m128)_mm_set1_epi64x(val);
|
||||
}
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static inline __m256 convert_testword_to_simd256_avx(uint64_t val)
|
||||
{
|
||||
return (__m256)_mm256_set1_epi64x(val);
|
||||
}
|
||||
|
||||
#endif // __i386__ or __x86_64__
|
||||
|
||||
#endif // MEMRW_SIMD_H
|
@ -28,12 +28,12 @@
|
||||
// Public Functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
int test_block_move(int my_cpu, int iterations)
|
||||
int test_block_move(int my_cpu, int iterations, int simd)
|
||||
{
|
||||
int ticks = 0;
|
||||
|
||||
if (my_cpu == master_cpu) {
|
||||
display_test_pattern_name("block move");
|
||||
display_test_pattern_names("block move", simd);
|
||||
}
|
||||
|
||||
// Initialize memory with the initial pattern.
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "display.h"
|
||||
#include "error.h"
|
||||
#include "test.h"
|
||||
#include "config.h"
|
||||
|
||||
#include "test_funcs.h"
|
||||
#include "test_helper.h"
|
||||
@ -30,19 +31,279 @@
|
||||
// Public Functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2)
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
#include "memrw_simd.h"
|
||||
|
||||
// ==================== vvvvv MMX vvvvv ====================
|
||||
|
||||
#if 0
|
||||
// XXX how to make this work without producing GCC error: 'SSE register return with SSE disabled' ?
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * write_loops_simd64(testword_t *p, testword_t *pe, testword_t pattern1)
|
||||
{
|
||||
register __m64 mdpattern1 __asm__("%mm0") = convert_testword_to_simd64(pattern1);
|
||||
if (enable_nontemporal) {
|
||||
do {
|
||||
write64nt_simd((__m64 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 1 : 0;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
else {
|
||||
do {
|
||||
write64_simd((__m64 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 1 : 0;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
__asm__ __volatile__ ("emms");
|
||||
__sync_synchronize();
|
||||
return p;
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma GCC target ("mmx", "no-sse", "no-sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * write_loops_simd64_mmx(testword_t *p, testword_t *pe, testword_t pattern1)
|
||||
{
|
||||
if (enable_nontemporal) {
|
||||
do {
|
||||
write64nt_mmx(p, pattern1);
|
||||
p += (sizeof(*p) < 8) ? 1 : 0;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
else {
|
||||
do {
|
||||
write64_mmx(p, pattern1);
|
||||
p += (sizeof(*p) < 8) ? 1 : 0;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
__asm__ __volatile__ ("emms");
|
||||
__sync_synchronize();
|
||||
return p;
|
||||
}
|
||||
|
||||
// ? TODO ? read1_loops_simd64
|
||||
// ? TODO ? read2_loops_simd64
|
||||
|
||||
// ==================== ^^^^^ MMX ^^^^^ ====================
|
||||
|
||||
|
||||
// ==================== vvvvv SSE vvvvv ====================
|
||||
|
||||
#pragma GCC target ("sse", "no-sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * write_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
|
||||
if (enable_nontemporal) {
|
||||
do {
|
||||
write128nt_sse((__m128 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
else {
|
||||
do {
|
||||
write128_sse((__m128 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
__sync_synchronize();
|
||||
return p;
|
||||
}
|
||||
|
||||
#define COMPARE_TARGET 0xF
|
||||
|
||||
#pragma GCC target ("sse", "no-sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * read1_loops_simd128_sse(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
|
||||
__m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
|
||||
do {
|
||||
__m128 actual = read128_sse((__m128 *)p);
|
||||
int compar_result = compare128_sse(mdpattern1, actual);
|
||||
write128_sse((__m128 *)p, mdpattern2);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m128 good = mdpattern1;
|
||||
__m128 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse", "no-sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * read2_loops_simd128_sse(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse(pattern1);
|
||||
__m128 mdpattern2 = convert_testword_to_simd128_sse(pattern2);
|
||||
do {
|
||||
__m128 actual = read128_sse((__m128 *)p);
|
||||
int compar_result = compare128_sse(mdpattern2, actual);
|
||||
write128_sse((__m128 *)p, mdpattern1);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m128 good = mdpattern2;
|
||||
__m128 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p -= (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p-- > ps); // test before decrement in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#undef COMPARE_TARGET
|
||||
|
||||
// ==================== ^^^^^ SSE ^^^^^ ====================
|
||||
|
||||
|
||||
// ==================== vvvvv SSE2 vvvvv ====================
|
||||
|
||||
#pragma GCC target ("sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * write_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
|
||||
if (enable_nontemporal) {
|
||||
do {
|
||||
write128nt_sse2((__m128 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
else {
|
||||
do {
|
||||
write128_sse2((__m128 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
__sync_synchronize();
|
||||
return p;
|
||||
}
|
||||
|
||||
#define COMPARE_TARGET 0x3
|
||||
|
||||
#pragma GCC target ("sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * read1_loops_simd128_sse2(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
|
||||
__m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
|
||||
do {
|
||||
__m128 actual = read128_sse2((__m128 *)p);
|
||||
int compar_result = compare128_sse2(mdpattern1, actual);
|
||||
write128_sse2((__m128 *)p, mdpattern2);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m128 good = mdpattern1;
|
||||
__m128 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p += (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#pragma GCC target ("sse2", "no-avx")
|
||||
static __attribute__((noinline)) testword_t * read2_loops_simd128_sse2(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m128 mdpattern1 = convert_testword_to_simd128_sse2(pattern1);
|
||||
__m128 mdpattern2 = convert_testword_to_simd128_sse2(pattern2);
|
||||
do {
|
||||
__m128 actual = read128_sse2((__m128 *)p);
|
||||
int compar_result = compare128_sse2(mdpattern2, actual);
|
||||
write128_sse2((__m128 *)p, mdpattern1);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m128 good = mdpattern2;
|
||||
__m128 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 128 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p -= (sizeof(*p) < 8) ? 3 : 1;
|
||||
} while (p-- > ps); // test before decrement in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#undef COMPARE_TARGET
|
||||
|
||||
// ==================== ^^^^^ SSE2 ^^^^^ ====================
|
||||
|
||||
|
||||
// ==================== vvvvv AVX vvvvv ====================
|
||||
|
||||
#define COMPARE_TARGET 0xF
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static __attribute__((noinline)) testword_t * write_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1)
|
||||
{
|
||||
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
|
||||
if (enable_nontemporal) {
|
||||
do {
|
||||
write256nt_avx((__m256 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 7 : 3;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
else {
|
||||
do {
|
||||
write256_avx((__m256 *)p, mdpattern1);
|
||||
p += (sizeof(*p) < 8) ? 7 : 3;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
__sync_synchronize();
|
||||
return p;
|
||||
}
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static __attribute__((noinline)) testword_t * read1_loops_simd256_avx(testword_t *p, testword_t *pe, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
|
||||
__m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
|
||||
do {
|
||||
__m256 actual = read256_avx((__m256 *)p);
|
||||
int compar_result = compare256_avx(mdpattern1, actual);
|
||||
write256_avx((__m256 *)p, mdpattern2);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m256 good = mdpattern1;
|
||||
__m256 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p += (sizeof(*p) < 8) ? 7 : 3;
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#pragma GCC target ("avx")
|
||||
static __attribute__((noinline)) testword_t * read2_loops_simd256_avx(testword_t *p, testword_t *ps, testword_t pattern1, testword_t pattern2)
|
||||
{
|
||||
__m256 mdpattern1 = convert_testword_to_simd256_avx(pattern1);
|
||||
__m256 mdpattern2 = convert_testword_to_simd256_avx(pattern2);
|
||||
do {
|
||||
__m256 actual = read256_avx((__m256 *)p);
|
||||
int compar_result = compare256_avx(mdpattern2, actual);
|
||||
write256_avx((__m256 *)p, mdpattern1);
|
||||
if (unlikely(compar_result != COMPARE_TARGET)) {
|
||||
__m256 good = mdpattern2;
|
||||
__m256 bad = actual;
|
||||
data_error_wide(p, (testword_t *)&good, (testword_t *)&bad, 256 / (8 * sizeof(*p)), true);
|
||||
}
|
||||
p -= (sizeof(*p) < 8) ? 7 : 3;
|
||||
} while (p-- > ps); // test before decrement in case pointer overflows
|
||||
return p;
|
||||
}
|
||||
|
||||
#undef COMPARE_TARGET
|
||||
|
||||
// ==================== ^^^^^ AVX ^^^^^ ====================
|
||||
|
||||
#endif
|
||||
|
||||
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd)
|
||||
{
|
||||
int ticks = 0;
|
||||
|
||||
size_t chunk_align = simd == 1 ? 64/8 : ((simd == 2 || simd == 3) ? 128/8 : (simd == 4 ? 256/8 : sizeof(testword_t)));
|
||||
if (my_cpu == master_cpu) {
|
||||
display_test_pattern_value(pattern1);
|
||||
display_test_pattern_values(pattern1, simd);
|
||||
}
|
||||
|
||||
// Initialize memory with the initial pattern.
|
||||
for (int i = 0; i < vm_map_size; i++) {
|
||||
testword_t *start, *end;
|
||||
calculate_chunk(&start, &end, my_cpu, i, sizeof(testword_t));
|
||||
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
|
||||
calculate_chunk(&start, &end, my_cpu, i, chunk_align);
|
||||
__asm__ volatile("nop");
|
||||
if (end < start) SKIP_RANGE(1) // we need enough words for this test
|
||||
|
||||
testword_t *p = start;
|
||||
testword_t *pe = start;
|
||||
@ -60,46 +321,64 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
|
||||
if (my_cpu < 0) {
|
||||
continue;
|
||||
}
|
||||
//do_trace(0, "W p %016x -> pe %016x", (uintptr_t)p, (uintptr_t) pe);
|
||||
test_addr[my_cpu] = (uintptr_t)p;
|
||||
if (!simd || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
|
||||
#if HAND_OPTIMISED
|
||||
#if defined(__x86_64__)
|
||||
uint64_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"rep \n\t"
|
||||
"stosq \n\t"
|
||||
:
|
||||
: "c" (length), "D" (p), "a" (pattern1)
|
||||
:
|
||||
);
|
||||
p = pe;
|
||||
uint64_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"rep \n\t"
|
||||
"stosq \n\t"
|
||||
:
|
||||
: "c" (length), "D" (p), "a" (pattern1)
|
||||
:
|
||||
);
|
||||
p = pe;
|
||||
#elif defined(__i386__)
|
||||
uint32_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"rep \n\t"
|
||||
"stosl \n\t"
|
||||
:
|
||||
: "c" (length), "D" (p), "a" (pattern1)
|
||||
:
|
||||
);
|
||||
p = pe;
|
||||
uint32_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"rep \n\t"
|
||||
"stosl \n\t"
|
||||
:
|
||||
: "c" (length), "D" (p), "a" (pattern1)
|
||||
:
|
||||
);
|
||||
p = pe;
|
||||
#elif defined(__loongarch_lp64)
|
||||
uint64_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"loop: \n\t"
|
||||
"st.d %2, %1, 0x0 \n\t"
|
||||
"addi.d %1, %1, 0x8 \n\t"
|
||||
"addi.d %0, %0, -0x1 \n\t"
|
||||
"bnez %0, loop \n\t"
|
||||
:
|
||||
: "r" (length), "r" (p), "r" (pattern1)
|
||||
: "memory"
|
||||
);
|
||||
p = pe;
|
||||
uint64_t length = pe - p + 1;
|
||||
__asm__ __volatile__ ("\t"
|
||||
"loop: \n\t"
|
||||
"st.d %2, %1, 0x0 \n\t"
|
||||
"addi.d %1, %1, 0x8 \n\t"
|
||||
"addi.d %0, %0, -0x1 \n\t"
|
||||
"bnez %0, loop \n\t"
|
||||
:
|
||||
: "r" (length), "r" (p), "r" (pattern1)
|
||||
: "memory"
|
||||
);
|
||||
p = pe;
|
||||
#endif
|
||||
#else
|
||||
do {
|
||||
write_word(p, pattern1);
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
do {
|
||||
write_word(p, pattern1);
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
#endif
|
||||
}
|
||||
// SIMD code paths
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
else if (simd == 1) {
|
||||
p = write_loops_simd64_mmx(p, pe, pattern1);
|
||||
}
|
||||
else if (simd == 2) {
|
||||
p = write_loops_simd128_sse(p, pe, pattern1);
|
||||
}
|
||||
else if (simd == 3) {
|
||||
p = write_loops_simd128_sse2(p, pe, pattern1);
|
||||
}
|
||||
else if (simd == 4) {
|
||||
p = write_loops_simd256_avx(p, pe, pattern1);
|
||||
}
|
||||
#endif
|
||||
do_tick(my_cpu);
|
||||
BAILOUT;
|
||||
@ -113,8 +392,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
|
||||
|
||||
for (int j = 0; j < vm_map_size; j++) {
|
||||
testword_t *start, *end;
|
||||
calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
|
||||
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
|
||||
calculate_chunk(&start, &end, my_cpu, j, chunk_align);
|
||||
if (end < start) SKIP_RANGE(1) // we need enough words for this test
|
||||
|
||||
testword_t *p = start;
|
||||
testword_t *pe = start;
|
||||
@ -132,14 +411,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
|
||||
if (my_cpu < 0) {
|
||||
continue;
|
||||
}
|
||||
//do_trace(0, "R1 p %016x -> pe %016x", (uintptr_t)p, (uintptr_t)pe);
|
||||
test_addr[my_cpu] = (uintptr_t)p;
|
||||
do {
|
||||
testword_t actual = read_word(p);
|
||||
if (unlikely(actual != pattern1)) {
|
||||
data_error(p, pattern1, actual, true);
|
||||
}
|
||||
write_word(p, pattern2);
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
|
||||
do {
|
||||
testword_t actual = read_word(p);
|
||||
if (unlikely(actual != pattern1)) {
|
||||
data_error(p, pattern1, actual, true);
|
||||
}
|
||||
write_word(p, pattern2);
|
||||
} while (p++ < pe); // test before increment in case pointer overflows
|
||||
}
|
||||
// SIMD code paths
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
else if (simd == 2) {
|
||||
p = read1_loops_simd128_sse(p, pe, pattern1, pattern2);
|
||||
}
|
||||
else if (simd == 3) {
|
||||
p = read1_loops_simd128_sse2(p, pe, pattern1, pattern2);
|
||||
}
|
||||
else if (simd == 4) {
|
||||
p = read1_loops_simd256_avx(p, pe, pattern1, pattern2);
|
||||
}
|
||||
#endif
|
||||
do_tick(my_cpu);
|
||||
BAILOUT;
|
||||
} while (!at_end && ++pe); // advance pe to next start point
|
||||
@ -149,8 +443,8 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
|
||||
|
||||
for (int j = vm_map_size - 1; j >= 0; j--) {
|
||||
testword_t *start, *end;
|
||||
calculate_chunk(&start, &end, my_cpu, j, sizeof(testword_t));
|
||||
if (end < start) SKIP_RANGE(1) // we need at least one word for this test
|
||||
calculate_chunk(&start, &end, my_cpu, j, chunk_align);
|
||||
if (end < start) SKIP_RANGE(1) // we need enough words for this test
|
||||
|
||||
testword_t *p = end;
|
||||
testword_t *ps = end;
|
||||
@ -168,14 +462,29 @@ int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword
|
||||
if (my_cpu < 0) {
|
||||
continue;
|
||||
}
|
||||
//do_trace(0, "R2 ps %016x -> p %016x", (uintptr_t)ps, (uintptr_t)p);
|
||||
test_addr[my_cpu] = (uintptr_t)p;
|
||||
do {
|
||||
testword_t actual = read_word(p);
|
||||
if (unlikely(actual != pattern2)) {
|
||||
data_error(p, pattern2, actual, true);
|
||||
}
|
||||
write_word(p, pattern1);
|
||||
} while (p-- > ps); // test before decrement in case pointer overflows
|
||||
if (simd == 0 || simd == 1 || ((end - start) < (int)((32/8 << simd) / sizeof(testword_t)) - 1)) {
|
||||
do {
|
||||
testword_t actual = read_word(p);
|
||||
if (unlikely(actual != pattern2)) {
|
||||
data_error(p, pattern2, actual, true);
|
||||
}
|
||||
write_word(p, pattern1);
|
||||
} while (p-- > ps); // test before decrement in case pointer overflows
|
||||
}
|
||||
// SIMD code paths
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
else if (simd == 2) {
|
||||
p = read2_loops_simd128_sse((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
|
||||
}
|
||||
else if (simd == 3) {
|
||||
p = read2_loops_simd128_sse2((testword_t *)((uintptr_t)p & ~(uintptr_t)0xF), ps, pattern1, pattern2);
|
||||
}
|
||||
else if (simd == 4) {
|
||||
p = read2_loops_simd256_avx((testword_t *)((uintptr_t)p & ~(uintptr_t)0x1F), ps, pattern1, pattern2);
|
||||
}
|
||||
#endif
|
||||
do_tick(my_cpu);
|
||||
BAILOUT;
|
||||
} while (!at_start && --ps); // advance ps to next start point
|
||||
|
@ -21,7 +21,7 @@ int test_own_addr1(int my_cpu);
|
||||
|
||||
int test_own_addr2(int my_cpu, int stage);
|
||||
|
||||
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2);
|
||||
int test_mov_inv_fixed(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int simd);
|
||||
|
||||
int test_mov_inv_walk1(int my_cpu, int iterations, int offset, bool inverse);
|
||||
|
||||
@ -29,7 +29,7 @@ int test_mov_inv_random(int my_cpu);
|
||||
|
||||
int test_modulo_n(int my_cpu, int iterations, testword_t pattern1, testword_t pattern2, int n, int offset);
|
||||
|
||||
int test_block_move(int my_cpu, int iterations);
|
||||
int test_block_move(int my_cpu, int iterations, int simd);
|
||||
|
||||
int test_bit_fade(int my_cpu, int stage, int sleep_secs);
|
||||
|
||||
|
@ -112,6 +112,21 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
testword_t prsg_state;
|
||||
|
||||
int ticks = 0;
|
||||
int simd = 0;
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (cpuid_info.flags.mmx) {
|
||||
simd++;
|
||||
if (cpuid_info.flags.sse) {
|
||||
simd++;
|
||||
if (cpuid_info.flags.sse2) {
|
||||
simd++;
|
||||
if (cpuid_info.flags.avx) {
|
||||
simd++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (test) {
|
||||
// Address test, walking ones.
|
||||
@ -140,11 +155,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
testword_t pattern2 = ~pattern1;
|
||||
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
|
||||
BAILOUT;
|
||||
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
|
||||
BAILOUT;
|
||||
} break;
|
||||
|
||||
@ -159,11 +174,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
testword_t pattern2 = ~pattern1;
|
||||
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2);
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern1, pattern2, 0);
|
||||
BAILOUT;
|
||||
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1);
|
||||
ticks += test_mov_inv_fixed(my_cpu, iterations, pattern2, pattern1, 0);
|
||||
BAILOUT;
|
||||
|
||||
pattern1 >>= 1;
|
||||
@ -171,6 +186,7 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
} break;
|
||||
|
||||
// Moving inversions, fixed random pattern.
|
||||
// SIMD variants after rep stos[lq] variant.
|
||||
case 5:
|
||||
if (cpuid_info.flags.rdtsc) {
|
||||
prsg_state = get_tsc();
|
||||
@ -179,15 +195,19 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
}
|
||||
prsg_state *= 0x12345678;
|
||||
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
prsg_state = prsg(prsg_state);
|
||||
{
|
||||
for (int j = 0; j <= simd; j++) {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
prsg_state = prsg(prsg_state);
|
||||
|
||||
testword_t pattern1 = prsg_state;
|
||||
testword_t pattern2 = ~pattern1;
|
||||
testword_t pattern1 = prsg_state;
|
||||
testword_t pattern2 = ~pattern1;
|
||||
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2);
|
||||
BAILOUT;
|
||||
BARRIER;
|
||||
ticks += test_mov_inv_fixed(my_cpu, 2, pattern1, pattern2, j);
|
||||
BAILOUT;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -206,7 +226,11 @@ int run_test(int my_cpu, int test, int stage, int iterations)
|
||||
|
||||
// Block move.
|
||||
case 7:
|
||||
ticks += test_block_move(my_cpu, iterations);
|
||||
{
|
||||
for (int j = 0; j <= simd; j++) {
|
||||
ticks += test_block_move(my_cpu, iterations, simd);
|
||||
}
|
||||
}
|
||||
BAILOUT;
|
||||
break;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user