[CPU] Fixed has_subnormals behavior for negative zero values (#19360)

This commit is contained in:
Gorokhov Dmitriy 2023-08-29 09:53:05 +04:00 committed by GitHub
parent 82afb47e36
commit d32b6904bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 20 deletions

View File

@ -92,19 +92,20 @@ protected:
pop(rsi);
}
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &mask, const Xbyak::Ymm &zero) {
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) {
auto a = ymm1;
auto b = ymm2;
auto c = ymm3;
vmovdqu(a, yword[src]); // load 8 floats
vpcmpeqd(b, a, zero); // if (a == 0) b = 1 else b = 0
vpand(c, a, mask); // c = a & 01111111100000000000000000000000
vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111
vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000
vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
}
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &mask, const Xbyak::Xmm &zero) {
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) {
auto a = xmm1;
auto b = xmm2;
auto c = xmm3;
@ -112,8 +113,9 @@ protected:
uni_vmovdqu(a, xword[src]); // load 4 floats
uni_vmovdqu(b, a); // b = a
uni_vmovdqu(c, a); // c = a
uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0
uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000
uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111
uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
}
@ -130,12 +132,18 @@ protected:
const Reg64 &reg_idx = rsi;
const Reg64 &reg_mask_addr = r15;
static const uint32_t mask_data[8];
static const uint32_t exponent_mask_data[8];
static const uint32_t mantissa_mask_data[8];
};
const uint32_t jit_has_subnormals_base::mask_data[8] = {
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23,
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23
const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = {
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000,
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
};
const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = {
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff,
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
};
template<>
@ -143,6 +151,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::avx2> {
constexpr static uint32_t length = 8;
constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4;
constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5;
constexpr static const Xbyak::Ymm & rmm6 = Xbyak::util::ymm6;
};
template<>
@ -150,6 +159,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::sse41> {
constexpr static uint32_t length = 4;
constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4;
constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5;
constexpr static const Xbyak::Xmm & rmm6 = Xbyak::util::xmm6;
};
template<cpu_isa_t isa>
@ -159,7 +169,8 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
const int sh_bits = std::ilogb(vlen);
auto zero = reg<isa>::rmm4;
auto mask = reg<isa>::rmm5;
auto exponent_mask = reg<isa>::rmm5;
auto mantissa_mask = reg<isa>::rmm6;
preamble();
@ -167,11 +178,13 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
mov(reg_src, ptr[param1 + offsetof(args_t, src)]);
lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]);
mov(reg_sz, ptr[param1 + offsetof(args_t, count)]);
mov(reg_mask_addr, (size_t)mask_data);
// Initialize necessary consts
uni_vpxor(zero, zero, zero);
uni_vmovdqu(mask, ptr[reg_mask_addr]);
mov(reg_mask_addr, (size_t)exponent_mask_data);
uni_vmovdqu(exponent_mask, ptr[reg_mask_addr]);
mov(reg_mask_addr, (size_t)mantissa_mask_data);
uni_vmovdqu(mantissa_mask, ptr[reg_mask_addr]);
// Main loop
xor_(reg_idx, reg_idx);
@ -179,7 +192,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
shr(r8, sh_bits);
foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) {
check_subnormals(reg_src, mask, zero);
check_subnormals(reg_src, exponent_mask, mantissa_mask, zero);
jnc(has_subnormals);
add(reg_src, sizeof(float) * vlen);
});
@ -197,7 +210,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
uni_vmovdqu(ptr[r8], zero);
copy_floats(r8, reg_src, reg_sz);
check_subnormals(r8, mask, zero);
check_subnormals(r8, exponent_mask, mantissa_mask, zero);
jc(no_subnormals);
add(rsp, vlen * sizeof(float));
@ -331,8 +344,10 @@ void Input::cloneBlobIfRequired() {
}
#endif
uint32_t mantissaMask = 0x007fffff;
uint32_t exponentMask = 0x7f800000;
for (size_t i = 0; i < size; ++i) {
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
return true;
}
}

View File

@ -43,10 +43,11 @@ void validate() override {
ASSERT_FALSE(actualOutputs.empty());
auto& outTensor = actualOutputs.front();
ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type";
const uint32_t* data = reinterpret_cast<const uint32_t*>(outTensor.data());
const float* data = reinterpret_cast<const float*>(outTensor.data());
bool hasDenormals = false;
for (size_t i = 0; i < outTensor.get_size(); ++i) {
if (data[i] && (data[i] & (0xff << 23)) == 0) {
if (std::abs(data[i]) >= std::numeric_limits<float>::denorm_min() &&
std::abs(data[i]) < std::numeric_limits<float>::min()) {
hasDenormals = true;
}
}
@ -108,4 +109,5 @@ TEST_F(DenormalNullifyCheck, smoke_CPU_Denormal_Check) {
run();
}
}
}// namespace SubgraphTestsDefinitions