[CPU] Fixed has_subnormals behavior for negative zero values (#19360)

This commit is contained in:
Gorokhov Dmitriy 2023-08-29 09:53:05 +04:00 committed by GitHub
parent 82afb47e36
commit d32b6904bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 20 deletions

View File

@ -92,19 +92,20 @@ protected:
pop(rsi); pop(rsi);
} }
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &mask, const Xbyak::Ymm &zero) { void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) {
auto a = ymm1; auto a = ymm1;
auto b = ymm2; auto b = ymm2;
auto c = ymm3; auto c = ymm3;
vmovdqu(a, yword[src]); // load 8 floats vmovdqu(a, yword[src]); // load 8 floats
vpcmpeqd(b, a, zero); // if (a == 0) b = 1 else b = 0 vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111
vpand(c, a, mask); // c = a & 01111111100000000000000000000000 vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000
vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
} }
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &mask, const Xbyak::Xmm &zero) { void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) {
auto a = xmm1; auto a = xmm1;
auto b = xmm2; auto b = xmm2;
auto c = xmm3; auto c = xmm3;
@ -112,8 +113,9 @@ protected:
uni_vmovdqu(a, xword[src]); // load 4 floats uni_vmovdqu(a, xword[src]); // load 4 floats
uni_vmovdqu(b, a); // b = a uni_vmovdqu(b, a); // b = a
uni_vmovdqu(c, a); // c = a uni_vmovdqu(c, a); // c = a
uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0 uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111
uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000 uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
} }
@ -130,12 +132,18 @@ protected:
const Reg64 &reg_idx = rsi; const Reg64 &reg_idx = rsi;
const Reg64 &reg_mask_addr = r15; const Reg64 &reg_mask_addr = r15;
static const uint32_t mask_data[8]; static const uint32_t exponent_mask_data[8];
static const uint32_t mantissa_mask_data[8];
}; };
const uint32_t jit_has_subnormals_base::mask_data[8] = { const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = {
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000,
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
};
const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = {
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff,
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
}; };
template<> template<>
@ -143,6 +151,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::avx2> {
constexpr static uint32_t length = 8; constexpr static uint32_t length = 8;
constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4; constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4;
constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5; constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5;
constexpr static const Xbyak::Ymm & rmm6 = Xbyak::util::ymm6;
}; };
template<> template<>
@ -150,6 +159,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::sse41> {
constexpr static uint32_t length = 4; constexpr static uint32_t length = 4;
constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4; constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4;
constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5; constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5;
constexpr static const Xbyak::Xmm & rmm6 = Xbyak::util::xmm6;
}; };
template<cpu_isa_t isa> template<cpu_isa_t isa>
@ -159,7 +169,8 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
const int sh_bits = std::ilogb(vlen); const int sh_bits = std::ilogb(vlen);
auto zero = reg<isa>::rmm4; auto zero = reg<isa>::rmm4;
auto mask = reg<isa>::rmm5; auto exponent_mask = reg<isa>::rmm5;
auto mantissa_mask = reg<isa>::rmm6;
preamble(); preamble();
@ -167,11 +178,13 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
mov(reg_src, ptr[param1 + offsetof(args_t, src)]); mov(reg_src, ptr[param1 + offsetof(args_t, src)]);
lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]); lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]);
mov(reg_sz, ptr[param1 + offsetof(args_t, count)]); mov(reg_sz, ptr[param1 + offsetof(args_t, count)]);
mov(reg_mask_addr, (size_t)mask_data);
// Initialize necessary consts // Initialize necessary consts
uni_vpxor(zero, zero, zero); uni_vpxor(zero, zero, zero);
uni_vmovdqu(mask, ptr[reg_mask_addr]); mov(reg_mask_addr, (size_t)exponent_mask_data);
uni_vmovdqu(exponent_mask, ptr[reg_mask_addr]);
mov(reg_mask_addr, (size_t)mantissa_mask_data);
uni_vmovdqu(mantissa_mask, ptr[reg_mask_addr]);
// Main loop // Main loop
xor_(reg_idx, reg_idx); xor_(reg_idx, reg_idx);
@ -179,7 +192,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
shr(r8, sh_bits); shr(r8, sh_bits);
foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) { foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) {
check_subnormals(reg_src, mask, zero); check_subnormals(reg_src, exponent_mask, mantissa_mask, zero);
jnc(has_subnormals); jnc(has_subnormals);
add(reg_src, sizeof(float) * vlen); add(reg_src, sizeof(float) * vlen);
}); });
@ -197,7 +210,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
uni_vmovdqu(ptr[r8], zero); uni_vmovdqu(ptr[r8], zero);
copy_floats(r8, reg_src, reg_sz); copy_floats(r8, reg_src, reg_sz);
check_subnormals(r8, mask, zero); check_subnormals(r8, exponent_mask, mantissa_mask, zero);
jc(no_subnormals); jc(no_subnormals);
add(rsp, vlen * sizeof(float)); add(rsp, vlen * sizeof(float));
@ -331,8 +344,10 @@ void Input::cloneBlobIfRequired() {
} }
#endif #endif
uint32_t mantissaMask = 0x007fffff;
uint32_t exponentMask = 0x7f800000;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) { if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
return true; return true;
} }
} }

View File

@ -43,10 +43,11 @@ void validate() override {
ASSERT_FALSE(actualOutputs.empty()); ASSERT_FALSE(actualOutputs.empty());
auto& outTensor = actualOutputs.front(); auto& outTensor = actualOutputs.front();
ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type"; ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type";
const uint32_t* data = reinterpret_cast<const uint32_t*>(outTensor.data()); const float* data = reinterpret_cast<const float*>(outTensor.data());
bool hasDenormals = false; bool hasDenormals = false;
for (size_t i = 0; i < outTensor.get_size(); ++i) { for (size_t i = 0; i < outTensor.get_size(); ++i) {
if (data[i] && (data[i] & (0xff << 23)) == 0) { if (std::abs(data[i]) >= std::numeric_limits<float>::denorm_min() &&
std::abs(data[i]) < std::numeric_limits<float>::min()) {
hasDenormals = true; hasDenormals = true;
} }
} }
@ -108,4 +109,5 @@ TEST_F(DenormalNullifyCheck, smoke_CPU_Denormal_Check) {
run(); run();
} }
} }
}// namespace SubgraphTestsDefinitions }// namespace SubgraphTestsDefinitions