[CPU] Fixed has_subnormals behavior for negative zero values (#19360)
This commit is contained in:
parent
82afb47e36
commit
d32b6904bd
@ -92,19 +92,20 @@ protected:
|
||||
pop(rsi);
|
||||
}
|
||||
|
||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &mask, const Xbyak::Ymm &zero) {
|
||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) {
|
||||
auto a = ymm1;
|
||||
auto b = ymm2;
|
||||
auto c = ymm3;
|
||||
|
||||
vmovdqu(a, yword[src]); // load 8 floats
|
||||
vpcmpeqd(b, a, zero); // if (a == 0) b = 1 else b = 0
|
||||
vpand(c, a, mask); // c = a & 01111111100000000000000000000000
|
||||
vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111
|
||||
vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
|
||||
vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000
|
||||
vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
||||
vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
||||
}
|
||||
|
||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &mask, const Xbyak::Xmm &zero) {
|
||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) {
|
||||
auto a = xmm1;
|
||||
auto b = xmm2;
|
||||
auto c = xmm3;
|
||||
@ -112,10 +113,11 @@ protected:
|
||||
uni_vmovdqu(a, xword[src]); // load 4 floats
|
||||
uni_vmovdqu(b, a); // b = a
|
||||
uni_vmovdqu(c, a); // c = a
|
||||
uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0
|
||||
uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000
|
||||
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
||||
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
||||
uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111
|
||||
uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
|
||||
uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000
|
||||
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
||||
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
||||
}
|
||||
|
||||
template<cpu_isa_t isa>
|
||||
@ -130,12 +132,18 @@ protected:
|
||||
const Reg64 ®_idx = rsi;
|
||||
const Reg64 ®_mask_addr = r15;
|
||||
|
||||
static const uint32_t mask_data[8];
|
||||
static const uint32_t exponent_mask_data[8];
|
||||
static const uint32_t mantissa_mask_data[8];
|
||||
};
|
||||
|
||||
const uint32_t jit_has_subnormals_base::mask_data[8] = {
|
||||
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23,
|
||||
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23
|
||||
const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = {
|
||||
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000,
|
||||
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
|
||||
};
|
||||
|
||||
const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = {
|
||||
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff,
|
||||
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
|
||||
};
|
||||
|
||||
template<>
|
||||
@ -143,6 +151,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::avx2> {
|
||||
constexpr static uint32_t length = 8;
|
||||
constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4;
|
||||
constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5;
|
||||
constexpr static const Xbyak::Ymm & rmm6 = Xbyak::util::ymm6;
|
||||
};
|
||||
|
||||
template<>
|
||||
@ -150,6 +159,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::sse41> {
|
||||
constexpr static uint32_t length = 4;
|
||||
constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4;
|
||||
constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5;
|
||||
constexpr static const Xbyak::Xmm & rmm6 = Xbyak::util::xmm6;
|
||||
};
|
||||
|
||||
template<cpu_isa_t isa>
|
||||
@ -159,7 +169,8 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
||||
const int sh_bits = std::ilogb(vlen);
|
||||
|
||||
auto zero = reg<isa>::rmm4;
|
||||
auto mask = reg<isa>::rmm5;
|
||||
auto exponent_mask = reg<isa>::rmm5;
|
||||
auto mantissa_mask = reg<isa>::rmm6;
|
||||
|
||||
preamble();
|
||||
|
||||
@ -167,11 +178,13 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
||||
mov(reg_src, ptr[param1 + offsetof(args_t, src)]);
|
||||
lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]);
|
||||
mov(reg_sz, ptr[param1 + offsetof(args_t, count)]);
|
||||
mov(reg_mask_addr, (size_t)mask_data);
|
||||
|
||||
// Initialize necessary consts
|
||||
uni_vpxor(zero, zero, zero);
|
||||
uni_vmovdqu(mask, ptr[reg_mask_addr]);
|
||||
mov(reg_mask_addr, (size_t)exponent_mask_data);
|
||||
uni_vmovdqu(exponent_mask, ptr[reg_mask_addr]);
|
||||
mov(reg_mask_addr, (size_t)mantissa_mask_data);
|
||||
uni_vmovdqu(mantissa_mask, ptr[reg_mask_addr]);
|
||||
|
||||
// Main loop
|
||||
xor_(reg_idx, reg_idx);
|
||||
@ -179,7 +192,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
||||
shr(r8, sh_bits);
|
||||
|
||||
foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) {
|
||||
check_subnormals(reg_src, mask, zero);
|
||||
check_subnormals(reg_src, exponent_mask, mantissa_mask, zero);
|
||||
jnc(has_subnormals);
|
||||
add(reg_src, sizeof(float) * vlen);
|
||||
});
|
||||
@ -197,7 +210,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
||||
uni_vmovdqu(ptr[r8], zero);
|
||||
|
||||
copy_floats(r8, reg_src, reg_sz);
|
||||
check_subnormals(r8, mask, zero);
|
||||
check_subnormals(r8, exponent_mask, mantissa_mask, zero);
|
||||
jc(no_subnormals);
|
||||
add(rsp, vlen * sizeof(float));
|
||||
|
||||
@ -331,8 +344,10 @@ void Input::cloneBlobIfRequired() {
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t mantissaMask = 0x007fffff;
|
||||
uint32_t exponentMask = 0x7f800000;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
|
||||
if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -43,10 +43,11 @@ void validate() override {
|
||||
ASSERT_FALSE(actualOutputs.empty());
|
||||
auto& outTensor = actualOutputs.front();
|
||||
ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type";
|
||||
const uint32_t* data = reinterpret_cast<const uint32_t*>(outTensor.data());
|
||||
const float* data = reinterpret_cast<const float*>(outTensor.data());
|
||||
bool hasDenormals = false;
|
||||
for (size_t i = 0; i < outTensor.get_size(); ++i) {
|
||||
if (data[i] && (data[i] & (0xff << 23)) == 0) {
|
||||
if (std::abs(data[i]) >= std::numeric_limits<float>::denorm_min() &&
|
||||
std::abs(data[i]) < std::numeric_limits<float>::min()) {
|
||||
hasDenormals = true;
|
||||
}
|
||||
}
|
||||
@ -108,4 +109,5 @@ TEST_F(DenormalNullifyCheck, smoke_CPU_Denormal_Check) {
|
||||
run();
|
||||
}
|
||||
}
|
||||
|
||||
}// namespace SubgraphTestsDefinitions
|
||||
|
Loading…
Reference in New Issue
Block a user