[CPU] Fixed has_subnormals behavior for negative zero values (#19360)
This commit is contained in:
parent
82afb47e36
commit
d32b6904bd
@ -92,19 +92,20 @@ protected:
|
|||||||
pop(rsi);
|
pop(rsi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &mask, const Xbyak::Ymm &zero) {
|
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) {
|
||||||
auto a = ymm1;
|
auto a = ymm1;
|
||||||
auto b = ymm2;
|
auto b = ymm2;
|
||||||
auto c = ymm3;
|
auto c = ymm3;
|
||||||
|
|
||||||
vmovdqu(a, yword[src]); // load 8 floats
|
vmovdqu(a, yword[src]); // load 8 floats
|
||||||
vpcmpeqd(b, a, zero); // if (a == 0) b = 1 else b = 0
|
vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111
|
||||||
vpand(c, a, mask); // c = a & 01111111100000000000000000000000
|
vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
|
||||||
|
vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000
|
||||||
vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
||||||
vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &mask, const Xbyak::Xmm &zero) {
|
void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) {
|
||||||
auto a = xmm1;
|
auto a = xmm1;
|
||||||
auto b = xmm2;
|
auto b = xmm2;
|
||||||
auto c = xmm3;
|
auto c = xmm3;
|
||||||
@ -112,10 +113,11 @@ protected:
|
|||||||
uni_vmovdqu(a, xword[src]); // load 4 floats
|
uni_vmovdqu(a, xword[src]); // load 4 floats
|
||||||
uni_vmovdqu(b, a); // b = a
|
uni_vmovdqu(b, a); // b = a
|
||||||
uni_vmovdqu(c, a); // c = a
|
uni_vmovdqu(c, a); // c = a
|
||||||
uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0
|
uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111
|
||||||
uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000
|
uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0
|
||||||
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000
|
||||||
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0
|
||||||
|
uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
template<cpu_isa_t isa>
|
template<cpu_isa_t isa>
|
||||||
@ -130,12 +132,18 @@ protected:
|
|||||||
const Reg64 ®_idx = rsi;
|
const Reg64 ®_idx = rsi;
|
||||||
const Reg64 ®_mask_addr = r15;
|
const Reg64 ®_mask_addr = r15;
|
||||||
|
|
||||||
static const uint32_t mask_data[8];
|
static const uint32_t exponent_mask_data[8];
|
||||||
|
static const uint32_t mantissa_mask_data[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
const uint32_t jit_has_subnormals_base::mask_data[8] = {
|
const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = {
|
||||||
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23,
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000,
|
||||||
0xFF << 23, 0xFF << 23, 0xFF << 23, 0xFF << 23
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
|
||||||
|
};
|
||||||
|
|
||||||
|
const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = {
|
||||||
|
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff,
|
||||||
|
0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -143,6 +151,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::avx2> {
|
|||||||
constexpr static uint32_t length = 8;
|
constexpr static uint32_t length = 8;
|
||||||
constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4;
|
constexpr static const Xbyak::Ymm & rmm4 = Xbyak::util::ymm4;
|
||||||
constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5;
|
constexpr static const Xbyak::Ymm & rmm5 = Xbyak::util::ymm5;
|
||||||
|
constexpr static const Xbyak::Ymm & rmm6 = Xbyak::util::ymm6;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@ -150,6 +159,7 @@ struct jit_has_subnormals_base::reg<cpu_isa_t::sse41> {
|
|||||||
constexpr static uint32_t length = 4;
|
constexpr static uint32_t length = 4;
|
||||||
constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4;
|
constexpr static const Xbyak::Xmm & rmm4 = Xbyak::util::xmm4;
|
||||||
constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5;
|
constexpr static const Xbyak::Xmm & rmm5 = Xbyak::util::xmm5;
|
||||||
|
constexpr static const Xbyak::Xmm & rmm6 = Xbyak::util::xmm6;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<cpu_isa_t isa>
|
template<cpu_isa_t isa>
|
||||||
@ -159,7 +169,8 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
|||||||
const int sh_bits = std::ilogb(vlen);
|
const int sh_bits = std::ilogb(vlen);
|
||||||
|
|
||||||
auto zero = reg<isa>::rmm4;
|
auto zero = reg<isa>::rmm4;
|
||||||
auto mask = reg<isa>::rmm5;
|
auto exponent_mask = reg<isa>::rmm5;
|
||||||
|
auto mantissa_mask = reg<isa>::rmm6;
|
||||||
|
|
||||||
preamble();
|
preamble();
|
||||||
|
|
||||||
@ -167,11 +178,13 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
|||||||
mov(reg_src, ptr[param1 + offsetof(args_t, src)]);
|
mov(reg_src, ptr[param1 + offsetof(args_t, src)]);
|
||||||
lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]);
|
lea(reg_dst, ptr[param1 + offsetof(args_t, hasSubnormals)]);
|
||||||
mov(reg_sz, ptr[param1 + offsetof(args_t, count)]);
|
mov(reg_sz, ptr[param1 + offsetof(args_t, count)]);
|
||||||
mov(reg_mask_addr, (size_t)mask_data);
|
|
||||||
|
|
||||||
// Initialize necessary consts
|
// Initialize necessary consts
|
||||||
uni_vpxor(zero, zero, zero);
|
uni_vpxor(zero, zero, zero);
|
||||||
uni_vmovdqu(mask, ptr[reg_mask_addr]);
|
mov(reg_mask_addr, (size_t)exponent_mask_data);
|
||||||
|
uni_vmovdqu(exponent_mask, ptr[reg_mask_addr]);
|
||||||
|
mov(reg_mask_addr, (size_t)mantissa_mask_data);
|
||||||
|
uni_vmovdqu(mantissa_mask, ptr[reg_mask_addr]);
|
||||||
|
|
||||||
// Main loop
|
// Main loop
|
||||||
xor_(reg_idx, reg_idx);
|
xor_(reg_idx, reg_idx);
|
||||||
@ -179,7 +192,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
|||||||
shr(r8, sh_bits);
|
shr(r8, sh_bits);
|
||||||
|
|
||||||
foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) {
|
foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) {
|
||||||
check_subnormals(reg_src, mask, zero);
|
check_subnormals(reg_src, exponent_mask, mantissa_mask, zero);
|
||||||
jnc(has_subnormals);
|
jnc(has_subnormals);
|
||||||
add(reg_src, sizeof(float) * vlen);
|
add(reg_src, sizeof(float) * vlen);
|
||||||
});
|
});
|
||||||
@ -197,7 +210,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base {
|
|||||||
uni_vmovdqu(ptr[r8], zero);
|
uni_vmovdqu(ptr[r8], zero);
|
||||||
|
|
||||||
copy_floats(r8, reg_src, reg_sz);
|
copy_floats(r8, reg_src, reg_sz);
|
||||||
check_subnormals(r8, mask, zero);
|
check_subnormals(r8, exponent_mask, mantissa_mask, zero);
|
||||||
jc(no_subnormals);
|
jc(no_subnormals);
|
||||||
add(rsp, vlen * sizeof(float));
|
add(rsp, vlen * sizeof(float));
|
||||||
|
|
||||||
@ -331,8 +344,10 @@ void Input::cloneBlobIfRequired() {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
uint32_t mantissaMask = 0x007fffff;
|
||||||
|
uint32_t exponentMask = 0x7f800000;
|
||||||
for (size_t i = 0; i < size; ++i) {
|
for (size_t i = 0; i < size; ++i) {
|
||||||
if (u32data[i] && (u32data[i] & (0xFF << 23)) == 0) {
|
if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -43,10 +43,11 @@ void validate() override {
|
|||||||
ASSERT_FALSE(actualOutputs.empty());
|
ASSERT_FALSE(actualOutputs.empty());
|
||||||
auto& outTensor = actualOutputs.front();
|
auto& outTensor = actualOutputs.front();
|
||||||
ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type";
|
ASSERT_EQ(ov::element::f32, outTensor.get_element_type()) << "Unexpected element type";
|
||||||
const uint32_t* data = reinterpret_cast<const uint32_t*>(outTensor.data());
|
const float* data = reinterpret_cast<const float*>(outTensor.data());
|
||||||
bool hasDenormals = false;
|
bool hasDenormals = false;
|
||||||
for (size_t i = 0; i < outTensor.get_size(); ++i) {
|
for (size_t i = 0; i < outTensor.get_size(); ++i) {
|
||||||
if (data[i] && (data[i] & (0xff << 23)) == 0) {
|
if (std::abs(data[i]) >= std::numeric_limits<float>::denorm_min() &&
|
||||||
|
std::abs(data[i]) < std::numeric_limits<float>::min()) {
|
||||||
hasDenormals = true;
|
hasDenormals = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,4 +109,5 @@ TEST_F(DenormalNullifyCheck, smoke_CPU_Denormal_Check) {
|
|||||||
run();
|
run();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}// namespace SubgraphTestsDefinitions
|
}// namespace SubgraphTestsDefinitions
|
||||||
|
Loading…
Reference in New Issue
Block a user