From 188dda668f1ce5450727a50e3fedff5f563930c1 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Tue, 7 Feb 2023 17:57:34 +0400 Subject: [PATCH] [CPU] Fix sporadic SIGFAULT in GridSample. (#15009) --- .../intel_cpu/src/nodes/grid_sample.cpp | 6 +- .../src/nodes/kernels/grid_sample.cpp | 147 ++++++++++-------- .../src/nodes/kernels/jit_kernel_base.cpp | 90 ++++++++--- .../src/nodes/kernels/jit_kernel_base.hpp | 22 ++- .../skip_tests_config.cpp | 3 - .../single_layer_tests/grid_sample.cpp | 28 +++- 6 files changed, 209 insertions(+), 87 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index 39c10f890f2..4744fa06279 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -202,12 +202,16 @@ void GridSample::prepareParams() { auto& p = execParamsPerThread[ithr]; + p.workAmount = dstEnd - dstStart; + if (p.workAmount == 0lu) { + return; + } + p.batchNum = srcDataShape[0]; p.channelsNum = srcDataShape[1]; p.srcHeightF[0] = srcDataShape[2]; p.srcWidthF[0] = srcDataShape[3]; - p.workAmount = dstEnd - dstStart; p.gridStartB = dstStart * 2 * gridTypeSize; p.dstStartB = dstStart * dataTypeSize; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/kernels/grid_sample.cpp index 313b981870e..4d47b414076 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/grid_sample.cpp @@ -76,10 +76,8 @@ void GridSampleKernel::initVectors() { mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); uni_vpbroadcastd(vSrcHeightF, ptr[rAux]); - if (one_of(jcp.paddingMode, GridSamplePaddingMode::ZEROS, GridSamplePaddingMode::BORDER)) { - vZeros = getVmm(); - uni_vpxor(vZeros, vZeros, vZeros); - } + vZeros = getVmm(); + uni_vpxor(vZeros, vZeros, vZeros); if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BICUBIC, GridSampleInterpolationMode::BILINEAR)) { vOnesF = getVmm(); @@ -430,7 +428,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, cmp(rAux, 0); jle(lEnd, T_NEAR); - fillRestWorkMask(kTailMask, vAux, rAux); + fillRestWorkMask(kTailMask, rAux); uni_vmovups((Vmm)vAux | kTailMask, ptr[regGrid]); vpermd(vAux, vGridPermMask, vAux); Xbyak::Ymm ymmAux(vAux.getIdx()); @@ -441,7 +439,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, } L(lRest); { - fillRestWorkMask(kTailMask, vAux, rAux); + fillRestWorkMask(kTailMask, rAux); uni_vmovups(vWCoord | kTailMask, ptr[regGrid]); vpermd(vWCoord, vGridPermMask, vWCoord); vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component @@ -454,7 +452,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, L(lEnd); - fillRestWorkMask(kTailMask, vAux, regWorkAmount); + fillRestWorkMask(kTailMask, regWorkAmount); } template <> @@ -672,14 +670,14 @@ void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const template <> void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) { - vcmpps(kDst, vCoord, vSrcWidthF, 0x1); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, 0x2); // vCoord >= vZeros + vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) { - vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, 0x1); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, 0x2); // vCoord >= vZeros + vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> @@ -693,15 +691,15 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vWCoord, vSrcWidthF, 0x1); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); - uni_vcmpps(vAux, vWCoord, ptr[rAux], 0x1); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vWCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF } uni_vpxor(kDst, kDst, kDst); - uni_vcmpps(kDst, kDst, vWCoord, 0x2); // vWCoord >= vZeros + uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } @@ -710,17 +708,17 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vHCoord, vSrcHeightF, 0x1); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vHCoord, ptr[rAux], 0x1); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vmovups(kDst, kMaskW); uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF uni_vpxor(vAux, vAux, vAux); - uni_vcmpps(vAux, vAux, vHCoord, 0x2); // vHCoord >= vZeros + uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF } @@ -744,14 +742,14 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) } if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcWidthF, 0x1); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); - uni_vcmpps(vAux, vCoord, ptr[rAux], 0x1); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF } - uni_vcmpps(kDst, vZerosTmp, vCoord, 0x2); // vWCoord >= vZeros + uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } @@ -769,15 +767,15 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, } if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcHeightF, 0x1); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vCoord, ptr[rAux], 0x1); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vandps(kDst, kMaskW, vAux); - uni_vcmpps(vAux, vZerosTmp, vCoord, 0x2); // vHCoord >= vZeros + uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros uni_vandps(kDst, kDst, vAux); } @@ -831,7 +829,7 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor } } - uni_vcmpps(vAux, vCoordOrigin, vSub1F, 0x2); // vCoord <= vUpperBound + uni_vcmpps(vAux, vCoordOrigin, vSub1F, CMP_LE_PS); // vCoord <= vUpperBound uni_vandps(vCoordDst, vCoordOrigin, vAux); uni_vandnps(vAux, vAux, vSub1F); uni_vaddps(vCoordDst, vCoordDst, vAux); @@ -857,14 +855,20 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor template <> void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { auto vAux = getVmm(); + auto kAux = getMask(); const auto& vSrcDimMul2Sub1F = dim == coord::w ? vSrcWidthMul2Sub1F : vSrcHeightMul2Sub1F; if (jcp.alignCorners) { // abs(x) % D21 uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x) uni_vdivps(vAux, vCoordDst, vSrcDimMul2Sub1F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 + + // Check that the result does not exceed the divisor. + vcmpps(kAux, vSrcDimMul2Sub1F, vCoordDst, CMP_LE_PS); + uni_vmovups(vCoordDst | kAux, vZeros); + vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); } else { const auto& vSrcDimMul2F = dim == coord::w ? vSrcWidthMul2F : vSrcHeightMul2F; // (x % D2 + D2) % D2 @@ -877,12 +881,16 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, uni_vdivps(vAux, vCoordDst, vSrcDimMul2F); uni_vroundps(vAux, vAux, 0x3); // Truncation uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2 + + // Check that the result does not exceed the divisor. + vcmpps(kAux, vSrcDimMul2F, vCoordDst, CMP_LE_PS); + uni_vmovups(vCoordDst | kAux, vZeros); + vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); } - auto kAux = getMask(); uni_vsubps(vAux, vSrcDimMul2Sub1F, vCoordDst); - vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, 0x2); // vCoordDst >= vSrcDimF - vmovups(vCoordDst | kAux, vAux); + vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF + uni_vmovups(vCoordDst | kAux, vAux); } template // Works for AVX2, AVX, SSE41 @@ -925,6 +933,14 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vdivps(vAux0, vCoordDst, vMul2Sub1); uni_vroundps(vAux0, vAux0, 0x3); // Truncation uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21 + + // Check that the result does not exceed the divisor. + uni_vcmpps(vAux0, vCoordDst, vMul2Sub1, CMP_LT_PS); + uni_vandps(vCoordDst, vCoordDst, vAux0); + uni_vxorps(vAux0, vAux0, vAux0); + uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS); + uni_vandps(vCoordDst, vCoordDst, vAux0); + uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21 } else { // x' = (x % D2 + D2) % D2 - D21 @@ -956,6 +972,13 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vroundps(vAux0, vAux0, 0x3); // Truncation uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2 + // Check that the result does not exceed the divisor. + uni_vcmpps(vAux0, vCoordDst, vMul2, CMP_LT_PS); + uni_vandps(vCoordDst, vCoordDst, vAux0); + uni_vxorps(vAux0, vAux0, vAux0); + uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS); + uni_vandps(vCoordDst, vCoordDst, vAux0); + if (dim == coord::w) { if (vSrcWidthMul2Sub1F.isInitialized()) { uni_vsubps(vAux0, vCoordDst, vSrcWidthMul2Sub1F); @@ -975,17 +998,17 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v if (dim == coord::w) { if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, 0x1); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], 0x1); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } else { if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, 0x1); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], 0x1); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } @@ -1246,23 +1269,21 @@ void GridSampleKernel::nearestInterpolation(const Vmm& vWCoord, const Vmm& template <> void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { - auto vDX = getVmm(); - auto vDY = getVmm(); - const auto& shift00 = vWCoord; - const auto& shift01 = vHCoord; - auto shift10 = getVmm(); - auto shift11 = getVmm(); - auto vAux = getVmm(); + const auto& vDX = vWCoord; + const auto& vDY = vHCoord; + auto shift00 = getVmm(); + auto shift01 = getVmm(); + auto shift10 = getVmm(); + auto shift11 = getVmm(); + auto vAux = getVmm(); RegistersPool::Reg kMask00, kMask01, kMask10, kMask11; - uni_vmovups(vDX, vWCoord); - uni_vmovups(vDY, vHCoord); - uni_vroundps(vWCoord, vWCoord, 0x1); // Round floor - uni_vroundps(vHCoord, vHCoord, 0x1); // Round floor - uni_vsubps(vDX, vDX, vWCoord); - uni_vsubps(vDY, vDY, vHCoord); - uni_vaddps(shift10, vWCoord, vOnesF); - uni_vaddps(shift11, vHCoord, vOnesF); + uni_vroundps(shift00, vWCoord, 0x1); // Round floor + uni_vroundps(shift01, vHCoord, 0x1); // Round floor + uni_vsubps(vDX, vWCoord, shift00); + uni_vsubps(vDY, vHCoord, shift01); + uni_vaddps(shift10, shift00, vOnesF); + uni_vaddps(shift11, shift01, vOnesF); bool useMask = false, zeroFill = false; if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1272,31 +1293,31 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor kMask10 = getMask(); kMask11 = getMask(); - zerosPadding(kMask00, vHCoord, vWCoord); // (y; x) - zerosPadding(kMask01, vHCoord, shift10); // (y; x + 1) + zerosPadding(kMask00, shift01, shift00); // (y; x) + zerosPadding(kMask01, shift01, shift10); // (y; x + 1) zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1) - zerosPadding(kMask10, shift11, vWCoord); // (y + 1; x) + zerosPadding(kMask10, shift11, shift00); // (y + 1; x) - hwShiftPs2dq(shift00, vHCoord, vWCoord, vSrcWidthF); + hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF); uni_vpaddd(shift01, shift00, vDataTypeSizeB); - uni_vpaddd(shift10, shift00, vSrcWidthB); // shift11?? - uni_vpaddd(shift11, shift10, vDataTypeSizeB); // sub?? + uni_vpaddd(shift10, shift00, vSrcWidthB); + uni_vpaddd(shift11, shift10, vDataTypeSizeB); } else if (jcp.paddingMode == GridSamplePaddingMode::BORDER) { - borderPadding(vWCoord, vWCoord, coord::w); - borderPadding(vHCoord, vHCoord, coord::h); + borderPadding(shift00, shift00, coord::w); + borderPadding(shift01, shift01, coord::h); borderPadding(shift10, shift10, coord::w); borderPadding(shift11, shift11, coord::h); } else if (jcp.paddingMode == GridSamplePaddingMode::REFLECTION) { - reflectionPadding(vWCoord, vWCoord, coord::w); - reflectionPadding(vHCoord, vHCoord, coord::h); + reflectionPadding(shift00, shift00, coord::w); + reflectionPadding(shift01, shift01, coord::h); reflectionPadding(shift10, shift10, coord::w); reflectionPadding(shift11, shift11, coord::h); } if (jcp.paddingMode == GridSamplePaddingMode::BORDER || jcp.paddingMode == GridSamplePaddingMode::REFLECTION) { // W * y + x - hwShiftPs2dq(vAux, shift11, vWCoord, vSrcWidthF); - hwShiftPs2dq(vWCoord, vHCoord, vWCoord, vSrcWidthF); - hwShiftPs2dq(vHCoord, vHCoord, shift10, vSrcWidthF); + hwShiftPs2dq(vAux, shift11, shift00, vSrcWidthF); + hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF); + hwShiftPs2dq(shift01, shift01, shift10, vSrcWidthF); hwShiftPs2dq(shift11, shift11, shift10, vSrcWidthF); uni_vmovups(shift10, vAux); } @@ -1658,8 +1679,8 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord // (y - 1 + h; x - 1) if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { Xbyak::Opmask maskH = kMaskH; - vcmpps(kMaskH, vHCoord, vSrcHeightF, 0x1); - vcmpps(maskH | maskH, vZeros, vHCoord, 0x2); + vcmpps(kMaskH, vHCoord, vSrcHeightF, CMP_LT_PS); + vcmpps(maskH | maskH, vZeros, vHCoord, CMP_LE_PS); kandw(kAuxMask, kMaskH, wMasks[0]); uni_vmulps(vSrcShift0, vHCoord, vSrcWidthF); uni_vmovups(vWCoord, vWLeft); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.cpp b/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.cpp index 0516329ba83..82d32af7961 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.cpp @@ -286,26 +286,80 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand & } void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, - const Xbyak::Zmm& zAux, - const Xbyak::Reg64& rWorkRest) { - auto rAux0 = getReg64(); - auto rAux1 = getReg64(); - Xbyak::Label lKmov; - Xbyak::Reg32 rOnes(rAux1.getIdx()); - const uint64_t typeSize = 4; - const uint64_t elPerVec = x64::cpu_isa_traits::vlen / typeSize; + const Xbyak::Reg64& rWorkRest) { + auto rOnes = getReg64(); - mov(rOnes, 0x0000FFFF); - cmp(rWorkRest, elPerVec); - jge(lKmov); - { - Xbyak::Reg32 rShift(rAux0.getIdx()); - mov(rShift, elPerVec); - sub(rShift, rWorkRest); - shrx(rOnes, rOnes, rShift); + mov(rOnes, 0xFFFFFFFFFFFFFFFF); + shlx(rOnes, rOnes, rWorkRest); + not_(rOnes); + kmovq(dstMask, rOnes); +} + +void JitKernelBase::fillRestWorkMask(const Xbyak::Xmm& xmmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize) { + if (!one_of(typeSize, 1, 2, 4, 8)) { + IE_THROW() << "Could not fill data with type size " << typeSize; } - L(lKmov); - kmovw(dstMask, rOnes); + Xbyak::Label lEnd; + auto r32Ones = getReg32(); + Xbyak::Reg64 r64Ones(r32Ones.getIdx()); + auto elPerVec = x64::cpu_isa_traits::vlen / typeSize; + + mov(r64Ones, 0xFFFFFFFFFFFFFFFF); + for (uint8_t i = 0; i < elPerVec; i++) { + cmp(rWorkRest, i); + jle(lEnd, T_NEAR); + + if (typeSize == 1) { + pinsrb(xmmDstMask, r32Ones, i); + } else if (typeSize == 2) { + pinsrw(xmmDstMask, r32Ones, i); + } else if (typeSize == 4) { + pinsrd(xmmDstMask, r32Ones, i); + } else if (typeSize == 8) { + pinsrq(xmmDstMask, r64Ones, i); + } + } + L(lEnd); +} + +void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize) { + if (!one_of(typeSize, 1, 2, 4, 8)) { + IE_THROW() << "Could not fill data with type size " << typeSize; + } + Xbyak::Label lEnd; + auto elPerVec = x64::cpu_isa_traits::vlen / typeSize; + auto r32Ones = getReg32(); + Xbyak::Reg64 r64Ones(r32Ones.getIdx()); + Xbyak::Xmm xmmDstMask(ymmDstMask.getIdx()); + + mov(r64Ones, 0xFFFFFFFFFFFFFFFF); + uni_vpxor(ymmDstMask, ymmDstMask, ymmDstMask); + for (uint8_t i = 0; i < 2; i++) { + Xbyak::Label lPerm; + for (uint8_t j = 0; j < elPerVec; j++) { + cmp(rWorkRest, i * elPerVec + j); + jle(i == 0 ? lEnd : lPerm, T_NEAR); + + if (typeSize == 1) { + pinsrb(xmmDstMask, r32Ones, j); + } else if (typeSize == 2) { + pinsrw(xmmDstMask, r32Ones, j); + } else if (typeSize == 4) { + pinsrd(xmmDstMask, r32Ones, j); + } else if (typeSize == 8) { + pinsrq(xmmDstMask, r64Ones, j); + } + } + cmp(rWorkRest, elPerVec); + je(lEnd, T_NEAR); + L(lPerm); + vperm2f128(ymmDstMask, ymmDstMask, ymmDstMask, 0x1); + } + L(lEnd); } void JitKernelBase::load(const Xbyak::Xmm& vDst, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.hpp b/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.hpp index 9784f9a05d1..b49b0d8ff35 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/jit_kernel_base.hpp @@ -11,6 +11,7 @@ namespace ov { namespace intel_cpu { #define getReg64() RegistersPool::Reg(registersPool) +#define getReg32() RegistersPool::Reg(registersPool) #define getVmm() RegistersPool::Reg(registersPool) #define getMask() RegistersPool::Reg(registersPool) @@ -84,9 +85,16 @@ public: const bool zeroFill = false); void fillRestWorkMask(const Xbyak::Opmask& kDstMask, - const Xbyak::Zmm& zAux, const Xbyak::Reg64& rWorkRest); + void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize = 4); + + void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, + const Xbyak::Reg64& rWorkRest, + const uint64_t typeSize = 4); + void load(const Xbyak::Xmm& vDst, const Xbyak::Address& srcAddr, const Xbyak::Reg64& rLoadNum, @@ -133,6 +141,18 @@ protected: } RegistersPool::Ptr registersPool; + + enum { + // Comparison predicate operand (immediate byte) for single-precision floating-point values. + CMP_EQ_PS = 0, // Equal (ordered, non-signaling) + CMP_LT_PS, // Less-than (ordered, signaling) + CMP_LE_PS, // Less-than-or-equal (ordered, signaling) + CMP_UNORD_PS, // Unordered (non-signaling) + CMP_NEQ_PS, // Not-equal (unordered, non-signaling) + CMP_NLT_PS, // Not-less-than (unordered, signaling) + CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling) + CMP_ORD_PS // Ordered (non-signaling) + }; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index fd697eedd77..a60c67b9065 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -184,9 +184,6 @@ std::vector disabledTestPatterns() { // The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion. // Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation. R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)", - // 94989. BF16 Reference produces different results. - // GridSample regression on bf16 data. - R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*bf16.*)", // // Issue: 95915 R"(smoke_dynamic/AUGRUCellCPUTest.CompareWithRefs/IS=\(\[\?\.1\]_\[\?\.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT R"(smoke_dynamic/GRUCellCPUTest.CompareWithRefs/IS=\(\[\?.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/grid_sample.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/grid_sample.cpp index fa7935eb6a9..86690a12ea4 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/grid_sample.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/grid_sample.cpp @@ -94,6 +94,9 @@ protected: auto execType = dataPrecision == ov::element::i32 ? ov::element::i32 : ov::element::f32; selectedType = makeSelectedTypeStr(selectedType, execType); } + if (gridPrecision == ov::element::bf16) { + rel_threshold = 0.01f; + } auto params = ngraph::builder::makeDynamicParams({dataPrecision, gridPrecision}, inputDynamicShapes); params[0]->set_friendly_name("data"); @@ -272,12 +275,35 @@ INSTANTIATE_TEST_SUITE_P(smoke_static, GridSampleLayerTestCPU, ::testing::ValuesIn(interpolateMode), ::testing::ValuesIn(paddingMode), ::testing::ValuesIn(alignCorners), - ::testing::ValuesIn({ElementType::f32, ElementType::bf16, ElementType::i32, ElementType::i8}), + ::testing::ValuesIn({ElementType::f32, ElementType::i32}), + ::testing::ValuesIn({ElementType::f32}), + ::testing::ValuesIn(getCPUInfo()), + ::testing::Values(additionalConfig[0])), + GridSampleLayerTestCPU::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(nightly_static_1, GridSampleLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(getStaticShapes()), + ::testing::ValuesIn(interpolateMode), + ::testing::ValuesIn(paddingMode), + ::testing::ValuesIn(alignCorners), + ::testing::ValuesIn({ElementType::bf16, ElementType::i8}), ::testing::ValuesIn({ElementType::f32, ElementType::bf16}), ::testing::ValuesIn(getCPUInfo()), ::testing::Values(additionalConfig[0])), GridSampleLayerTestCPU::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(nightly_static_2, GridSampleLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(getStaticShapes()), + ::testing::ValuesIn(interpolateMode), + ::testing::ValuesIn(paddingMode), + ::testing::ValuesIn(alignCorners), + ::testing::ValuesIn({ElementType::f32}), + ::testing::ValuesIn({ElementType::bf16}), + ::testing::ValuesIn(getCPUInfo()), + ::testing::Values(additionalConfig[0])), + GridSampleLayerTestCPU::getTestCaseName); const std::vector> dynamicInSapes = { { { { ov::Dimension(1, 15), -1, -1, -1 }, // Dynamic shape 0