[CPU] Fix sporadic SIGFAULT in GridSample. (#15009)
This commit is contained in:
parent
a48b4fc2b5
commit
188dda668f
@ -202,12 +202,16 @@ void GridSample::prepareParams() {
|
||||
|
||||
auto& p = execParamsPerThread[ithr];
|
||||
|
||||
p.workAmount = dstEnd - dstStart;
|
||||
if (p.workAmount == 0lu) {
|
||||
return;
|
||||
}
|
||||
|
||||
p.batchNum = srcDataShape[0];
|
||||
p.channelsNum = srcDataShape[1];
|
||||
p.srcHeightF[0] = srcDataShape[2];
|
||||
p.srcWidthF[0] = srcDataShape[3];
|
||||
|
||||
p.workAmount = dstEnd - dstStart;
|
||||
p.gridStartB = dstStart * 2 * gridTypeSize;
|
||||
p.dstStartB = dstStart * dataTypeSize;
|
||||
|
||||
|
@ -76,10 +76,8 @@ void GridSampleKernel<x64::avx512_core>::initVectors() {
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]);
|
||||
uni_vpbroadcastd(vSrcHeightF, ptr[rAux]);
|
||||
|
||||
if (one_of(jcp.paddingMode, GridSamplePaddingMode::ZEROS, GridSamplePaddingMode::BORDER)) {
|
||||
vZeros = getVmm();
|
||||
uni_vpxor(vZeros, vZeros, vZeros);
|
||||
}
|
||||
vZeros = getVmm();
|
||||
uni_vpxor(vZeros, vZeros, vZeros);
|
||||
|
||||
if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BICUBIC, GridSampleInterpolationMode::BILINEAR)) {
|
||||
vOnesF = getVmm();
|
||||
@ -430,7 +428,7 @@ void GridSampleKernel<x64::avx512_core>::getTailCoordinates(const Vmm& vHCoord,
|
||||
cmp(rAux, 0);
|
||||
jle(lEnd, T_NEAR);
|
||||
|
||||
fillRestWorkMask(kTailMask, vAux, rAux);
|
||||
fillRestWorkMask(kTailMask, rAux);
|
||||
uni_vmovups((Vmm)vAux | kTailMask, ptr[regGrid]);
|
||||
vpermd(vAux, vGridPermMask, vAux);
|
||||
Xbyak::Ymm ymmAux(vAux.getIdx());
|
||||
@ -441,7 +439,7 @@ void GridSampleKernel<x64::avx512_core>::getTailCoordinates(const Vmm& vHCoord,
|
||||
}
|
||||
L(lRest);
|
||||
{
|
||||
fillRestWorkMask(kTailMask, vAux, rAux);
|
||||
fillRestWorkMask(kTailMask, rAux);
|
||||
uni_vmovups(vWCoord | kTailMask, ptr[regGrid]);
|
||||
vpermd(vWCoord, vGridPermMask, vWCoord);
|
||||
vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component
|
||||
@ -454,7 +452,7 @@ void GridSampleKernel<x64::avx512_core>::getTailCoordinates(const Vmm& vHCoord,
|
||||
|
||||
L(lEnd);
|
||||
|
||||
fillRestWorkMask(kTailMask, vAux, regWorkAmount);
|
||||
fillRestWorkMask(kTailMask, regWorkAmount);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -672,14 +670,14 @@ void GridSampleKernel<isa>::denormalizeRawCoordinates(const Vmm& vWCoord, const
|
||||
|
||||
template <>
|
||||
void GridSampleKernel<x64::avx512_core>::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) {
|
||||
vcmpps(kDst, vCoord, vSrcWidthF, 0x1); // vCoord < vUpperBound
|
||||
vcmpps(kDst | kDst, vZeros, vCoord, 0x2); // vCoord >= vZeros
|
||||
vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound
|
||||
vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros
|
||||
}
|
||||
|
||||
template <>
|
||||
void GridSampleKernel<x64::avx512_core>::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) {
|
||||
vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, 0x1); // vCoord < vUpperBound
|
||||
vcmpps(kDst | kDst, vZeros, vCoord, 0x2); // vCoord >= vZeros
|
||||
vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound
|
||||
vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -693,15 +691,15 @@ void GridSampleKernel<x64::sse41>::zerosPaddingW(const Vmask& kDst, const Vmm& v
|
||||
auto vAux = getVmm();
|
||||
|
||||
if (vSrcWidthF.isInitialized()) {
|
||||
uni_vcmpps(vAux, vWCoord, vSrcWidthF, 0x1); // vWCoord < vSrcWidthF
|
||||
uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF
|
||||
} else {
|
||||
auto rAux = getReg64();
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]);
|
||||
uni_vcmpps(vAux, vWCoord, ptr[rAux], 0x1); // vWCoord < vSrcWidthF
|
||||
uni_vcmpps(vAux, vWCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF
|
||||
}
|
||||
|
||||
uni_vpxor(kDst, kDst, kDst);
|
||||
uni_vcmpps(kDst, kDst, vWCoord, 0x2); // vWCoord >= vZeros
|
||||
uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros
|
||||
uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF
|
||||
}
|
||||
|
||||
@ -710,17 +708,17 @@ void GridSampleKernel<x64::sse41>::zerosPaddingH(const Vmask& kDst, const Vmm& v
|
||||
auto vAux = getVmm();
|
||||
|
||||
if (vSrcHeightF.isInitialized()) {
|
||||
uni_vcmpps(vAux, vHCoord, vSrcHeightF, 0x1); // vHCoord < vSrcHeightF
|
||||
uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF
|
||||
} else {
|
||||
auto rAux = getReg64();
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]);
|
||||
uni_vcmpps(vAux, vHCoord, ptr[rAux], 0x1); // vHCoord < vSrcHeightF
|
||||
uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF
|
||||
}
|
||||
|
||||
uni_vmovups(kDst, kMaskW);
|
||||
uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF
|
||||
uni_vpxor(vAux, vAux, vAux);
|
||||
uni_vcmpps(vAux, vAux, vHCoord, 0x2); // vHCoord >= vZeros
|
||||
uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros
|
||||
uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF
|
||||
}
|
||||
|
||||
@ -744,14 +742,14 @@ void GridSampleKernel<isa>::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord)
|
||||
}
|
||||
|
||||
if (vSrcWidthF.isInitialized()) {
|
||||
uni_vcmpps(vAux, vCoord, vSrcWidthF, 0x1); // vWCoord < vSrcWidthF
|
||||
uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF
|
||||
} else {
|
||||
auto rAux = getReg64();
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]);
|
||||
uni_vcmpps(vAux, vCoord, ptr[rAux], 0x1); // vWCoord < vSrcWidthF
|
||||
uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF
|
||||
}
|
||||
|
||||
uni_vcmpps(kDst, vZerosTmp, vCoord, 0x2); // vWCoord >= vZeros
|
||||
uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros
|
||||
uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF
|
||||
}
|
||||
|
||||
@ -769,15 +767,15 @@ void GridSampleKernel<isa>::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord,
|
||||
}
|
||||
|
||||
if (vSrcHeightF.isInitialized()) {
|
||||
uni_vcmpps(vAux, vCoord, vSrcHeightF, 0x1); // vHCoord < vSrcHeightF
|
||||
uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF
|
||||
} else {
|
||||
auto rAux = getReg64();
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]);
|
||||
uni_vcmpps(vAux, vCoord, ptr[rAux], 0x1); // vHCoord < vSrcHeightF
|
||||
uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF
|
||||
}
|
||||
|
||||
uni_vandps(kDst, kMaskW, vAux);
|
||||
uni_vcmpps(vAux, vZerosTmp, vCoord, 0x2); // vHCoord >= vZeros
|
||||
uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros
|
||||
uni_vandps(kDst, kDst, vAux);
|
||||
}
|
||||
|
||||
@ -831,7 +829,7 @@ void GridSampleKernel<isa>::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor
|
||||
}
|
||||
}
|
||||
|
||||
uni_vcmpps(vAux, vCoordOrigin, vSub1F, 0x2); // vCoord <= vUpperBound
|
||||
uni_vcmpps(vAux, vCoordOrigin, vSub1F, CMP_LE_PS); // vCoord <= vUpperBound
|
||||
uni_vandps(vCoordDst, vCoordOrigin, vAux);
|
||||
uni_vandnps(vAux, vAux, vSub1F);
|
||||
uni_vaddps(vCoordDst, vCoordDst, vAux);
|
||||
@ -857,14 +855,20 @@ void GridSampleKernel<isa>::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor
|
||||
template <>
|
||||
void GridSampleKernel<x64::avx512_core>::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) {
|
||||
auto vAux = getVmm();
|
||||
auto kAux = getMask();
|
||||
const auto& vSrcDimMul2Sub1F = dim == coord::w ? vSrcWidthMul2Sub1F : vSrcHeightMul2Sub1F;
|
||||
|
||||
if (jcp.alignCorners) {
|
||||
// abs(x) % D21
|
||||
uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x)
|
||||
uni_vdivps(vAux, vCoordDst, vSrcDimMul2Sub1F);
|
||||
uni_vroundps(vAux, vAux, 0x3); // Truncation
|
||||
uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21
|
||||
uni_vroundps(vAux, vAux, 0x3); // Truncation
|
||||
uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21
|
||||
|
||||
// Check that the result does not exceed the divisor.
|
||||
vcmpps(kAux, vSrcDimMul2Sub1F, vCoordDst, CMP_LE_PS);
|
||||
uni_vmovups(vCoordDst | kAux, vZeros);
|
||||
vrangeps(vCoordDst, vCoordDst, vZeros, 0x1);
|
||||
} else {
|
||||
const auto& vSrcDimMul2F = dim == coord::w ? vSrcWidthMul2F : vSrcHeightMul2F;
|
||||
// (x % D2 + D2) % D2
|
||||
@ -877,12 +881,16 @@ void GridSampleKernel<x64::avx512_core>::reflectionPadding(const Vmm& vCoordDst,
|
||||
uni_vdivps(vAux, vCoordDst, vSrcDimMul2F);
|
||||
uni_vroundps(vAux, vAux, 0x3); // Truncation
|
||||
uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2
|
||||
|
||||
// Check that the result does not exceed the divisor.
|
||||
vcmpps(kAux, vSrcDimMul2F, vCoordDst, CMP_LE_PS);
|
||||
uni_vmovups(vCoordDst | kAux, vZeros);
|
||||
vrangeps(vCoordDst, vCoordDst, vZeros, 0x1);
|
||||
}
|
||||
|
||||
auto kAux = getMask();
|
||||
uni_vsubps(vAux, vSrcDimMul2Sub1F, vCoordDst);
|
||||
vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, 0x2); // vCoordDst >= vSrcDimF
|
||||
vmovups(vCoordDst | kAux, vAux);
|
||||
vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF
|
||||
uni_vmovups(vCoordDst | kAux, vAux);
|
||||
}
|
||||
|
||||
template <x64::cpu_isa_t isa> // Works for AVX2, AVX, SSE41
|
||||
@ -925,6 +933,14 @@ void GridSampleKernel<isa>::reflectionPadding(const Vmm& vCoordDst, const Vmm& v
|
||||
uni_vdivps(vAux0, vCoordDst, vMul2Sub1);
|
||||
uni_vroundps(vAux0, vAux0, 0x3); // Truncation
|
||||
uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21
|
||||
|
||||
// Check that the result does not exceed the divisor.
|
||||
uni_vcmpps(vAux0, vCoordDst, vMul2Sub1, CMP_LT_PS);
|
||||
uni_vandps(vCoordDst, vCoordDst, vAux0);
|
||||
uni_vxorps(vAux0, vAux0, vAux0);
|
||||
uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS);
|
||||
uni_vandps(vCoordDst, vCoordDst, vAux0);
|
||||
|
||||
uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21
|
||||
} else {
|
||||
// x' = (x % D2 + D2) % D2 - D21
|
||||
@ -956,6 +972,13 @@ void GridSampleKernel<isa>::reflectionPadding(const Vmm& vCoordDst, const Vmm& v
|
||||
uni_vroundps(vAux0, vAux0, 0x3); // Truncation
|
||||
uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2
|
||||
|
||||
// Check that the result does not exceed the divisor.
|
||||
uni_vcmpps(vAux0, vCoordDst, vMul2, CMP_LT_PS);
|
||||
uni_vandps(vCoordDst, vCoordDst, vAux0);
|
||||
uni_vxorps(vAux0, vAux0, vAux0);
|
||||
uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS);
|
||||
uni_vandps(vCoordDst, vCoordDst, vAux0);
|
||||
|
||||
if (dim == coord::w) {
|
||||
if (vSrcWidthMul2Sub1F.isInitialized()) {
|
||||
uni_vsubps(vAux0, vCoordDst, vSrcWidthMul2Sub1F);
|
||||
@ -975,17 +998,17 @@ void GridSampleKernel<isa>::reflectionPadding(const Vmm& vCoordDst, const Vmm& v
|
||||
|
||||
if (dim == coord::w) {
|
||||
if (vSrcWidthF.isInitialized()) {
|
||||
uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, 0x1); // vCoordDst < vUpperBound
|
||||
uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, CMP_LT_PS); // vCoordDst < vUpperBound
|
||||
} else {
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]);
|
||||
uni_vcmpps(vAux1, vCoordDst, ptr[rAux], 0x1); // vCoordDst < vUpperBound
|
||||
uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound
|
||||
}
|
||||
} else {
|
||||
if (vSrcHeightF.isInitialized()) {
|
||||
uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, 0x1); // vCoordDst < vUpperBound
|
||||
uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound
|
||||
} else {
|
||||
mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]);
|
||||
uni_vcmpps(vAux1, vCoordDst, ptr[rAux], 0x1); // vCoordDst < vUpperBound
|
||||
uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound
|
||||
}
|
||||
}
|
||||
|
||||
@ -1246,23 +1269,21 @@ void GridSampleKernel<isa>::nearestInterpolation(const Vmm& vWCoord, const Vmm&
|
||||
|
||||
template <>
|
||||
void GridSampleKernel<x64::avx512_core>::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) {
|
||||
auto vDX = getVmm();
|
||||
auto vDY = getVmm();
|
||||
const auto& shift00 = vWCoord;
|
||||
const auto& shift01 = vHCoord;
|
||||
auto shift10 = getVmm();
|
||||
auto shift11 = getVmm();
|
||||
auto vAux = getVmm();
|
||||
const auto& vDX = vWCoord;
|
||||
const auto& vDY = vHCoord;
|
||||
auto shift00 = getVmm();
|
||||
auto shift01 = getVmm();
|
||||
auto shift10 = getVmm();
|
||||
auto shift11 = getVmm();
|
||||
auto vAux = getVmm();
|
||||
RegistersPool::Reg<Vmask> kMask00, kMask01, kMask10, kMask11;
|
||||
|
||||
uni_vmovups(vDX, vWCoord);
|
||||
uni_vmovups(vDY, vHCoord);
|
||||
uni_vroundps(vWCoord, vWCoord, 0x1); // Round floor
|
||||
uni_vroundps(vHCoord, vHCoord, 0x1); // Round floor
|
||||
uni_vsubps(vDX, vDX, vWCoord);
|
||||
uni_vsubps(vDY, vDY, vHCoord);
|
||||
uni_vaddps(shift10, vWCoord, vOnesF);
|
||||
uni_vaddps(shift11, vHCoord, vOnesF);
|
||||
uni_vroundps(shift00, vWCoord, 0x1); // Round floor
|
||||
uni_vroundps(shift01, vHCoord, 0x1); // Round floor
|
||||
uni_vsubps(vDX, vWCoord, shift00);
|
||||
uni_vsubps(vDY, vHCoord, shift01);
|
||||
uni_vaddps(shift10, shift00, vOnesF);
|
||||
uni_vaddps(shift11, shift01, vOnesF);
|
||||
|
||||
bool useMask = false, zeroFill = false;
|
||||
if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) {
|
||||
@ -1272,31 +1293,31 @@ void GridSampleKernel<x64::avx512_core>::bilinearInterpolation(const Vmm& vWCoor
|
||||
kMask10 = getMask();
|
||||
kMask11 = getMask();
|
||||
|
||||
zerosPadding(kMask00, vHCoord, vWCoord); // (y; x)
|
||||
zerosPadding(kMask01, vHCoord, shift10); // (y; x + 1)
|
||||
zerosPadding(kMask00, shift01, shift00); // (y; x)
|
||||
zerosPadding(kMask01, shift01, shift10); // (y; x + 1)
|
||||
zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1)
|
||||
zerosPadding(kMask10, shift11, vWCoord); // (y + 1; x)
|
||||
zerosPadding(kMask10, shift11, shift00); // (y + 1; x)
|
||||
|
||||
hwShiftPs2dq(shift00, vHCoord, vWCoord, vSrcWidthF);
|
||||
hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF);
|
||||
uni_vpaddd(shift01, shift00, vDataTypeSizeB);
|
||||
uni_vpaddd(shift10, shift00, vSrcWidthB); // shift11??
|
||||
uni_vpaddd(shift11, shift10, vDataTypeSizeB); // sub??
|
||||
uni_vpaddd(shift10, shift00, vSrcWidthB);
|
||||
uni_vpaddd(shift11, shift10, vDataTypeSizeB);
|
||||
} else if (jcp.paddingMode == GridSamplePaddingMode::BORDER) {
|
||||
borderPadding(vWCoord, vWCoord, coord::w);
|
||||
borderPadding(vHCoord, vHCoord, coord::h);
|
||||
borderPadding(shift00, shift00, coord::w);
|
||||
borderPadding(shift01, shift01, coord::h);
|
||||
borderPadding(shift10, shift10, coord::w);
|
||||
borderPadding(shift11, shift11, coord::h);
|
||||
} else if (jcp.paddingMode == GridSamplePaddingMode::REFLECTION) {
|
||||
reflectionPadding(vWCoord, vWCoord, coord::w);
|
||||
reflectionPadding(vHCoord, vHCoord, coord::h);
|
||||
reflectionPadding(shift00, shift00, coord::w);
|
||||
reflectionPadding(shift01, shift01, coord::h);
|
||||
reflectionPadding(shift10, shift10, coord::w);
|
||||
reflectionPadding(shift11, shift11, coord::h);
|
||||
}
|
||||
if (jcp.paddingMode == GridSamplePaddingMode::BORDER || jcp.paddingMode == GridSamplePaddingMode::REFLECTION) {
|
||||
// W * y + x
|
||||
hwShiftPs2dq(vAux, shift11, vWCoord, vSrcWidthF);
|
||||
hwShiftPs2dq(vWCoord, vHCoord, vWCoord, vSrcWidthF);
|
||||
hwShiftPs2dq(vHCoord, vHCoord, shift10, vSrcWidthF);
|
||||
hwShiftPs2dq(vAux, shift11, shift00, vSrcWidthF);
|
||||
hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF);
|
||||
hwShiftPs2dq(shift01, shift01, shift10, vSrcWidthF);
|
||||
hwShiftPs2dq(shift11, shift11, shift10, vSrcWidthF);
|
||||
uni_vmovups(shift10, vAux);
|
||||
}
|
||||
@ -1658,8 +1679,8 @@ void GridSampleKernel<x64::avx512_core>::bicubicInterpolation(const Vmm& vWCoord
|
||||
// (y - 1 + h; x - 1)
|
||||
if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) {
|
||||
Xbyak::Opmask maskH = kMaskH;
|
||||
vcmpps(kMaskH, vHCoord, vSrcHeightF, 0x1);
|
||||
vcmpps(maskH | maskH, vZeros, vHCoord, 0x2);
|
||||
vcmpps(kMaskH, vHCoord, vSrcHeightF, CMP_LT_PS);
|
||||
vcmpps(maskH | maskH, vZeros, vHCoord, CMP_LE_PS);
|
||||
kandw(kAuxMask, kMaskH, wMasks[0]);
|
||||
uni_vmulps(vSrcShift0, vHCoord, vSrcWidthF);
|
||||
uni_vmovups(vWCoord, vWLeft);
|
||||
|
@ -286,26 +286,80 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &
|
||||
}
|
||||
|
||||
void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask,
|
||||
const Xbyak::Zmm& zAux,
|
||||
const Xbyak::Reg64& rWorkRest) {
|
||||
auto rAux0 = getReg64();
|
||||
auto rAux1 = getReg64();
|
||||
Xbyak::Label lKmov;
|
||||
Xbyak::Reg32 rOnes(rAux1.getIdx());
|
||||
const uint64_t typeSize = 4;
|
||||
const uint64_t elPerVec = x64::cpu_isa_traits<x64::avx512_core>::vlen / typeSize;
|
||||
const Xbyak::Reg64& rWorkRest) {
|
||||
auto rOnes = getReg64();
|
||||
|
||||
mov(rOnes, 0x0000FFFF);
|
||||
cmp(rWorkRest, elPerVec);
|
||||
jge(lKmov);
|
||||
{
|
||||
Xbyak::Reg32 rShift(rAux0.getIdx());
|
||||
mov(rShift, elPerVec);
|
||||
sub(rShift, rWorkRest);
|
||||
shrx(rOnes, rOnes, rShift);
|
||||
mov(rOnes, 0xFFFFFFFFFFFFFFFF);
|
||||
shlx(rOnes, rOnes, rWorkRest);
|
||||
not_(rOnes);
|
||||
kmovq(dstMask, rOnes);
|
||||
}
|
||||
|
||||
void JitKernelBase::fillRestWorkMask(const Xbyak::Xmm& xmmDstMask,
|
||||
const Xbyak::Reg64& rWorkRest,
|
||||
const uint64_t typeSize) {
|
||||
if (!one_of(typeSize, 1, 2, 4, 8)) {
|
||||
IE_THROW() << "Could not fill data with type size " << typeSize;
|
||||
}
|
||||
L(lKmov);
|
||||
kmovw(dstMask, rOnes);
|
||||
Xbyak::Label lEnd;
|
||||
auto r32Ones = getReg32();
|
||||
Xbyak::Reg64 r64Ones(r32Ones.getIdx());
|
||||
auto elPerVec = x64::cpu_isa_traits<x64::sse41>::vlen / typeSize;
|
||||
|
||||
mov(r64Ones, 0xFFFFFFFFFFFFFFFF);
|
||||
for (uint8_t i = 0; i < elPerVec; i++) {
|
||||
cmp(rWorkRest, i);
|
||||
jle(lEnd, T_NEAR);
|
||||
|
||||
if (typeSize == 1) {
|
||||
pinsrb(xmmDstMask, r32Ones, i);
|
||||
} else if (typeSize == 2) {
|
||||
pinsrw(xmmDstMask, r32Ones, i);
|
||||
} else if (typeSize == 4) {
|
||||
pinsrd(xmmDstMask, r32Ones, i);
|
||||
} else if (typeSize == 8) {
|
||||
pinsrq(xmmDstMask, r64Ones, i);
|
||||
}
|
||||
}
|
||||
L(lEnd);
|
||||
}
|
||||
|
||||
void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask,
|
||||
const Xbyak::Reg64& rWorkRest,
|
||||
const uint64_t typeSize) {
|
||||
if (!one_of(typeSize, 1, 2, 4, 8)) {
|
||||
IE_THROW() << "Could not fill data with type size " << typeSize;
|
||||
}
|
||||
Xbyak::Label lEnd;
|
||||
auto elPerVec = x64::cpu_isa_traits<x64::sse41>::vlen / typeSize;
|
||||
auto r32Ones = getReg32();
|
||||
Xbyak::Reg64 r64Ones(r32Ones.getIdx());
|
||||
Xbyak::Xmm xmmDstMask(ymmDstMask.getIdx());
|
||||
|
||||
mov(r64Ones, 0xFFFFFFFFFFFFFFFF);
|
||||
uni_vpxor(ymmDstMask, ymmDstMask, ymmDstMask);
|
||||
for (uint8_t i = 0; i < 2; i++) {
|
||||
Xbyak::Label lPerm;
|
||||
for (uint8_t j = 0; j < elPerVec; j++) {
|
||||
cmp(rWorkRest, i * elPerVec + j);
|
||||
jle(i == 0 ? lEnd : lPerm, T_NEAR);
|
||||
|
||||
if (typeSize == 1) {
|
||||
pinsrb(xmmDstMask, r32Ones, j);
|
||||
} else if (typeSize == 2) {
|
||||
pinsrw(xmmDstMask, r32Ones, j);
|
||||
} else if (typeSize == 4) {
|
||||
pinsrd(xmmDstMask, r32Ones, j);
|
||||
} else if (typeSize == 8) {
|
||||
pinsrq(xmmDstMask, r64Ones, j);
|
||||
}
|
||||
}
|
||||
cmp(rWorkRest, elPerVec);
|
||||
je(lEnd, T_NEAR);
|
||||
L(lPerm);
|
||||
vperm2f128(ymmDstMask, ymmDstMask, ymmDstMask, 0x1);
|
||||
}
|
||||
L(lEnd);
|
||||
}
|
||||
|
||||
void JitKernelBase::load(const Xbyak::Xmm& vDst,
|
||||
|
@ -11,6 +11,7 @@ namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
#define getReg64() RegistersPool::Reg<Xbyak::Reg64>(registersPool)
|
||||
#define getReg32() RegistersPool::Reg<Xbyak::Reg32>(registersPool)
|
||||
#define getVmm() RegistersPool::Reg<Vmm>(registersPool)
|
||||
#define getMask() RegistersPool::Reg<Vmask>(registersPool)
|
||||
|
||||
@ -84,9 +85,16 @@ public:
|
||||
const bool zeroFill = false);
|
||||
|
||||
void fillRestWorkMask(const Xbyak::Opmask& kDstMask,
|
||||
const Xbyak::Zmm& zAux,
|
||||
const Xbyak::Reg64& rWorkRest);
|
||||
|
||||
void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask,
|
||||
const Xbyak::Reg64& rWorkRest,
|
||||
const uint64_t typeSize = 4);
|
||||
|
||||
void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask,
|
||||
const Xbyak::Reg64& rWorkRest,
|
||||
const uint64_t typeSize = 4);
|
||||
|
||||
void load(const Xbyak::Xmm& vDst,
|
||||
const Xbyak::Address& srcAddr,
|
||||
const Xbyak::Reg64& rLoadNum,
|
||||
@ -133,6 +141,18 @@ protected:
|
||||
}
|
||||
|
||||
RegistersPool::Ptr registersPool;
|
||||
|
||||
enum {
|
||||
// Comparison predicate operand (immediate byte) for single-precision floating-point values.
|
||||
CMP_EQ_PS = 0, // Equal (ordered, non-signaling)
|
||||
CMP_LT_PS, // Less-than (ordered, signaling)
|
||||
CMP_LE_PS, // Less-than-or-equal (ordered, signaling)
|
||||
CMP_UNORD_PS, // Unordered (non-signaling)
|
||||
CMP_NEQ_PS, // Not-equal (unordered, non-signaling)
|
||||
CMP_NLT_PS, // Not-less-than (unordered, signaling)
|
||||
CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling)
|
||||
CMP_ORD_PS // Ordered (non-signaling)
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
|
@ -184,9 +184,6 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion.
|
||||
// Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation.
|
||||
R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)",
|
||||
// 94989. BF16 Reference produces different results.
|
||||
// GridSample regression on bf16 data.
|
||||
R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*bf16.*)",
|
||||
// // Issue: 95915
|
||||
R"(smoke_dynamic/AUGRUCellCPUTest.CompareWithRefs/IS=\(\[\?\.1\]_\[\?\.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
|
||||
R"(smoke_dynamic/GRUCellCPUTest.CompareWithRefs/IS=\(\[\?.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
|
||||
|
@ -94,6 +94,9 @@ protected:
|
||||
auto execType = dataPrecision == ov::element::i32 ? ov::element::i32 : ov::element::f32;
|
||||
selectedType = makeSelectedTypeStr(selectedType, execType);
|
||||
}
|
||||
if (gridPrecision == ov::element::bf16) {
|
||||
rel_threshold = 0.01f;
|
||||
}
|
||||
|
||||
auto params = ngraph::builder::makeDynamicParams({dataPrecision, gridPrecision}, inputDynamicShapes);
|
||||
params[0]->set_friendly_name("data");
|
||||
@ -272,12 +275,35 @@ INSTANTIATE_TEST_SUITE_P(smoke_static, GridSampleLayerTestCPU,
|
||||
::testing::ValuesIn(interpolateMode),
|
||||
::testing::ValuesIn(paddingMode),
|
||||
::testing::ValuesIn(alignCorners),
|
||||
::testing::ValuesIn({ElementType::f32, ElementType::bf16, ElementType::i32, ElementType::i8}),
|
||||
::testing::ValuesIn({ElementType::f32, ElementType::i32}),
|
||||
::testing::ValuesIn({ElementType::f32}),
|
||||
::testing::ValuesIn(getCPUInfo()),
|
||||
::testing::Values(additionalConfig[0])),
|
||||
GridSampleLayerTestCPU::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(nightly_static_1, GridSampleLayerTestCPU,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(getStaticShapes()),
|
||||
::testing::ValuesIn(interpolateMode),
|
||||
::testing::ValuesIn(paddingMode),
|
||||
::testing::ValuesIn(alignCorners),
|
||||
::testing::ValuesIn({ElementType::bf16, ElementType::i8}),
|
||||
::testing::ValuesIn({ElementType::f32, ElementType::bf16}),
|
||||
::testing::ValuesIn(getCPUInfo()),
|
||||
::testing::Values(additionalConfig[0])),
|
||||
GridSampleLayerTestCPU::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(nightly_static_2, GridSampleLayerTestCPU,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(getStaticShapes()),
|
||||
::testing::ValuesIn(interpolateMode),
|
||||
::testing::ValuesIn(paddingMode),
|
||||
::testing::ValuesIn(alignCorners),
|
||||
::testing::ValuesIn({ElementType::f32}),
|
||||
::testing::ValuesIn({ElementType::bf16}),
|
||||
::testing::ValuesIn(getCPUInfo()),
|
||||
::testing::Values(additionalConfig[0])),
|
||||
GridSampleLayerTestCPU::getTestCaseName);
|
||||
|
||||
const std::vector<std::vector<InputShape>> dynamicInSapes = {
|
||||
{ { { ov::Dimension(1, 15), -1, -1, -1 }, // Dynamic shape 0
|
||||
|
Loading…
Reference in New Issue
Block a user