[CPU] Fix of invalid read in DefConv (#10481)
This commit is contained in:
parent
bdee939fe0
commit
14d11a8998
@ -115,6 +115,11 @@ private:
|
|||||||
|
|
||||||
Xbyak::Label l_table;
|
Xbyak::Label l_table;
|
||||||
|
|
||||||
|
inline void checkZeroWei(const Xbyak::Xmm &x1, Label &nullifyLabel) {
|
||||||
|
uni_vtestps(x1, x1);
|
||||||
|
jz(nullifyLabel);
|
||||||
|
}
|
||||||
|
|
||||||
void ow_loop() {
|
void ow_loop() {
|
||||||
Label ow_loop_main;
|
Label ow_loop_main;
|
||||||
Label ow_tail;
|
Label ow_tail;
|
||||||
@ -280,6 +285,22 @@ private:
|
|||||||
Label ic_loop_main;
|
Label ic_loop_main;
|
||||||
Label ic_loop_tail;
|
Label ic_loop_tail;
|
||||||
Label loop_end;
|
Label loop_end;
|
||||||
|
Label nullify_v1;
|
||||||
|
Label nullify_v2;
|
||||||
|
Label nullify_v3;
|
||||||
|
Label nullify_v4;
|
||||||
|
Label nullify_v1_end;
|
||||||
|
Label nullify_v2_end;
|
||||||
|
Label nullify_v3_end;
|
||||||
|
Label nullify_v4_end;
|
||||||
|
Label nullify_v1_tail;
|
||||||
|
Label nullify_v2_tail;
|
||||||
|
Label nullify_v3_tail;
|
||||||
|
Label nullify_v4_tail;
|
||||||
|
Label nullify_v1_end_tail;
|
||||||
|
Label nullify_v2_end_tail;
|
||||||
|
Label nullify_v3_end_tail;
|
||||||
|
Label nullify_v4_end_tail;
|
||||||
|
|
||||||
mov(aux2_reg_input, aux_reg_input);
|
mov(aux2_reg_input, aux_reg_input);
|
||||||
add(aux2_reg_input, (ow * jcp_.stride_w * jcp_.ic) * jcp_.typesize_in);
|
add(aux2_reg_input, (ow * jcp_.stride_w * jcp_.ic) * jcp_.typesize_in);
|
||||||
@ -337,35 +358,69 @@ private:
|
|||||||
cmp(reg_ic_iter, simd_w);
|
cmp(reg_ic_iter, simd_w);
|
||||||
jl(ic_loop_tail, T_NEAR);
|
jl(ic_loop_tail, T_NEAR);
|
||||||
|
|
||||||
|
// check zero markers
|
||||||
|
uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]);
|
||||||
|
|
||||||
size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic;
|
size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic;
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v1_off, xmm_v1_off);
|
uni_vpmovsxdq(xmm_v1_off, xmm_v1_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v1_off);
|
uni_vmovq(reg_tmp_64, xmm_v1_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v1, nullify_v1);
|
||||||
uni_vmovups(vmm_v1, ptr[reg_tmp_64]);
|
uni_vmovups(vmm_v1, ptr[reg_tmp_64]);
|
||||||
uni_vmulps(vmm_v1, vmm_v1, vmm_w1);
|
uni_vmulps(vmm_v1, vmm_v1, vmm_w1);
|
||||||
|
jmp(nullify_v1_end, T_NEAR);
|
||||||
|
L(nullify_v1);
|
||||||
|
{
|
||||||
|
uni_vpxor(vmm_v1, vmm_v1, vmm_v1);
|
||||||
|
}
|
||||||
|
L(nullify_v1_end);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v2_off, xmm_v2_off);
|
uni_vpmovsxdq(xmm_v2_off, xmm_v2_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v2_off);
|
uni_vmovq(reg_tmp_64, xmm_v2_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v2, nullify_v2);
|
||||||
uni_vmovups(vmm_v2, ptr[reg_tmp_64]);
|
uni_vmovups(vmm_v2, ptr[reg_tmp_64]);
|
||||||
uni_vmulps(vmm_v2, vmm_v2, vmm_w2);
|
uni_vmulps(vmm_v2, vmm_v2, vmm_w2);
|
||||||
|
jmp(nullify_v2_end, T_NEAR);
|
||||||
|
L(nullify_v2);
|
||||||
|
{
|
||||||
|
uni_vpxor(vmm_v2, vmm_v2, vmm_v2);
|
||||||
|
}
|
||||||
|
L(nullify_v2_end);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v3_off, xmm_v3_off);
|
uni_vpmovsxdq(xmm_v3_off, xmm_v3_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v3_off);
|
uni_vmovq(reg_tmp_64, xmm_v3_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v3, nullify_v3);
|
||||||
uni_vmovups(vmm_v3, ptr[reg_tmp_64]);
|
uni_vmovups(vmm_v3, ptr[reg_tmp_64]);
|
||||||
uni_vmulps(vmm_v3, vmm_v3, vmm_w3);
|
uni_vmulps(vmm_v3, vmm_v3, vmm_w3);
|
||||||
|
jmp(nullify_v3_end, T_NEAR);
|
||||||
|
L(nullify_v3);
|
||||||
|
{
|
||||||
|
uni_vpxor(vmm_v3, vmm_v3, vmm_v3);
|
||||||
|
}
|
||||||
|
L(nullify_v3_end);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v4_off, xmm_v4_off);
|
uni_vpmovsxdq(xmm_v4_off, xmm_v4_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v4_off);
|
uni_vmovq(reg_tmp_64, xmm_v4_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v4, nullify_v4);
|
||||||
uni_vmovups(vmm_v4, ptr[reg_tmp_64]);
|
uni_vmovups(vmm_v4, ptr[reg_tmp_64]);
|
||||||
uni_vmulps(vmm_v4, vmm_v4, vmm_w4);
|
uni_vmulps(vmm_v4, vmm_v4, vmm_w4);
|
||||||
|
jmp(nullify_v4_end, T_NEAR);
|
||||||
|
L(nullify_v4);
|
||||||
|
{
|
||||||
|
uni_vpxor(vmm_v4, vmm_v4, vmm_v4);
|
||||||
|
}
|
||||||
|
L(nullify_v4_end);
|
||||||
|
|
||||||
uni_vaddps(vmm_v1, vmm_v1, vmm_v2);
|
uni_vaddps(vmm_v1, vmm_v1, vmm_v2);
|
||||||
uni_vaddps(vmm_v1, vmm_v1, vmm_v3);
|
uni_vaddps(vmm_v1, vmm_v1, vmm_v3);
|
||||||
@ -383,34 +438,68 @@ private:
|
|||||||
cmp(reg_ic_iter, 1);
|
cmp(reg_ic_iter, 1);
|
||||||
jl(loop_end, T_NEAR);
|
jl(loop_end, T_NEAR);
|
||||||
|
|
||||||
|
// check zero markers
|
||||||
|
uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]);
|
||||||
|
uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]);
|
||||||
|
|
||||||
size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic;
|
size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic;
|
||||||
uni_vpmovsxdq(xmm_v1_off, xmm_v1_off);
|
uni_vpmovsxdq(xmm_v1_off, xmm_v1_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v1_off);
|
uni_vmovq(reg_tmp_64, xmm_v1_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v1, nullify_v1_tail);
|
||||||
uni_vmovss(xmm_v1, ptr[reg_tmp_64]);
|
uni_vmovss(xmm_v1, ptr[reg_tmp_64]);
|
||||||
uni_vmulss(xmm_v1, xmm_v1, xmm_w1);
|
uni_vmulss(xmm_v1, xmm_v1, xmm_w1);
|
||||||
|
jmp(nullify_v1_end_tail, T_NEAR);
|
||||||
|
L(nullify_v1_tail);
|
||||||
|
{
|
||||||
|
uni_vpxor(xmm_v1, xmm_v1, xmm_v1);
|
||||||
|
}
|
||||||
|
L(nullify_v1_end_tail);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v2_off, xmm_v2_off);
|
uni_vpmovsxdq(xmm_v2_off, xmm_v2_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v2_off);
|
uni_vmovq(reg_tmp_64, xmm_v2_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v2, nullify_v2_tail);
|
||||||
uni_vmovss(xmm_v2, ptr[reg_tmp_64]);
|
uni_vmovss(xmm_v2, ptr[reg_tmp_64]);
|
||||||
uni_vmulss(xmm_v2, xmm_v2, xmm_w2);
|
uni_vmulss(xmm_v2, xmm_v2, xmm_w2);
|
||||||
|
jmp(nullify_v2_end_tail, T_NEAR);
|
||||||
|
L(nullify_v2_tail);
|
||||||
|
{
|
||||||
|
uni_vpxor(xmm_v2, xmm_v2, xmm_v2);
|
||||||
|
}
|
||||||
|
L(nullify_v2_end_tail);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v3_off, xmm_v3_off);
|
uni_vpmovsxdq(xmm_v3_off, xmm_v3_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v3_off);
|
uni_vmovq(reg_tmp_64, xmm_v3_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v3, nullify_v3_tail);
|
||||||
uni_vmovss(xmm_v3, ptr[reg_tmp_64]);
|
uni_vmovss(xmm_v3, ptr[reg_tmp_64]);
|
||||||
uni_vmulss(xmm_v3, xmm_v3, xmm_w3);
|
uni_vmulss(xmm_v3, xmm_v3, xmm_w3);
|
||||||
|
jmp(nullify_v3_end_tail, T_NEAR);
|
||||||
|
L(nullify_v3_tail);
|
||||||
|
{
|
||||||
|
uni_vpxor(xmm_v3, xmm_v3, xmm_v3);
|
||||||
|
}
|
||||||
|
L(nullify_v3_end_tail);
|
||||||
|
|
||||||
uni_vpmovsxdq(xmm_v4_off, xmm_v4_off);
|
uni_vpmovsxdq(xmm_v4_off, xmm_v4_off);
|
||||||
uni_vmovq(reg_tmp_64, xmm_v4_off);
|
uni_vmovq(reg_tmp_64, xmm_v4_off);
|
||||||
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in);
|
||||||
add(reg_tmp_64, aux2_reg_input);
|
add(reg_tmp_64, aux2_reg_input);
|
||||||
|
checkZeroWei(xmm_v4, nullify_v4_tail);
|
||||||
uni_vmovss(xmm_v4, ptr[reg_tmp_64]);
|
uni_vmovss(xmm_v4, ptr[reg_tmp_64]);
|
||||||
uni_vmulss(xmm_v4, xmm_v4, xmm_w4);
|
uni_vmulss(xmm_v4, xmm_v4, xmm_w4);
|
||||||
|
jmp(nullify_v4_end_tail, T_NEAR);
|
||||||
|
L(nullify_v4_tail);
|
||||||
|
{
|
||||||
|
uni_vpxor(xmm_v4, xmm_v4, xmm_v4);
|
||||||
|
}
|
||||||
|
L(nullify_v4_end_tail);
|
||||||
|
|
||||||
uni_vaddss(xmm_v1, xmm_v1, xmm_v2);
|
uni_vaddss(xmm_v1, xmm_v1, xmm_v2);
|
||||||
uni_vaddss(xmm_v1, xmm_v1, xmm_v3);
|
uni_vaddss(xmm_v1, xmm_v1, xmm_v3);
|
||||||
|
@ -141,9 +141,6 @@ std::vector<std::string> disabledTestPatterns() {
|
|||||||
*IS=_TS=\(\(4\.5\.6\.7\)\)_RS=\(\(1\.1\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.1\.1\.1\)\)_\(\(1\.1\.6\.1\)\).*)",
|
*IS=_TS=\(\(4\.5\.6\.7\)\)_RS=\(\(1\.1\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.1\.1\.1\)\)_\(\(1\.1\.6\.1\)\).*)",
|
||||||
// Issue: 69222
|
// Issue: 69222
|
||||||
R"(.*smoke_PriorBoxClustered.*PriorBoxClusteredLayerCPUTest.*_netPRC=f16_.*)",
|
R"(.*smoke_PriorBoxClustered.*PriorBoxClusteredLayerCPUTest.*_netPRC=f16_.*)",
|
||||||
// Issue: 74817
|
|
||||||
// Sporadic failings with NAN on Dynamic shape cases with jit implementation
|
|
||||||
R"(.*DefConvLayoutTest7.*)",
|
|
||||||
// Issue: 71968
|
// Issue: 71968
|
||||||
R"(.*LSTMSequenceCommonZeroClip.*PURE.*CONST.*hidden_size=10.*sigmoid.sigmoid.sigmoid.*reverse.*FP32_targetDevice=CPU.*)",
|
R"(.*LSTMSequenceCommonZeroClip.*PURE.*CONST.*hidden_size=10.*sigmoid.sigmoid.sigmoid.*reverse.*FP32_targetDevice=CPU.*)",
|
||||||
// Issue: 72005
|
// Issue: 72005
|
||||||
|
Loading…
Reference in New Issue
Block a user