diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp index fb82f296a80..87281c49beb 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp @@ -115,6 +115,11 @@ private: Xbyak::Label l_table; + inline void checkZeroWei(const Xbyak::Xmm &x1, Label &nullifyLabel) { + uni_vtestps(x1, x1); + jz(nullifyLabel); + } + void ow_loop() { Label ow_loop_main; Label ow_tail; @@ -280,6 +285,22 @@ private: Label ic_loop_main; Label ic_loop_tail; Label loop_end; + Label nullify_v1; + Label nullify_v2; + Label nullify_v3; + Label nullify_v4; + Label nullify_v1_end; + Label nullify_v2_end; + Label nullify_v3_end; + Label nullify_v4_end; + Label nullify_v1_tail; + Label nullify_v2_tail; + Label nullify_v3_tail; + Label nullify_v4_tail; + Label nullify_v1_end_tail; + Label nullify_v2_end_tail; + Label nullify_v3_end_tail; + Label nullify_v4_end_tail; mov(aux2_reg_input, aux_reg_input); add(aux2_reg_input, (ow * jcp_.stride_w * jcp_.ic) * jcp_.typesize_in); @@ -337,35 +358,69 @@ private: cmp(reg_ic_iter, simd_w); jl(ic_loop_tail, T_NEAR); + // check zero markers + uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v1, nullify_v1); uni_vmovups(vmm_v1, ptr[reg_tmp_64]); uni_vmulps(vmm_v1, vmm_v1, vmm_w1); + jmp(nullify_v1_end, T_NEAR); + L(nullify_v1); + { + uni_vpxor(vmm_v1, vmm_v1, vmm_v1); + } + L(nullify_v1_end); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); uni_vmovq(reg_tmp_64, xmm_v2_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v2, nullify_v2); uni_vmovups(vmm_v2, ptr[reg_tmp_64]); uni_vmulps(vmm_v2, vmm_v2, vmm_w2); + jmp(nullify_v2_end, T_NEAR); + L(nullify_v2); + { + uni_vpxor(vmm_v2, vmm_v2, vmm_v2); + } + L(nullify_v2_end); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); uni_vmovq(reg_tmp_64, xmm_v3_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v3, nullify_v3); uni_vmovups(vmm_v3, ptr[reg_tmp_64]); uni_vmulps(vmm_v3, vmm_v3, vmm_w3); + jmp(nullify_v3_end, T_NEAR); + L(nullify_v3); + { + uni_vpxor(vmm_v3, vmm_v3, vmm_v3); + } + L(nullify_v3_end); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); uni_vmovq(reg_tmp_64, xmm_v4_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v4, nullify_v4); uni_vmovups(vmm_v4, ptr[reg_tmp_64]); uni_vmulps(vmm_v4, vmm_v4, vmm_w4); + jmp(nullify_v4_end, T_NEAR); + L(nullify_v4); + { + uni_vpxor(vmm_v4, vmm_v4, vmm_v4); + } + L(nullify_v4_end); uni_vaddps(vmm_v1, vmm_v1, vmm_v2); uni_vaddps(vmm_v1, vmm_v1, vmm_v3); @@ -383,34 +438,68 @@ private: cmp(reg_ic_iter, 1); jl(loop_end, T_NEAR); + // check zero markers + uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v1, nullify_v1_tail); uni_vmovss(xmm_v1, ptr[reg_tmp_64]); uni_vmulss(xmm_v1, xmm_v1, xmm_w1); + jmp(nullify_v1_end_tail, T_NEAR); + L(nullify_v1_tail); + { + uni_vpxor(xmm_v1, xmm_v1, xmm_v1); + } + L(nullify_v1_end_tail); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); uni_vmovq(reg_tmp_64, xmm_v2_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v2, nullify_v2_tail); uni_vmovss(xmm_v2, ptr[reg_tmp_64]); uni_vmulss(xmm_v2, xmm_v2, xmm_w2); + jmp(nullify_v2_end_tail, T_NEAR); + L(nullify_v2_tail); + { + uni_vpxor(xmm_v2, xmm_v2, xmm_v2); + } + L(nullify_v2_end_tail); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); uni_vmovq(reg_tmp_64, xmm_v3_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v3, nullify_v3_tail); uni_vmovss(xmm_v3, ptr[reg_tmp_64]); uni_vmulss(xmm_v3, xmm_v3, xmm_w3); + jmp(nullify_v3_end_tail, T_NEAR); + L(nullify_v3_tail); + { + uni_vpxor(xmm_v3, xmm_v3, xmm_v3); + } + L(nullify_v3_end_tail); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); uni_vmovq(reg_tmp_64, xmm_v4_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); add(reg_tmp_64, aux2_reg_input); + checkZeroWei(xmm_v4, nullify_v4_tail); uni_vmovss(xmm_v4, ptr[reg_tmp_64]); uni_vmulss(xmm_v4, xmm_v4, xmm_w4); + jmp(nullify_v4_end_tail, T_NEAR); + L(nullify_v4_tail); + { + uni_vpxor(xmm_v4, xmm_v4, xmm_v4); + } + L(nullify_v4_end_tail); uni_vaddss(xmm_v1, xmm_v1, xmm_v2); uni_vaddss(xmm_v1, xmm_v1, xmm_v3); diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index ff4f7c36838..d98c578ee67 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -141,9 +141,6 @@ std::vector disabledTestPatterns() { *IS=_TS=\(\(4\.5\.6\.7\)\)_RS=\(\(1\.1\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.1\.1\.1\)\)_\(\(1\.1\.6\.1\)\).*)", // Issue: 69222 R"(.*smoke_PriorBoxClustered.*PriorBoxClusteredLayerCPUTest.*_netPRC=f16_.*)", - // Issue: 74817 - // Sporadic failings with NAN on Dynamic shape cases with jit implementation - R"(.*DefConvLayoutTest7.*)", // Issue: 71968 R"(.*LSTMSequenceCommonZeroClip.*PURE.*CONST.*hidden_size=10.*sigmoid.sigmoid.sigmoid.*reverse.*FP32_targetDevice=CPU.*)", // Issue: 72005