From 6d634d09a441b676236f9709eaa4d5003424aca9 Mon Sep 17 00:00:00 2001 From: Egor Duplensky Date: Wed, 22 Sep 2021 10:46:37 +0300 Subject: [PATCH] [CPU] Fix mixing VEX and non-VEX instructions (#7238) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quote: The Skylake microarchitecture implements a different state machine than prior generations to manage the YMM state transition associated with mixing SSE and AVX instructions. It no longer saves the entire upper YMM state when executing an SSE instruction when in “Modified and Unsaved” state, but saves the upper bits of individual register. As a result, mixing SSE and AVX instructions will experience a penalty associated with partial register dependency of the destination registers being used and additional blend operation on the upper bits of the destination registers. Such type of penalties have a huge impact on openvino's and oneDNN's kernels. Basically the mixing of VEX and non-VEX instructions should be avoided. --- .../emitters/jit_eltwise_emitters.cpp | 6 +- .../emitters/jit_load_store_emitters.cpp | 41 +++++---- .../nodes/common/permute_kernel.cpp | 20 ++--- .../nodes/mkldnn_bin_conv_node.cpp | 10 +-- .../nodes/mkldnn_eltwise_node.cpp | 8 +- .../nodes/mkldnn_fake_quantize_node.cpp | 26 +++--- .../mkldnn_plugin/nodes/mkldnn_input_node.cpp | 14 ++-- .../nodes/mkldnn_interpolate_node.cpp | 16 ++-- .../mkldnn_plugin/nodes/mkldnn_mvn_node.cpp | 20 ++--- .../nodes/mkldnn_normalize_node.cpp | 35 ++++---- .../nodes/mkldnn_reduce_node.cpp | 84 +++++++++---------- .../nodes/mkldnn_region_yolo_node.cpp | 8 +- .../nodes/mkldnn_roi_pooling_node.cpp | 4 +- inference-engine/thirdparty/mkl-dnn | 2 +- 14 files changed, 148 insertions(+), 146 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp index 29c17d3f172..cf026dae9cd 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp @@ -85,8 +85,8 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const if (isa == cpu::x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); - h->mulps(vmm_dst, vmm_src1); - h->addps(vmm_dst, vmm_src2); + h->uni_vmulps(vmm_dst, vmm_src1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); } else { Vmm vmm_mul0; if (vmm_dst.getIdx() == vmm_src0.getIdx()) { @@ -656,7 +656,7 @@ void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const s } else if (isa == cpu::x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + h->uni_vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); } else { h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); h->uni_vmovups(vmm_dst, table_val("zero")); diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp index 83bc04c530d..ab1bb949e96 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp @@ -190,7 +190,7 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o } if (bytes_to_load >= 8 && bytes_to_load < 16) - h->pinsrq(xmm, addr(start_bytes), 0); + h->uni_vpinsrq(xmm, xmm, addr(start_bytes), 0); else if (bytes_to_load == 16) h->uni_vmovdqu(xmm, addr(start_bytes)); @@ -202,17 +202,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vpinsrw(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 2), 2); break; - case 4: h->pinsrd(xmm, addr(start_bytes), 0); break; + case 4: h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); break; case 5: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); break; case 6: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); break; case 7: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); break; @@ -223,17 +223,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); break; - case 12: h->pinsrd(xmm, addr(start_bytes + 8), 2); break; + case 12: h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); break; case 13: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); break; case 14: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); break; case 15: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); break; @@ -465,10 +465,7 @@ template if (is_xmm || is_ymm) { uint8 imm = 1; imm = ~((imm << load_num) - imm); // shift load_num bit - if (is_xmm) - h->blendps(vmm, table_val(fill_value), imm); - else - h->vblendps(vmm, vmm, table_val(fill_value), imm); + h->uni_vblendps(vmm, vmm, table_val(fill_value), imm); } else if (is_zmm) { uint64_t tail_mask = 1; tail_mask = ~((tail_mask << load_num) - tail_mask); @@ -668,7 +665,7 @@ template } if (bytes_to_store >= 8 && bytes_to_store < 16) - h->pextrq(addr(start_bytes), xmm, 0); + h->uni_vpextrq(addr(start_bytes), xmm, 0); else if (bytes_to_store == 16) h->uni_vmovdqu(addr(start_bytes), xmm); @@ -682,17 +679,17 @@ template h->uni_vpextrw(addr(start_bytes), xmm, 0); h->uni_vpextrb(addr(start_bytes + 2), xmm, 2); break; - case 4: h->pextrd(addr(start_bytes), xmm, 0); break; + case 4: h->uni_vpextrd(addr(start_bytes), xmm, 0); break; case 5: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); break; case 6: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); break; case 7: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); break; @@ -703,17 +700,17 @@ template h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); break; - case 12: h->pextrd(addr(start_bytes + 8), xmm, 2); break; + case 12: h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); break; case 13: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); break; case 14: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); break; case 15: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); break; diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp index 4babccb5186..e5a079dfb92 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp @@ -47,21 +47,21 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) { switch (jcp.data_size) { - case 16: movups(xmm, addr); break; - case 8: movsd(xmm, addr); break; - case 4: movss(xmm, addr); break; - case 2: pinsrw(xmm, addr, 0x0); break; - case 1: pinsrb(xmm, addr, 0x0); break; + case 16: uni_vmovups(xmm, addr); break; + case 8: uni_vmovsd(xmm, addr); break; + case 4: uni_vmovss(xmm, addr); break; + case 2: uni_vpinsrw(xmm, xmm, addr, 0x0); break; + case 1: uni_vpinsrb(xmm, xmm, addr, 0x0); break; } } void store(const Xbyak::Address &addr, const Xbyak::Xmm &xmm) { switch (jcp.data_size) { - case 16: movups(addr, xmm); break; - case 8: movsd(addr, xmm); break; - case 4: movss(addr, xmm); break; - case 2: pextrw(addr, xmm, 0x0); break; - case 1: pextrb(addr, xmm, 0x0); break; + case 16: uni_vmovups(addr, xmm); break; + case 8: uni_vmovsd(addr, xmm); break; + case 4: uni_vmovss(addr, xmm); break; + case 2: uni_vpextrw(addr, xmm, 0x0); break; + case 1: uni_vpextrb(addr, xmm, 0x0); break; } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp index 517066d6f32..72b2c6b401f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp @@ -215,7 +215,7 @@ private: case memory::data_type::s32: if (scalar_load) { mov(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vmovups(vmm_in, op); } @@ -223,7 +223,7 @@ private: case memory::data_type::s8: if (scalar_load) { movsx(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vpmovsxbd(vmm_in, op); } @@ -231,7 +231,7 @@ private: case memory::data_type::u8: if (scalar_load) { movzx(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vpmovzxbd(vmm_in, op); } @@ -541,7 +541,7 @@ private: if (jcp_.exclude_pad) { mov(reg_shift, kw_padding[jj]); imul(reg_shift, reg_tmp_32); - movq(Xmm(vmm_shift.getIdx()), reg_shift); + uni_vmovq(Xmm(vmm_shift.getIdx()), reg_shift); uni_vbroadcastss(vmm_shift, Xmm(vmm_shift.getIdx())); uni_vcvtdq2ps(vmm_shift, vmm_shift); } @@ -612,7 +612,7 @@ private: } else { Ymm ymm_prev_dst = Ymm(vmm_sum.getIdx()); vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01); - vpslldq(vmm_sum, vmm_sum, (oc - jcp_.oc_block / 2) * sizeof(float)); + uni_vpslldq(vmm_sum, vmm_sum, (oc - jcp_.oc_block / 2) * sizeof(float)); } uni_vaddps(vmm_dst, vmm_dst, vmm_sum); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 9dd250d7b96..8c9d610bd80 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -583,7 +583,7 @@ private: switch (src_prc) { case Precision::FP32: case Precision::I32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case Precision::BF16: uni_vpinsrw(xmm_src, xmm_src, op, 0); @@ -599,11 +599,11 @@ private: break; case Precision::I8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case Precision::U8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_prc"); @@ -730,7 +730,7 @@ private: switch (dst_prc) { case Precision::FP32: case Precision::I32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case Precision::BF16: uni_vpsrld(xmm_dst, xmm_dst, 16); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp index b12bed6a476..838a47d4085 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp @@ -150,9 +150,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vpxor(xmm_wei(0), xmm_wei(0), xmm_wei(0)); uni_vpxor(xmm_mask(0), xmm_mask(0), xmm_mask(0)); - movss(xmm_src(0), ptr[reg_from + c * sizeof(float)]); - movss(xmm_wei(0), ptr[reg_thresholds + c * sizeof(float)]); - movss(xmm_mask(0), ptr[reg_output_mask + c * sizeof(float)]); + uni_vmovss(xmm_src(0), ptr[reg_from + c * sizeof(float)]); + uni_vmovss(xmm_wei(0), ptr[reg_thresholds + c * sizeof(float)]); + uni_vmovss(xmm_mask(0), ptr[reg_output_mask + c * sizeof(float)]); uni_vcmpgtps(xmm_src(0), xmm_src(0), xmm_wei(0)); uni_vpcmpeqd(xmm_src(0), xmm_src(0), xmm_mask(0)); uni_vmovmskps(reg_src_32, xmm_src(0)); @@ -591,13 +591,13 @@ private: jle(exit_label, T_NEAR); for (int i = 0; i < jqp_.c % tail4_simd_w; i++) { - movss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); - movss(xmm_crop_high(0), ptr[reg_crop_high + i * wei_type_size]); - movss(xmm_input_scale(0), ptr[reg_input_scale + i * wei_type_size]); - movss(xmm_input_shift(0), ptr[reg_input_shift + i * wei_type_size]); + uni_vmovss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); + uni_vmovss(xmm_crop_high(0), ptr[reg_crop_high + i * wei_type_size]); + uni_vmovss(xmm_input_scale(0), ptr[reg_input_scale + i * wei_type_size]); + uni_vmovss(xmm_input_shift(0), ptr[reg_input_shift + i * wei_type_size]); if (do_dequantization) { - movss(xmm_output_scale(0), ptr[reg_output_scale + i * wei_type_size]); - movss(xmm_output_shift(0), ptr[reg_output_shift + i * wei_type_size]); + uni_vmovss(xmm_output_scale(0), ptr[reg_output_scale + i * wei_type_size]); + uni_vmovss(xmm_output_shift(0), ptr[reg_output_shift + i * wei_type_size]); } load_scalar(xmm_val(0), ptr[aux_reg_from + i * src_type_size], jqp_.src_prc); @@ -688,15 +688,15 @@ private: switch (src_prc) { case Precision::FP32: case Precision::I32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case Precision::I8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case Precision::U8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_prc"); @@ -797,7 +797,7 @@ private: switch (dst_prc) { case Precision::FP32: case Precision::I32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case Precision::I8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp index 1926914f074..786d5dc41ac 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp @@ -104,13 +104,13 @@ protected: auto b = xmm2; auto c = xmm3; - movdqu(a, xword[src]); // load 4 floats - movdqu(b, a); // b = a - movdqu(c, a); // c = a - pcmpeqd(b, zero); // if (a == 0) b = 1 else b = 0 - pand(c, mask); // c = a & 01111111100000000000000000000000 - pcmpeqd(c, zero); // if (c == 0) c = 1 else c = 0 - ptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + uni_vmovdqu(a, xword[src]); // load 4 floats + uni_vmovdqu(b, a); // b = a + uni_vmovdqu(c, a); // c = a + uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0 + uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000 + uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } template diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp index f6873d88d34..6a2d8564bd3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp @@ -1199,10 +1199,10 @@ private: jl(tail_loop_end_label, T_NEAR); // get idx for input - movss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); + uni_vmovss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::data_type::s32, true); - movss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); + uni_vmovss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::data_type::s32, true); // gather weightX by input idx, used in y0-y3 gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::data_type::f32, true); @@ -1430,18 +1430,18 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; default: @@ -1536,7 +1536,7 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -1552,7 +1552,7 @@ private: break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; default: assert(!"unknown dst_dt"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index f27a40e3bd2..f1801d72b61 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -102,17 +102,17 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k Xbyak::Ymm ymm_sum = Xbyak::Ymm(vmm_dst.getIdx()); vextractf128(xmm_aux1, ymm_sum, 0); vextractf128(xmm_aux2, ymm_sum, 1); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } else { Xbyak::Zmm zmm_sum = Xbyak::Zmm(vmm_dst.getIdx()); vextractf32x4(xmm_aux1, zmm_sum, 0); vextractf32x4(xmm_aux2, zmm_sum, 1); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); vextractf32x4(xmm_aux2, zmm_sum, 2); vextractf32x4(xmm_aux3, zmm_sum, 3); - addps(xmm_aux2, xmm_aux3); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux2, xmm_aux2, xmm_aux3); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } } else { @@ -342,14 +342,14 @@ private: } inline void hsum_store(Xbyak::Xmm xmm_sum) { - movshdup(xmm_aux3, xmm_sum); // sum:1,2,3,4; aux3:2,2,4,4 - addps(xmm_sum, xmm_aux3); // sum:1+2,2+2,3+4,4+4 - movhlps(xmm_aux3, xmm_sum); // aux3:3+4,4+4,4,4 - addps(xmm_sum, xmm_aux3); // sum:1+2+3+4,... + uni_vmovshdup(xmm_aux3, xmm_sum); // sum:1,2,3,4; aux3:2,2,4,4 + uni_vaddps(xmm_sum, xmm_sum, xmm_aux3); // sum:1+2,2+2,3+4,4+4 + uni_vmovhlps(xmm_aux3, xmm_sum); // aux3:3+4,4+4,4,4 + uni_vaddps(xmm_sum, xmm_sum, xmm_aux3); // sum:1+2+3+4,... if (jcp_.normalize_variance) { - movss(ptr[reg_variance], xmm_sum); + uni_vmovss(ptr[reg_variance], xmm_sum); } else { - movss(ptr[reg_sum], xmm_sum); + uni_vmovss(ptr[reg_sum], xmm_sum); } } }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index ff95f416573..50345af1821 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -88,17 +88,21 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker Xbyak::Ymm ymm_sqr_sum = Xbyak::Ymm(vmm_sqr_sum.getIdx()); vextractf128(xmm_aux1, ymm_sqr_sum, 0); vextractf128(xmm_aux2, ymm_sqr_sum, 1); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } else { Xbyak::Zmm zmm_sqr_sum = Xbyak::Zmm(vmm_sqr_sum.getIdx()); vextractf32x4(xmm_aux1, zmm_sqr_sum, 0); vextractf32x4(xmm_aux2, zmm_sqr_sum, 1); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); vextractf32x4(xmm_aux2, zmm_sqr_sum, 2); vextractf32x4(xmm_aux3, zmm_sqr_sum, 3); - addps(xmm_aux2, xmm_aux3); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux2, xmm_aux3); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux2, xmm_aux2, xmm_aux3); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } } @@ -124,11 +128,11 @@ private: Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(4); inline void hsum_store(Xbyak::Xmm xmm_sqr_sum) { - movshdup(xmm_aux3, xmm_sqr_sum); // sqrt_sum:1,2,3,4; aux3:2,2,4,4 - addps(xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2,2+2,3+4,4+4 - movhlps(xmm_aux3, xmm_sqr_sum); // aux3:3+4,4+4,4,4 - addps(xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2+3+4,... - movss(ptr[reg_modulo], xmm_sqr_sum); + uni_vmovshdup(xmm_aux3, xmm_sqr_sum); // sqrt_sum:1,2,3,4; aux3:2,2,4,4 + uni_vaddps(xmm_sqr_sum, xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2,2+2,3+4,4+4 + uni_vmovhlps(xmm_aux3, xmm_sqr_sum); // aux3:3+4,4+4,4,4 + uni_vaddps(xmm_sqr_sum, xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2+3+4,... + uni_vmovss(ptr[reg_modulo], xmm_sqr_sum); } inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { @@ -359,6 +363,7 @@ private: load_scalar(xmm_val, ptr[reg_src], jcp_.src_dt); uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); + if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, step * sizeof(float)); @@ -493,19 +498,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown dst_dt"); @@ -568,11 +573,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp index 7828bc55f27..e9c3a1a619b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -607,19 +607,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_dt"); @@ -692,11 +692,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -707,7 +707,7 @@ private: case memory::data_type::u8: uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); + vmovq(reg_tmp_64, xmm_dst); mov(op, reg_tmp_8); break; default: @@ -738,9 +738,9 @@ private: } inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); @@ -753,21 +753,21 @@ private: case memory::data_type::s32: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -777,7 +777,7 @@ private: inline void horiz_ps(const Xmm& xmm, const Operand& op) { switch (jcp_.reduce_mode) { case ReduceAnd: - andps(xmm, op); + uni_vandps(xmm, xmm, op); break; case ReduceL1: case ReduceL2: @@ -786,19 +786,19 @@ private: case ReduceSum: case ReduceSumSquare: case ReduceLogSumExp: - addps(xmm, op); + uni_vaddps(xmm, xmm, op); break; case ReduceMax: - maxps(xmm, op); + uni_vmaxps(xmm, op); break; case ReduceMin: - minps(xmm, op); + uni_vminps(xmm, op); break; case ReduceOr: - orps(xmm, op); + uni_vorps(xmm, xmm, op); break; case ReduceProd: - mulps(xmm, op); + uni_vmulps(xmm, op); break; default: assert(!"unsupported reduce mode"); @@ -1074,19 +1074,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_dt"); @@ -1159,11 +1159,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -1205,33 +1205,33 @@ private: } inline void horize_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... switch (dst_dt) { case memory::data_type::f32: - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(ptr[reg_dst], xmm_dst, 0x0); + uni_vpextrw(ptr[reg_dst], xmm_dst, 0x0); break; case memory::data_type::s32: uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -1261,9 +1261,9 @@ private: } inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); @@ -1276,21 +1276,21 @@ private: case memory::data_type::s32: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -1300,7 +1300,7 @@ private: inline void horiz_ps(const Xmm& xmm, const Operand& op) { switch (jcp_.reduce_mode) { case ReduceAnd: - andps(xmm, op); + uni_vandps(xmm, xmm, op); break; case ReduceL1: case ReduceL2: @@ -1309,19 +1309,19 @@ private: case ReduceSum: case ReduceSumSquare: case ReduceLogSumExp: - addps(xmm, op); + uni_vaddps(xmm, xmm, op); break; case ReduceMax: - maxps(xmm, op); + uni_vmaxps(xmm, op); break; case ReduceMin: - minps(xmm, op); + uni_vminps(xmm, op); break; case ReduceOr: - orps(xmm, op); + uni_vorps(xmm, xmm, op); break; case ReduceProd: - mulps(xmm, op); + uni_vmulps(xmm, op); break; default: assert(!"unsupported reduce mode"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp index af1159bb07d..32379a8f0f3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp @@ -202,10 +202,10 @@ private: inline void load_scalar(Xbyak::Xmm xmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) { switch (src_dt) { case InferenceEngine::Precision::FP32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case InferenceEngine::Precision::BF16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; default: @@ -215,11 +215,11 @@ private: inline void store_scalar(const Xbyak::Address &op, Xbyak::Xmm xmm_dst, InferenceEngine::Precision dst_dt) { switch (dst_dt) { case InferenceEngine::Precision::FP32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case InferenceEngine::Precision::BF16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; default: assert(!"unknown dst_dt"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp index 346bc1079a9..f98b5609982 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp @@ -210,9 +210,9 @@ private: } void roi_pool_bilinear(int c_blocks) { - movq(xmm_yf, reg_yf); + uni_vmovq(xmm_yf, reg_yf); uni_vbroadcastss(vmm_yf, xmm_yf); - movq(xmm_xf, reg_xf); + uni_vmovq(xmm_xf, reg_xf); uni_vbroadcastss(vmm_xf, xmm_xf); Vmm vmm_src00 = get_src_reg(0); diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index 60f41b3a998..e03f65ca65a 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit 60f41b3a9988ce7b1bc85c4f1ce7f9443bc91c9d +Subproject commit e03f65ca65aa771f3541163a33a57d7eea9ea83d