diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp index 29c17d3f172..cf026dae9cd 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp @@ -85,8 +85,8 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const if (isa == cpu::x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); - h->mulps(vmm_dst, vmm_src1); - h->addps(vmm_dst, vmm_src2); + h->uni_vmulps(vmm_dst, vmm_src1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); } else { Vmm vmm_mul0; if (vmm_dst.getIdx() == vmm_src0.getIdx()) { @@ -656,7 +656,7 @@ void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const s } else if (isa == cpu::x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); - h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); + h->uni_vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); } else { h->vcmpps(k_mask, vmm_src0, vmm_src1, _cmp_eq_oq); h->uni_vmovups(vmm_dst, table_val("zero")); diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp index 83bc04c530d..ab1bb949e96 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp @@ -190,7 +190,7 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o } if (bytes_to_load >= 8 && bytes_to_load < 16) - h->pinsrq(xmm, addr(start_bytes), 0); + h->uni_vpinsrq(xmm, xmm, addr(start_bytes), 0); else if (bytes_to_load == 16) h->uni_vmovdqu(xmm, addr(start_bytes)); @@ -202,17 +202,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vpinsrw(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 2), 2); break; - case 4: h->pinsrd(xmm, addr(start_bytes), 0); break; + case 4: h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); break; case 5: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); break; case 6: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); break; case 7: - h->pinsrd(xmm, addr(start_bytes), 0); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes), 0); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); break; @@ -223,17 +223,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); break; - case 12: h->pinsrd(xmm, addr(start_bytes + 8), 2); break; + case 12: h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); break; case 13: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); break; case 14: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); break; case 15: - h->pinsrd(xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); break; @@ -465,10 +465,7 @@ template if (is_xmm || is_ymm) { uint8 imm = 1; imm = ~((imm << load_num) - imm); // shift load_num bit - if (is_xmm) - h->blendps(vmm, table_val(fill_value), imm); - else - h->vblendps(vmm, vmm, table_val(fill_value), imm); + h->uni_vblendps(vmm, vmm, table_val(fill_value), imm); } else if (is_zmm) { uint64_t tail_mask = 1; tail_mask = ~((tail_mask << load_num) - tail_mask); @@ -668,7 +665,7 @@ template } if (bytes_to_store >= 8 && bytes_to_store < 16) - h->pextrq(addr(start_bytes), xmm, 0); + h->uni_vpextrq(addr(start_bytes), xmm, 0); else if (bytes_to_store == 16) h->uni_vmovdqu(addr(start_bytes), xmm); @@ -682,17 +679,17 @@ template h->uni_vpextrw(addr(start_bytes), xmm, 0); h->uni_vpextrb(addr(start_bytes + 2), xmm, 2); break; - case 4: h->pextrd(addr(start_bytes), xmm, 0); break; + case 4: h->uni_vpextrd(addr(start_bytes), xmm, 0); break; case 5: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); break; case 6: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); break; case 7: - h->pextrd(addr(start_bytes), xmm, 0); + h->uni_vpextrd(addr(start_bytes), xmm, 0); h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); break; @@ -703,17 +700,17 @@ template h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); break; - case 12: h->pextrd(addr(start_bytes + 8), xmm, 2); break; + case 12: h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); break; case 13: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); break; case 14: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); break; case 15: - h->pextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); break; diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp index 4babccb5186..e5a079dfb92 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/common/permute_kernel.cpp @@ -47,21 +47,21 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) { switch (jcp.data_size) { - case 16: movups(xmm, addr); break; - case 8: movsd(xmm, addr); break; - case 4: movss(xmm, addr); break; - case 2: pinsrw(xmm, addr, 0x0); break; - case 1: pinsrb(xmm, addr, 0x0); break; + case 16: uni_vmovups(xmm, addr); break; + case 8: uni_vmovsd(xmm, addr); break; + case 4: uni_vmovss(xmm, addr); break; + case 2: uni_vpinsrw(xmm, xmm, addr, 0x0); break; + case 1: uni_vpinsrb(xmm, xmm, addr, 0x0); break; } } void store(const Xbyak::Address &addr, const Xbyak::Xmm &xmm) { switch (jcp.data_size) { - case 16: movups(addr, xmm); break; - case 8: movsd(addr, xmm); break; - case 4: movss(addr, xmm); break; - case 2: pextrw(addr, xmm, 0x0); break; - case 1: pextrb(addr, xmm, 0x0); break; + case 16: uni_vmovups(addr, xmm); break; + case 8: uni_vmovsd(addr, xmm); break; + case 4: uni_vmovss(addr, xmm); break; + case 2: uni_vpextrw(addr, xmm, 0x0); break; + case 1: uni_vpextrb(addr, xmm, 0x0); break; } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp index 517066d6f32..72b2c6b401f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp @@ -215,7 +215,7 @@ private: case memory::data_type::s32: if (scalar_load) { mov(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vmovups(vmm_in, op); } @@ -223,7 +223,7 @@ private: case memory::data_type::s8: if (scalar_load) { movsx(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vpmovsxbd(vmm_in, op); } @@ -231,7 +231,7 @@ private: case memory::data_type::u8: if (scalar_load) { movzx(reg_tmp_32, op); - movq(xmm_in, reg_tmp_64); + uni_vmovq(xmm_in, reg_tmp_64); } else { uni_vpmovzxbd(vmm_in, op); } @@ -541,7 +541,7 @@ private: if (jcp_.exclude_pad) { mov(reg_shift, kw_padding[jj]); imul(reg_shift, reg_tmp_32); - movq(Xmm(vmm_shift.getIdx()), reg_shift); + uni_vmovq(Xmm(vmm_shift.getIdx()), reg_shift); uni_vbroadcastss(vmm_shift, Xmm(vmm_shift.getIdx())); uni_vcvtdq2ps(vmm_shift, vmm_shift); } @@ -612,7 +612,7 @@ private: } else { Ymm ymm_prev_dst = Ymm(vmm_sum.getIdx()); vperm2i128(ymm_prev_dst, ymm_prev_dst, ymm_prev_dst, 0x01); - vpslldq(vmm_sum, vmm_sum, (oc - jcp_.oc_block / 2) * sizeof(float)); + uni_vpslldq(vmm_sum, vmm_sum, (oc - jcp_.oc_block / 2) * sizeof(float)); } uni_vaddps(vmm_dst, vmm_dst, vmm_sum); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 9dd250d7b96..8c9d610bd80 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -583,7 +583,7 @@ private: switch (src_prc) { case Precision::FP32: case Precision::I32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case Precision::BF16: uni_vpinsrw(xmm_src, xmm_src, op, 0); @@ -599,11 +599,11 @@ private: break; case Precision::I8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case Precision::U8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_prc"); @@ -730,7 +730,7 @@ private: switch (dst_prc) { case Precision::FP32: case Precision::I32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case Precision::BF16: uni_vpsrld(xmm_dst, xmm_dst, 16); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp index b12bed6a476..838a47d4085 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fake_quantize_node.cpp @@ -150,9 +150,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vpxor(xmm_wei(0), xmm_wei(0), xmm_wei(0)); uni_vpxor(xmm_mask(0), xmm_mask(0), xmm_mask(0)); - movss(xmm_src(0), ptr[reg_from + c * sizeof(float)]); - movss(xmm_wei(0), ptr[reg_thresholds + c * sizeof(float)]); - movss(xmm_mask(0), ptr[reg_output_mask + c * sizeof(float)]); + uni_vmovss(xmm_src(0), ptr[reg_from + c * sizeof(float)]); + uni_vmovss(xmm_wei(0), ptr[reg_thresholds + c * sizeof(float)]); + uni_vmovss(xmm_mask(0), ptr[reg_output_mask + c * sizeof(float)]); uni_vcmpgtps(xmm_src(0), xmm_src(0), xmm_wei(0)); uni_vpcmpeqd(xmm_src(0), xmm_src(0), xmm_mask(0)); uni_vmovmskps(reg_src_32, xmm_src(0)); @@ -591,13 +591,13 @@ private: jle(exit_label, T_NEAR); for (int i = 0; i < jqp_.c % tail4_simd_w; i++) { - movss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); - movss(xmm_crop_high(0), ptr[reg_crop_high + i * wei_type_size]); - movss(xmm_input_scale(0), ptr[reg_input_scale + i * wei_type_size]); - movss(xmm_input_shift(0), ptr[reg_input_shift + i * wei_type_size]); + uni_vmovss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); + uni_vmovss(xmm_crop_high(0), ptr[reg_crop_high + i * wei_type_size]); + uni_vmovss(xmm_input_scale(0), ptr[reg_input_scale + i * wei_type_size]); + uni_vmovss(xmm_input_shift(0), ptr[reg_input_shift + i * wei_type_size]); if (do_dequantization) { - movss(xmm_output_scale(0), ptr[reg_output_scale + i * wei_type_size]); - movss(xmm_output_shift(0), ptr[reg_output_shift + i * wei_type_size]); + uni_vmovss(xmm_output_scale(0), ptr[reg_output_scale + i * wei_type_size]); + uni_vmovss(xmm_output_shift(0), ptr[reg_output_shift + i * wei_type_size]); } load_scalar(xmm_val(0), ptr[aux_reg_from + i * src_type_size], jqp_.src_prc); @@ -688,15 +688,15 @@ private: switch (src_prc) { case Precision::FP32: case Precision::I32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case Precision::I8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case Precision::U8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_prc"); @@ -797,7 +797,7 @@ private: switch (dst_prc) { case Precision::FP32: case Precision::I32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case Precision::I8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp index 1926914f074..786d5dc41ac 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp @@ -104,13 +104,13 @@ protected: auto b = xmm2; auto c = xmm3; - movdqu(a, xword[src]); // load 4 floats - movdqu(b, a); // b = a - movdqu(c, a); // c = a - pcmpeqd(b, zero); // if (a == 0) b = 1 else b = 0 - pand(c, mask); // c = a & 01111111100000000000000000000000 - pcmpeqd(c, zero); // if (c == 0) c = 1 else c = 0 - ptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + uni_vmovdqu(a, xword[src]); // load 4 floats + uni_vmovdqu(b, a); // b = a + uni_vmovdqu(c, a); // c = a + uni_vpcmpeqd(b, b, zero); // if (a == 0) b = 1 else b = 0 + uni_vpand(c, c, mask); // c = a & 01111111100000000000000000000000 + uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } template diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp index f6873d88d34..6a2d8564bd3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp @@ -1199,10 +1199,10 @@ private: jl(tail_loop_end_label, T_NEAR); // get idx for input - movss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); + uni_vmovss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::data_type::s32, true); - movss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); + uni_vmovss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::data_type::s32, true); // gather weightX by input idx, used in y0-y3 gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::data_type::f32, true); @@ -1430,18 +1430,18 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; default: @@ -1536,7 +1536,7 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -1552,7 +1552,7 @@ private: break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; default: assert(!"unknown dst_dt"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index f27a40e3bd2..f1801d72b61 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -102,17 +102,17 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k Xbyak::Ymm ymm_sum = Xbyak::Ymm(vmm_dst.getIdx()); vextractf128(xmm_aux1, ymm_sum, 0); vextractf128(xmm_aux2, ymm_sum, 1); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } else { Xbyak::Zmm zmm_sum = Xbyak::Zmm(vmm_dst.getIdx()); vextractf32x4(xmm_aux1, zmm_sum, 0); vextractf32x4(xmm_aux2, zmm_sum, 1); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); vextractf32x4(xmm_aux2, zmm_sum, 2); vextractf32x4(xmm_aux3, zmm_sum, 3); - addps(xmm_aux2, xmm_aux3); - addps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux2, xmm_aux2, xmm_aux3); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } } else { @@ -342,14 +342,14 @@ private: } inline void hsum_store(Xbyak::Xmm xmm_sum) { - movshdup(xmm_aux3, xmm_sum); // sum:1,2,3,4; aux3:2,2,4,4 - addps(xmm_sum, xmm_aux3); // sum:1+2,2+2,3+4,4+4 - movhlps(xmm_aux3, xmm_sum); // aux3:3+4,4+4,4,4 - addps(xmm_sum, xmm_aux3); // sum:1+2+3+4,... + uni_vmovshdup(xmm_aux3, xmm_sum); // sum:1,2,3,4; aux3:2,2,4,4 + uni_vaddps(xmm_sum, xmm_sum, xmm_aux3); // sum:1+2,2+2,3+4,4+4 + uni_vmovhlps(xmm_aux3, xmm_sum); // aux3:3+4,4+4,4,4 + uni_vaddps(xmm_sum, xmm_sum, xmm_aux3); // sum:1+2+3+4,... if (jcp_.normalize_variance) { - movss(ptr[reg_variance], xmm_sum); + uni_vmovss(ptr[reg_variance], xmm_sum); } else { - movss(ptr[reg_sum], xmm_sum); + uni_vmovss(ptr[reg_sum], xmm_sum); } } }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index ff95f416573..50345af1821 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -88,17 +88,21 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker Xbyak::Ymm ymm_sqr_sum = Xbyak::Ymm(vmm_sqr_sum.getIdx()); vextractf128(xmm_aux1, ymm_sqr_sum, 0); vextractf128(xmm_aux2, ymm_sqr_sum, 1); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } else { Xbyak::Zmm zmm_sqr_sum = Xbyak::Zmm(vmm_sqr_sum.getIdx()); vextractf32x4(xmm_aux1, zmm_sqr_sum, 0); vextractf32x4(xmm_aux2, zmm_sqr_sum, 1); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); vextractf32x4(xmm_aux2, zmm_sqr_sum, 2); vextractf32x4(xmm_aux3, zmm_sqr_sum, 3); - addps(xmm_aux2, xmm_aux3); - addps(xmm_aux1, xmm_aux2); + // vaddps(xmm_aux2, xmm_aux3); + // vaddps(xmm_aux1, xmm_aux2); + uni_vaddps(xmm_aux2, xmm_aux2, xmm_aux3); + uni_vaddps(xmm_aux1, xmm_aux1, xmm_aux2); hsum_store(xmm_aux1); } } @@ -124,11 +128,11 @@ private: Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(4); inline void hsum_store(Xbyak::Xmm xmm_sqr_sum) { - movshdup(xmm_aux3, xmm_sqr_sum); // sqrt_sum:1,2,3,4; aux3:2,2,4,4 - addps(xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2,2+2,3+4,4+4 - movhlps(xmm_aux3, xmm_sqr_sum); // aux3:3+4,4+4,4,4 - addps(xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2+3+4,... - movss(ptr[reg_modulo], xmm_sqr_sum); + uni_vmovshdup(xmm_aux3, xmm_sqr_sum); // sqrt_sum:1,2,3,4; aux3:2,2,4,4 + uni_vaddps(xmm_sqr_sum, xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2,2+2,3+4,4+4 + uni_vmovhlps(xmm_aux3, xmm_sqr_sum); // aux3:3+4,4+4,4,4 + uni_vaddps(xmm_sqr_sum, xmm_sqr_sum, xmm_aux3); // sqrt_sum:1+2+3+4,... + uni_vmovss(ptr[reg_modulo], xmm_sqr_sum); } inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) { @@ -359,6 +363,7 @@ private: load_scalar(xmm_val, ptr[reg_src], jcp_.src_dt); uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); + if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, step * sizeof(float)); @@ -493,19 +498,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown dst_dt"); @@ -568,11 +573,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp index 7828bc55f27..e9c3a1a619b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -607,19 +607,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_dt"); @@ -692,11 +692,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -707,7 +707,7 @@ private: case memory::data_type::u8: uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); + vmovq(reg_tmp_64, xmm_dst); mov(op, reg_tmp_8); break; default: @@ -738,9 +738,9 @@ private: } inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); @@ -753,21 +753,21 @@ private: case memory::data_type::s32: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -777,7 +777,7 @@ private: inline void horiz_ps(const Xmm& xmm, const Operand& op) { switch (jcp_.reduce_mode) { case ReduceAnd: - andps(xmm, op); + uni_vandps(xmm, xmm, op); break; case ReduceL1: case ReduceL2: @@ -786,19 +786,19 @@ private: case ReduceSum: case ReduceSumSquare: case ReduceLogSumExp: - addps(xmm, op); + uni_vaddps(xmm, xmm, op); break; case ReduceMax: - maxps(xmm, op); + uni_vmaxps(xmm, op); break; case ReduceMin: - minps(xmm, op); + uni_vminps(xmm, op); break; case ReduceOr: - orps(xmm, op); + uni_vorps(xmm, xmm, op); break; case ReduceProd: - mulps(xmm, op); + uni_vmulps(xmm, op); break; default: assert(!"unsupported reduce mode"); @@ -1074,19 +1074,19 @@ private: switch (src_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case memory::data_type::bf16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; case memory::data_type::s8: movsx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; case memory::data_type::u8: movzx(reg_tmp_32, op); - movq(xmm_src, reg_tmp_64); + uni_vmovq(xmm_src, reg_tmp_64); break; default: assert(!"unknown src_dt"); @@ -1159,11 +1159,11 @@ private: switch (dst_dt) { case memory::data_type::f32: case memory::data_type::s32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; case memory::data_type::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); @@ -1205,33 +1205,33 @@ private: } inline void horize_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... switch (dst_dt) { case memory::data_type::f32: - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::bf16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(ptr[reg_dst], xmm_dst, 0x0); + uni_vpextrw(ptr[reg_dst], xmm_dst, 0x0); break; case memory::data_type::s32: uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -1261,9 +1261,9 @@ private: } inline void load_embedded_horiz_store(Xbyak::Xmm xmm_dst, memory::data_type dst_dt) { - movshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 + uni_vmovshdup(xmm_aux3, xmm_dst); // dst:1,2,3,4; aux3:2,2,4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2),f(2,2),f(3,4),f(4,4) - movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 + uni_vmovhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4 horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); @@ -1276,21 +1276,21 @@ private: case memory::data_type::s32: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); - movss(ptr[reg_dst], xmm_dst); + uni_vmovss(ptr[reg_dst], xmm_dst); break; case memory::data_type::u8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; case memory::data_type::s8: horiz_ps(xmm_dst, xmm_aux3); uni_vcvtps2dq(xmm_dst, xmm_dst); uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - pextrb(ptr[reg_dst], xmm_dst, 0); + uni_vpextrb(ptr[reg_dst], xmm_dst, 0); break; default: assert(!"unknown dst_dt"); @@ -1300,7 +1300,7 @@ private: inline void horiz_ps(const Xmm& xmm, const Operand& op) { switch (jcp_.reduce_mode) { case ReduceAnd: - andps(xmm, op); + uni_vandps(xmm, xmm, op); break; case ReduceL1: case ReduceL2: @@ -1309,19 +1309,19 @@ private: case ReduceSum: case ReduceSumSquare: case ReduceLogSumExp: - addps(xmm, op); + uni_vaddps(xmm, xmm, op); break; case ReduceMax: - maxps(xmm, op); + uni_vmaxps(xmm, op); break; case ReduceMin: - minps(xmm, op); + uni_vminps(xmm, op); break; case ReduceOr: - orps(xmm, op); + uni_vorps(xmm, xmm, op); break; case ReduceProd: - mulps(xmm, op); + uni_vmulps(xmm, op); break; default: assert(!"unsupported reduce mode"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp index af1159bb07d..32379a8f0f3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_region_yolo_node.cpp @@ -202,10 +202,10 @@ private: inline void load_scalar(Xbyak::Xmm xmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) { switch (src_dt) { case InferenceEngine::Precision::FP32: - movss(xmm_src, op); + uni_vmovss(xmm_src, op); break; case InferenceEngine::Precision::BF16: - pinsrw(xmm_src, op, 0x0); + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); uni_vpslld(xmm_src, xmm_src, 16); break; default: @@ -215,11 +215,11 @@ private: inline void store_scalar(const Xbyak::Address &op, Xbyak::Xmm xmm_dst, InferenceEngine::Precision dst_dt) { switch (dst_dt) { case InferenceEngine::Precision::FP32: - movss(op, xmm_dst); + uni_vmovss(op, xmm_dst); break; case InferenceEngine::Precision::BF16: uni_vpsrld(xmm_dst, xmm_dst, 16); - pextrw(op, xmm_dst, 0x0); + uni_vpextrw(op, xmm_dst, 0x0); break; default: assert(!"unknown dst_dt"); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp index 346bc1079a9..f98b5609982 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp @@ -210,9 +210,9 @@ private: } void roi_pool_bilinear(int c_blocks) { - movq(xmm_yf, reg_yf); + uni_vmovq(xmm_yf, reg_yf); uni_vbroadcastss(vmm_yf, xmm_yf); - movq(xmm_xf, reg_xf); + uni_vmovq(xmm_xf, reg_xf); uni_vbroadcastss(vmm_xf, xmm_xf); Vmm vmm_src00 = get_src_reg(0); diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn index 60f41b3a998..e03f65ca65a 160000 --- a/inference-engine/thirdparty/mkl-dnn +++ b/inference-engine/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit 60f41b3a9988ce7b1bc85c4f1ce7f9443bc91c9d +Subproject commit e03f65ca65aa771f3541163a33a57d7eea9ea83d