diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp index 20d48e3ac18..3f8b5f12ad9 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp @@ -3,24 +3,23 @@ // #include "jit_eltwise_emitters.hpp" -#include -#include -#include -#include using namespace InferenceEngine; using namespace dnnl::impl::utils; -using namespace dnnl::impl; -using namespace dnnl::impl::cpu::x64; +using namespace dnnl::impl::cpu; using namespace Xbyak; +#define CONST_1_F 0x3f800000 // 1.f +#define INF_MASK 0x7F800000 +#define INF_NEG_MASK 0xFF800000 + namespace ov { namespace intel_cpu { /// ADD /// -jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_add_emitter::get_inputs_num() const { return 2; } @@ -28,20 +27,20 @@ size_t jit_add_emitter::get_inputs_num() const { return 2; } void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -54,7 +53,7 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); uni_vadd(vmm_dst, vmm_dst, vmm_src1); } else { @@ -62,14 +61,14 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std } } -std::set jit_add_emitter::get_supported_precisions() { +std::set jit_add_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// MUL_ADD /// -jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } @@ -77,20 +76,20 @@ size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_src2 = Vmm(in_vec_idxs[2]); @@ -144,7 +143,7 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { uni_vfmadd231_xmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2); } else { uni_vfmadd231_vmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2); @@ -155,14 +154,14 @@ size_t jit_mul_add_emitter::aux_vecs_count() const { return 1; } -std::set jit_mul_add_emitter::get_supported_precisions() { +std::set jit_mul_add_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// SUB /// -jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_subtract_emitter::get_inputs_num() const { return 2; } @@ -170,20 +169,20 @@ size_t jit_subtract_emitter::get_inputs_num() const { return 2; } void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -196,7 +195,7 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); uni_vsub(vmm_dst, vmm_dst, vmm_src1); } else { @@ -204,14 +203,14 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set jit_subtract_emitter::get_supported_precisions() { +std::set jit_subtract_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// MULTIPLY /// -jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_multiply_emitter::get_inputs_num() const { return 2; } @@ -219,20 +218,20 @@ size_t jit_multiply_emitter::get_inputs_num() const { return 2; } void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -245,7 +244,7 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); uni_vmul(vmm_dst, vmm_dst, vmm_src1); } else { @@ -253,14 +252,14 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set jit_multiply_emitter::get_supported_precisions() { +std::set jit_multiply_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_divide_emitter::get_inputs_num() const { return 2; } @@ -268,20 +267,20 @@ size_t jit_divide_emitter::get_inputs_num() const { return 2; } void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -308,7 +307,7 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vmovups(vmm_dst, vmm_src0); uni_vdiv(vmm_dst, vmm_dst, vmm_src1); } else { @@ -316,7 +315,7 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set jit_divide_emitter::get_supported_precisions() { +std::set jit_divide_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } @@ -325,9 +324,9 @@ size_t jit_divide_emitter::aux_vecs_count() const { } /// FLOOR /// -jit_floor_emitter::jit_floor_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_floor_emitter::jit_floor_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_floor_emitter::get_inputs_num() const { return 1; } @@ -335,29 +334,29 @@ size_t jit_floor_emitter::get_inputs_num() const { return 1; } void jit_floor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vroundps(vmm_dst, vmm_src, 1); } /// CEILING /// -jit_ceiling_emitter::jit_ceiling_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_ceiling_emitter::jit_ceiling_emitter(jit_generator* host, cpu_isa_t host_isa, Precision exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_ceiling_emitter::get_inputs_num() const { return 1; } @@ -366,29 +365,29 @@ void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vroundps(vmm_dst, vmm_src, 2); } /// FLOOR_MOD /// -jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } @@ -396,26 +395,26 @@ size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); @@ -438,9 +437,9 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const { } /// MOD /// -jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_mod_emitter::get_inputs_num() const { return 2; } @@ -448,26 +447,26 @@ size_t jit_mod_emitter::get_inputs_num() const { return 2; } void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); @@ -490,9 +489,9 @@ size_t jit_mod_emitter::aux_vecs_count() const { } /// MAXIMUM /// -jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_maximum_emitter::get_inputs_num() const { return 2; } @@ -500,20 +499,20 @@ size_t jit_maximum_emitter::get_inputs_num() const { return 2; } void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -526,7 +525,7 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { if (vmm_src0.getIdx() != vmm_dst.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); uni_vmax(vmm_dst, vmm_dst, vmm_src1); @@ -535,14 +534,14 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set jit_maximum_emitter::get_supported_precisions() { +std::set jit_maximum_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// MINIMUM /// -jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_minimum_emitter::get_inputs_num() const { return 2; } @@ -550,20 +549,20 @@ size_t jit_minimum_emitter::get_inputs_num() const { return 2; } void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -576,7 +575,7 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { if (vmm_src0.getIdx() != vmm_dst.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); uni_vmin(vmm_dst, vmm_dst, vmm_src1); @@ -585,15 +584,15 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set jit_minimum_emitter::get_supported_precisions() { +std::set jit_minimum_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// SQUARED_DIFFERENCE /// jit_squared_difference_emitter::jit_squared_difference_emitter( - jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) + x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } @@ -601,20 +600,20 @@ size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -633,7 +632,7 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ } }; - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { if (vmm_src0.getIdx() != vmm_dst.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); uni_vsqdiff(vmm_dst, vmm_dst, vmm_src1); @@ -642,35 +641,36 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ } } -std::set jit_squared_difference_emitter::get_supported_precisions() { +std::set jit_squared_difference_emitter::get_supported_precisions() { return {Precision::FP32, Precision::I32}; } /// POWER_DYNAMIC /// -jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} -jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + Precision exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) {} +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } void jit_power_dynamic_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -689,10 +689,10 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save k-regs as callee may use them size_t n_k_regs_to_save = 8; - if (isa == cpu::x64::avx512_core || isa == cpu::x64::avx512_core) { + if (isa == x64::avx512_core || isa == x64::avx512_core) { h->sub(h->rsp, n_k_regs_to_save * k_mask_size); for (size_t i = 0; i < n_k_regs_to_save; ++i) { - if (mayiuse(avx512_core)) + if (x64::mayiuse(x64::avx512_core)) h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); else h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); @@ -738,9 +738,9 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); // restore k registers - if (isa == cpu::x64::avx512_core || isa == cpu::x64::avx512_core) { + if (isa == x64::avx512_core || isa == x64::avx512_core) { for (int i = n_k_regs_to_save - 1; i >= 0; --i) { - if (mayiuse(avx512_core)) + if (x64::mayiuse(x64::avx512_core)) h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); else h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); @@ -756,11 +756,11 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -770,33 +770,33 @@ size_t jit_equal_emitter::get_inputs_num() const { return 2; } void jit_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); h->uni_vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -809,7 +809,7 @@ void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const s void jit_equal_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_equal_emitter::aux_vecs_count() const { @@ -817,11 +817,11 @@ size_t jit_equal_emitter::aux_vecs_count() const { } /// NOT_EQUAL /// -jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -831,33 +831,33 @@ size_t jit_not_equal_emitter::get_inputs_num() const { return 2; } void jit_not_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); h->pxor(vmm_aux1, vmm_aux1); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("one")); h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); @@ -870,7 +870,7 @@ void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, con void jit_not_equal_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_not_equal_emitter::aux_vecs_count() const { @@ -878,11 +878,11 @@ size_t jit_not_equal_emitter::aux_vecs_count() const { } /// GREATER /// -jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -892,33 +892,33 @@ size_t jit_greater_emitter::get_inputs_num() const { return 2; } void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -931,7 +931,7 @@ void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const void jit_greater_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_greater_emitter::aux_vecs_count() const { @@ -939,11 +939,12 @@ size_t jit_greater_emitter::aux_vecs_count() const { } /// GREATER_EQUAL /// -jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, + Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -953,33 +954,33 @@ size_t jit_greater_equal_emitter::get_inputs_num() const { return 2; } void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -992,7 +993,7 @@ void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, void jit_greater_equal_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_greater_equal_emitter::aux_vecs_count() const { @@ -1000,11 +1001,11 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const { } /// LESS /// -jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1014,33 +1015,33 @@ size_t jit_less_emitter::get_inputs_num() const { return 2; } void jit_less_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -1053,7 +1054,7 @@ void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const st void jit_less_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_less_emitter::aux_vecs_count() const { @@ -1061,11 +1062,11 @@ size_t jit_less_emitter::aux_vecs_count() const { } /// LESS_EQUAL /// -jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1075,20 +1076,20 @@ size_t jit_less_equal_emitter::get_inputs_num() const { return 2; } void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1096,13 +1097,13 @@ void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, co Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->movups(vmm_aux0, vmm_src0); h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -1115,7 +1116,7 @@ void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, co void jit_less_equal_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_less_equal_emitter::aux_vecs_count() const { @@ -1123,11 +1124,11 @@ size_t jit_less_equal_emitter::aux_vecs_count() const { } /// LOGICAL_AND /// -jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1137,20 +1138,20 @@ size_t jit_logical_and_emitter::get_inputs_num() const { return 2; } void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1158,7 +1159,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, c Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->pxor(vmm_aux0, vmm_aux0); h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); @@ -1172,7 +1173,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, c h->blendvps(vmm_aux2, vmm_aux1); h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); @@ -1196,7 +1197,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, c void jit_logical_and_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_logical_and_emitter::aux_vecs_count() const { @@ -1205,11 +1206,11 @@ size_t jit_logical_and_emitter::aux_vecs_count() const { /// LOGICAL_OR /// -jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1219,20 +1220,20 @@ size_t jit_logical_or_emitter::get_inputs_num() const { return 2; } void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1240,7 +1241,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, co Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->pxor(vmm_aux0, vmm_aux0); h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); @@ -1254,7 +1255,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, co h->blendvps(vmm_aux2, vmm_aux1); h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); @@ -1278,7 +1279,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, co void jit_logical_or_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_logical_or_emitter::aux_vecs_count() const { @@ -1286,11 +1287,11 @@ size_t jit_logical_or_emitter::aux_vecs_count() const { } /// LOGICAL_XOR /// -jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1300,20 +1301,20 @@ size_t jit_logical_xor_emitter::get_inputs_num() const { return 2; } void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1321,7 +1322,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, c Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->pxor(vmm_aux0, vmm_aux0); h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); h->movups(vmm_dst, table_val("one")); @@ -1335,7 +1336,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, c h->blendvps(vmm_aux2, vmm_aux1); h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("one")); h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0); @@ -1359,7 +1360,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, c void jit_logical_xor_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_logical_xor_emitter::aux_vecs_count() const { @@ -1367,11 +1368,11 @@ size_t jit_logical_xor_emitter::aux_vecs_count() const { } /// LOGICAL_NOT /// -jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1381,32 +1382,32 @@ size_t jit_logical_not_emitter::get_inputs_num() const { return 1; } void jit_logical_not_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->pxor(vmm_aux0, vmm_aux0); h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq); h->movups(vmm_aux1, table_val("one")); h->pxor(vmm_dst, vmm_dst); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero")); h->uni_vmovups(vmm_dst, table_val("zero")); h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0); @@ -1419,7 +1420,7 @@ void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, c void jit_logical_not_emitter::register_table_entries() { push_arg_entry_of("zero", 0x00000000, true); - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); } size_t jit_logical_not_emitter::aux_vecs_count() const { @@ -1427,7 +1428,7 @@ size_t jit_logical_not_emitter::aux_vecs_count() const { } /// POWER_STATIC /// -jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { auto powerStaticNode = ov::as_type_ptr(node); if (powerStaticNode == nullptr) { @@ -1441,7 +1442,7 @@ jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_ prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, +jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, float inpPower, float inpScale, float inpShift, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc), power(inpPower), scale(inpScale), shift(inpShift) { @@ -1453,20 +1454,20 @@ size_t jit_power_static_emitter::get_inputs_num() const { return 1; } void jit_power_static_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); @@ -1474,7 +1475,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); if (scale != 1.f || shift != 0.f) { - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vmovups(vmm_aux0, table_val("scale")); h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0); h->uni_vmovups(vmm_dst, table_val("shift")); @@ -1500,7 +1501,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, if (power < 0.f) { h->uni_vmovups(vmm_aux0, table_val("one")); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); h->uni_vmovups(vmm_dst, vmm_aux0); } else { @@ -1516,7 +1517,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, if (power < 0.f) { h->uni_vmovups(vmm_aux0, table_val("one")); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst); h->uni_vmovups(vmm_dst, vmm_aux0); } else { @@ -1538,10 +1539,10 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save k-regs as callee may use them size_t n_k_regs_to_save = 8; - if (isa == cpu::x64::avx512_core || isa == cpu::x64::avx512_core) { + if (isa == x64::avx512_core || isa == x64::avx512_core) { h->sub(h->rsp, n_k_regs_to_save * k_mask_size); for (size_t i = 0; i < n_k_regs_to_save; ++i) { - if (mayiuse(avx512_core)) + if (x64::mayiuse(x64::avx512_core)) h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); else h->kmovw(h->ptr[h->rsp + i * k_mask_size], Opmask(i)); @@ -1587,9 +1588,9 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); // restore k registers - if (isa == cpu::x64::avx512_core || isa == cpu::x64::avx512_core) { + if (isa == x64::avx512_core || isa == x64::avx512_core) { for (int i = n_k_regs_to_save - 1; i >= 0; --i) { - if (mayiuse(avx512_core)) + if (x64::mayiuse(x64::avx512_core)) h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); else h->kmovw(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); @@ -1605,10 +1606,10 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, } void jit_power_static_emitter::register_table_entries() { - push_arg_entry_of("power", cpu::x64::float2int(power), true); - push_arg_entry_of("scale", cpu::x64::float2int(scale), true); - push_arg_entry_of("shift", cpu::x64::float2int(shift), true); - push_arg_entry_of("one", cpu::x64::float2int(1.f), true); + push_arg_entry_of("power", x64::float2int(power), true); + push_arg_entry_of("scale", x64::float2int(scale), true); + push_arg_entry_of("shift", x64::float2int(shift), true); + push_arg_entry_of("one", x64::float2int(1.f), true); } size_t jit_power_static_emitter::aux_vecs_count() const { @@ -1616,11 +1617,11 @@ size_t jit_power_static_emitter::aux_vecs_count() const { } /// PRELU /// -jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1629,27 +1630,27 @@ size_t jit_prelu_emitter::get_inputs_num() const { return 2; } void jit_prelu_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]); - if (isa == cpu::x64::sse41) { + if (isa == x64::sse41) { h->pxor(vmm_aux0, vmm_aux0); h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os); h->movups(vmm_aux1, vmm_src1); @@ -1657,12 +1658,12 @@ void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const s if (vmm_src0.getIdx() != vmm_dst.getIdx()) h->movups(vmm_dst, vmm_src0); h->blendvps(vmm_dst, vmm_aux1); - } else if (isa == cpu::x64::avx2) { + } else if (isa == x64::avx2) { h->vmulps(vmm_aux0, vmm_src0, vmm_src1); h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1); h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1); h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1); - } else if (isa == cpu::x64::avx512_core) { + } else if (isa == x64::avx512_core) { h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0); if (vmm_src0.getIdx() != vmm_dst.getIdx()) h->vmovups(vmm_dst, vmm_src0); @@ -1676,9 +1677,9 @@ size_t jit_prelu_emitter::aux_vecs_count() const { } /// SQRT /// -jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) {} size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } @@ -1686,20 +1687,20 @@ size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } void jit_sqrt_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1707,7 +1708,7 @@ void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const st } /// Negate /// -jit_negative_emitter::jit_negative_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_negative_emitter::jit_negative_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} size_t jit_negative_emitter::get_inputs_num() const { return 1; } @@ -1715,20 +1716,20 @@ size_t jit_negative_emitter::get_inputs_num() const { return 1; } void jit_negative_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vpxor(vmm_dst, vmm_dst, vmm_dst); @@ -1736,12 +1737,12 @@ void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, cons } /// ERF /// -jit_erf_emitter::jit_erf_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_erf_emitter::jit_erf_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } @@ -1754,20 +1755,20 @@ void jit_erf_emitter::emit_impl( const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1780,7 +1781,7 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std auto compute_cmp_mask = [&](const Vmm &vmm_src, const Xbyak::Operand &compare_operand, int cmp_predicate) { - if (host_isa_ == cpu::x64::avx512_core) { + if (host_isa_ == x64::avx512_core) { h->vcmpps(k_mask, vmm_src, compare_operand, cmp_predicate); } else { h->uni_vcmpps(vmm_mask, vmm_src, compare_operand, cmp_predicate); @@ -1788,7 +1789,7 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std }; auto blend_with_mask = [&](const Vmm &vmm_dst, const Xbyak::Operand &src) { - if (host_isa_ == cpu::x64::avx512_core) { + if (host_isa_ == x64::avx512_core) { h->vblendmps(vmm_dst | k_mask, vmm_dst, src); } else { h->uni_vblendvps(vmm_dst, vmm_dst, src, vmm_mask); @@ -1903,7 +1904,7 @@ void jit_erf_emitter::register_table_entries() { push_arg_entry_of("erf_pol4", 0xbfba00e3, true); // p4 = -1.453152027f push_arg_entry_of("erf_pol5", 0x3f87dc22, true); // p5 = 1.061405429f - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("half", 0x3f000000, true); push_arg_entry_of("exp_log2ef", 0x3fb8aa3b, true); @@ -1920,11 +1921,11 @@ size_t jit_erf_emitter::aux_vecs_count() const { } /// SOFT SIGN /// -jit_soft_sign_emitter::jit_soft_sign_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_soft_sign_emitter::jit_soft_sign_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, Precision exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1934,20 +1935,20 @@ size_t jit_soft_sign_emitter::get_inputs_num() const { return 1; } void jit_soft_sign_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { - if (host_isa_ == cpu::x64::sse41) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx2) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else if (host_isa_ == cpu::x64::avx512_core) { - emit_isa(in_vec_idxs, out_vec_idxs); + if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); } else { assert(!"unsupported isa"); } } -template +template void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1958,8 +1959,189 @@ void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, con } void jit_soft_sign_emitter::register_table_entries() { - push_arg_entry_of("one", 0x3f800000, true); + push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("positive_mask", 0x7fffffff, true); } + +/// IS_FINITE /// +template <> +void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + auto vmm_src = Zmm(in_vec_idxs[0]); + auto vmm_dst = Zmm(out_vec_idxs[0]); + auto &ones_mask = h->k1; + auto reg32_one = Reg32(aux_gpr_idxs[0]); + + h->mov(reg32_one, CONST_1_F); + h->vfpclassps(ones_mask, vmm_src, 0B10111001); + h->knotd(ones_mask, ones_mask); + h->vpbroadcastd(vmm_dst | ones_mask | h->T_z, reg32_one); +} + +template +void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional::type; + auto vmm_src = Vmm(in_vec_idxs[0]); + auto vmm_dst = Vmm(out_vec_idxs[0]); + + h->uni_vandps(vmm_src, vmm_src, table_val("inf")); + h->uni_vcmpps(vmm_src, vmm_src, table_val("inf"), 0B00000100); // NEq + + if (isa == x64::avx2) { + h->uni_vandps(vmm_dst, vmm_src, table_val("one")); + } else { + if (vmm_src.getIdx() != vmm_dst.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src); + } + h->uni_vandps(vmm_dst, vmm_dst, table_val("one")); + } +} + +void jit_is_finite_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "jit_is_finite_emitter doesn't support ISA " << host_isa_; + } +} + +void jit_is_finite_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core) { + push_arg_entry_of("one", CONST_1_F, true); + push_arg_entry_of("inf", INF_MASK, true); + } +} + +/// IS_INF /// +template <> +void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + Zmm vmm_src = Zmm(in_vec_idxs[0]); + Zmm vmm_dst = Zmm(out_vec_idxs[0]); + + if (detect_negative || detect_positive) { + auto &ones_mask = h->k1; + auto reg32_one = Reg32(aux_gpr_idxs[0]); + uint8_t imm = detect_negative ? 0B00010000 : 0B00000000; + if (detect_positive) { + imm |= 0B00001000; + } + + h->mov(reg32_one, CONST_1_F); + h->vfpclassps(ones_mask, vmm_src, imm); + h->vpbroadcastd(vmm_dst | ones_mask | h->T_z, reg32_one); + } else { + h->uni_vxorps(vmm_dst, vmm_dst, vmm_dst); + } +} + +template +void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional::type; + + if (detect_negative || detect_positive) { + auto vmm_src = Vmm(in_vec_idxs[0]); + auto vmm_dst = Vmm(out_vec_idxs[0]); + + if (detect_positive) { + if (detect_negative) { + h->uni_vpslld(vmm_src, vmm_src, 1); + h->uni_vpsrld(vmm_src, vmm_src, 1); + } + h->uni_vpcmpeqd(vmm_src, vmm_src, table_val("inf")); + } else if (detect_negative) { + h->uni_vpcmpeqd(vmm_src, vmm_src, table_val("inf_neg")); + } + + if (isa == x64::avx2) { + h->uni_vandps(vmm_dst, vmm_src, table_val("one")); + } else { + if (vmm_src.getIdx() != vmm_dst.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src); + } + h->uni_vandps(vmm_dst, vmm_dst, table_val("one")); + } + } else { + auto vmm_dst = Vmm(out_vec_idxs[0]); + h->uni_vxorps(vmm_dst, vmm_dst, vmm_dst); + } +} + +void jit_is_inf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "jit_is_inf_emitter doesn't support ISA " << host_isa_; + } +} + +void jit_is_inf_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core) { + push_arg_entry_of("one", CONST_1_F, true); + push_arg_entry_of("inf", INF_MASK, true); + push_arg_entry_of("inf_neg", INF_NEG_MASK, true); + } +} + +/// IS_NAN /// +template <> +void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + auto vmm_src = Zmm(in_vec_idxs[0]); + auto vmm_dst = Zmm(out_vec_idxs[0]); + auto &ones_mask = h->k1; + auto reg32_one = Reg32(aux_gpr_idxs[0]); + + h->mov(reg32_one, CONST_1_F); + h->vfpclassps(ones_mask, vmm_src, 0B10000001); + h->vpbroadcastd(vmm_dst | ones_mask | h->T_z, reg32_one); +} + +template +void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional::type; + auto vmm_src = Vmm(in_vec_idxs[0]); + auto vmm_dst = Vmm(out_vec_idxs[0]); + + h->uni_vcmpps(vmm_src, vmm_src, vmm_src, 0B00000100); + if (isa == x64::avx2) { + h->uni_vandps(vmm_dst, vmm_src, table_val("one")); + } else { + if (vmm_dst.getIdx() != vmm_src.getIdx()) { + h->uni_vmovups(vmm_dst, vmm_src); + } + h->uni_vandps(vmm_dst, vmm_dst, table_val("one")); + } +} + +void jit_is_nan_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == x64::avx512_core) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "jit_is_nan_emitter doesn't support ISA " << host_isa_; + } +} + +void jit_is_nan_emitter::register_table_entries() { + if (host_isa_ != x64::avx512_core) { + push_arg_entry_of("one", CONST_1_F, true); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp index 0a374a418f8..f56373435ec 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp @@ -4,7 +4,6 @@ #pragma once -#include #include "jit_emitter.hpp" namespace ov { @@ -14,7 +13,7 @@ class jit_add_emitter : public jit_emitter { public: jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -33,7 +32,7 @@ class jit_mul_add_emitter : public jit_emitter { public: jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -55,7 +54,7 @@ class jit_subtract_emitter : public jit_emitter { public: jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -75,7 +74,7 @@ class jit_multiply_emitter : public jit_emitter { public: jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -95,7 +94,7 @@ class jit_divide_emitter : public jit_emitter { public: jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -115,7 +114,7 @@ class jit_floor_emitter : public jit_emitter { public: jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -133,7 +132,7 @@ class jit_ceiling_emitter : public jit_emitter { public: jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -151,7 +150,7 @@ class jit_floor_mod_emitter : public jit_emitter { public: jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -171,7 +170,7 @@ class jit_mod_emitter : public jit_emitter { public: jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -191,7 +190,7 @@ class jit_maximum_emitter : public jit_emitter { public: jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -211,7 +210,7 @@ class jit_minimum_emitter : public jit_emitter { public: jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -232,7 +231,7 @@ public: jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, + const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -252,7 +251,7 @@ class jit_power_dynamic_emitter : public jit_emitter { public: jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -271,7 +270,7 @@ class jit_equal_emitter : public jit_emitter { public: jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -293,7 +292,7 @@ class jit_not_equal_emitter : public jit_emitter { public: jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -315,7 +314,7 @@ class jit_greater_emitter : public jit_emitter { public: jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -337,7 +336,7 @@ class jit_greater_equal_emitter : public jit_emitter { public: jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -359,7 +358,7 @@ class jit_less_emitter : public jit_emitter { public: jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -382,7 +381,7 @@ public: jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -404,7 +403,7 @@ class jit_logical_and_emitter : public jit_emitter { public: jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -426,7 +425,7 @@ class jit_logical_or_emitter : public jit_emitter { public: jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -448,7 +447,7 @@ class jit_logical_xor_emitter : public jit_emitter { public: jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -469,7 +468,7 @@ class jit_logical_not_emitter : public jit_emitter { public: jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -491,7 +490,7 @@ public: jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, float inpPower, float inpScale, float inpShift, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -516,7 +515,7 @@ class jit_prelu_emitter : public jit_emitter { public: jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -536,7 +535,7 @@ class jit_sqrt_emitter : public jit_emitter { public: jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -552,7 +551,7 @@ private: class jit_negative_emitter : public jit_emitter { public: - jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -571,7 +570,7 @@ public: jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -595,7 +594,7 @@ class jit_soft_sign_emitter : public jit_emitter { public: jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); size_t get_inputs_num() const override; @@ -611,5 +610,87 @@ private: void register_table_entries() override; }; +class jit_is_finite_emitter : public jit_emitter { +public: + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32) : jit_emitter(host, hostIsa, execPrc) { + prepare_table(); + } + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32) : jit_emitter(host, hostIsa, node, execPrc) { + prepare_table(); + } + + size_t get_inputs_num() const override { return 1; }; + +protected: + size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + void register_table_entries() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emitContext) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +class jit_is_inf_emitter : public jit_emitter { +public: + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32, bool detect_negative = true, bool detect_positive = true) + : jit_emitter(host, hostIsa, execPrc), detect_negative(detect_negative), detect_positive(detect_positive) { + prepare_table(); + } + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32): jit_emitter(host, hostIsa, node, execPrc) { + prepare_table(); + } + + size_t get_inputs_num() const override { return 1; }; + +protected: + size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + void register_table_entries() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emitContext) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + bool detect_negative; + bool detect_positive; +}; + +class jit_is_nan_emitter : public jit_emitter { +public: + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32) : jit_emitter(host, hostIsa, execPrc) { + prepare_table(); + } + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, + InferenceEngine::Precision execPrc = InferenceEngine::Precision::FP32) : jit_emitter(host, hostIsa, node, execPrc) { + prepare_table(); + } + + size_t get_inputs_num() const override { return 1; } + +protected: + size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + void register_table_entries() override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emitContext) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 6c9762605f6..49a5b34face 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -94,6 +94,13 @@ struct EltwiseEmitter { } }; +template<> +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext & ctx) { + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc, ctx.opData.alpha, ctx.opData.beta); + } +}; + /** * Implements Eltwise shape inference algorithm. The algorithm is based on broadcasting all the input shapes * according to the NUMPY broadcast rule. This implementation is more lightweight than the ngraph one. @@ -518,7 +525,10 @@ private: OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter)); + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter)); if (precisions.empty()) IE_THROW() << "Unsupported operation type for Eltwise emitter"; @@ -576,7 +586,10 @@ private: OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter)); + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter)); if (!ctx.emitter) IE_THROW() << "Unsupported operation type for Eltwise emitter"; @@ -1750,8 +1763,6 @@ void Eltwise::initSupportedPrimitiveDescriptors() { // if dim rank is greater than the maximum possible, we should use the reference execution canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; - // 98206 to add JIT implementation. - canUseOptimizedImpl &= !one_of(getAlgorithm(), Algorithm::EltwiseIsFinite, Algorithm::EltwiseIsInf, Algorithm::EltwiseIsNaN); if (!canUseOptimizedImpl && !fusedWith.empty()) { IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' uses reference impl, but unexpectedly fused with other ops"; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp index a66e4fe6cc2..448c2b09411 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/comparison.cpp @@ -55,14 +55,31 @@ const auto ComparisonTestParams = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs, ComparisonLayerTest, ComparisonTestParams, ComparisonLayerTest::getTestCaseName); + +std::vector inputShapesIsOps = { + {{1}, {1}}, + {{1, 2}, {1}}, + {{3, 1}, {1}}, + {{2, 2}, {1}}, + {{1, 5, 1}, {1}}, + {{2, 1, 1, 3, 1}, {1}}, + {{7, 1, 1, 1, 1}, {1}}, + {{2, 2, 2}, {1}}, + {{3, 1, 3, 3}, {1}}, + {{17}, {1}}, + {{2, 18}, {1}}, + {{1, 3, 20}, {1}}, + {{2, 200}, {1}}, + {{2, 17, 3, 4}, {1}} +}; + std::vector comparisonOpTypesIs = { ngraph::helpers::ComparisonTypes::IS_FINITE, - ngraph::helpers::ComparisonTypes::IS_INF, ngraph::helpers::ComparisonTypes::IS_NAN }; const auto ComparisonTestParamsIs = ::testing::Combine( - ::testing::ValuesIn(CommonTestUtils::combineParams(inputShapes)), + ::testing::ValuesIn(inputShapesIsOps), ::testing::Values(InferenceEngine::Precision::FP32), ::testing::ValuesIn(comparisonOpTypesIs), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), @@ -73,4 +90,4 @@ const auto ComparisonTestParamsIs = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_IsOp, ComparisonLayerTest, ComparisonTestParamsIs, ComparisonLayerTest::getTestCaseName); -} // namespace \ No newline at end of file +} // namespace diff --git a/src/tests/functional/shared_test_classes/src/single_layer/comparison.cpp b/src/tests/functional/shared_test_classes/src/single_layer/comparison.cpp index 6500661e7d6..9928a0bb687 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/comparison.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/comparison.cpp @@ -78,24 +78,34 @@ void ComparisonLayerTest::SetUp() { } InferenceEngine::Blob::Ptr ComparisonLayerTest::GenerateInput(const InferenceEngine::InputInfo &inputInfo) const { - auto blob = LayerTestsUtils::LayerTestsCommon::GenerateInput(inputInfo); + InferenceEngine::Blob::Ptr blob; if (comparisonOpType == ComparisonTypes::IS_FINITE || comparisonOpType == ComparisonTypes::IS_NAN) { - auto *dataPtr = blob->buffer().as(); - auto range = blob->size(); + blob = make_blob_with_precision(inputInfo.getTensorDesc()); + blob->allocate(); + auto dataPtr = blob->buffer().as(); + auto dataPtrInt = blob->buffer().as(); + const auto range = blob->size(); + const float start = -static_cast(range) / 2.f; testing::internal::Random random(1); - if (comparisonOpType == ComparisonTypes::IS_FINITE) { - for (size_t i = 0; i < range / 2; i++) { - dataPtr[random.Generate(range)] = - i % 3 == 0 ? std::numeric_limits::infinity() : i % 3 == 1 ? -std::numeric_limits::infinity() : - std::numeric_limits::quiet_NaN(); - } - } else { - for (size_t i = 0; i < range / 2; i++) { - dataPtr[random.Generate(range)] = std::numeric_limits::quiet_NaN(); + for (size_t i = 0; i < range; i++) { + if (i % 7 == 0) { + dataPtr[i] = std::numeric_limits::infinity(); + } else if (i % 7 == 1) { + dataPtr[i] = -std::numeric_limits::infinity(); + } else if (i % 7 == 2) { + dataPtrInt[i] = 0x7F800000 + random.Generate(range); + } else if (i % 7 == 3) { + dataPtr[i] = std::numeric_limits::quiet_NaN(); + } else if (i % 7 == 5) { + dataPtr[i] = -std::numeric_limits::quiet_NaN(); + } else { + dataPtr[i] = start + static_cast(random.Generate(range)); } } + } else { + blob = LayerTestsUtils::LayerTestsCommon::GenerateInput(inputInfo); } return blob; diff --git a/src/tests/functional/shared_test_classes/src/single_layer/is_inf.cpp b/src/tests/functional/shared_test_classes/src/single_layer/is_inf.cpp index e8d436b5494..6c2c1107dfd 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/is_inf.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/is_inf.cpp @@ -76,15 +76,25 @@ void IsInfLayerTest::generate_inputs(const std::vector& targetInputSt const auto& funcInputs = function->inputs(); const auto& input = funcInputs[0]; - int32_t range = std::accumulate(targetInputStaticShapes[0].begin(), targetInputStaticShapes[0].end(), 1u, std::multiplies()); - auto tensor = utils::create_and_fill_tensor( - input.get_element_type(), targetInputStaticShapes[0], range, -range / 2, 1); + int32_t range = std::accumulate(targetInputStaticShapes[0].begin(), targetInputStaticShapes[0].end(), 1, std::multiplies()); + float startFrom = -static_cast(range) / 2.f; + auto tensor = ov::Tensor{ input.get_element_type(), targetInputStaticShapes[0]}; auto pointer = tensor.data::value_type>(); testing::internal::Random random(1); - for (size_t i = 0; i < range / 2; i++) { - pointer[random.Generate(range)] = i % 2 == 0 ? std::numeric_limits::infinity() : -std::numeric_limits::infinity(); + for (size_t i = 0; i < range; i++) { + if (i % 7 == 0) { + pointer[i] = std::numeric_limits::infinity(); + } else if (i % 7 == 1) { + pointer[i] = std::numeric_limits::quiet_NaN(); + } else if (i % 7 == 3) { + pointer[i] = -std::numeric_limits::infinity(); + } else if (i % 7 == 5) { + pointer[i] = -std::numeric_limits::quiet_NaN(); + } else { + pointer[i] = startFrom + static_cast(random.Generate(range)); + } } inputs.insert({input.get_node_shared_ptr(), tensor});