[CPU] Fixed integer compute for arithmetic operations (#13556)

2022-10-24 15:54:52 +04:00 · 2022-10-24 15:54:52 +04:00 · 9f40eb7196
commit 9f40eb7196
parent be1b72d1e9
8 changed files with 246 additions and 107 deletions
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@ -120,7 +120,7 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
 auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
    auto supported = [](descriptor::Tensor& t) -> bool {
        static const std::set<ngraph::element::Type> supported_data_types =
-                { ngraph::element::f32, ngraph::element::i32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
+                { ngraph::element::f32, ngraph::element::bf16, ngraph::element::i8, ngraph::element::u8 };
        return t.get_partial_shape().is_static() && supported_data_types.count(t.get_element_type()) != 0;
    };
    const auto & inputs = n->inputs();
--- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp
@ -46,14 +46,26 @@ void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

+    auto uni_vadd = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break;
+            case Precision::I32:  h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
+        uni_vadd(vmm_dst, vmm_dst, vmm_src1);
    } else {
-        h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1);
+        uni_vadd(vmm_dst, vmm_src0, vmm_src1);
    }
 }

+std::set<InferenceEngine::Precision> jit_add_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}
+
 /// MUL_ADD ///
 jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
 : jit_emitter(host, host_isa, node, exec_prc) {}
@ -85,30 +97,57 @@ void jit_mul_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::x64::sse41) {
+    auto uni_vfmadd231_xmm = [this](Xmm vmm_dst, Xmm vmm_src0, Xmm vmm_src1, Xmm vmm_src2) {
        h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
-        h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2);
+        switch (exec_prc_) {
+            case Precision::FP32: {
+                h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
+                h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2);
+            } break;
+            case Precision::I32: {
+                h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1);
+                h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2);
+            } break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
+    auto uni_vfmadd231_vmm = [this, vmm_aux0](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1, Vmm vmm_src2) {
+        switch (exec_prc_) {
+            case Precision::FP32: {
+                Vmm vmm_mul0;
+                if (vmm_dst.getIdx() == vmm_src0.getIdx()) {
+                    h->uni_vmovups(vmm_aux0, vmm_src0);
+                    vmm_mul0 = vmm_aux0;
+                } else {
+                    vmm_mul0 = vmm_src0;
+                }
+
+                Vmm vmm_mul1;
+                if (vmm_dst.getIdx() == vmm_src1.getIdx()) {
+                    h->uni_vmovups(vmm_aux0, vmm_src1);
+                    vmm_mul1 = vmm_aux0;
+                } else {
+                    vmm_mul1 = vmm_src1;
+                }
+
+                if (vmm_dst.getIdx() != vmm_src2.getIdx())
+                    h->uni_vmovups(vmm_dst, vmm_src2);
+
+                h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1);
+            } break;
+            case Precision::I32: {
+                h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1);
+                h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2);
+            } break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
+    if (isa == cpu::x64::sse41) {
+        uni_vfmadd231_xmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2);
    } else {
-        Vmm vmm_mul0;
-        if (vmm_dst.getIdx() == vmm_src0.getIdx()) {
-            h->uni_vmovups(vmm_aux0, vmm_src0);
-            vmm_mul0 = vmm_aux0;
-        } else {
-            vmm_mul0 = vmm_src0;
-        }
-
-        Vmm vmm_mul1;
-        if (vmm_dst.getIdx() == vmm_src1.getIdx()) {
-            h->uni_vmovups(vmm_aux0, vmm_src1);
-            vmm_mul1 = vmm_aux0;
-        } else {
-            vmm_mul1 = vmm_src1;
-        }
-
-        if (vmm_dst.getIdx() != vmm_src2.getIdx())
-            h->uni_vmovups(vmm_dst, vmm_src2);
-        h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1);
+        uni_vfmadd231_vmm(vmm_dst, vmm_src0, vmm_src1, vmm_src2);
    }
 }

@ -116,6 +155,10 @@ size_t jit_mul_add_emitter::aux_vecs_count() const {
    return 1;
 }

+std::set<InferenceEngine::Precision> jit_mul_add_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}
+
 /// SUB ///
 jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
 : jit_emitter(host, host_isa, node, exec_prc) {}
@ -145,14 +188,25 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

+    auto uni_vsub = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); break;
+            case Precision::I32:  h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
+        uni_vsub(vmm_dst, vmm_dst, vmm_src1);
    } else {
-        h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+        uni_vsub(vmm_dst, vmm_src0, vmm_src1);
    }
 }

+std::set<InferenceEngine::Precision> jit_subtract_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}

 /// MULTIPLY ///
 jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
@ -183,14 +237,25 @@ void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

+    auto uni_vmul = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break;
+            case Precision::I32:  h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
+        uni_vmul(vmm_dst, vmm_dst, vmm_src1);
    } else {
-        h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1);
+        uni_vmul(vmm_dst, vmm_src0, vmm_src1);
    }
 }

+std::set<InferenceEngine::Precision> jit_multiply_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}

 /// DIVIDE ///
 jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
@ -554,17 +619,32 @@ void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

+    auto uni_vsqdiff = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: {
+                h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
+                h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+            } break;
+            case Precision::I32: {
+                h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1);
+                h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst);
+            } break;
+            default: assert(!"unsupported precision");
+        }
+    };
+
    if (isa == cpu::x64::sse41) {
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
-        h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+        uni_vsqdiff(vmm_dst, vmm_dst, vmm_src1);
    } else {
-        h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1);
-        h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst);
+        uni_vsqdiff(vmm_dst, vmm_src0, vmm_src1);
    }
 }

+std::set<InferenceEngine::Precision> jit_squared_difference_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}

 /// POWER_DYNAMIC ///
 jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& node, Precision exec_prc)
--- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp
@ -18,6 +18,7 @@ public:
                    InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() const override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -36,6 +37,7 @@ public:
                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() const override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -57,6 +59,7 @@ public:
                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() const override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -76,6 +79,7 @@ public:
                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() const override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -232,6 +236,7 @@ public:
                                   InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() const override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
--- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp
@ -2252,16 +2252,19 @@ void Eltwise::appendBinPostOps(dnnl::post_ops& ops, const VectorDims& postOpDims
 }

 bool Eltwise::canFuse(const NodePtr& node) const {
-    auto isSuitableNode = [this](const Eltwise* node) {
-        // [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
-        // we disable its fusing otherwise there is no guarantee it will be executed it I32
-        // [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32
-        // (all should be handled via explicit convert operations)
-        if (node->getAlgorithm() == Algorithm::EltwiseDivide) {
-            for (const auto &originalInputPrecision : getOriginalInputPrecisions()) {
-                if (originalInputPrecision == Precision::I32) {
-                    return false;
-                }
+    auto isIntegerComputeSupported = [this](const Node* node) {
+        if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd,
+                                          Algorithm::EltwiseMultiply,
+                                          Algorithm::EltwiseMulAdd,
+                                          Algorithm::EltwiseSubtract,
+                                          Algorithm::EltwiseDivide,
+                                          Algorithm::EltwiseSquaredDifference)) {
+            return false;
+        }
+
+        for (const auto &originalInputPrecision : node->getOriginalInputPrecisions()) {
+            if (originalInputPrecision != Precision::I32) {
+                return false;
            }
        }

@ -2271,9 +2274,10 @@ bool Eltwise::canFuse(const NodePtr& node) const {
    if (!mayiuse(x64::sse41) || getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)
        return false;

-    if (!isSuitableNode(this)) {
+
+    bool isIntegerNode = isIntegerComputeSupported(this);
+    if (isIntegerNode && node->getType() != Type::Eltwise)
        return false;
-    }

    // FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
    size_t addedInputEdgesNum = node->getType() != Type::FakeQuantize ? (node->getParentEdges().size() - 1) : 0;
@ -2281,6 +2285,16 @@ bool Eltwise::canFuse(const NodePtr& node) const {
        return false;

    if (node->getType() == Type::Eltwise) {
+        // [WA] Since execution precision change from I32 to FP32 for arithmetic operations may lead to incorrect results
+        // we disable fusing cases which may lead to invalid precision conversions inside the kernel
+        // [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32
+        // (all should be handled via explicit convert operations)
+        bool isIntegerFusingNode = isIntegerComputeSupported(node.get());
+        if (isIntegerNode && !isIntegerFusingNode ||
+                !isIntegerNode && isIntegerFusingNode) {
+            return false;
+        }
+
        if (node->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
            // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
            if (one_of(node->getAlgorithm(), Algorithm::EltwiseSubtract,
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
@ -13,23 +13,19 @@ namespace snippets {
 namespace {

 const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_Convert = {
-        { { ov::element::f32 }, { ov::element::i32 } },
        { { ov::element::f32 }, { ov::element::bf16 } },
        { { ov::element::f32 }, { ov::element::u8 } },
        { { ov::element::f32 }, { ov::element::i8 } },

        { { ov::element::bf16 }, { ov::element::f32 } },
-        { { ov::element::bf16 }, { ov::element::i32 } },
        { { ov::element::bf16 }, { ov::element::i8 } },
        { { ov::element::bf16 }, { ov::element::u8 } },

        { { ov::element::i8 }, { ov::element::f32 } },
-        { { ov::element::i8 }, { ov::element::i32 } },
        { { ov::element::i8 }, { ov::element::bf16 } },
        { { ov::element::i8 }, { ov::element::u8 }  },

        { { ov::element::u8 }, { ov::element::f32 } },
-        { { ov::element::u8 }, { ov::element::i32 } },
        { { ov::element::u8 }, { ov::element::bf16 } },
        { { ov::element::u8 }, { ov::element::i8 } },
 };
@ -50,17 +46,14 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
                         Convert::getTestCaseName);

 const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertInput = {
-        { { ov::element::f32 }, { ov::element::i32 } },
        { { ov::element::f32 }, { ov::element::bf16 } },

        { { ov::element::bf16 }, { ov::element::f32 } },

        { { ov::element::i8 }, { ov::element::f32 } },
-        { { ov::element::i8 }, { ov::element::i32 } },
        { { ov::element::i8 }, { ov::element::bf16 } },

        { { ov::element::u8 }, { ov::element::f32 } },
-        { { ov::element::u8 }, { ov::element::i32 } },
        { { ov::element::u8 }, { ov::element::bf16 } },
 };

@ -98,8 +91,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertStub, ConvertStub,
                         Convert::getTestCaseName);

 const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertPartialInputsAndResults = {
-        { { ov::element::i8, ov::element::i32, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
-        { { ov::element::bf16, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::bf16 } },
+        { { ov::element::i8, ov::element::i8, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
 };

 const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
@ -118,10 +110,9 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertP
                         Convert::getTestCaseName);

 const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertMany = {
-        { { ov::element::i32, ov::element::u8}, {} },
-        { { ov::element::i32, ov::element::u8, ov::element::i32 }, {} },
-        { { ov::element::i32, ov::element::f32, ov::element::i32, ov::element::i8 }, {} },
-        { { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 }, {} },
+        { { ov::element::f32, ov::element::u8}, {} },
+        { { ov::element::f32, ov::element::u8, ov::element::i8 }, {} },
+        { { ov::element::f32, ov::element::f32, ov::element::i8, ov::element::i8 }, {} },
 };

 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
@ -137,21 +128,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutpu
                         ::testing::Combine(
                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
                                 ::testing::ValuesIn(types_ConvertMany),
-                                 ::testing::Values(5), // sinh + subgraph + reorders for sinh
+                                 ::testing::Values(2), // sinh + subgraph
                                 ::testing::Values(1),
                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                         Convert::getTestCaseName);

 const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>> types_ConvertManyIO = {
-        { { ov::element::i32, ov::element::u8}, {ov::element::i32} },
-        { { ov::element::i32, ov::element::u8, ov::element::i32 }, { ov::element::i32, ov::element::i8, ov::element::i32, ov::element::f32 } },
+        { { ov::element::f32, ov::element::u8}, {ov::element::i8} },
+        { { ov::element::f32, ov::element::u8, ov::element::i8 }, { ov::element::u8, ov::element::i8, ov::element::f32, ov::element::f32 } },
 };

 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
                         ::testing::Combine(
                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
                                 ::testing::ValuesIn(types_ConvertManyIO),
-                                 ::testing::Values(5), // sinh + subgraph + reorders for sinh
+                                 ::testing::Values(2), // sinh + subgraph
                                 ::testing::Values(1),
                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                         Convert::getTestCaseName);
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/eltwise.cpp
@ -37,38 +37,47 @@ public:
    }

 protected:
+    ov::Tensor generate_eltwise_input(const ov::element::Type& type, const ngraph::Shape& shape) {
+        struct gen_params {
+            uint32_t range;
+            int32_t start_from;
+            int32_t resolution;
+
+            gen_params(uint32_t range = 10, int32_t start_from = 0, int32_t resolution = 1)
+                : range(range), start_from(start_from), resolution(resolution) {}
+        };
+
+        gen_params params = gen_params();
+        if (type.is_real()) {
+            switch (eltwiseType) {
+                case ngraph::helpers::EltwiseTypes::POWER:
+                case ngraph::helpers::EltwiseTypes::MOD:
+                case ngraph::helpers::EltwiseTypes::FLOOR_MOD:
+                    params = gen_params(2, 2, 8);
+                    break;
+                case ngraph::helpers::EltwiseTypes::DIVIDE:
+                    params = gen_params(2, 2, 8);
+                    break;
+                case ngraph::helpers::EltwiseTypes::ERF:
+                    params = gen_params(6, -3);
+                    break;
+                default:
+                    params = gen_params(80, 0, 8);
+                    break;
+            }
+        } else {
+            params = gen_params(INT32_MAX, INT32_MIN);
+        }
+
+        return ov::test::utils::create_and_fill_tensor(type, shape, params.range, params.start_from, params.resolution);
+    }
+
    void generate_inputs(const std::vector<ngraph::Shape>& targetInputStaticShapes) override {
        inputs.clear();
        const auto& funcInputs = function->inputs();
        for (int i = 0; i < funcInputs.size(); ++i) {
            const auto& funcInput = funcInputs[i];
-            ov::Tensor tensor;
-            bool isReal = funcInput.get_element_type().is_real();
-            switch (eltwiseType) {
-                case ngraph::helpers::EltwiseTypes::POWER:
-                case ngraph::helpers::EltwiseTypes::MOD:
-                case ngraph::helpers::EltwiseTypes::FLOOR_MOD:
-                    tensor = isReal ?
-                             ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 2, 2, 8) :
-                             ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 4, 2);
-                    break;
-                case ngraph::helpers::EltwiseTypes::DIVIDE:
-                    tensor = isReal ?
-                             ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 2, 2, 8) :
-                             ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 100, 101);
-                    break;
-                case ngraph::helpers::EltwiseTypes::ERF:
-                    tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 6, -3);
-                    break;
-                default:
-                    if (funcInput.get_element_type().is_real()) {
-                        tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 80, 0, 8);
-                    } else {
-                        tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
-                    }
-                    break;
-            }
-            inputs.insert({funcInput.get_node_shared_ptr(), tensor});
+            inputs.insert({funcInput.get_node_shared_ptr(), generate_eltwise_input(funcInput.get_element_type(), targetInputStaticShapes[i])});
        }
    }

@ -88,6 +97,8 @@ protected:

        if (ElementType::bf16 == netType) {
            rel_threshold = 2e-2f;
+        } else if (ElementType::i32 == netType) {
+            abs_threshold = 0;
        }

        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
@ -134,29 +145,21 @@ protected:
                    }
                }
            }
-            if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE ||
-                eltwiseType == ngraph::helpers::EltwiseTypes::MOD) {
-                std::vector<float> data(ngraph::shape_size(shape));
-                data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(ngraph::shape_size(shape), 10, 2);
+
+            if (netType == ElementType::i32) {
+                auto data_tensor = generate_eltwise_input(ElementType::i32, shape);
+                auto data_ptr = reinterpret_cast<int32_t*>(data_tensor.data());
+                std::vector<int32_t> data(data_ptr, data_ptr + ngraph::shape_size(shape));
                secondaryInput = ngraph::builder::makeConstant(netType, shape, data);
-            } else if (eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD) {
-                auto negative_data_size = ngraph::shape_size(shape) / 2;
-                auto positive_data_size = ngraph::shape_size(shape) - negative_data_size;
-                std::vector<float> negative_data(negative_data_size);
-                std::vector<float> data(positive_data_size);
-                negative_data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(negative_data_size, -10, -2);
-                data = NGraphFunctions::Utils::generateVector<ngraph::element::Type_t::f32>(positive_data_size, 10, 2);
-                data.insert(data.end(), negative_data.begin(), negative_data.end());
-                secondaryInput = ngraph::builder::makeConstant(netType, shape, data);
-            } else if (eltwiseType == ngraph::helpers::EltwiseTypes::POWER) {
-                secondaryInput = ngraph::builder::makeConstant<float>(netType, shape, {}, true, 3);
            } else {
-                secondaryInput = ngraph::builder::makeConstant<float>(netType, shape, {}, true);
+                auto data_tensor = generate_eltwise_input(ElementType::f32, shape);
+                auto data_ptr = reinterpret_cast<float*>(data_tensor.data());
+                std::vector<float> data(data_ptr, data_ptr + ngraph::shape_size(shape));
+                secondaryInput = ngraph::builder::makeConstant(netType, shape, data);
            }
        }

        auto eltwise = ngraph::builder::makeEltwise(parameters[0], secondaryInput, eltwiseType);
-
        function = makeNgraphFunction(netType, parameters, eltwise, "Eltwise");
    }

@ -325,6 +328,36 @@ const auto params_5D_emptyCPUSpec = ::testing::Combine(

 INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D, EltwiseLayerCPUTest, params_5D_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName);

+std::vector<ngraph::helpers::EltwiseTypes> eltwiseOpTypesI32 = {
+        ngraph::helpers::EltwiseTypes::ADD,
+        ngraph::helpers::EltwiseTypes::MULTIPLY,
+        ngraph::helpers::EltwiseTypes::SUBTRACT,
+        ngraph::helpers::EltwiseTypes::DIVIDE,
+        ngraph::helpers::EltwiseTypes::SQUARED_DIFF,
+};
+
+const std::vector<fusingSpecificParams> fusingParamsSetI32{
+    emptyFusingSpec,
+    fusingMultiplyAddPerChannel,
+};
+
+const auto params_5D_emptyCPUSpec_I32 = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D)),
+                ::testing::ValuesIn(eltwiseOpTypesI32),
+                ::testing::ValuesIn(secondaryInputTypes),
+                ::testing::ValuesIn(opTypes),
+                ::testing::Values(ElementType::i32),
+                ::testing::Values(ElementType::i32),
+                ::testing::Values(ElementType::i32),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(additional_config)),
+        ::testing::Values(emptyCPUSpec),
+        ::testing::ValuesIn(fusingParamsSetI32));
+
+INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_I32, EltwiseLayerCPUTest, params_5D_emptyCPUSpec_I32, EltwiseLayerCPUTest::getTestCaseName);
+
+
 std::vector<std::vector<ov::Shape>> inShapes_4D_Blocked_Planar = {
        {{2, 17, 31, 3}, {2, 1, 31, 3}},
        {{2, 17, 5, 1}, {2, 1, 1, 4}},
--- a/src/plugins/intel_cpu/tests/functional/test_utils/fusing_test_utils.hpp
+++ b/src/plugins/intel_cpu/tests/functional/test_utils/fusing_test_utils.hpp
@ -361,6 +361,18 @@ const auto fusingMultiplyPerChannel = fusingSpecificParams{std::make_shared<post
            return std::make_shared<ngraph::opset1::Multiply>(cfg.input, secondMultInput);
        }, "Multiply(PerChannel)"}}), {"Multiply"}};

+const auto fusingMultiplyAddPerChannel = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
+        {[](postNodeConfig& cfg) {
+                ngraph::Shape newShape = generatePerChannelShape(cfg.input);
+                auto constNode = ngraph::builder::makeConstant(cfg.type, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Multiply>(cfg.input, constNode);
+        }, "Multiply(PerChannel)"},
+        {[](postNodeConfig& cfg) {
+                ngraph::Shape newShape = generatePerChannelShape(cfg.input);
+                auto constNode = ngraph::builder::makeConstant(cfg.type, newShape, std::vector<float>{}, true);
+                return std::make_shared<ngraph::opset1::Add>(cfg.input, constNode);
+        }, "Add(PerChannel)"}}), {"Add"} };
+
 const auto fusingAddPerTensor = fusingSpecificParams{std::make_shared<postNodesMgr>(std::vector<postNodeBuilder>{
        {[](postNodeConfig& cfg){
            ngraph::Shape secondMultInShape(1, 1);
--- a/src/tests/ie_test_utils/common_test_utils/ov_tensor_utils.cpp
+++ b/src/tests/ie_test_utils/common_test_utils/ov_tensor_utils.cpp
@ -230,6 +230,10 @@ void compare(const ov::Tensor& expected,
        auto eps = std::numeric_limits<double>::epsilon();
        return (b - a) > (std::fmax(std::fabs(a), std::fabs(b)) * eps);
    };
+    auto less_or_equal = [] (double a, double b) {
+        auto eps = std::numeric_limits<double>::epsilon();
+        return (b - a) >= (std::fmax(std::fabs(a), std::fabs(b)) * eps);
+    };
    for (size_t i = 0; i < shape_size(expected_shape); i++) {
        double expected_value = expected_data[i];
        double actual_value = actual_data[i];
@ -258,7 +262,7 @@ void compare(const ov::Tensor& expected,
    }
    abs_error.mean /= shape_size(expected_shape);
    rel_error.mean /= shape_size(expected_shape);
-    if (!(less(abs_error.max, abs_threshold) && less(rel_error.max, rel_threshold))) {
+    if (!(less_or_equal(abs_error.max, abs_threshold) && less_or_equal(rel_error.max, rel_threshold))) {
        std::ostringstream out_stream;
        out_stream << "abs_max < abs_threshold && rel_max < rel_threshold" <<
                   "\n\t abs_max: " << abs_error.max <<