[CPU] Fixed Divide operation support for I32 precision (#3721)

* [CPU] Fixed Divide operation support for I32 precision
2020-12-28 19:18:19 +03:00 · 2020-12-28 19:18:19 +03:00 · 37b6e75730
commit 37b6e75730
parent a8daab3377
7 changed files with 66 additions and 7 deletions
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp
@ -204,14 +204,43 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

+    auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
+        switch (exec_prc_) {
+            case Precision::FP32: {
+                h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
+                break;
+            }
+            case Precision::I32: {
+                Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
+
+                // The opset doesn't contain vector instruction for integer divide operation
+                // As WA we emulate its behavior via fp divide followed by rounding to zero
+                h->uni_vcvtdq2ps(vmm_dst, vmm_src0);
+                h->uni_vcvtdq2ps(vmm_aux0, vmm_src1);
+                h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0);
+                h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
+                h->uni_vcvtps2dq(vmm_dst, vmm_dst);
+                break;
+            }
+            default: assert(!"unsupported precision");
+        }
+    };
+
    if (isa == cpu::sse42) {
        h->uni_vmovups(vmm_dst, vmm_src0);
-        h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1);
+        uni_vdiv(vmm_dst, vmm_dst, vmm_src1);
    } else {
-        h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
+        uni_vdiv(vmm_dst, vmm_src0, vmm_src1);
    }
 }

+std::set<InferenceEngine::Precision> jit_divide_emitter::get_supported_precisions() {
+    return {Precision::FP32, Precision::I32};
+}
+
+size_t jit_divide_emitter::aux_vecs_count() const {
+    return exec_prc_ == Precision::I32 ? 1 : 0;
+}

 /// FLOOR_MOD ///
 jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp
@ -81,6 +81,7 @@ public:
                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
+    static std::set<InferenceEngine::Precision> get_supported_precisions();

 private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -88,6 +89,7 @@ private:

    template <mkldnn::impl::cpu::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
+    size_t aux_vecs_count() const override;
 };


--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@ -70,9 +70,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
        for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
            if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
                std::set<Precision> prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get());
+                std::set<Precision> prcs_intersect = {};

                std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(),
-                                      prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin()));
+                                      prcs.begin(), prcs.end(), std::inserter(prcs_intersect, prcs_intersect.begin()));
+
+                supported_precision_intersection = prcs_intersect;
            }
        }

@ -1723,9 +1726,29 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
        return false;
    };

+    auto isSuitableNode = [](const MKLDNNEltwiseNode* node) {
+        // [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
+        // we disable its fusing otherwise there is no guarantee it will be executed it I32
+        // [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32
+        // (all should be handled via explicit convert operations)
+        if (node->getOpType() == Divide) {
+            for (int i = 0; i < node->getCnnLayer()->insData.size(); i++) {
+                if (node->getCnnLayer()->insData[i].lock()->getPrecision() == Precision::I32) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    };
+
    if (!mayiuse(cpu::sse42))
        return false;

+    if (!isSuitableNode(this)) {
+        return false;
+    }
+
    // FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
    size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0;
    if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS)
@ -1734,6 +1757,10 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
    if (node->getType() == Eltwise) {
        auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
        if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
+            if (!isSuitableNode(this)) {
+                return false;
+            }
+
            // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
            if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) {
                return false;
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/eltwise.cpp
@ -28,6 +28,7 @@ std::vector<std::vector<std::vector<size_t>>> inShapes = {
 std::vector<InferenceEngine::Precision> netPrecisions = {
        InferenceEngine::Precision::FP32,
        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::I32,
 };

 std::vector<ngraph::helpers::InputLayerType> secondaryInputTypes = {
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp
@ -136,7 +136,7 @@ std::vector<std::vector<InferenceEngine::Precision>> inputPrecisions = {

 std::vector<std::vector<EltwiseTypes>> eltwiseOps = {
        { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT },
-        { EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD },
+        { EltwiseTypes::DIVIDE, EltwiseTypes::SQUARED_DIFF, EltwiseTypes::ADD },
 };

 INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest,
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
@ -168,7 +168,8 @@ protected:
            }

            const auto max = std::max(CommonTestUtils::ie_abs(res), CommonTestUtils::ie_abs(ref));
-            ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
+            float diff = static_cast<float>(absoluteDifference) / static_cast<float>(max);
+            ASSERT_TRUE(max != 0 && (diff <= static_cast<float>(threshold)))
                                        << "Relative comparison of values expected: " << ref << " and actual: " << res
                                        << " at index " << i << " with threshold " << threshold
                                        << " failed";
--- a/ngraph/python/tests/test_onnx/test_backend.py
+++ b/ngraph/python/tests/test_onnx/test_backend.py
@ -635,8 +635,7 @@ tests_expected_to_fail = [
        "OnnxBackendNodeModelTest.test_adagrad_multiple_cpu",
        "OnnxBackendNodeModelTest.test_adagrad_cpu"),
    (xfail_issue_41894,
-        "OnnxBackendNodeModelTest.test_max_uint16_cpu",
-        "OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu"),
+        "OnnxBackendNodeModelTest.test_max_uint16_cpu"),
    (xfail_issue_43523,
        "OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_example_cpu",
        "OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_random_cpu",