From cfc235bd653616020aed1234f935ee09943c01ac Mon Sep 17 00:00:00 2001
From: Egor Duplensky <egor.duplenskii@intel.com>
Date: Wed, 12 May 2021 11:21:05 +0300
Subject: [PATCH] [CPU] Do not set BF16 on input port for Eltwise after Input
 (#5542)

* [CPU] Do not set BF16 on input port for Eltwise after Input

Since Eltwise supports conversion to BF16
Thus unnecessary Reorder is avoided

* Create a separate function for enforcing BF16 on ports

* Add test to verify that no extra Reorder is inserted

Also:
- update legacy test
- remove extra code which is not applicable anymore

* Correct expected precision in legacy test
---
 .../src/mkldnn_plugin/mkldnn_graph.cpp        | 58 +++++++++--------
 .../src/mkldnn_plugin/mkldnn_graph.h          |  1 +
 .../mkldnn_plugin/mkldnn_graph_optimizer.cpp  |  7 ---
 .../gather_x2_add_mul_relu_concat_matmul.cpp  |  2 +-
 .../src/input_noreorder_eltwise_bf16.cpp      | 62 +++++++++++++++++++
 5 files changed, 95 insertions(+), 35 deletions(-)
 create mode 100644 inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
index a80c788f5ec..1caedcaba75 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -262,33 +262,8 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
         graphNodes.push_back(outNode);
     }
 
-    // We set all non const data paths precision to BF16 in case enforceBF16 flag is switched on.
-    if (config.enforceBF16) {
-        bool isQuantizedModel = false;
-        for (auto& node : graphNodes) {
-            if (node->getType() == FakeQuantize)
-                isQuantizedModel = true;
-        }
-
-        // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
-        // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
-        if (implication(isQuantizedModel, config.manualEnforceBF16)) {
-            for (auto &node : graphNodes) {
-                if (node->getType() != Input && node->getType() != Output) {
-                    for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
-                        auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
-                        if (!(parent->getType() == Input && parent->isConstant()) && node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
-                            node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
-                    }
-
-                    for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
-                        if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
-                            node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
-                    }
-                }
-            }
-        }
-    }
+    if (config.enforceBF16)
+        EnforceBF16();
 
     // change precision for input/output nodes to avoid extra data conversion when set input/output blobs
     // also we need to change input/output precisions for consumers/producers to avoid inserting reorder
@@ -1201,6 +1176,35 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo
     return true;
 }
 
+// Set all non const data paths precision to BF16
+void MKLDNNGraph::EnforceBF16() {
+    bool isQuantizedModel = false;
+    for (auto& node : graphNodes) {
+        if (node->getType() == FakeQuantize)
+            isQuantizedModel = true;
+    }
+
+    // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
+    // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
+    if (implication(isQuantizedModel, config.manualEnforceBF16)) {
+        for (auto &node : graphNodes) {
+            if (node->getType() != Input && node->getType() != Output) {
+                for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
+                    auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
+                    if (!(parent->getType() == Input && parent->isConstant()) &&       // exclude nodes after Constant Inputs
+                        !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16
+                        node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
+                        node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
+                }
+
+                for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
+                    if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
+                        node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
+                }
+            }
+        }
+    }
+}
 
 InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
     return dump_graph_as_ie_ngraph_net(*this);
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
index 4a82f9c26b0..29c07120fbf 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@@ -204,6 +204,7 @@ protected:
     friend InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
 
 private:
+    void EnforceBF16();
     void printGraphInfo() const;
 };
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
index d5f2e3819be..f773507b657 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -142,13 +142,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
     DropDoubleReorders(graph);
     graph.RemoveDroppedNodes();
 
-#if 0
-    /* disable, since there is no use case for it at the moment
-     * should be enabled after ngraph migration */
-    DropConvertReorder(graph);
-    graph.RemoveDroppedNodes();
-#endif
-
     MergeTransposeAndReorder(graph);
     graph.RemoveDroppedNodes();
 
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
index 74b50d158d7..492faef5314 100644
--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
@@ -123,7 +123,7 @@ protected:
         // performance counters
         expectedPrecisions["Matmul_0"] = "BF16";
         expectedPrecisions["Mul_1"] = "BF16";
-        expectedPrecisions["Add_1"] = "BF16";
+        expectedPrecisions["Add_1"] = netPrecision.name(); // FP32->BF16 in case of FP32 net, BF16->BF16 in case of BF16 net
         expectedPrecisions["Relu_1"] = "ndef";
         expectedPrecisions["Conc_1"] = "BF16";
         expectedPrecisions["Matmul_1"] = "BF16";
diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
new file mode 100644
index 00000000000..f09724a596a
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
@@ -0,0 +1,62 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph_functions/builders.hpp>
+#include "ie_common.h"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+class InputNoReorderEltwiseBF16 : virtual public LayerTestsUtils::LayerTestsCommon,
+                                  public CPUTestsBase {
+protected:
+    void SetUp() {
+        auto netPrecision = inPrc = Precision::FP32;
+        outPrc = Precision::BF16;
+        targetDevice = CommonTestUtils::DEVICE_CPU;
+        std::map<std::string, std::string> additional_config{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}};
+        configuration.insert(additional_config.begin(), additional_config.end());
+
+        std::vector<size_t> inputShape {2, 4, 4, 1};
+        std::vector<size_t> outputShape = inputShape;
+        auto eltwiseType = ngraph::helpers::EltwiseTypes::ADD;
+        auto secondaryInputType = ngraph::helpers::InputLayerType::CONSTANT;
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto input = ngraph::builder::makeParams(ngPrc, {inputShape});
+        std::shared_ptr<ngraph::Node> secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, inputShape);
+        auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
+
+        function = makeNgraphFunction(ngPrc, input, eltwise, "Eltwise");
+    }
+};
+
+/* FP32 network with enforced BF16 precision.
+ * Test that no Reorder (or Convert) is inserted after Input.
+ * Eltwise performs the conversion by itself.
+
+    Input[FP32]        Constant[FP32]
+          \                 /
+           \               /
+            X  No Reorder X
+             \           /
+             Eltwise[FP32->BF16]
+                  |
+                  |
+             Output[BF16]
+*/
+TEST_F(InputNoReorderEltwiseBF16, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+
+    CheckNodeOfTypeCount(executableNetwork, "Reorder", 0);
+    CheckNodeOfTypeCount(executableNetwork, "Convert", 0);
+    CheckNodeOfTypeCount(executableNetwork, "Eltwise", 1);
+}
+} // namespace CPULayerTestsDefinitions