[CPU] Do not set BF16 on input port for Eltwise after Input (#5542)

* [CPU] Do not set BF16 on input port for Eltwise after Input Since Eltwise supports conversion to BF16 Thus unnecessary Reorder is avoided * Create a separate function for enforcing BF16 on ports * Add test to verify that no extra Reorder is inserted Also: - update legacy test - remove extra code which is not applicable anymore * Correct expected precision in legacy test
2021-05-12 11:21:05 +03:00 · 2021-05-12 11:21:05 +03:00 · cfc235bd65
commit cfc235bd65
parent 4c452b8bb6
5 changed files with 95 additions and 35 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -262,33 +262,8 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
        graphNodes.push_back(outNode);
    }

-    // We set all non const data paths precision to BF16 in case enforceBF16 flag is switched on.
-    if (config.enforceBF16) {
-        bool isQuantizedModel = false;
-        for (auto& node : graphNodes) {
-            if (node->getType() == FakeQuantize)
-                isQuantizedModel = true;
-        }
-
-        // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
-        // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
-        if (implication(isQuantizedModel, config.manualEnforceBF16)) {
-            for (auto &node : graphNodes) {
-                if (node->getType() != Input && node->getType() != Output) {
-                    for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
-                        auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
-                        if (!(parent->getType() == Input && parent->isConstant()) && node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
-                            node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
-                    }
-
-                    for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
-                        if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
-                            node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
-                    }
-                }
-            }
-        }
-    }
+    if (config.enforceBF16)
+        EnforceBF16();

    // change precision for input/output nodes to avoid extra data conversion when set input/output blobs
    // also we need to change input/output precisions for consumers/producers to avoid inserting reorder
@ -1201,6 +1176,35 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo
    return true;
 }

+// Set all non const data paths precision to BF16
+void MKLDNNGraph::EnforceBF16() {
+    bool isQuantizedModel = false;
+    for (auto& node : graphNodes) {
+        if (node->getType() == FakeQuantize)
+            isQuantizedModel = true;
+    }
+
+    // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
+    // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
+    if (implication(isQuantizedModel, config.manualEnforceBF16)) {
+        for (auto &node : graphNodes) {
+            if (node->getType() != Input && node->getType() != Output) {
+                for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
+                    auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
+                    if (!(parent->getType() == Input && parent->isConstant()) &&       // exclude nodes after Constant Inputs
+                        !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16
+                        node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
+                        node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
+                }
+
+                for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
+                    if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
+                        node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
+                }
+            }
+        }
+    }
+}

 InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
    return dump_graph_as_ie_ngraph_net(*this);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -204,6 +204,7 @@ protected:
    friend InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);

 private:
+    void EnforceBF16();
    void printGraphInfo() const;
 };

--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -142,13 +142,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
    DropDoubleReorders(graph);
    graph.RemoveDroppedNodes();

-#if 0
-    /* disable, since there is no use case for it at the moment
-     * should be enabled after ngraph migration */
-    DropConvertReorder(graph);
-    graph.RemoveDroppedNodes();
-#endif
-
    MergeTransposeAndReorder(graph);
    graph.RemoveDroppedNodes();

--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
@ -123,7 +123,7 @@ protected:
        // performance counters
        expectedPrecisions["Matmul_0"] = "BF16";
        expectedPrecisions["Mul_1"] = "BF16";
-        expectedPrecisions["Add_1"] = "BF16";
+        expectedPrecisions["Add_1"] = netPrecision.name(); // FP32->BF16 in case of FP32 net, BF16->BF16 in case of BF16 net
        expectedPrecisions["Relu_1"] = "ndef";
        expectedPrecisions["Conc_1"] = "BF16";
        expectedPrecisions["Matmul_1"] = "BF16";
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
@ -0,0 +1,62 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph_functions/builders.hpp>
+#include "ie_common.h"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+class InputNoReorderEltwiseBF16 : virtual public LayerTestsUtils::LayerTestsCommon,
+                                  public CPUTestsBase {
+protected:
+    void SetUp() {
+        auto netPrecision = inPrc = Precision::FP32;
+        outPrc = Precision::BF16;
+        targetDevice = CommonTestUtils::DEVICE_CPU;
+        std::map<std::string, std::string> additional_config{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}};
+        configuration.insert(additional_config.begin(), additional_config.end());
+
+        std::vector<size_t> inputShape {2, 4, 4, 1};
+        std::vector<size_t> outputShape = inputShape;
+        auto eltwiseType = ngraph::helpers::EltwiseTypes::ADD;
+        auto secondaryInputType = ngraph::helpers::InputLayerType::CONSTANT;
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto input = ngraph::builder::makeParams(ngPrc, {inputShape});
+        std::shared_ptr<ngraph::Node> secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, inputShape);
+        auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
+
+        function = makeNgraphFunction(ngPrc, input, eltwise, "Eltwise");
+    }
+};
+
+/* FP32 network with enforced BF16 precision.
+ * Test that no Reorder (or Convert) is inserted after Input.
+ * Eltwise performs the conversion by itself.
+
+    Input[FP32]        Constant[FP32]
+          \                 /
+           \               /
+            X  No Reorder X
+             \           /
+             Eltwise[FP32->BF16]
+                  |
+                  |
+             Output[BF16]
+*/
+TEST_F(InputNoReorderEltwiseBF16, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+
+    CheckNodeOfTypeCount(executableNetwork, "Reorder", 0);
+    CheckNodeOfTypeCount(executableNetwork, "Convert", 0);
+    CheckNodeOfTypeCount(executableNetwork, "Eltwise", 1);
+}
+} // namespace CPULayerTestsDefinitions