[CPU] Do not set BF16 on input port for Eltwise after Input (#5542)

* [CPU] Do not set BF16 on input port for Eltwise after Input Since Eltwise supports conversion to BF16 Thus unnecessary Reorder is avoided * Create a separate function for enforcing BF16 on ports * Add test to verify that no extra Reorder is inserted Also: - update legacy test - remove extra code which is not applicable anymore * Correct expected precision in legacy test
2021-05-12 11:21:05 +03:00 · 2021-05-12 11:21:05 +03:00 · cfc235bd65
commit cfc235bd65
parent 4c452b8bb6
5 changed files with 95 additions and 35 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -262,33 +262,8 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
        graphNodes.push_back(outNode);
    }
-    // We set all non const data paths precision to BF16 in case enforceBF16 flag is switched on.
+    if (config.enforceBF16)
-    if (config.enforceBF16) {
+        EnforceBF16();
        bool isQuantizedModel = false;
        for (auto& node : graphNodes) {
            if (node->getType() == FakeQuantize)
                isQuantizedModel = true;
        }
        // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
        // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
        if (implication(isQuantizedModel, config.manualEnforceBF16)) {
            for (auto &node : graphNodes) {
                if (node->getType() != Input && node->getType() != Output) {
                    for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
                        auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
                        if (!(parent->getType() == Input && parent->isConstant()) && node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
                            node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
                    }
                    for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
                        if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
                            node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
                    }
                }
            }
        }
    }
    // change precision for input/output nodes to avoid extra data conversion when set input/output blobs
    // also we need to change input/output precisions for consumers/producers to avoid inserting reorder
@ -1201,6 +1176,35 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo
    return true;
 }
 // Set all non const data paths precision to BF16
 void MKLDNNGraph::EnforceBF16() {
    bool isQuantizedModel = false;
    for (auto& node : graphNodes) {
        if (node->getType() == FakeQuantize)
            isQuantizedModel = true;
    }
    // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
    // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
    if (implication(isQuantizedModel, config.manualEnforceBF16)) {
        for (auto &node : graphNodes) {
            if (node->getType() != Input && node->getType() != Output) {
                for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
                    auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
                    if (!(parent->getType() == Input && parent->isConstant()) &&       // exclude nodes after Constant Inputs
                        !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16
                        node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
                        node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
                }
                for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
                    if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
                        node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
                }
            }
        }
    }
 }
 InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
    return dump_graph_as_ie_ngraph_net(*this);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -204,6 +204,7 @@ protected:
    friend InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
 private:
    void EnforceBF16();
    void printGraphInfo() const;
 };
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -142,13 +142,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
    DropDoubleReorders(graph);
    graph.RemoveDroppedNodes();
 #if 0
    /* disable, since there is no use case for it at the moment
     * should be enabled after ngraph migration */
    DropConvertReorder(graph);
    graph.RemoveDroppedNodes();
 #endif
    MergeTransposeAndReorder(graph);
    graph.RemoveDroppedNodes();
--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp
@ -123,7 +123,7 @@ protected:
        // performance counters
        expectedPrecisions["Matmul_0"] = "BF16";
        expectedPrecisions["Mul_1"] = "BF16";
-        expectedPrecisions["Add_1"] = "BF16";
+        expectedPrecisions["Add_1"] = netPrecision.name(); // FP32->BF16 in case of FP32 net, BF16->BF16 in case of BF16 net
        expectedPrecisions["Relu_1"] = "ndef";
        expectedPrecisions["Conc_1"] = "BF16";
        expectedPrecisions["Matmul_1"] = "BF16";
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp
@ -0,0 +1,62 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <ngraph_functions/builders.hpp>
 #include "ie_common.h"
 #include "ngraph_functions/utils/ngraph_helpers.hpp"
 #include "test_utils/cpu_test_utils.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 class InputNoReorderEltwiseBF16 : virtual public LayerTestsUtils::LayerTestsCommon,
                                  public CPUTestsBase {
 protected:
    void SetUp() {
        auto netPrecision = inPrc = Precision::FP32;
        outPrc = Precision::BF16;
        targetDevice = CommonTestUtils::DEVICE_CPU;
        std::map<std::string, std::string> additional_config{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}};
        configuration.insert(additional_config.begin(), additional_config.end());
        std::vector<size_t> inputShape {2, 4, 4, 1};
        std::vector<size_t> outputShape = inputShape;
        auto eltwiseType = ngraph::helpers::EltwiseTypes::ADD;
        auto secondaryInputType = ngraph::helpers::InputLayerType::CONSTANT;
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
        auto input = ngraph::builder::makeParams(ngPrc, {inputShape});
        std::shared_ptr<ngraph::Node> secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, inputShape);
        auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType);
        function = makeNgraphFunction(ngPrc, input, eltwise, "Eltwise");
    }
 };
 /* FP32 network with enforced BF16 precision.
 * Test that no Reorder (or Convert) is inserted after Input.
 * Eltwise performs the conversion by itself.
    Input[FP32]        Constant[FP32]
          \                 /
           \               /
            X  No Reorder X
             \           /
             Eltwise[FP32->BF16]
                  |
                  |
             Output[BF16]
 */
 TEST_F(InputNoReorderEltwiseBF16, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckNodeOfTypeCount(executableNetwork, "Reorder", 0);
    CheckNodeOfTypeCount(executableNetwork, "Convert", 0);
    CheckNodeOfTypeCount(executableNetwork, "Eltwise", 1);
 }
 } // namespace CPULayerTestsDefinitions