From cfc235bd653616020aed1234f935ee09943c01ac Mon Sep 17 00:00:00 2001 From: Egor Duplensky Date: Wed, 12 May 2021 11:21:05 +0300 Subject: [PATCH] [CPU] Do not set BF16 on input port for Eltwise after Input (#5542) * [CPU] Do not set BF16 on input port for Eltwise after Input Since Eltwise supports conversion to BF16 Thus unnecessary Reorder is avoided * Create a separate function for enforcing BF16 on ports * Add test to verify that no extra Reorder is inserted Also: - update legacy test - remove extra code which is not applicable anymore * Correct expected precision in legacy test --- .../src/mkldnn_plugin/mkldnn_graph.cpp | 58 +++++++++-------- .../src/mkldnn_plugin/mkldnn_graph.h | 1 + .../mkldnn_plugin/mkldnn_graph_optimizer.cpp | 7 --- .../gather_x2_add_mul_relu_concat_matmul.cpp | 2 +- .../src/input_noreorder_eltwise_bf16.cpp | 62 +++++++++++++++++++ 5 files changed, 95 insertions(+), 35 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp index a80c788f5ec..1caedcaba75 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp @@ -262,33 +262,8 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana graphNodes.push_back(outNode); } - // We set all non const data paths precision to BF16 in case enforceBF16 flag is switched on. - if (config.enforceBF16) { - bool isQuantizedModel = false; - for (auto& node : graphNodes) { - if (node->getType() == FakeQuantize) - isQuantizedModel = true; - } - - // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision - // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default - if (implication(isQuantizedModel, config.manualEnforceBF16)) { - for (auto &node : graphNodes) { - if (node->getType() != Input && node->getType() != Output) { - for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) { - auto &parent = node->getParentEdgesAtPort(i)[0]->getParent(); - if (!(parent->getType() == Input && parent->isConstant()) && node->getOriginalInputPrecisionAtPort(i) == Precision::FP32) - node->setOriginalInputPrecisionAtPort(i, Precision::BF16); - } - - for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) { - if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32) - node->setOriginalOutputPrecisionAtPort(i, Precision::BF16); - } - } - } - } - } + if (config.enforceBF16) + EnforceBF16(); // change precision for input/output nodes to avoid extra data conversion when set input/output blobs // also we need to change input/output precisions for consumers/producers to avoid inserting reorder @@ -1201,6 +1176,35 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo return true; } +// Set all non const data paths precision to BF16 +void MKLDNNGraph::EnforceBF16() { + bool isQuantizedModel = false; + for (auto& node : graphNodes) { + if (node->getType() == FakeQuantize) + isQuantizedModel = true; + } + + // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision + // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default + if (implication(isQuantizedModel, config.manualEnforceBF16)) { + for (auto &node : graphNodes) { + if (node->getType() != Input && node->getType() != Output) { + for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) { + auto &parent = node->getParentEdgesAtPort(i)[0]->getParent(); + if (!(parent->getType() == Input && parent->isConstant()) && // exclude nodes after Constant Inputs + !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16 + node->getOriginalInputPrecisionAtPort(i) == Precision::FP32) + node->setOriginalInputPrecisionAtPort(i, Precision::BF16); + } + + for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) { + if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32) + node->setOriginalOutputPrecisionAtPort(i, Precision::BF16); + } + } + } + } +} InferenceEngine::CNNNetwork MKLDNNGraph::dump() const { return dump_graph_as_ie_ngraph_net(*this); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h index 4a82f9c26b0..29c07120fbf 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h @@ -204,6 +204,7 @@ protected: friend InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph); private: + void EnforceBF16(); void printGraphInfo() const; }; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index d5f2e3819be..f773507b657 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -142,13 +142,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap DropDoubleReorders(graph); graph.RemoveDroppedNodes(); -#if 0 - /* disable, since there is no use case for it at the moment - * should be enabled after ngraph migration */ - DropConvertReorder(graph); - graph.RemoveDroppedNodes(); -#endif - MergeTransposeAndReorder(graph); graph.RemoveDroppedNodes(); diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp index 74b50d158d7..492faef5314 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp @@ -123,7 +123,7 @@ protected: // performance counters expectedPrecisions["Matmul_0"] = "BF16"; expectedPrecisions["Mul_1"] = "BF16"; - expectedPrecisions["Add_1"] = "BF16"; + expectedPrecisions["Add_1"] = netPrecision.name(); // FP32->BF16 in case of FP32 net, BF16->BF16 in case of BF16 net expectedPrecisions["Relu_1"] = "ndef"; expectedPrecisions["Conc_1"] = "BF16"; expectedPrecisions["Matmul_1"] = "BF16"; diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp new file mode 100644 index 00000000000..f09724a596a --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/input_noreorder_eltwise_bf16.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ie_common.h" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +class InputNoReorderEltwiseBF16 : virtual public LayerTestsUtils::LayerTestsCommon, + public CPUTestsBase { +protected: + void SetUp() { + auto netPrecision = inPrc = Precision::FP32; + outPrc = Precision::BF16; + targetDevice = CommonTestUtils::DEVICE_CPU; + std::map additional_config{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}; + configuration.insert(additional_config.begin(), additional_config.end()); + + std::vector inputShape {2, 4, 4, 1}; + std::vector outputShape = inputShape; + auto eltwiseType = ngraph::helpers::EltwiseTypes::ADD; + auto secondaryInputType = ngraph::helpers::InputLayerType::CONSTANT; + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto input = ngraph::builder::makeParams(ngPrc, {inputShape}); + std::shared_ptr secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, inputShape); + auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType); + + function = makeNgraphFunction(ngPrc, input, eltwise, "Eltwise"); + } +}; + +/* FP32 network with enforced BF16 precision. + * Test that no Reorder (or Convert) is inserted after Input. + * Eltwise performs the conversion by itself. + + Input[FP32] Constant[FP32] + \ / + \ / + X No Reorder X + \ / + Eltwise[FP32->BF16] + | + | + Output[BF16] +*/ +TEST_F(InputNoReorderEltwiseBF16, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + + CheckNodeOfTypeCount(executableNetwork, "Reorder", 0); + CheckNodeOfTypeCount(executableNetwork, "Convert", 0); + CheckNodeOfTypeCount(executableNetwork, "Eltwise", 1); +} +} // namespace CPULayerTestsDefinitions