From 87b9a5b6e94378d1e659f08335fb922bae798efc Mon Sep 17 00:00:00 2001
From: Yury Gaydaychuk <yury.gaydaychuk@intel.com>
Date: Tue, 26 Oct 2021 12:55:08 +0300
Subject: [PATCH 1/5] [CPU] Dynamic support for ShapeOf (#7875)

---
 .../src/mkldnn_plugin/cpu_types.cpp           |   3 +
 .../src/mkldnn_plugin/cpu_types.h             |   1 +
 .../mkldnn_plugin/nodes/mkldnn_shapeof.cpp    |  79 ++++++++
 .../src/mkldnn_plugin/nodes/mkldnn_shapeof.h  |  39 ++++
 .../plugin/cpu/single_layer_tests/shapeof.cpp | 175 ++++++++++++++++++
 5 files changed, 297 insertions(+)
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.h
 create mode 100644 inference-engine/tests/functional/plugin/cpu/single_layer_tests/shapeof.cpp
diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.cpp b/inference-engine/src/mkldnn_plugin/cpu_types.cpp
index 4af6683bf78..00b22d90937 100644
--- a/inference-engine/src/mkldnn_plugin/cpu_types.cpp
+++ b/inference-engine/src/mkldnn_plugin/cpu_types.cpp
@@ -65,6 +65,7 @@ const InferenceEngine::details::caseless_unordered_map<std::string, Type> type_t
         { "Reshape", Reshape },
         { "Squeeze", Reshape },
         { "Unsqueeze", Reshape },
+        { "ShapeOf", ShapeOf },
         { "Softmax", Softmax },
         { "Reorder", Reorder },
         { "BatchToSpace", BatchToSpace },
@@ -225,6 +226,8 @@ std::string NameFromType(const Type type) {
             return "StridedSlice";
         case Reshape:
             return "Reshape";
+        case ShapeOf:
+            return "ShapeOf";
         case Tile:
             return "Tile";
         case ROIAlign:
diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.h b/inference-engine/src/mkldnn_plugin/cpu_types.h
index 95371b6c847..0062c034c5b 100644
--- a/inference-engine/src/mkldnn_plugin/cpu_types.h
+++ b/inference-engine/src/mkldnn_plugin/cpu_types.h
@@ -33,6 +33,7 @@ enum Type {
     Eltwise,
     MatMul,
     Reshape,
+    ShapeOf,
     Tile,
     ROIAlign,
     ROIPooling,
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.cpp
new file mode 100644
index 00000000000..28097fcaf58
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_shapeof.h"
+#include <ngraph/opsets/opset1.hpp>
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNShapeOfNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        if (!one_of(op->get_type_info(),
+                    ngraph::op::v0::ShapeOf::get_type_info_static(),
+                    ngraph::op::v3::ShapeOf::get_type_info_static())) {
+            errorMessage = "Node is not an instance of ShapeOf form the operation set v1 or v3.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNShapeOfNode::MKLDNNShapeOfNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                     MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (isSupportedOperation(op, errorMessage)) {
+        errorPrefix = "ShapeOf layer with name '" + getName() + "' ";
+        if (op->get_input_partial_shape(0).size() == 0)
+            IE_THROW() << errorPrefix << "gets unsupported input 0D tensor (scalar)";
+    } else {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+}
+
+void MKLDNNShapeOfNode::getSupportedDescriptors() {
+    if (!descs.empty())
+        return;
+    if (getParentEdges().size() != 1)
+        IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size();
+    if (getChildEdges().empty())
+        IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getChildEdges().size();
+}
+
+void MKLDNNShapeOfNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    Precision precision = getOriginalInputPrecisionAtPort(0);
+
+    const LayoutType dataFormats[4] = { LayoutType::ncsp, LayoutType::nspc, LayoutType::nCsp16c, LayoutType::nCsp8c };
+    for (const auto &df : dataFormats) {
+        addSupportedPrimDesc({{df, precision}},
+                             {{LayoutType::ncsp, Precision::I32}},
+                             impl_desc_type::ref);
+    }
+}
+
+void MKLDNNShapeOfNode::execute(mkldnn::stream strm) {
+    auto inPtr = getParentEdgeAt(0)->getMemoryPtr();
+    auto outPtr = getChildEdgeAt(0)->getMemoryPtr();
+    auto inDims = inPtr->getStaticDims();
+    size_t dimsCount = inDims.size();
+    if (outPtr->getStaticDims().size() != 1 || dimsCount != outPtr->getStaticDims()[0])
+        IE_THROW() << errorPrefix << "has inconsistent input shape and output size";
+
+    auto *dst = reinterpret_cast<int *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
+
+    for (size_t i = 0; i < dimsCount; i++) {
+        dst[i] = inDims[i];
+    }
+}
+
+bool MKLDNNShapeOfNode::created() const {
+    return getType() == ShapeOf;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNShapeOfNode, ShapeOf)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.h
new file mode 100644
index 00000000000..2a7eb9560e6
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shapeof.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <mkldnn_node.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include <mkldnn_extension_utils.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNShapeOfNode : public MKLDNNNode {
+public:
+    MKLDNNShapeOfNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {
+        if (inputShapesDefined())
+            updateLastInputDims();
+    };
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+    bool needPrepareParams() const override {return false;};
+    void executeDynamicImpl(mkldnn::stream strm) override { execute(strm); }
+    std::vector<VectorDims> shapeInfer() const override {
+        return {VectorDims{getParentEdgesAtPort(0)[0]->getMemory().getStaticDims().size()}};
+    }
+
+    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/shapeof.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/shapeof.cpp
new file mode 100644
index 00000000000..204f50453f3
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/shapeof.cpp
@@ -0,0 +1,175 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils/cpu_test_utils.hpp"
+
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+typedef std::tuple<
+        std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>>  // input shape
+> ShapeOfSpecificParams;
+
+typedef std::tuple<
+        ShapeOfSpecificParams,
+        InferenceEngine::Precision,     // Net precision
+        LayerTestsUtils::TargetDevice   // Device name
+> ShapeOfLayerTestParams;
+
+typedef std::tuple<
+        CPULayerTestsDefinitions::ShapeOfLayerTestParams,
+        CPUSpecificParams> ShapeOfLayerCPUTestParamsSet;
+
+class ShapeOfLayerCPUTest : public testing::WithParamInterface<ShapeOfLayerCPUTestParamsSet>,
+                             virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ShapeOfLayerCPUTestParamsSet> obj) {
+        CPULayerTestsDefinitions::ShapeOfLayerTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = obj.param;
+        std::string td;
+        Precision netPr;
+        std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>> shapes;
+
+        ShapeOfSpecificParams shapeOfPar;
+        std::tie(shapeOfPar, netPr, td) = basicParamsSet;
+        std::tie(shapes) = shapeOfPar;
+        std::ostringstream result;
+        result << "ShapeOfTest_";
+        result << std::to_string(obj.index) << "_";
+        result << "Prec=" << netPr.name() << "_";
+        result << CPUTestsBase::getTestCaseName(cpuParams) << "_";
+        result << "IS=";
+        for (const auto& shape : shapes.second) {
+            result << "(";
+            for (const auto& item : shape) {
+                result << CommonTestUtils::vec2str(item);
+            }
+            result << ")_";
+        }
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        CPULayerTestsDefinitions::ShapeOfLayerTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = this->GetParam();
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+
+        CPULayerTestsDefinitions::ShapeOfSpecificParams shapeOfParams;
+        auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
+        std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>> shapes;
+        std::tie(shapeOfParams, netPrecision, targetDevice) = basicParamsSet;
+        inPrc = netPrecision;
+        outPrc = Precision::I32;
+        std::tie(shapes) = shapeOfParams;
+        targetStaticShapes = shapes.second;
+        inputDynamicShapes = shapes.first;
+
+        auto inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto param = ngraph::builder::makeParams(inType, {targetStaticShapes.front().front()});
+        auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::opset3::Parameter>(param));
+        auto shapeOf = std::make_shared<ngraph::opset3::ShapeOf>(paramOuts[0], ngraph::element::i32);
+        shapeOf->get_rt_info() = getCPUInfo();
+        selectedType = std::string("ref_") + inPrc.name();
+
+        const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(shapeOf)};
+        function = std::make_shared<ngraph::Function>(results, param, "ShapeOf");
+        functionRefs = ngraph::clone_function(*function);
+    }
+};
+
+TEST_P(ShapeOfLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    Run();
+    CheckPluginRelatedResults(executableNetwork, "ShapeOf");
+}
+
+namespace {
+
+/* CPU PARAMS */
+std::vector<CPUSpecificParams> filterCPUInfoForDevice(const size_t dimsCount = 3) {
+    std::vector<CPUSpecificParams> resCPUParams;
+    if (dimsCount == 5) {
+        resCPUParams.push_back(CPUSpecificParams{{nCdhw16c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{nCdhw8c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{ncdhw}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{ndhwc}, {x}, {}, {}});
+    } else if (dimsCount == 4) {
+        resCPUParams.push_back(CPUSpecificParams{{nChw16c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{nChw8c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{nchw}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{nhwc}, {x}, {}, {}});
+    } else {
+        resCPUParams.push_back(CPUSpecificParams{{nCw16c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{nCw8c}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{abc}, {x}, {}, {}});
+        resCPUParams.push_back(CPUSpecificParams{{acb}, {x}, {}, {}});
+    }
+
+    return resCPUParams;
+}
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::BF16,
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::I8
+};
+
+std::vector<std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>>> inShapesDynamic3d = {
+        {{ngraph::PartialShape{-1, -1, -1}},
+                {{{ 8, 5, 4 }, { 8, 5, 3 }, { 8, 5, 2 }}}},
+        {{ngraph::PartialShape{-1, -1, -1}},
+                {{{ 1, 2, 4 }, { 1, 2, 3 }, { 1, 2, 2 }}}}
+};
+std::vector<std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>>> inShapesDynamic4d = {
+        {{ngraph::PartialShape{-1, -1, -1, -1}},
+                {{{ 8, 5, 3, 4 }, { 8, 5, 3, 3 }, { 8, 5, 3, 2 }}}},
+        {{ngraph::PartialShape{-1, -1, -1, -1}},
+                {{{ 1, 2, 3, 4 }, { 1, 2, 3, 3 }, { 1, 2, 3, 2 }}}}
+};
+std::vector<std::pair<std::vector<ngraph::PartialShape>, std::vector<std::vector<ngraph::Shape>>>> inShapesDynamic5d = {
+        {{ngraph::PartialShape{-1, -1, -1, -1, -1}},
+         {{{ 8, 5, 3, 2, 4 }, { 8, 5, 3, 2, 3 }, { 8, 5, 3, 2, 2 }}}},
+         {{ngraph::PartialShape{-1, -1, -1, -1, -1}},
+          {{{ 1, 2, 3, 4, 4 }, { 1, 2, 3, 4, 3 }, { 1, 2, 3, 4, 2 }}}}
+};
+const auto params5dDynamic = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::ValuesIn(inShapesDynamic5d)),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(5)));
+
+const auto params4dDynamic = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::ValuesIn(inShapesDynamic4d)),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(4)));
+
+const auto params3dDynamic = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::Combine(
+                        ::testing::ValuesIn(inShapesDynamic3d)),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(3)));
+
+// We don't check static case, because of constant folding
+INSTANTIATE_TEST_SUITE_P(smoke_ShapeOf3dDynamicLayoutTest, ShapeOfLayerCPUTest,
+                         params3dDynamic, ShapeOfLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_ShapeOf4dDynamicLayoutTest, ShapeOfLayerCPUTest,
+                         params4dDynamic, ShapeOfLayerCPUTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_ShapeOf5dDynamicLayoutTest, ShapeOfLayerCPUTest,
+                         params5dDynamic, ShapeOfLayerCPUTest::getTestCaseName);
+} // namespace
+} // namespace CPULayerTestsDefinitions

From bd2fdca9de8eb59ea06ae62cc5a9d2789936bf43 Mon Sep 17 00:00:00 2001
From: Bartek Szmelczynski <bartosz.szmelczynski@intel.com>
Date: Tue, 26 Oct 2021 12:54:47 +0200
Subject: [PATCH 2/5] remove CT from ref impl (#7959)

---
 .../reference/scatter_elements_update.hpp      | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
index d907201defe..6449e115d98 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
@@ -29,21 +29,19 @@ void scatter_elem_update(const DataType* input_data,
     // output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1,
     // output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2
 
-    NGRAPH_SUPPRESS_DEPRECATED_START
-    CoordinateTransform indices_transform{indices_shape};
-    CoordinateTransform data_transform{data_shape};
+    CoordinateTransformBasic indices_transform{indices_shape};
+    CoordinateTransformBasic data_transform{data_shape};
+    const auto indices_strides = row_major_strides(indices_shape);
+    const auto data_strides = row_major_strides(data_shape);
 
     for (const Coordinate& indices_cord : indices_transform) {
-        const size_t indices_idx = indices_transform.index(indices_cord);
+        const size_t indices_idx =
+            std::inner_product(indices_cord.begin(), indices_cord.end(), indices_strides.begin(), 0);
         Coordinate out_cord(indices_cord);
         out_cord.at(axis) = indices[indices_idx];
-        NGRAPH_CHECK(data_transform.has_source_coordinate(out_cord),
-                     "Provided index coordinates are out of input data bounds: ",
-                     out_cord,
-                     ".");
-        out_buf[data_transform.index(out_cord)] = updates[indices_idx];
+        const auto out_idx = std::inner_product(out_cord.begin(), out_cord.end(), data_strides.begin(), 0);
+        out_buf[out_idx] = updates[indices_idx];
     }
-    NGRAPH_SUPPRESS_DEPRECATED_END
 }
 }  // namespace reference
 }  // namespace runtime

From 4a96d14adce5a0152d11b04e6aa601060850c530 Mon Sep 17 00:00:00 2001
From: Vladislav Golubev <vladislav.golubev@intel.com>
Date: Tue, 26 Oct 2021 14:22:29 +0300
Subject: [PATCH 3/5] [nGraph] Reshape: upper_bound propagation fix (#8177)

---
 ngraph/core/src/op/reshape.cpp    |  9 ++++++++-
 ngraph/test/type_prop/reshape.cpp | 14 ++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/ngraph/core/src/op/reshape.cpp b/ngraph/core/src/op/reshape.cpp
index 257cc6bce4a..bd0f2fa5f5a 100644
--- a/ngraph/core/src/op/reshape.cpp
+++ b/ngraph/core/src/op/reshape.cpp
@@ -81,7 +81,7 @@ void op::v1::Reshape::validate_and_infer_types() {
     std::tie(lb, ub) = evaluate_both_bounds(get_input_source_output(1));
     if (lb && ub) {
         const auto lower_bound = std::make_shared<op::v0::Constant>(lb)->cast_vector<int64_t>();
-        const auto upper_bound = std::make_shared<op::v0::Constant>(ub)->cast_vector<int64_t>();
+        auto upper_bound = std::make_shared<op::v0::Constant>(ub)->cast_vector<int64_t>();
         shape_can_be_calculated = true;
         NGRAPH_CHECK(lower_bound.size() == upper_bound.size());
         for (size_t i = 0; i < lower_bound.size(); ++i) {
@@ -94,6 +94,13 @@ void op::v1::Reshape::validate_and_infer_types() {
                 NODE_VALIDATION_CHECK(this, minus_one_idx == -1, "More than one dimension has size of -1");
                 minus_one_idx = static_cast<int64_t>(i);
             }
+
+            // We must handle i32 fully dynamic dimension in a special way
+            if (get_input_element_type(1) == element::i32 &&
+                upper_bound[i] == std::numeric_limits<std::int32_t>::max()) {
+                upper_bound[i] = std::numeric_limits<std::int64_t>::max();
+            }
+
             reshape_pattern.emplace_back(lower_bound[i], upper_bound[i]);
         }
         // For scalar case reshape_patter should be empty but scalar reshape pattern should be empty
diff --git a/ngraph/test/type_prop/reshape.cpp b/ngraph/test/type_prop/reshape.cpp
index 083dddc5e5e..0d74ac2e4db 100644
--- a/ngraph/test/type_prop/reshape.cpp
+++ b/ngraph/test/type_prop/reshape.cpp
@@ -561,3 +561,17 @@ TEST(type_prop, reshape_to_scalar_3) {
         make_shared<op::v1::Reshape>(param, op::Constant::create(element::i64, {}, std::vector<int64_t>{100}), false),
         std::exception);
 }
+
+TEST(type_prop, dynamic_shape_propagation_with_i32_precision) {
+    auto param = make_shared<op::Parameter>(element::f32, PartialShape{1, -1, -1});
+    auto shape_of = std::make_shared<op::v3::ShapeOf>(param, element::i32);
+
+    auto indices = op::Constant::create(element::i32, {3}, {1, 2, 0});
+    auto axis = op::Constant::create(element::i32, {1}, {0});
+    auto gather = std::make_shared<op::v1::Gather>(shape_of, indices, axis);
+
+    auto reshape = std::make_shared<op::v1::Reshape>(param, gather, true);
+
+    ASSERT_EQ(reshape->get_element_type(), element::f32);
+    ASSERT_EQ(reshape->get_output_partial_shape(0), (PartialShape{-1, -1, 1}));
+}

From a02eafb397b23416078afdd1f3eabb84a0fb77a1 Mon Sep 17 00:00:00 2001
From: Egor Duplensky <egor.duplenskii@intel.com>
Date: Tue, 26 Oct 2021 14:49:36 +0300
Subject: [PATCH 4/5] [CPU] [BF16] Do not enforce BF16 for graph tail  (#6114)

---
 .../src/mkldnn_plugin/mkldnn_graph.cpp        | 66 +++++++++++++++----
 .../src/mkldnn_plugin/mkldnn_node.cpp         |  6 ++
 .../src/mkldnn_plugin/mkldnn_node.h           |  1 +
 .../plugin/cpu/bfloat16/gather_multiply.cpp   |  2 +-
 .../plugin/cpu/test_utils/cpu_test_utils.cpp  |  3 +
 5 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
index e962d362293..61928e183f5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -1209,21 +1209,59 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo
 void MKLDNNGraph::EnforceBF16() {
     // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
     // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
-    if (implication(isQuantized(), config.manualEnforceBF16)) {
-        for (auto &node : graphNodes) {
-            if (node->getType() != Input && node->getType() != Output) {
-                for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
-                    auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
-                    if (!(parent->getType() == Input && parent->isConstant()) &&       // exclude nodes after Constant Inputs
-                        !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16
-                        node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
-                        node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
-                }
+    if (!implication(isQuantized(), config.manualEnforceBF16))
+        return;
+    /* list of node types that must be forced to be executed in BF16 precision
+     * because of performance gains */
+    static const std::unordered_set<Type, std::hash<int>> significantNodes { // std::hash<int> is necessary old compilers (defect in C++11 standart)
+        Convolution,    // conv nets
+        FullyConnected, // conv / bert nets
+        RNNCell,        // recurent nets
+        RNNSeq,         // recurent nets
+        MatMul,         // bert nets
+        ROIPooling,     // object detection nets
+        Interpolate,    // super resolution nets
+    };
 
-                for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
-                    if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
-                        node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
-                }
+    std::function<void(const MKLDNNNodePtr&, std::unordered_set<MKLDNNNodePtr>& skipNodes)> searchForNodesToSkip;
+    searchForNodesToSkip = [&](const MKLDNNNodePtr& node, std::unordered_set<MKLDNNNodePtr>& skipNodes) -> void {
+        for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+            const auto& parent = node->getParentEdgeAt(i)->getParent();
+            if (significantNodes.count(parent->getType())) // stop at significant nodes
+                continue;
+
+            const auto res = skipNodes.insert(parent);
+            if (res.second) // node not visited yet
+                searchForNodesToSkip(parent, skipNodes);
+        }
+    };
+
+    /* Skip BF16 enforcement for tail of the graph by forming set of nodes to skip.
+     * Necessary to maintain accuracy.
+     * Experiments show zero peformance impact on average */
+    std::unordered_set<MKLDNNNodePtr> nodesToSkip;
+    // starting from output nodes
+    for (const auto& entry : outputNodesMap) {
+        const auto& node = entry.second;
+        searchForNodesToSkip(node, nodesToSkip);
+    }
+
+    for (const auto& node : graphNodes) {
+        if (nodesToSkip.count(node) && !node->enforceBF16evenForGraphTail)
+            continue;
+
+        if (node->getType() != Input && node->getType() != Output) {
+            for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
+                const auto &parent = node->getParentEdgesAtPort(i)[0]->getParent();
+                if (!(parent->getType() == Input && parent->isConstant()) &&       // exclude skipNodes after Constant Inputs
+                    !(parent->getType() == Input && node->getType() == Eltwise) && // exclude Eltwise after Input since it supports conversion to BF16
+                    node->getOriginalInputPrecisionAtPort(i) == Precision::FP32)
+                    node->setOriginalInputPrecisionAtPort(i, Precision::BF16);
+            }
+
+            for (size_t i = 0; i < node->getOriginalOutputsNumber(); i++) {
+                if (node->getOriginalOutputPrecisionAtPort(i) == Precision::FP32)
+                    node->setOriginalOutputPrecisionAtPort(i, Precision::BF16);
             }
         }
     }
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index d095b02d1f2..31d36aece4a 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -159,6 +159,12 @@ MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::en
             }
         }
     }
+
+    const auto it = rtInfo.find("enforceBF16evenForGraphTail");
+    if (it != rtInfo.end()) {
+        if (const auto value = std::dynamic_pointer_cast<ngraph::VariantImpl<int64_t>>(it->second))
+            enforceBF16evenForGraphTail = value->get();
+    }
 }
 
 MKLDNNNode::MKLDNNNode(const std::string& type, const std::string& name, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &w_cache)
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index b7a3622cb77..7e089789720 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -593,6 +593,7 @@ protected:
     std::vector <impl_desc_type> implPriorities;
     std::vector <mkldnn::memory::format_tag> inputMemoryFormatsFilter;
     std::vector <mkldnn::memory::format_tag> outputMemoryFormatsFilter;
+    bool enforceBF16evenForGraphTail = false;
 
     std::string originalLayers;  // contains names of the original layers separated by comma
 
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_multiply.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_multiply.cpp
index e4283a18931..84c9824f22b 100644
--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_multiply.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_multiply.cpp
@@ -100,7 +100,7 @@ protected:
         // performance counters
 
         expectedPrecisions["Matmul_0"] = "BF16";
-        expectedPrecisions["Mul_1"] = "BF16";
+        expectedPrecisions["Mul_1"] = netPrecision.name(); // tail kept in FP32 precision
     }
 };
 
diff --git a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
index f01bd40b96b..4b515dcd144 100644
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
@@ -4,6 +4,7 @@
 
 #include "cpu_test_utils.hpp"
 #include "utils/rt_info/memory_formats_attribute.hpp"
+#include <cstdint>
 
 namespace CPUTestUtils {
 
@@ -257,6 +258,8 @@ CPUTestsBase::makeCPUInfo(std::vector<cpu_memory_format_t> inFmts, std::vector<c
         cpuInfo.insert({"PrimitivesPriority", std::make_shared<ngraph::VariantWrapper<std::string>>(impls2str(priority))});
     }
 
+    cpuInfo.insert({"enforceBF16evenForGraphTail", ov::make_variant<int64_t>(true)});
+
     return cpuInfo;
 }
 

From ac5b0e881a08cb01b160fec983710e0bbb37d94c Mon Sep 17 00:00:00 2001
From: Irina Efode <irina.efode@intel.com>
Date: Tue, 26 Oct 2021 15:24:37 +0300
Subject: [PATCH 5/5] [IE GPU] Skip sporadic failure in CI (#8109)

---
 .../plugin/gpu/shared_tests_instances/skip_tests_config.cpp     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
index 5f1ff96d654..23a373f94ff 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
@@ -88,5 +88,7 @@ std::vector<std::string> disabledTestPatterns() {
             R"(.*CanSetInBlobWithDifferentPrecision/netPRC=BIN.*)",
             R"(.*CanSetOutBlobWithDifferentPrecision/netPRC=(I4|U4).*)",
             R"(.*CanSetOutBlobWithDifferentPrecision/netPRC=BIN.*)",
+            // TODO: Issue: 68712
+            R"(.*.MatMul.*CompareWithRefs.*IS0=\(1.5\)_IS1=\(1.5\).*transpose_a=0.*transpose_b=1.*CONSTANT.*FP16.*UNSPECIFIED.*UNSPECIFIED.*ANY.*)",
     };
 }