fp32 outputs fixup to properly handle negative values (#2529 )

Pre.2021.1.submission (#2094 )
* fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation * [CPU][BF16] bf16 for Gemm or MatMul was enabled (#1920) # Conflicts: # inference-engine/thirdparty/mkl-dnn * Fuse EmbeddingBag * [IE CLDNN] Fix result storing in leftover's branch (#2050) Co-authored-by: Alexey Varyzgin <alexey.varyzgin@intel.com> Co-authored-by: Vafin, Maxim <maxim.vafin@intel.com> Co-authored-by: Ilya Znamenskiy <ilya.znamenskiy@intel.com>
2020-10-05 13:51:41 +03:00 · 2020-09-07 17:24:59 +03:00 · 2020-09-04 18:20:25 +03:00 · 2020-08-17 20:17:30 +03:00 · 2020-08-14 21:52:22 +03:00 · 2020-08-14 15:57:51 +03:00
19 changed files with 503 additions and 111 deletions
--- a/inference-engine/include/ie_plugin_config.hpp
+++ b/inference-engine/include/ie_plugin_config.hpp
@@ -189,6 +189,15 @@ DECLARE_CONFIG_VALUE(NO);
 */
 DECLARE_CONFIG_KEY(CPU_THREADS_NUM);

+/**
+ * @brief If set, enables dynamic sequence recognition for 1D inputs for the CPU, the value defines the min seq boundary
+ */
+DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE);
+/**
+ * @brief Step to pre-reshape for the dynamic sequence recognition
+ */
+DECLARE_CONFIG_KEY(CPU_DYNAMIC_SEQUENCE_STEP);
+
 /**
 * @brief The name for setting CPU affinity per thread option.
 *
--- a/inference-engine/include/ie_precision.hpp
+++ b/inference-engine/include/ie_precision.hpp
@@ -224,7 +224,7 @@ public:
               (precisionInfo.value == Precision::Q78) || (precisionInfo.value == Precision::I16) ||
               (precisionInfo.value == Precision::I8) || (precisionInfo.value == Precision::I32) ||
               (precisionInfo.value == Precision::I64) || (precisionInfo.value == Precision::BIN) ||
-               (precisionInfo.value == Precision::CUSTOM);
+               (precisionInfo.value == Precision::BF16) || (precisionInfo.value == Precision::CUSTOM);
    }

 protected:
--- a/inference-engine/src/mkldnn_plugin/bf16transformer.h
+++ b/inference-engine/src/mkldnn_plugin/bf16transformer.h
@@ -13,7 +13,7 @@ namespace MKLDNNPlugin {

 class BF16Transformer {
    const InferenceEngine::details::caseless_set<std::string> _initbf16 =
-        { "convolution", "fullyconnected", "innerproduct" };
+        { "convolution", "fullyconnected", "innerproduct", "gemm" };
    const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
        { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic",
          "exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" };
--- a/inference-engine/src/mkldnn_plugin/config.cpp
+++ b/inference-engine/src/mkldnn_plugin/config.cpp
@@ -57,6 +57,14 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
            // zero and any negative value will be treated
            // as default batch size
            batchLimit = std::max(val_i, 0);
+        } else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE) {
+            int val_i = std::stoi(val);
+            // zero and any negative value will be treated
+            // as default sequence size, so no auto-reshaping will happen
+            dynamicSequence = std::max(val_i, 0);
+        }  else if (key == PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP) {
+            int val_i = std::stoi(val);
+            dynamicSequenceStep = std::max(val_i, 0);
        } else if (key == PluginConfigParams::KEY_PERF_COUNT) {
            if (val == PluginConfigParams::YES) collectPerfCounters = true;
            else if (val == PluginConfigParams::NO) collectPerfCounters = false;
@@ -110,6 +118,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
    }
    if (exclusiveAsyncRequests)  // Exclusive request feature disables the streams
        streamExecutorConfig._streams = 1;
+    if (dynamicSequence && !dynamicSequenceStep) {
+        THROW_IE_EXCEPTION << "Dynamic sequence recognition is enabled, but the "
+                           << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP
+                           << " is not set!";
+
+    } else if (!dynamicSequence && dynamicSequenceStep) {
+        THROW_IE_EXCEPTION << "Dynamic sequence recognition " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE
+                           << " is not enabled while the " << PluginConfigParams::KEY_CPU_DYNAMIC_SEQUENCE_STEP << " is set!";
+    }

    updateProperties();
 }
--- a/inference-engine/src/mkldnn_plugin/config.h
+++ b/inference-engine/src/mkldnn_plugin/config.h
@@ -21,6 +21,8 @@ struct Config {
    bool collectPerfCounters = false;
    bool exclusiveAsyncRequests = false;
    bool enableDynamicBatch = false;
+    int  dynamicSequence = 0;
+    int  dynamicSequenceStep = 0;
    std::string dumpToDot = "";
    std::string dumpQuantizedGraphToDot = "";
    std::string dumpQuantizedGraphToIr = "";
--- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
@@ -38,70 +38,76 @@ MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap network
    return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs, std::static_pointer_cast<MKLDNNExecNetwork>(shared_from_this()));
 }

-MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
+MKLDNNExecNetwork::MKLDNNExecNetwork(ReshapedCNNNetworks networks,
                                     const Config &cfg,
                                     const MKLDNNExtensionManager::Ptr& extMgr,
                                     NumaNodesWeights &numaNodesWeights) :
    InferenceEngine::ExecutableNetworkThreadSafeDefault{nullptr, nullptr},
    extensionManager(extMgr),
    _cfg{cfg},
-    _name{network.getName()} {
+    _name{networks.begin()->second.getName()} {
    OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "MKLDNNExecNetwork::MKLDNNExecNetwork");

-    // we are cloning network if we have statistics and we can transform network.
-    _clonedNetwork = cloneNet(network);
+    typedef std::map<int, InferenceEngine::details::CNNNetworkImplPtr, sorting_order> PluginInternalNetworks;
+    PluginInternalNetworks plugin_internal_networks;
+    for (auto n : networks) {
+        // we are cloning network if we have statistics and we can transform network.
+        auto _clonedNetwork = cloneNet(n.second);

-    if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
-        auto params = LayerTransformation::Params(true,  // updatePrecisions
-                                                    true,  // quantizeOutputs
-                                                    true,  // weightsToConst
-                                                    LayerTransformation::QuantizedTensorAlignment::UpdateLevel,  // quantizedTensorAlignmentOnActivations
-                                                    LayerTransformation::QuantizedTensorAlignment::None,  // quantizedTensorAlignmentOnWeights
-                                                    true,  // roundQuantizedValues
-                                                    true,  // updateBiases
-                                                    true);  // supportAsymmetricQuantization
-        LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
-            add<ConvolutionTransformation>(LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }), "Convolution").
-            addCleanup<ScaleShiftToConvolutionTransformation>(
-                LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
-                "ScaleShift"));
-        transformer.transform(*_clonedNetwork);
+        if (_cfg.lpTransformsMode == Config::LPTransformsMode::On) {
+            auto params = LayerTransformation::Params(true,  // updatePrecisions
+                                                      true,  // quantizeOutputs
+                                                      true,  // weightsToConst
+                                                      LayerTransformation::QuantizedTensorAlignment::UpdateLevel,  // quantizedTensorAlignmentOnActivations
+                                                      LayerTransformation::QuantizedTensorAlignment::None,  // quantizedTensorAlignmentOnWeights
+                                                      true,  // roundQuantizedValues
+                                                      true,  // updateBiases
+                                                      true);  // supportAsymmetricQuantization
+            LowPrecisionTransformer transformer(LowPrecisionTransformer::getAllTransformations(params).
+                    add<ConvolutionTransformation>(
+                    LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}), "Convolution").
+                    addCleanup<ScaleShiftToConvolutionTransformation>(
+                    LayerTransformation::Params(params).setPrecisionsOnActivations({Precision::U8}),
+                    "ScaleShift"));
+            transformer.transform(*_clonedNetwork);

-        // Check if network is INT8 or Binary.
-        // BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
-        // BF16 + INT8 or BF16 + BIN.
-        bool isFloatModel = true;
-        CNNNetworkIterator i(&network);
-        while (i != CNNNetworkIterator()) {
-            if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
-                isFloatModel = false;
-                break;
+            // Check if network is INT8 or Binary.
+            // BF16 transformations were disabled since CPU plug-in doesn't support mixed precision execution:
+            // BF16 + INT8 or BF16 + BIN.
+            bool isFloatModel = true;
+            CNNNetworkIterator i(&n.second.operator InferenceEngine::ICNNNetwork &());
+            while (i != CNNNetworkIterator()) {
+                if (CaselessEq<std::string>()((*i)->type, "FakeQuantize")) {
+                    isFloatModel = false;
+                    break;
+                }
+                i++;
+            }
+
+            if (with_cpu_x86_bfloat16() && isFloatModel) {
+                BF16Transformer bf16Transformer;
+                CNNNetwork cnnetwork(_clonedNetwork);
+                // If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
+                // Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
+                // CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
+                if (cfg.enforceBF16 == true)
+                    bf16Transformer.convertToBFloat16(cnnetwork);
+            } else {
+                BF16Transformer bf16Transformer;
+                CNNNetwork cnnetwork(_clonedNetwork);
+                bf16Transformer.convertToFloat(cnnetwork);
            }
-            i++;
        }

-        if (with_cpu_x86_bfloat16() && isFloatModel) {
-            BF16Transformer bf16Transformer;
-            CNNNetwork cnnetwork(_clonedNetwork);
-            // If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
-            // Overwise, only layers marked as BF16 in 'cnnetwork' will be performed in bfloat16 mode.
-            // CPU plugin throws an exception, if marked as BF16 layers have not supported by CPU plugin.
-            if (cfg.enforceBF16 == true)
-                bf16Transformer.convertToBFloat16(cnnetwork);
-        } else {
-            BF16Transformer bf16Transformer;
-            CNNNetwork cnnetwork(_clonedNetwork);
-            bf16Transformer.convertToFloat(cnnetwork);
-        }
-    }
-
-    MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork&>(*_clonedNetwork));
+        MKLDNNGraph::ApplyUnrollPasses(static_cast<ICNNNetwork &>(*_clonedNetwork));

    if (_cfg.enableDynamicBatch) {
-        // check topology for applicability
-        if (!CanProcessDynBatch(*_clonedNetwork)) {
-            THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
+            // check topology for applicability
+            if (!CanProcessDynBatch(*_clonedNetwork)) {
+                THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
+            }
        }
+        plugin_internal_networks[n.first] = _clonedNetwork;
    }

    if (cfg.exclusiveAsyncRequests) {
@@ -131,19 +137,23 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
    _graphs = decltype(_graphs){[&] {
        // TODO: Remove `cloneNet` to `localNetwork` when `MKLDNNGraph::CreateGraph`
        //       is fixed and does not change content of network passed (CVS-26420)
-        auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*_clonedNetwork));
-        auto graph = std::make_shared<MKLDNNGraph>();
-        {
-            std::unique_lock<std::mutex> lock{_cfgMutex};
-            graph->setConfig(_cfg);
+        SequenceGraphs m;
+        for (auto n : plugin_internal_networks) {
+            auto localNetwork = cloneNet(static_cast<ICNNNetwork&>(*n.second.get()));
+            auto graph = std::make_shared<MKLDNNGraph>();
+            {
+                std::unique_lock<std::mutex> lock{_cfgMutex};
+                graph->setConfig(_cfg);
+            }
+            int numaNode = 0;
+            auto *streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor *>(_taskExecutor.get());
+            if (nullptr != streamExecutor) {
+                numaNode = streamExecutor->GetNumaNodeId();
+            }
+            graph->CreateGraph(static_cast<ICNNNetwork &>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
+            m[n.first] = graph;
        }
-        int numaNode = 0;
-        auto* streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(_taskExecutor.get());
-        if (nullptr != streamExecutor) {
-            numaNode = streamExecutor->GetNumaNodeId();
-        }
-        graph->CreateGraph(static_cast<ICNNNetwork&>(*localNetwork), extensionManager, numaNodesWeights[numaNode]);
-        return graph;
+        return m;
    }};

    _taskExecutor->runAndWait({std::thread::hardware_concurrency(), [this] {_graphs.local();}});
@@ -152,7 +162,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
    // of MemoryLayer implementation. It uses output edge of MemoryLayer
    // producer as storage for tensor to keep it between infer calls.
    if (_graphs.size() == 1) {
-        for (auto &node : _graphs.begin()->get()->GetNodes()) {
+        for (auto &node : _graphs.begin()->begin()->second->GetNodes()) {
            if (node->getType() == MemoryInput) {
                auto memoryNode = dynamic_cast<MKLDNNMemoryInputNode*>(node.get());
                auto state_store = memoryNode->getStore();
@@ -174,9 +184,9 @@ void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &pr
        std::lock_guard<std::mutex> lock{_cfgMutex};
        _cfg.readProperties(properties);
    }
-    for (auto g : _graphs) {
-        g->setProperty(properties);
-    }
+    for (auto g : _graphs)
+        for (auto s : g)
+                s.second->setProperty(properties);
 }

 void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
@@ -193,13 +203,13 @@ void MKLDNNExecNetwork::GetExecGraphInfo(InferenceEngine::ICNNNetwork::Ptr &grap
    if (_graphs.size() == 0)
        THROW_IE_EXCEPTION << "No graph was found";

-    graphPtr = _graphs.begin()->get()->dump();
+    graphPtr = _graphs.begin()->begin()->second->dump();
 }

 void MKLDNNExecNetwork::GetConfig(const std::string &name, Parameter &result, ResponseDesc *resp) const {
    if (_graphs.size() == 0)
        THROW_IE_EXCEPTION << "No graph was found";
-    Config engConfig = _graphs.begin()->get()->getProperty();
+    Config engConfig = _graphs.begin()->begin()->second->getProperty();
    auto option = engConfig._config.find(name);
    if (option != engConfig._config.end()) {
        result = option->second;
@@ -213,9 +223,9 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
        THROW_IE_EXCEPTION << "No graph was found";

    if (name == METRIC_KEY(NETWORK_NAME)) {
-        if (_graphs.begin()->get()->dump() == nullptr)
+        if (_graphs.begin()->begin()->second->dump() == nullptr)
            THROW_IE_EXCEPTION << "Invalid graph dump";
-        result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->get()->dump()->getName());
+        result = IE_SET_METRIC(NETWORK_NAME, _graphs.begin()->begin()->second->dump()->getName());
    } else if (name == METRIC_KEY(SUPPORTED_METRICS)) {
        std::vector<std::string> metrics;
        metrics.push_back(METRIC_KEY(NETWORK_NAME));
@@ -225,12 +235,12 @@ void MKLDNNExecNetwork::GetMetric(const std::string &name, Parameter &result, Re
        result = IE_SET_METRIC(SUPPORTED_METRICS, metrics);
    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
        std::vector<std::string> configKeys;
-        for (auto && key : _graphs.begin()->get()->getProperty()._config) {
+        for (auto && key : _graphs.begin()->begin()->second->getProperty()._config) {
            configKeys.push_back(key.first);
        }
        result = IE_SET_METRIC(SUPPORTED_CONFIG_KEYS, configKeys);
    } else if (name == METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)) {
-        Config engConfig = _graphs.begin()->get()->getProperty();
+        Config engConfig = _graphs.begin()->begin()->second->getProperty();
        auto option = engConfig._config.find(CONFIG_KEY(CPU_THROUGHPUT_STREAMS));
        IE_ASSERT(option != engConfig._config.end());
        auto streams = std::stoi(option->second);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.h
@@ -18,6 +18,9 @@
 #include <unordered_map>

 namespace MKLDNNPlugin {
+typedef std::less<int> sorting_order;
+typedef std::map<int, MKLDNNGraph::Ptr, sorting_order> SequenceGraphs;
+typedef std::map<int, InferenceEngine::CNNNetwork, sorting_order> ReshapedCNNNetworks;

 class MKLDNNExecNetwork: public InferenceEngine::ExecutableNetworkThreadSafeDefault {
 public:
@@ -29,7 +32,7 @@ public:

    void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;

-    MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
+    MKLDNNExecNetwork(ReshapedCNNNetworks, const Config &cfg,
                      const MKLDNNExtensionManager::Ptr &extMgr, NumaNodesWeights &weightsSharing);

    ~MKLDNNExecNetwork() override = default;
@@ -44,13 +47,12 @@ public:

    std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState() override;

-    InferenceEngine::ThreadLocal<MKLDNNGraph::Ptr>  _graphs;
+    InferenceEngine::ThreadLocal<SequenceGraphs> _graphs;

 protected:
    friend class MKLDNNInferRequest;
    MKLDNNExtensionManager::Ptr extensionManager;
    std::vector<InferenceEngine::IMemoryStateInternal::Ptr> memoryStates;
-    InferenceEngine::details::CNNNetworkImplPtr _clonedNetwork;
    std::mutex                                  _cfgMutex;
    Config                                      _cfg;
    std::atomic_int                             _numRequests = {0};
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -758,9 +758,14 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
            ext_blob->allocate();
        }

-        if (ext_blob->byteSize() != intr_blob.GetSize())
+        if (config.dynamicSequence) {
+            if (ext_blob->byteSize() < intr_blob.GetSize())
+                THROW_IE_EXCEPTION << "Output blob size is less than network output size ("
+                                   << ext_blob->size() << "<" << intr_blob.GetSize() / sizeof(float) << ").";
+        } else if (ext_blob->byteSize() != intr_blob.GetSize()) {
            THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
-                               << ext_blob->size() << "!=" << intr_blob.GetSize()/sizeof(float) << ").";
+                               << ext_blob->size() << "!=" << intr_blob.GetSize() / sizeof(float) << ").";
+        }

        void *ext_blob_ptr = ext_blob->buffer();
        void *intr_blob_ptr = intr_blob.GetData();
@@ -776,6 +781,13 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
        size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;

        ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
+        if (config.dynamicSequence && ext_blob->size() > intr_blob.GetElementsCount()) {
+            if (ext_blob->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32)
+                THROW_IE_EXCEPTION << "Dynamic sequence is supported only for the fp32 outputs only!";
+            auto elements = intr_blob.GetElementsCount();
+            std::fill(static_cast<float*>(ext_blob_ptr) + elements,
+                       static_cast<float*>(ext_blob_ptr) + ext_blob->size(), -std::numeric_limits<float>::max());
+        }
    }
 }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@@ -24,7 +24,10 @@ MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsData

    if (execNetwork->_graphs.size() == 0)
        THROW_IE_EXCEPTION << "No graph was found";
-    graph = execNetwork->_graphs.begin()->get();
+    const int seq = execNetwork->_graphs.begin()->size() > 1
+            ? _networkInputs.cbegin()->second->getTensorDesc().getDims()[1]
+            : 0;
+    graph = execNetwork->_graphs.begin()->at(seq).get();
    for (const auto& it : _networkInputs) {
        InferenceEngine::Blob::Ptr blob;
        MKLDNNInferRequest::GetBlob(it.first.c_str(), blob);
@@ -79,13 +82,28 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) {

 void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
    using namespace openvino::itt;
+    const bool dyn_sequence = execNetwork->_graphs.local().size() > 1;
+    auto dims = _inputs.cbegin()->second->getTensorDesc().getDims();
    OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, profilingTask);

-    graph = execNetwork->_graphs.local().get();
+    if (dyn_sequence) {
+        // graph per sequence
+        const int *ptr = _inputs.cbegin()->second->buffer().as<int *>();
+        auto sz = _inputs.cbegin()->second->size();
+        const int size_non_zero = std::distance(ptr,
+                                                std::find_if(ptr, ptr + sz, [](int x) { return x == 0; }));
+        const int actual_seq = execNetwork->_graphs.local().lower_bound(size_non_zero)->first;
+        // std::cout << "Last non-zero : " << size_non_zero << ", Actual Seq : " << actual_seq << std::endl;
+        graph = execNetwork->_graphs.local()[actual_seq].get();
+        dims[1] = actual_seq;
+    } else {
+        graph = execNetwork->_graphs.local().begin()->second.get();
+    }
+
    {
        execDataPreprocessing(_inputs);
-
-        changeDefaultPtr();
+        if (!dyn_sequence)
+            changeDefaultPtr();

        // need to retain converted blobs until infer finish
        std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
@@ -103,7 +121,16 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
                    pushInput<float>(input.first, input.second);
                    break;
                case InferenceEngine::Precision::I32:
-                    pushInput<int32_t>(input.first, input.second);
+                    if (dyn_sequence) {
+                        iconv = InferenceEngine::make_shared_blob<int32_t>({InferenceEngine::Precision::I32,
+                                                                            dims,
+                                                                            input.second->getTensorDesc().getLayout()},
+                                                                           input.second->buffer());
+                        convertedInputs.push_back(iconv);
+                        pushInput<int32_t>(input.first, iconv);
+                    } else {
+                        pushInput<int32_t>(input.first, input.second);
+                    }
                    break;
                case InferenceEngine::Precision::I8:
                    pushInput<int8_t>(input.first, input.second);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@@ -158,28 +158,51 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const st
        conf.batchLimit = static_cast<int>(network.getBatchSize());
    }

-    std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
-    bool is_transformed = false;
-    if (clonedNetwork->getFunction()) {
-        Transformation(clonedNetwork);
-        is_transformed = true;
+    CNNNetwork localNetwork(cloneNetwork(network));
+    const InputsDataMap inputInfo = localNetwork.getInputsInfo();
+    ICNNNetwork::InputShapes shapes = localNetwork.getInputShapes();
+    ReshapedCNNNetworks reshapedNetworks;
+    int seq = 0;
+    if (conf.dynamicSequence) {
+        if (shapes.at(inputInfo.cbegin()->first).size() < 2)
+            THROW_IE_EXCEPTION << "Auto-reshaping of the network with no sequence (first input is scalar or channels-only)!";
+        seq = shapes.at(inputInfo.cbegin()->first)[1];
    }
-    auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(clonedNetwork);
-    if (implNetwork) {
-        // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
-        ConstTransformer transformator(implNetwork.get());
-        transformator.fullTrim();
-        if (!is_transformed) {
-            NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
-            NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
-            NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
-            NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
-            NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
-            NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
+    do {
+        CNNNetwork clonedNetwork(cloneNetwork(network));
+        if (conf.dynamicSequence) {
+            for (const InputsDataMap::value_type &item : inputInfo)
+                shapes[item.first][1] = seq;
+                // std::cout << "Reshaped network by sequence to  " << seq << std::endl;
+            clonedNetwork.reshape(shapes);
        }
-    }
+        bool is_transformed = false;
+        if (clonedNetwork.getFunction()) {
+            auto temp = clonedNetwork.operator ICNNNetwork::Ptr();
+            Transformation(temp);
+            clonedNetwork = CNNNetwork(temp);
+            is_transformed = true;
+        }
+        auto implNetwork = std::dynamic_pointer_cast<details::CNNNetworkImpl>(
+                clonedNetwork.operator ICNNNetwork::Ptr());
+        if (implNetwork) {
+            // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
+            ConstTransformer transformator(implNetwork.get());
+            transformator.fullTrim();
+            if (!is_transformed) {
+                NetPass::ConvertPrecision(*implNetwork, Precision::I64, Precision::I32);
+                NetPass::ConvertPrecision(*implNetwork, Precision::U64, Precision::I32);
+                NetPass::ConvertPrecision(*implNetwork, Precision::U32, Precision::I32);
+                NetPass::ConvertPrecision(*implNetwork, Precision::FP16, Precision::FP32);
+                NetPass::ConvertPrecision(*implNetwork, Precision::BOOL, Precision::U8);
+                NetPass::ConvertPrecision(*implNetwork, Precision::U16, Precision::I32);
+            }
+        }
+        reshapedNetworks[seq] = clonedNetwork;
+        seq -= conf.dynamicSequenceStep;
+    } while (conf.dynamicSequence && seq >= conf.dynamicSequence);

-    return std::make_shared<MKLDNNExecNetwork>(*clonedNetwork, conf, extensionManager, weightsSharing);
+    return std::make_shared<MKLDNNExecNetwork>(reshapedNetworks, conf, extensionManager, weightsSharing);
 }

 void Engine::SetConfig(const std::map<std::string, std::string> &config) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
@@ -122,8 +122,13 @@ void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
    auto inPrec0 = getCnnLayer()->insData[0].lock()->getPrecision();
    auto inPrec1 = getCnnLayer()->insData[1].lock()->getPrecision();
    if ((inPrec0 != Precision::U8 && inPrec0 != Precision::I8) || inPrec1 != Precision::I8 || isThreeInputs) {
-        inPrec0 = Precision::FP32;
-        inPrec1 = Precision::FP32;
+        if (inPrec0 == Precision::BF16 || inPrec1 == Precision::BF16) {
+            inPrec0 = Precision::BF16;
+            inPrec1 = Precision::BF16;
+        } else {
+            inPrec0 = Precision::FP32;
+            inPrec1 = Precision::FP32;
+        }
    }

    auto inputDataType0 = MKLDNNExtensionUtils::IEPrecisionToDataType(inPrec0);
@@ -192,6 +197,11 @@ inline void process_gemm(char transa, char transb, int M, int N, int K, float al
    mkldnn_sgemm(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
 }

+inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint16_t *A, int lda,
+                         const uint16_t *B, int ldb, float beta, float *C, int ldc) {
+    mkldnn_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
 inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint8_t *A, int lda,
                         const int8_t *B, int ldb, float beta, float *C, int ldc) {
    const int32_t co = 0;
@@ -288,6 +298,9 @@ void MKLDNNGemmNode::execute(mkldnn::stream strm) {
        case Precision::FP32:
            process_data<float, float>();
            break;
+        case Precision::BF16:
+            process_data<uint16_t, uint16_t>();
+            break;
        case Precision::I8:
            process_data<int8_t, int8_t>();
            break;
--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
@@ -16,7 +16,7 @@ using namespace mkldnn;
 class MKLDNNTestExecNetwork: public MKLDNNPlugin::MKLDNNExecNetwork {
 public:
    MKLDNNPlugin::MKLDNNGraph& getGraph() {
-        return *(_graphs.begin()->get());
+        return *(_graphs.begin()->begin()->second);
    }
 };

--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp
@@ -1198,7 +1198,7 @@ TEST_F(MKLDNNGraphStructureTests, TestOutputAfterInplacePlusConcat) {
    InferenceEngine::Core core;
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
@@ -1714,7 +1714,7 @@ TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));

-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
@@ -1864,7 +1864,7 @@ TEST_F(MKLDNNGraphStructureTests, TestConcatAfterConcat) {
    InferenceEngine::Core core;
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
@@ -2045,7 +2045,7 @@ TEST_F(MKLDNNGraphStructureTests, Test2ConcatFromConcat) {
    InferenceEngine::Core core;
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
@@ -2377,7 +2377,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithConstLayer) {
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));

-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
@@ -2525,7 +2525,7 @@ TEST_F(MKLDNNGraphStructureTests, TestLoadTopologyWithEltwiseBeforeConcat) {
    InferenceEngine::CNNNetwork network;
    ASSERT_NO_THROW(network = core.ReadNetwork(model, weights_ptr));

-    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork(network, {}, {}, cache));
+    MKLDNNPlugin::MKLDNNExecNetwork::Ptr execNetwork(new MKLDNNPlugin::MKLDNNExecNetwork({{0, network}}, {}, {}, cache));
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    InferenceEngine::OutputsDataMap _networkOutputs = network.getOutputsInfo();
    execNetwork->setNetworkInputs(_networkInputs);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax/softmax_kernel_items_class_optimized.cpp
@@ -76,6 +76,7 @@ JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_para
    auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, kd);

    jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
+    jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.bIMADSupport));

    return jit;
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_gpu_items_class_optimized.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_gpu_items_class_optimized.cl
@@ -63,12 +63,24 @@ KERNEL(softmax_items_class_optimized)(__global INPUT0_TYPE* input, __global OUTP
    ACCUMULATOR_TYPE denominator = 0.0;
    for (uint cls = 0; cls < FULL_ITERATIONS_NUM; cls++)
    {
+// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries 
+// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
+#if HAS_DRIVER_PROBLEMS
+        data[cls] = data[cls] == max_value ? 1.0 : native_exp(data[cls] - max_value);
+#else
        data[cls] = native_exp(data[cls] - max_value);
+#endif
        denominator += data[cls];
    }
    if(simd_lane < LEFTOVERS)
    {
+// This is a temporary solution for unresolved problem when ocl kernels compilation step doesn't produce actual binaries 
+// for current kernel but driver doesn't report any errors (JIRA CVS-32211)
+#if HAS_DRIVER_PROBLEMS
+        data[DATA_PER_WORKITEM-1] = data[DATA_PER_WORKITEM-1] == max_value ? 1.0 : native_exp(data[DATA_PER_WORKITEM-1] - max_value);
+#else
        data[DATA_PER_WORKITEM-1] = native_exp(data[DATA_PER_WORKITEM-1] - max_value);
+#endif
        denominator += data[DATA_PER_WORKITEM-1];
    }

--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
--- a/model-optimizer/automation/package_BOM.txt
+++ b/model-optimizer/automation/package_BOM.txt
@@ -119,6 +119,7 @@ extensions/front/create_tensor_nodes.py
 extensions/front/disable_weights_quantize_value_propagation.py
 extensions/front/div.py
 extensions/front/eltwise_n.py
+extensions/front/EmbeddingBagFuse.py
 extensions/front/ExpandDimsToUnsqueeze.py
 extensions/front/FillToBroadcast.py
 extensions/front/flatten_to_reshape.py
@@ -286,6 +287,7 @@ extensions/front/onnx/priorbox_ext.py
 extensions/front/onnx/priorgridgenerator_ext.py
 extensions/front/onnx/proposal_ext.py
 extensions/front/onnx/quantize_dequantize_linear.py
+extensions/front/onnx/quantize_dequantize_redundant.py 
 extensions/front/onnx/quantize_ext.py
 extensions/front/onnx/quantize_linear_ext.py
 extensions/front/onnx/quantize_linear_resolver.py
--- a/model-optimizer/extensions/front/EmbeddingBagFuse.py
+++ b/model-optimizer/extensions/front/EmbeddingBagFuse.py
@@ -0,0 +1,91 @@
+"""
+ Copyright (C) 2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.embedding_bag import EmbeddingBagOffsetsSum
+from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Graph, rename_nodes
+
+
+class EmbeddingBagFuse(FrontReplacementSubgraph):
+    enabled = True
+
+    def run_after(self):
+        from extensions.front.ExpandDimsToUnsqueeze import ExpandDimsToUnsqueeze
+        from extensions.front.AttributedGatherNormalizer import AttributedGatherNormalizer
+        return [ExpandDimsToUnsqueeze, AttributedGatherNormalizer]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('weights', dict(op='Const')),
+                ('concat_before', dict(op='Concat')),
+                ('gather_before1_1', dict(op='Gather')),
+                ('unsqueeze_before1_1', dict(op='Unsqueeze')),
+                ('gather_before2_1', dict(op='Gather')),
+                ('unsqueeze_before2_1', dict(op='Unsqueeze')),
+                ('slice1', dict(op='Slice')),
+                ('gather_after1', dict(op='Gather')),
+                ('reduce1', dict(op='ReduceSum')),
+                ('unsqueeze_after1', dict(op='Unsqueeze')),
+                ('concat_after', dict(op='Concat')),
+            ],
+            edges=[
+                ('concat_before', 'gather_before1_1'),
+                ('concat_before', 'gather_before2_1'),
+                ('gather_before1_1', 'unsqueeze_before1_1'),
+                ('gather_before2_1', 'unsqueeze_before2_1'),
+                ('unsqueeze_before1_1', 'slice1', {'out': 0, 'in': 1}),
+                ('unsqueeze_before2_1', 'slice1', {'out': 0, 'in': 2}),
+                ('weights', 'gather_after1', {'out': 0, 'in': 0}),
+                ('slice1', 'gather_after1', {'out': 0, 'in': 1}),
+                ('gather_after1', 'reduce1'),
+                ('reduce1', 'unsqueeze_after1'),
+                ('unsqueeze_after1', 'concat_after'),
+            ])
+
+    def replace_sub_graph(self, graph: Graph, match: dict):
+        concat_before = match['concat_before']
+        gather_after1 = match['gather_after1']
+        slice1 = match['slice1']
+        concat_after = match['concat_after']
+        weights_node = gather_after1.in_port(0).get_source().node
+        gather_after_axis = gather_after1.in_port(2).get_source().node.soft_get('value')
+        for dst_port in weights_node.out_port(0).get_destinations():
+            node = dst_port.node
+            if node.op == 'Gather':
+                # validate that all Gathers have same axis
+                if node.in_port(2).get_source().node.soft_get('value') != gather_after_axis:
+                    return
+                dst_port.disconnect()
+        indices_node = slice1.in_port(0).get_source().node
+        slice_axis = slice1.in_port(3).get_source().node.soft_get('value')
+        for dst_port in indices_node.out_port(0).get_destinations():
+            node = dst_port.node
+            if node.op == 'Slice':
+                # validate that all Slices have same axis
+                if node.in_port(3).get_source().node.soft_get('value') != slice_axis:
+                    return
+                dst_port.disconnect()
+        emb_bag = EmbeddingBagOffsetsSum(graph, {}).create_node()
+        weights_node.out_port(0).connect(emb_bag.in_port(0))
+        indices_node.out_port(0).connect(emb_bag.in_port(1))
+        concat_before.in_port(0).get_connection().set_destination(emb_bag.in_port(2))
+        concat_after.out_port(0).get_connection().set_source(emb_bag.out_port(0))
+        concat_name = concat_after.soft_get('name', concat_after.id)
+        rename_nodes([(concat_after, concat_name + '/TBD'), (emb_bag, concat_name)])
+
+        # remove this sub-graph since a lot of matchings will be obsolete
+        graph.remove_nodes_from(graph.dfs(concat_before.id, set()))
--- a/model-optimizer/extensions/front/onnx/quantize_dequantize_redundant.py
+++ b/model-optimizer/extensions/front/onnx/quantize_dequantize_redundant.py
@@ -0,0 +1,171 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging as log
+
+from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.front.subgraph_matcher import SubgraphMatch
+from mo.graph.graph import Graph
+
+
+class QuantizeDequantizeRedundant2(FrontReplacementSubgraph):
+    """
+    Fuses duplicated QuantizeLinear and DequantizeLinear nodes
+    (redundancy in the official NV's int8 MLPerf BERT model)
+    Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
+    """
+    enabled = True
+
+    def run_before(self):
+        from extensions.front.onnx.quantize_dequantize_linear import QuantizeDequantizeLinear
+        return [QuantizeDequantizeLinear]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('inp', dict(op='Add')),
+                ('quantize0', dict(op='QuantizeLinear')),
+                ('dequantize0', dict(op='DequantizeLinear')),
+                ('quantize1', dict(op='QuantizeLinear')),
+                ('dequantize1', dict(op='DequantizeLinear')),
+            ],
+            edges=[
+                ('inp', 'quantize0', {'in': 0}),
+                ('inp', 'quantize1', {'in': 0}),
+                ('quantize0', 'dequantize0', {'in': 0}),
+                ('quantize1', 'dequantize1', {'in': 0}),
+            ]
+        )
+
+    def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
+
+        q0 = match['quantize0']
+        q1 = match['quantize1']
+
+        q0_scale = q0.in_port(1).get_source().node
+        q0_zerop = q0.in_port(2).get_source().node
+        q1_scale = q1.in_port(1).get_source().node
+        q1_zerop = q1.in_port(2).get_source().node
+
+        inp_port = q0.in_port(0).get_source()
+        name = inp_port.node.soft_get('name', inp_port.node.id)
+
+        # only constant as for zero_point/scale supported
+        if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
+                q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const':
+
+            # only patterns with same scale/zero_point values for Q and DQ are supported
+            if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value:
+                log.debug('Redundant 2Q-DQ pattern after {}'.format(name))
+
+                dests = match['dequantize1'].out_port(0).get_destinations()
+                for dest in dests:
+                    dest.disconnect()
+                    dest.connect(match['dequantize0'].out_port(0))
+                graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
+            else:
+                log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
+                          'cannot removed!'.format(name))
+
+
+class QuantizeDequantizeRedundant4(FrontReplacementSubgraph):
+    """
+    Fuses duplicated QuantizeLinear and DequantizeLinear nodes
+    (redundancy in the official NV's int8 MLPerf BERT model)
+    Covers cases when the values for zero point and scale are same in both QuantizeLinear and DequantizeLinear.
+    """
+    enabled = True
+
+    def run_before(self):
+        return [QuantizeDequantizeRedundant2]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('inp', dict(op='Add')),
+                ('quantize0', dict(op='QuantizeLinear')),
+                ('dequantize0', dict(op='DequantizeLinear')),
+                ('quantize1', dict(op='QuantizeLinear')),
+                ('dequantize1', dict(op='DequantizeLinear')),
+                ('quantize2', dict(op='QuantizeLinear')),
+                ('dequantize2', dict(op='DequantizeLinear')),
+                ('quantize3', dict(op='QuantizeLinear')),
+                ('dequantize3', dict(op='DequantizeLinear')),
+            ],
+            edges=[
+                ('inp', 'quantize0', {'in': 0}),
+                ('inp', 'quantize1', {'in': 0}),
+                ('inp', 'quantize2', {'in': 0}),
+                ('inp', 'quantize3', {'in': 0}),
+                ('quantize0', 'dequantize0', {'in': 0}),
+                ('quantize1', 'dequantize1', {'in': 0}),
+                ('quantize2', 'dequantize2', {'in': 0}),
+                ('quantize3', 'dequantize3', {'in': 0}),
+            ]
+        )
+
+    def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
+
+        q0 = match['quantize0']
+        q1 = match['quantize1']
+        q2 = match['quantize2']
+        q3 = match['quantize3']
+
+        q0_scale = q0.in_port(1).get_source().node
+        q0_zerop = q0.in_port(2).get_source().node
+        q1_scale = q1.in_port(1).get_source().node
+        q1_zerop = q1.in_port(2).get_source().node
+        q2_scale = q2.in_port(1).get_source().node
+        q2_zerop = q2.in_port(2).get_source().node
+        q3_scale = q3.in_port(1).get_source().node
+        q3_zerop = q3.in_port(2).get_source().node
+
+        inp_port = q0.in_port(0).get_source()
+        name = inp_port.node.soft_get('name', inp_port.node.id)
+
+        # only constant as for zero_point/scale supported
+        if q0_scale.soft_get('type') == 'Const' and q1_scale.soft_get('type') == 'Const' and \
+                q0_zerop.soft_get('type') == 'Const' and q1_zerop.soft_get('type') == 'Const' and \
+                q2_zerop.soft_get('type') == 'Const' and q2_zerop.soft_get('type') == 'Const' and \
+                q3_zerop.soft_get('type') == 'Const' and q3_zerop.soft_get('type') == 'Const':
+
+            # only patterns with same scale/zero_point values for Q and DQ are supported
+            if q0_scale.value == q1_scale.value and q0_zerop.value == q1_zerop.value and \
+                    q0_scale.value == q2_scale.value and q0_zerop.value == q2_zerop.value and \
+                    q0_scale.value == q3_scale.value and q0_zerop.value == q3_zerop.value:
+                log.debug('Redundant 4Q-DQ pattern after {}'.format(name))
+
+                dests = match['dequantize1'].out_port(0).get_destinations()
+                for dest in dests:
+                    dest.disconnect()
+                    dest.connect(match['dequantize0'].out_port(0))
+                graph.remove_nodes_from([match['quantize1'].id, match['dequantize1'].id])
+
+                dests = match['dequantize2'].out_port(0).get_destinations()
+                for dest in dests:
+                    dest.disconnect()
+                    dest.connect(match['dequantize0'].out_port(0))
+                graph.remove_nodes_from([match['quantize2'].id, match['dequantize2'].id])
+
+                dests = match['dequantize3'].out_port(0).get_destinations()
+                for dest in dests:
+                    dest.disconnect()
+                    dest.connect(match['dequantize0'].out_port(0))
+                graph.remove_nodes_from([match['quantize3'].id, match['dequantize3'].id])
+
+            else:
+                log.error('QuantizeLinears in the fan-out have different scale or zero-point values, '
+                          'cannot removed!'.format(name))
Author	SHA1	Message	Date
Maxim Shevtsov	834755680d	fp32 outputs fixup to properly handle negative values (#2529 )	2020-10-05 13:51:41 +03:00
Maxim Shevtsov	c8b783f644	Pre.2021.1.submission (#2094 ) * fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation * [CPU][BF16] bf16 for Gemm or MatMul was enabled (#1920) # Conflicts: # inference-engine/thirdparty/mkl-dnn * Fuse EmbeddingBag * [IE CLDNN] Fix result storing in leftover's branch (#2050) Co-authored-by: Alexey Varyzgin <alexey.varyzgin@intel.com> Co-authored-by: Vafin, Maxim <maxim.vafin@intel.com> Co-authored-by: Ilya Znamenskiy <ilya.znamenskiy@intel.com>	2020-09-07 17:24:59 +03:00
Maxim Shevtsov	1e6ca0627a	MLPerf's pre.2021.1.submission branch update (DO NOT REVIEW) (#2083 ) * fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation * [CPU][BF16] bf16 for Gemm or MatMul was enabled (#1920) # Conflicts: # inference-engine/thirdparty/mkl-dnn * Fuse EmbeddingBag Co-authored-by: Alexey Varyzgin <alexey.varyzgin@intel.com> Co-authored-by: Vafin, Maxim <maxim.vafin@intel.com>	2020-09-04 18:20:25 +03:00
Maxim Shevtsov	05a57ebd8e	fixed code and updated unit tests to accomodate auto-reshaping graphs, to unlock full validation (#1808 )	2020-08-17 20:17:30 +03:00
Maxim Shevtsov	e8a178e196	fixed unit tests to accomodate auto-reshaping graphs, to unlock full validation (#1795 )	2020-08-14 21:52:22 +03:00
myshevts	0aead5c070	Fuses duplicated QuantizeLinear and DequantizeLinear nodes, (redundancy in the official NV's int8 MLPerf BERT model that is not good for the OV), per discussion with NV reps	2020-08-14 15:57:51 +03:00
myshevts	dcfaeedb6f	multo-graph for automatic dynamic sequence handling via auto-pre-reshaping	2020-08-14 15:57:51 +03:00