From 5c17c7e0a0c3687f9709d7d5023a8b0a17d50447 Mon Sep 17 00:00:00 2001
From: Paul Youngsoo Ahn <paul.y.ahn@intel.com>
Date: Wed, 9 Nov 2022 10:34:40 +0900
Subject: [PATCH] [GPU] Fix multistream issue for dynamic shape (#13433)
 (#13433)

- Separate kernels_cache::add_kernel from factory (choose_impl)
- Reset kenrels_cache._kernels after kernels_cache.compile
- Create cldnn unit test case to check multi-stream processing
---
 .../include/intel_gpu/graph/network.hpp       | 18 +++++
 .../include/intel_gpu/graph/program.hpp       |  7 --
 .../graph/graph_optimizer/compile_graph.cpp   |  6 +-
 .../graph_optimizer/post_input_reorder.cpp    |  4 +
 .../graph_optimizer/post_optimize_weights.cpp |  9 ++-
 .../remove_redundant_reorders.cpp             |  7 +-
 .../src/graph/impls/ocl/primitive_base.hpp    | 18 +++--
 .../src/graph/include/primitive_inst.h        |  3 +
 src/plugins/intel_gpu/src/graph/network.cpp   |  8 ++
 .../intel_gpu/src/graph/primitive_inst.cpp    | 26 +++---
 src/plugins/intel_gpu/src/graph/program.cpp   |  4 -
 .../intel_gpu/src/runtime/kernels_cache.cpp   | 64 ++++++++++++---
 .../intel_gpu/src/runtime/kernels_cache.hpp   |  2 +
 .../test_cases/multiple_streams_gpu_test.cpp  | 79 +++++++++++++++++++
 14 files changed, 209 insertions(+), 46 deletions(-)
 create mode 100644 src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
index 42092db3e58..805db2b576b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -11,6 +11,7 @@
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/event.hpp"
 #include "intel_gpu/runtime/stream.hpp"
+#include "intel_gpu/runtime/lru_cache.hpp"
 
 #include <map>
 #include <vector>
@@ -216,6 +217,15 @@ public:
     /// Returns memory state @p variable_id of stateful network
     VariableState& get_variable_memory(const std::string &variable_id);
 
+    /// Return kernels_cache
+    kernels_cache& get_kernels_cache() const { return *_kernels_cache; }
+
+    /// Return implentations_cache
+    ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
+
+    /// Return in_mem_kernels_cache
+    KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
+
 private:
     using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
     uint32_t net_id = 0;
@@ -249,5 +259,13 @@ private:
     void check_names();
     void add_default_output_chains();
     output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
+
+    std::unique_ptr<kernels_cache> _kernels_cache;
+    // Move from cldnn::program to cldnn::network for multi-threads issue.
+    std::unique_ptr<ImplementationsCache> _impls_cache;
+    std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
+    // TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
+    const size_t _impls_cache_capacity = 0;
+    const size_t _in_mem_kernels_cache_capacity = 0;
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
index be10119bc6b..0c65e6ca846 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@@ -247,8 +247,6 @@ public:
 
     void load_tuning_cache();
     std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }
-    ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
-    KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
 
     // returns {-1, -1} if it failed to estimate by allocating given batch size
     std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
@@ -261,11 +259,6 @@ private:
     stream::ptr _stream;
     // TODO: Consider moving it to engine
     std::unique_ptr<kernels_cache> _kernels_cache;
-    std::unique_ptr<ImplementationsCache> _impls_cache;
-    std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
-    // TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models.
-    const size_t _impls_cache_capacity = 0;
-    const size_t _in_mem_kernels_cache_capacity = 0;
     build_options options;
     std::list<program_node*> inputs;
     std::vector<program_node*> outputs;
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
index b5c6f224a75..af001ea65c3 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@@ -34,9 +34,13 @@ void compile_graph::run(program& p) {
     for (size_t idx = 0; idx < proc_order.size(); idx++) {
         auto& node = *(std::next(proc_order.begin(), idx));
         if (!node->is_type<data>() && !(node->is_type<mutable_data>() && node->get_dependencies().empty()) && !node->is_dynamic()) {
-            tasks.push_back([node, &exception] {
+            tasks.push_back([node, &p, &exception] {
                 try {
                     node->selected_impl = node->type()->choose_impl(*node);
+                    if (node->selected_impl) {
+                        auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source());
+                        node->selected_impl->set_kernel_ids(kernel_ids);
+                    }
                 } catch(...) {
                     exception = std::current_exception();
                 }
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp
index a646ac57ea2..ff95aa1ad86 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp
@@ -66,6 +66,10 @@ void post_input_reorder::run(program& p) {
                 reorder.get_output_layout(false);
                 node->set_output_layout(previous_layout, false);
                 reorder.set_selected_impl(reorder.type()->choose_impl(reorder));
+                if (auto impl = reorder.get_selected_impl()) {
+                    auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
+                    impl->set_kernel_ids(kernel_ids);
+                }
             }
         }
     }
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
index 346318f74ad..6eca63349f7 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -53,8 +53,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
 
             // Don't run impl selection to avoid double compilation of reorder kernels
             // in main program and internal program for constant propagation
-            if (!g_node.is_constant())
-                g_node.selected_impl = g_node.type()->choose_impl(g_node);
+            if (!g_node.is_constant()) {
+                g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
+                if (auto impl = g_node.get_selected_impl()) {
+                    auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
+                    impl->set_kernel_ids(kernel_ids);
+                }
+            }
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index c90a282c8f1..126787b91ca 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -36,8 +36,11 @@ void remove_redundant_reorders::run(program& p) {
             return;
 
         node.set_unique_id();
-        auto new_impl = node.type()->choose_impl(node);
-        node.set_selected_impl(std::move(new_impl));
+        node.set_selected_impl(node.type()->choose_impl(node));
+        if (auto impl = node.get_selected_impl()) {
+            auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source());
+            impl->set_kernel_ids(kernel_ids);
+        }
     };
 
     // Fuse reorders into primitives
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
index 19d7213ec58..370aa55e04c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@@ -50,12 +50,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
         _kernel_data.weightsReorderParams.cpuKernel = nullptr;
         _kernel_data.weightsReorderParams.clKernel = nullptr;
-
-        _kernel_ids.reserve(kd.kernels.size());
-        // Add selected kernels to kernels_cache for the following compilation and save output ids
-        for (size_t i = 0; i < kd.kernels.size(); ++i) {
-            _kernel_ids.emplace_back(arg.get_program().add_kernel(kd.kernels[i].code.kernelString));
-        }
     }
 
     bool is_cpu() const override { return false; }
@@ -198,6 +192,18 @@ protected:
         bool group_events = (all_events.size() > 1);
         return aggregate_events(all_events, stream, group_events);
     }
+
+    void set_kernel_ids(std::vector<kernel_id> kernel_ids) override {
+        _kernel_ids = kernel_ids;
+    }
+
+    std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
+        std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
+        for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) {
+            kernel_strings.push_back(_kernel_data.kernels[i].code.kernelString);
+        }
+        return kernel_strings;
+    }
 };
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
index d10d76a942c..6e0d9c9c054 100644
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -54,6 +54,9 @@ struct primitive_impl {
     virtual std::vector<std::string> get_kernel_ids() {
         return {};
     }
+    virtual std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() { return {}; }
+    virtual void set_kernels(std::vector<kernel::ptr>) {}
+    virtual void set_kernel_ids(std::vector<kernel_id> kernel_ids) {}
 
     // If this flag is set as false, the memory allocated for this primitive is not allowed to be reused
     bool can_reuse_memory = true;
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 54785b67d20..665ebcfc18a 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -31,6 +31,7 @@
 #include "kernel_selector_helper.h"
 #include "program_helpers.h"
 #include "runtime/cldnn_itt.hpp"
+#include "kernels_cache.hpp"
 
 #include <algorithm>
 #include <string>
@@ -293,6 +294,13 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
     build_exec_order();
     validate_primitives();
     add_default_output_chains();
+
+    if (is_dynamic()) {
+        _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(program->get_engine(), program->get_id(),
+                                                                        kernel_selector::KernelBase::get_db().get_batch_header_str()));
+        _impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
+        _in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
+    }
 }
 
 network::network(engine& engine,
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 9de819e7a81..7487cc75ef4 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -286,17 +286,19 @@ void primitive_inst::update_impl() {
         };
 
         auto layout_key = get_layout_key();
-        auto& cache = _network.get_program()->get_implementations_cache();
+        auto& cache = get_network().get_implementations_cache();
         if (cache.has(layout_key)) {
             _impl = cache.get(layout_key)->clone();
             GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
         } else {
-            auto lru = cache.get_lru_element();
             _impl = _node->type()->choose_impl(*_node, *_impl_params);
-            _network.get_program()->compile();
-            _impl->init_kernels(_network.get_program()->get_kernels_cache());
+            auto& kernels_cache = get_network().get_kernels_cache();
+            auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
+            _impl->set_kernel_ids(kernel_ids);
+            kernels_cache.compile();
+            _impl->init_kernels(kernels_cache);
             cache.add(layout_key, _impl->clone());
-            _network.get_program()->get_kernels_cache().reset();
+            kernels_cache.reset();
         }
 
         reset_shape_change();
@@ -560,8 +562,6 @@ event::ptr primitive_inst::update_weights() {
         auto original_weights_memory = dep_memory_ptr(weights_idx);
         auto original_layout = original_weights_memory->get_layout();
         layout expected_layout = from_weights_tensor(weights_params.dest);
-
-        auto& program = _node->get_program();
         auto& engine = _network.get_engine();
 
         auto get_layout_key = [&]() -> std::string {
@@ -574,7 +574,7 @@ event::ptr primitive_inst::update_weights() {
         cldnn::kernel::ptr kernel = nullptr;
         auto layout_key = get_layout_key();
         if (layout_key != "") {
-            auto& cache = program.get_in_mem_kernels_cache();
+            auto& cache = get_network().get_in_mem_kernels_cache();
             if (cache.has(layout_key)) {
                 GPU_DEBUG_IF(debug_config->verbose >= 4) {
                     GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout << "\nto " << expected_layout << std::endl;
@@ -585,14 +585,16 @@ event::ptr primitive_inst::update_weights() {
                 GPU_DEBUG_IF(debug_config->verbose >= 4) {
                     GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout << "\nto " << expected_layout << std::endl;
                 }
-                auto _kernel_id = program.add_kernel(weights_params.clKernel->code.kernelString);
-                program.compile();
-                kernel = program.get_kernel(_kernel_id);
+                auto& kernels_cache = get_network().get_kernels_cache();
+                auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
+                kernels_cache.compile();
+                kernel = kernels_cache.get_kernel(kernel_id);
                 cache.add(layout_key, kernel);
+                kernels_cache.reset();
             }
         }
 
-        auto& stream = _network.get_stream();
+        auto& stream = get_network().get_stream();
 
         bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
         if (can_reuse) {
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index a31d07e51d3..05031a0f3dc 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -114,8 +114,6 @@ program::program(engine& engine_ref,
     prepare_nodes(topology);
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
                                                                       kernel_selector::KernelBase::get_db().get_batch_header_str()));
-    _impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
-    _in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
     program_node::reset_unique_id();
     if (no_optimizations) {
         init_graph();
@@ -137,8 +135,6 @@ program::program(engine& engine_ref,
     set_options();
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
                                                                       kernel_selector::KernelBase::get_db().get_batch_header_str()));
-    _impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
-    _in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
     pm = std::unique_ptr<pass_manager>(new pass_manager(*this));
     prepare_nodes(nodes);
     build_program(is_internal);
diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
index 98e05c7083e..62220631ddf 100644
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@@ -148,18 +148,8 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, const std::vector
 kernel_id kernels_cache::set_kernel_source(
     const std::shared_ptr<kernel_string>& kernel_string,
     bool dump_custom_program) {
-    std::lock_guard<std::mutex> lock(_mutex);
-    // we need unique id in order to avoid conflict across topologies.
-    const auto kernel_num = _kernels.size() + (_kernel_idx++);
-    kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
-
-    auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
-
-    assert(_kernels.find(id) == _kernels.end());
-    if (res.second) {
-        _pending_compilation = true;
-    }
-    return id;
+    auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program);
+    return kernel_ids[0];
 }
 
 static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
@@ -384,4 +374,54 @@ void kernels_cache::reset() {
     _pending_compilation = false;
 }
 
+std::vector<kernel_id> kernels_cache::add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program) {
+    std::vector<kernel_id> kernel_ids;
+    kernel_ids.reserve(kernel_sources.size());
+    for (size_t i = 0; i < kernel_sources.size(); ++i) {
+        std::lock_guard<std::mutex> lock(_mutex);
+        auto kernel_string = kernel_sources[i];
+        // we need unique id in order to avoid conflict across topologies.
+        const auto kernel_num = _kernels.size() + (_kernel_idx++);
+        kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
+
+        auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
+
+        assert(_kernels.find(id) == _kernels.end());
+        if (res.second) {
+            _pending_compilation = true;
+        }
+        kernel_ids.emplace_back(id);
+    }
+    return kernel_ids;
+}
+
+void kernels_cache::compile() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
+
+    std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
+    if (_engine.type() == engine_types::ocl) {
+        _build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl,
+                                                                    _engine.configuration(), _engine.get_task_executor()));
+    }
+
+    // create batches
+    std::vector<batch_program> batches;
+    get_program_source(_kernels_code, &batches);
+
+    // build batches
+    for (size_t idx = 0; idx < batches.size(); idx++) {
+        build_batch(*_build_engine, batches[idx]);
+    }
+
+    _kernels_code.clear();
+    _pending_compilation = false;
+#if defined(__unix__) && !defined(__ANDROID__)
+    //  NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed.
+    //  (It is at least 500 MB when we perform parallel compilation)
+    //  It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory.
+    //  Also, this is not happening in Windows.
+    //  So, added malloc_trim for linux build until we figure out a better solution.
+        malloc_trim(0);
+#endif
+}
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
index 8404b49b0ac..d3fc6ac4b9b 100644
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
@@ -101,6 +101,8 @@ public:
     void remove_kernel(kernel_id id) {
         _kernels.erase(id);
     }
+    std::vector<kernel_id> add_kernels_source(std::vector<std::shared_ptr<kernel_string>> kernel_sources, bool dump_custom_program = false);
+    void compile();
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp
new file mode 100644
index 00000000000..7bea1dc0cf9
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2022-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/shape_of.hpp>
+#include <intel_gpu/primitives/eltwise.hpp>
+#include <intel_gpu/primitives/fully_connected.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include <vector>
+#include <iostream>
+
+using namespace cldnn;
+using namespace ::tests;
+
+
+TEST(multistream_gpu, basic) {
+    const int num_streams = 2;
+    auto config = InferenceEngine::CPUStreamsExecutor::Config();
+    config._streams = num_streams;
+    auto task_executor = std::make_shared<InferenceEngine::CPUStreamsExecutor>(config);
+    auto& engine = get_test_engine();
+
+    build_options bo;
+    bo.set_option(build_option::allow_new_shape_infer(true));
+
+    auto input1_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
+    auto input2_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx };
+    auto weights    = engine.allocate_memory({ {512, 512}, data_types::f32, format::bfyx});
+
+    topology topology;
+    topology.add(input_layout("input1", input1_dyn_layout));
+    topology.add(input_layout("input2", input2_dyn_layout));
+    topology.add(data("weights", weights));
+    topology.add(eltwise("eltwise", "input1", "input2", eltwise_mode::sum));
+    topology.add(fully_connected("fc", "eltwise", "weights"));
+    topology.add(shape_of("shape_of", "fc", 3, data_types::i32));
+
+    auto prog_ptr = program::build_program(engine, topology, bo);
+    std::vector<network::ptr> networks;
+    for (size_t i = 0; i < num_streams; i++) {
+        networks.push_back(network::allocate_network(engine, prog_ptr));
+    }
+
+    std::vector<InferenceEngine::Task> tasks;
+    for (size_t i = 0; i < num_streams; i++) {
+        tasks.push_back([&networks, i, &engine] {
+            auto net = networks[i];
+            std::vector<int> various_size = {32, 128, 16, 64};
+            for (size_t iter = 0; iter < 8; iter++) {
+                int len = various_size[iter % various_size.size()];
+                auto input1_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
+                auto input2_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx });
+                net->set_input_data("input1", input1_mem);
+                net->set_input_data("input2", input2_mem);
+
+                auto outputs = net->execute();
+
+                auto output = outputs.at("shape_of").get_memory();
+                cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
+
+                std::vector<int32_t> expected_results = {1, len, 512};
+
+                for (size_t out_idx = 0; out_idx < expected_results.size(); ++out_idx) {
+                    EXPECT_TRUE(are_equal(expected_results[out_idx], output_ptr[out_idx]));
+                }
+            }
+        });
+    }
+
+    task_executor->runAndWait(tasks);
+    tasks.clear();
+    networks.clear();
+}