From 6f754052cfac6c47f907e56d1ea789e8fe58a414 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Thu, 21 Oct 2021 10:45:25 +0300
Subject: [PATCH] [GPU] Extended remote context to accept user queue (#6235)

* [GPU] extended remote context to accept user queues for each stream

* [GPU] OV2.0 API for queue sharing. Removed deviceName arg for context creation
---
 .../supported_plugins/GPU_RemoteBlob_API.md   |  83 +++++---
 docs/snippets/CMakeLists.txt                  |   3 +-
 docs/snippets/GPU_RemoteBlob_API3.cpp         |  76 ++++++++
 .../cldnn_async_infer_request.cpp             |  30 ++-
 .../cldnn_engine/cldnn_async_infer_request.h  |   3 +
 .../cldnn_engine/cldnn_executable_network.cpp |   4 +
 .../src/cldnn_engine/cldnn_graph.cpp          |  19 +-
 .../src/cldnn_engine/cldnn_graph.h            |   2 +
 .../src/cldnn_engine/cldnn_infer_request.cpp  |   3 +-
 .../src/cldnn_engine/cldnn_infer_request.h    |   4 +
 .../src/cldnn_engine/cldnn_remote_context.cpp |   4 +
 .../src/cldnn_engine/cldnn_remote_context.h   |   2 +
 .../include/ie/gpu/gpu_context_api_ocl.hpp    |  19 ++
 .../include/ie/gpu/gpu_params.hpp             |   5 +
 .../include/openvino/runtime/gpu/dx.hpp       |   6 +-
 .../include/openvino/runtime/gpu/ocl.hpp      |  23 ++-
 .../include/openvino/runtime/gpu/va.hpp       |   6 +-
 .../cldnn_remote_blob_tests.cpp               | 180 ++++++++++++++++++
 .../cldnn_remote_tensor_tests.cpp             |   2 +-
 .../clDNN/api/cldnn/graph/network.hpp         |   5 +-
 .../clDNN/api/cldnn/runtime/engine.hpp        |   3 +
 .../clDNN/runtime/ocl/ocl_engine.cpp          |   4 +
 .../clDNN/runtime/ocl/ocl_engine.hpp          |   1 +
 .../clDNN/runtime/ocl/ocl_stream.cpp          |  30 ++-
 .../clDNN/runtime/ocl/ocl_stream.hpp          |   1 +
 .../thirdparty/clDNN/src/network.cpp          |   5 +-
 26 files changed, 467 insertions(+), 56 deletions(-)
 create mode 100644 docs/snippets/GPU_RemoteBlob_API3.cpp

diff --git a/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md b/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
index c24faa3541f..9431ae7fd59 100644
--- a/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
+++ b/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
@@ -1,79 +1,101 @@
 Remote Blob API of GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU_RemoteBlob_API}
 ================================
 
-The GPU plugin implementation of the `RemoteContext` and `RemoteBlob` interfaces supports GPU 
-pipeline developers who need video memory sharing and interoperability with existing native APIs 
+The GPU plugin implementation of the `RemoteContext` and `RemoteBlob` interfaces supports GPU
+pipeline developers who need video memory sharing and interoperability with existing native APIs
 such as OpenCL\*, Microsoft DirectX\*, or VAAPI\*.
-Using these interfaces allows to avoid any memory copy overhead when plugging the OpenVINO™ inference 
-into an existing GPU pipeline. It also enables OpenCL kernels participating in the pipeline to become 
+Using these interfaces allows to avoid any memory copy overhead when plugging the OpenVINO™ inference
+into an existing GPU pipeline. It also enables OpenCL kernels participating in the pipeline to become
 native buffer consumers or producers of the OpenVINO™ inference.
-Since the GPU plugin works on top of the clDNN library, the functionality above is also implemented 
+Since the GPU plugin works on top of the clDNN library, the functionality above is also implemented
 using OpenCL and its sharing extensions provided by Intel®.
 
 There are two interoperability scenarios that are supported for the Remote Blob API:
 
-* GPU plugin context and memory objects can be constructed from low-level device, display, or memory 
-handles and used to create the OpenVINO™ `ExecutableNetwork` or `Blob` class. 
+* GPU plugin context and memory objects can be constructed from low-level device, display, or memory
+handles and used to create the OpenVINO™ `ExecutableNetwork` or `Blob` class.
 * OpenCL context or buffer handles can be obtained from existing GPU plugin objects, and used in OpenCL processing.
 
 Class and function declarations for the API are defined in the following files:
-* Windows\*: `gpu/gpu_context_api_ocl.hpp` and `gpu/gpu_context_api_dx.hpp` 
+* Windows\*: `gpu/gpu_context_api_ocl.hpp` and `gpu/gpu_context_api_dx.hpp`
 * Linux\*: `gpu/gpu_context_api_ocl.hpp` and `gpu/gpu_context_api_va.hpp`
 
-The most common way to enable the interaction of your application with the Remote Blob API is to use user-side utility classes 
-and functions that consume or produce native handles directly. 
+The most common way to enable the interaction of your application with the Remote Blob API is to use user-side utility classes
+and functions that consume or produce native handles directly.
 
 ## Execution Context User-Side Wrappers
 
 GPU plugin classes that implement the `RemoteContext` interface are responsible for context sharing.
-Obtaining a pointer to a context object is the first step of sharing pipeline objects. 
-The context object of the GPU plugin directly wraps OpenCL context, setting a scope for sharing 
+Obtaining a pointer to a context object is the first step of sharing pipeline objects.
+The context object of the GPU plugin directly wraps OpenCL context, setting a scope for sharing
 `ExecutableNetwork` and `RemoteBlob` objects.
-To create such objects within user context, explicitly provide the context to the plugin using the 
-`make_shared_context()` overloaded function. Depending on the platform, the function accepts the 
-`cl_context` handle, the pointer to the `ID3D11Device` interface, or the `VADisplay` handle, and 
+To create such objects within user context, explicitly provide the context to the plugin using the
+`make_shared_context()` overloaded function. Depending on the platform, the function accepts the
+`cl_context` handle, the pointer to the `ID3D11Device` interface, or the `VADisplay` handle, and
 returns a smart pointer to the `RemoteContext` plugin object.
 
 If you do not provide any user context, the plugin uses its default internal context.
 The plugin attempts to use the same internal context object as long as plugin options are kept the same.
-Therefore, all ExecutableNetwork objects created during this time share the same context. 
+Therefore, all ExecutableNetwork objects created during this time share the same context.
 Once the plugin options are changed, the internal context is replaced by the new one.
 
-To request the current default context of the plugin, call the `GetDefaultContext()` method of the core engine. 
+To request the current default context of the plugin, call the `GetDefaultContext()` method of the core engine.
 To request the internal context of the given `ExecutableNetwork`, use the `GetContext()` method.
 
 ## Shared Blob User-Side Wrappers
 
-The classes that implement the `RemoteBlob` interface are both wrappers for native API 
-memory handles (which can be obtained from them at any time) and act just like regular OpenVINO™ 
+The classes that implement the `RemoteBlob` interface are both wrappers for native API
+memory handles (which can be obtained from them at any time) and act just like regular OpenVINO™
 `Blob` objects.
 
-Once you obtain the context, you can use it to compile a new `ExecutableNetwork` or create `RemoteBlob` 
+Once you obtain the context, you can use it to compile a new `ExecutableNetwork` or create `RemoteBlob`
 objects.
-For network compilation, use a dedicated flavor of `LoadNetwork()`, which accepts the context as an 
+For network compilation, use a dedicated flavor of `LoadNetwork()`, which accepts the context as an
 additional parameter.
 
-To create a shared blob from a native memory handle, use `make_shared_blob()` overloaded functions 
+To create a shared blob from a native memory handle, use `make_shared_blob()` overloaded functions
 that can accept the `cl::Buffer`, `cl::Image2D`, `cl_mem` handles, and either `ID3D11Buffer`,
-`ID3D11Texture2D` pointers or the `VASurfaceID` handle. 
-All `make_shared_blob()` flavors return a smart pointer to the `Blob` object, which can be directly 
+`ID3D11Texture2D` pointers or the `VASurfaceID` handle.
+All `make_shared_blob()` flavors return a smart pointer to the `Blob` object, which can be directly
 passed to the `SetBlob() `method of an inference request object.
 
 ## Direct NV12 video surface input
 
-To support the direct consumption of a hardware video decoder output, plugin accepts two-plane video 
-surfaces as arguments for the `make_shared_blob_nv12()` function, which creates an `NV12Blob` object 
+To support the direct consumption of a hardware video decoder output, plugin accepts two-plane video
+surfaces as arguments for the `make_shared_blob_nv12()` function, which creates an `NV12Blob` object
 and returns a smart pointer to it, which is cast to `Blob::Ptr`.
 
-To ensure that the plugin generates the correct execution graph for the NV12 dual-plane input, set 
+To ensure that the plugin generates the correct execution graph for the NV12 dual-plane input, set
 the `CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS` plugin configuration flag to `PluginConfigParams::YES`.
 
+## Context & queue sharing
+
+GPU plugin supports creation of shared context from `cl_command_queue` handle. In that case
+opencl context handle is extracted from given queue via OpenCL™ API, and the queue itself is used inside
+the plugin for further execution of inference primitives. Sharing of the queue changes behavior of `StartAsync()`
+method to guarantee that submission of inference primitives into given queue is finished before
+returning of control back to calling thread.
+
+This sharing mechanism allows to do pipeline synchonization on app side and avoid blocking of host thread
+on waiting for completion of inference. Pseudocode may look as follows:
+
+@snippet snippets/GPU_RemoteBlob_API3.cpp part0
+
+### Limitations
+
+ - Some primitives in GPU plugin may block host thread on waiting for previous primitives before adding its kernels
+   to the command queue. In such cases `StartAsync()` call takes much more time to return control to the calling thread
+   as internally it waits for partial or full network completion.
+   Examples of operations: Loop, TensorIterator, DetectionOutput, NonMaxSuppression
+ - Synchonization of pre/post processing jobs and inference pipleine inside shared queue is the user responsibility
+ - Throughput mode is not available when queue sharing is used, i.e. only single stream can be used for each executable network.
+
 ## Low-Level Methods and Their Parameter Description
 
-The high-level wrappers above bring a direct dependency on native APIs to the user program. 
-If you want to avoid the dependency, you still can directly use the `CreateContext()`, 
+The high-level wrappers above bring a direct dependency on native APIs to the user program.
+If you want to avoid the dependency, you still can directly use the `CreateContext()`,
 `CreateBlob()`, and `getParams()` methods.
-On this level, native handles are re-interpreted as void pointers and all arguments are passed 
+On this level, native handles are re-interpreted as void pointers and all arguments are passed
 using `std::map` containers that are filled with `std::string, InferenceEngine::Parameter` pairs.
 Two types of map entries are possible: descriptor and container. The first map entry is a
 descriptor, which sets the expected structure  and possible parameter values of the map.
@@ -84,6 +106,7 @@ descriptor, which sets the expected structure  and possible parameter values of
 |----------------|---------------------------------------------------------------------|
 | `CONTEXT_TYPE` | Describes the type of the shared context in a map. Can be `OCL` (for pure OpenCL context) or `VA_SHARED` (for context shared with a video decoding device). |
 | `OCL_CONTEXT` | Contains the OpenCL context handle. |
+| `OCL_QUEUE` | Contains the OpenCL queue handle if queue sharing is needed. |
 | `VA_DEVICE` | Contains the native video decoding device handle. Can be `VADisplay` or `ID3D11Device` (a pointer). |
 | `SHARED_MEM_TYPE` | Describes the type of the shared memory buffer in a map. Can be `OCL_BUFFER` (clBuffer), `OCL_IMAGE2D` (clImage2D), `VA_SURFACE()`,  or `DX_BUFFER`.  |
 | `MEM_HANDLE` | Contains the OpenCL memory handle. |
diff --git a/docs/snippets/CMakeLists.txt b/docs/snippets/CMakeLists.txt
index 9edc3e4f327..9bc067478eb 100644
--- a/docs/snippets/CMakeLists.txt
+++ b/docs/snippets/CMakeLists.txt
@@ -11,7 +11,8 @@ file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
 if(NOT CLDNN__IOCL_ICD_INCDIRS OR TRUE)
     list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API0.cpp"
                              "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API1.cpp"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp")
+                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp"
+                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API3.cpp")
 endif()
 
 # remove OpenCV related sources
diff --git a/docs/snippets/GPU_RemoteBlob_API3.cpp b/docs/snippets/GPU_RemoteBlob_API3.cpp
new file mode 100644
index 00000000000..6e374425285
--- /dev/null
+++ b/docs/snippets/GPU_RemoteBlob_API3.cpp
@@ -0,0 +1,76 @@
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+
+#include <ie_core.hpp>
+#include <CL/cl2.hpp>
+#include <gpu/gpu_context_api_ocl.hpp>
+
+
+int main() {
+using namespace InferenceEngine;
+//! [part0]
+
+
+// ...
+
+
+// initialize the core and read the network
+InferenceEngine::Core ie;
+auto net = ie.ReadNetwork("network.xml");
+
+// initialize opencl context and create queue
+cl::Context ctx = get_my_OpenCL_context();
+cl::CommandQueue queue = get_my_OpenCL_queue();
+
+// share the queue with GPU plugin and compile ExecutableNetwork
+auto remote_context = gpu::make_shared_context(ie, "GPU", queue.get());
+auto exec_net_shared = ie.LoadNetwork(net, remote_context);
+
+// create the OpenCL buffers within the context
+cl::Buffer shared_in_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
+cl::Buffer shared_out_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
+// wrap in and out buffers into RemoteBlob and set them to infer request
+auto shared_in_blob = gpu::make_shared_blob(input_info->getTensorDesc(), remote_context, shared_in_buffer);
+auto shared_out_blob = gpu::make_shared_blob(out_data->getTensorDesc(), remote_context, shared_out_buffer);
+auto infer_request = exec_net_shared.CreateInferRequest();
+infer_request.SetBlob(input_name, shared_in_blob);
+infer_request.SetBlob(output_name, shared_out_blob);
+
+// ...
+// execute user kernel
+cl::Kernel kernel_preproc(program, kernel_name_preproc.c_str());
+kernel_preproc.setArg(0, shared_in_buffer);
+queue.enqueueNDRangeKernel(kernel_preproc,
+                           cl::NDRange(0),
+                           cl::NDRange(image_size),
+                           cl::NDRange(1),
+                           nullptr,  // wait events *
+                           &profileEvent);
+// Blocking clFinish() call is not required, but this barrier is added to the queue to guarantee that user kernel is finished
+// before any inference primitive is started
+queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+// ...
+
+// pass results to the inference
+// since the remote context is created with queue sharing, StartAsync() guarantees that scheduling is finished
+infer_request.StartAsync();
+
+// execute some postprocessing kernel.
+// infer_request.Wait() is not called, synchonization between inference and post-processing is done via
+// enqueueBarrierWithWaitList call.
+cl::Kernel kernel_postproc(program, kernel_name_postproc.c_str());
+kernel_postproc.setArg(0, shared_out_buffer);
+queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+queue.enqueueNDRangeKernel(kernel_postproc,
+                           cl::NDRange(0),
+                           cl::NDRange(image_size),
+                           cl::NDRange(1),
+                           nullptr,  // wait events *
+                           &profileEvent);
+
+// Wait for pipeline completion
+queue.finish();
+//! [part0]
+
+return 0;
+}
diff --git a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
index 41bb9bf1576..9e69ddeb0c8 100644
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
@@ -13,12 +13,14 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
     : AsyncInferRequestThreadSafeDefault(inferRequest, taskExecutor, callbackExecutor), _inferRequest(inferRequest), _waitExecutor(waitExecutor) {
     _pipeline = {};
 
-    _pipeline.push_back({taskExecutor,
-                [this] {
-                    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
-                    _inferRequest->preprocess();
-                    _inferRequest->enqueue();
-    } });
+    if (!_inferRequest->use_external_queue()) {
+        _pipeline.push_back({taskExecutor,
+                    [this] {
+                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
+                        _inferRequest->preprocess();
+                        _inferRequest->enqueue();
+        } });
+    }
     _pipeline.push_back({_waitExecutor,
                     [this] {
                         OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
@@ -26,6 +28,22 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
                     }});
 }
 
+void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
+    if (_inferRequest->use_external_queue()) {
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
+    }
+    Parent::Infer_ThreadUnsafe();
+}
+
+void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
+    if (_inferRequest->use_external_queue()) {
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
+    }
+    Parent::StartAsync_ThreadUnsafe();
+}
+
 CLDNNPlugin::CLDNNAsyncInferRequest::~CLDNNAsyncInferRequest() {
     StopAndWait();
 }
diff --git a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h
index afed98b5f86..d9d90d1db47 100644
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h
@@ -21,6 +21,9 @@ public:
 
     ~CLDNNAsyncInferRequest();
 
+    void Infer_ThreadUnsafe() override;
+    void StartAsync_ThreadUnsafe() override;
+
 private:
     CLDNNInferRequest::Ptr _inferRequest;
     InferenceEngine::ITaskExecutor::Ptr _waitExecutor;
diff --git a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
index 79f951dd2e1..7a2157292f7 100644
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@@ -89,6 +89,10 @@ IInferRequestInternal::Ptr CLDNNExecNetwork::CreateInferRequestImpl(const std::v
     }
     if (m_config.useProfiling)
         ptr->EnableProfiling();
+
+    if (m_graphs.front()->use_external_queue()) {
+        ptr->enable_external_queue();
+    }
     ptr->SetGraph(m_graphs.front());
 
     return ptr;
diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.cpp b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
index 2f78f6f7d00..0727747030f 100644
--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@@ -94,9 +94,26 @@ void CLDNNGraph::Build() {
     }
 }
 
+bool CLDNNGraph::use_external_queue() const {
+    auto impl = getContextImpl(m_context);
+    return impl->GetExternalQueue() != nullptr;
+}
+
 std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::BuildNetwork");
-    auto network = std::make_shared<cldnn::network>(program, m_stream_id);
+    std::shared_ptr<cldnn::network> network = nullptr;
+
+    auto impl = getContextImpl(m_context);
+    auto externalQueue = impl->GetExternalQueue();
+    if (externalQueue) {
+        if (m_config.throughput_streams != 1)
+            IE_THROW(ParameterMismatch) << "Throughput streams can't be used with shared queue!\n";
+        auto &engine = m_program->GetEngine();
+        network = std::make_shared<cldnn::network>(program, engine.create_stream(externalQueue), m_stream_id);
+    } else {
+        network = std::make_shared<cldnn::network>(program, m_stream_id);
+    }
+
 
     if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
         static int net_id = 0;
diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.h b/inference-engine/src/cldnn_engine/cldnn_graph.h
index ee0cf0befc1..d65d4183d90 100644
--- a/inference-engine/src/cldnn_engine/cldnn_graph.h
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.h
@@ -71,6 +71,8 @@ public:
         m_cv.notify_one();
     }
 
+    bool use_external_queue() const;
+
 protected:
     uint32_t m_state;
     std::condition_variable m_cv;
diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
index 6577fce8f01..1510562a384 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@@ -452,7 +452,8 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap
                                      const CLDNNExecNetwork::Ptr& execNetwork)
         : IInferRequestInternal(networkInputs, networkOutputs)
         , m_useProfiling(false)
-        , m_useStreams(false) {
+        , m_useStreams(false)
+        , m_useExternalQueue(false) {
     IE_ASSERT(nullptr != execNetwork);
     streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
 }
diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
index c90836f65b0..210adc46be8 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
@@ -57,6 +57,9 @@ public:
     void enqueue_dynamic();
     void wait_dynamic();
 
+    bool use_external_queue() const { return m_useExternalQueue; }
+    void enable_external_queue() { m_useExternalQueue = true; }
+
 private:
     InferenceEngine::BlobMap _deviceOutputs;
     std::map<std::string, cldnn::primitive_id> inputsMap;
@@ -64,6 +67,7 @@ private:
 
     bool m_useProfiling;
     bool m_useStreams;
+    bool m_useExternalQueue;
     std::shared_ptr<CLDNNGraph> m_graph;
 
     // dynamic batch stuff
diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
index 11bfbb1e91d..dbfd41170c2 100644
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@@ -199,6 +199,7 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
     m_plugin(plugin),
     m_type(ContextType::OCL),
     m_config(config),
+    m_external_queue(nullptr),
     m_va_display(nullptr) {
     lock.clear(std::memory_order_relaxed);
     gpu_handle_param _context_id = nullptr;
@@ -221,6 +222,9 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
         if (tile_id_itr != params.end()) {
             target_tile_id = tile_id_itr->second.as<int>();
         }
+
+        if (params.find(GPU_PARAM_KEY(OCL_QUEUE)) != params.end())
+            m_external_queue = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_QUEUE));
     }
 
     // TODO: Parameterize this based on plugin config and compilation options
diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.h b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
index f93193a1543..19c24540994 100644
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
@@ -224,6 +224,7 @@ public:
     std::shared_ptr<cldnn::engine> GetEngine() const { return m_engine; }
     Config& GetConfig() { return m_config; }
     ContextType GetType() const { return m_type; }
+    InferenceEngine::gpu_handle_param GetExternalQueue() const { return m_external_queue; }
     const std::weak_ptr<InferenceEngine::IInferencePlugin> GetPlugin() const { return m_plugin; }
 
     void acquire_lock() {
@@ -238,6 +239,7 @@ protected:
     // TODO: refactor to unique_ptr
     std::shared_ptr<cldnn::engine> m_engine;
     InferenceEngine::gpu_handle_param m_va_display;
+    InferenceEngine::gpu_handle_param m_external_queue;
     Config m_config;
 
     ContextType m_type;
diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
index 0033bc48879..95682bbfc65 100644
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
@@ -232,6 +232,25 @@ static inline RemoteContext::Ptr make_shared_context(Core& core,
     return core.CreateContext(deviceName, contextParams);
 }
 
+/**
+ * @brief This function is used to obtain remote context object from user-supplied OpenCL context handle
+ * @param core A reference to Inference Engine Core object
+ * @param deviceName A name of device to create a remote context for
+ * @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
+ * @note Only latency mode is supported for such context sharing case.
+ * @return A shared remote context instance
+ */
+static inline RemoteContext::Ptr make_shared_context(Core& core, std::string deviceName, cl_command_queue queue) {
+    cl_context ctx;
+    auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+    if (res != CL_SUCCESS)
+        IE_THROW() << "Can't get context from given opencl queue";
+    ParamMap contextParams = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
+                              {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
+                              {GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
+    return core.CreateContext(deviceName, contextParams);
+}
+
 /**
  * @brief This function is used to create remote blob object within default GPU plugin OpenCL context
  * @param desc A tensor descriptor object representing remote blob configuration
diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
index e647c8e26ca..2ceff349214 100644
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
@@ -68,6 +68,11 @@ DECLARE_GPU_PARAM_KEY(OCL_CONTEXT, gpu_handle_param);
  */
 DECLARE_GPU_PARAM_KEY(TILE_ID, int);
 
+/**
+ * @brief This key identifies OpenCL queue handle in a shared context
+ */
+DECLARE_GPU_PARAM_KEY(OCL_QUEUE, gpu_handle_param);
+
 /**
  * @brief This key identifies video acceleration device/display handle
  * in a shared context or shared memory blob parameter map
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp
index 817afdd5234..b43f1a780ae 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp
@@ -94,6 +94,7 @@ public:
  */
 class D3DContext : public ClContext {
     using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";
 
 public:
     /**
@@ -117,16 +118,15 @@ public:
     /**
      * @brief Constructs D3DContext remote context object from ID3D11Device
      * @param core OpenVINO Runtime Core object instance
-     * @param deviceName A name of to create a remote context for
      * @param device A pointer to ID3D11Device to be used to create a remote context
      */
-    D3DContext(Core& core, std::string deviceName, ID3D11Device* device) {
+    D3DContext(Core& core, ID3D11Device* device) {
         // clang-format off
         ParamMap context_params = {
             {GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
             {GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}
         };
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
     }
 
     /**
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp
index 6ca2f74bc6a..24ccf2ae9af 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp
@@ -124,6 +124,7 @@ public:
  */
 class ClContext : public RemoteContext {
     using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";
 
 public:
     /**
@@ -139,13 +140,29 @@ public:
     /**
      * @brief Constructs context object from user-supplied OpenCL context handle
      * @param core A reference to OpenVINO Runtime Core object
-     * @param deviceName A name of device to create a remote context for
      * @param ctx A OpenCL context to be used to create shared remote context
      */
-    ClContext(Core& core, std::string deviceName, cl_context ctx) {
+    ClContext(Core& core, cl_context ctx) {
         ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
                                    {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)}};
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
+    }
+
+    /**
+     * @brief Constructs context object from user-supplied OpenCL context handle
+     * @param core A reference to OpenVINO Runtime Core object
+     * @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
+     * @note Only latency mode is supported for such context sharing case.
+     */
+    ClContext(Core& core, cl_command_queue queue) {
+        cl_context ctx;
+        auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+        if (res != CL_SUCCESS)
+            IE_THROW() << "Can't get context from given opencl queue";
+        ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
+                                   {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
+                                   {GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
+        *this = core.create_context(device_name, context_params);
     }
 
     /**
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp
index 83542558138..fc82d0f5990 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp
@@ -68,6 +68,7 @@ public:
  */
 class VAContext : public ClContext {
     using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";
 
 public:
     /**
@@ -91,13 +92,12 @@ public:
     /**
      * @brief Constructs remote context object from VA display handle
      * @param core OpenVINO Runtime Core object
-     * @param deviceName A device name to create a remote context for
      * @param device A `VADisplay` to create remote context from
      */
-    VAContext(Core& core, std::string deviceName, VADisplay device) {
+    VAContext(Core& core, VADisplay device) {
         ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
                                    {GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}};
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
     }
 
     /**
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
index f2a447b30db..968fa18d40f 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@@ -123,6 +123,186 @@ TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
     }
 }
 
+TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
+#if defined _WIN32
+    GTEST_SKIP();
+#endif
+    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+    auto ie = PluginCache::get().ie();
+    auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net_regular.CreateInferRequest();
+    auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto ocl_instance = std::make_shared<OpenCL>();
+    cl_int err;
+
+    auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
+    auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
+    auto in_dims = in_desc.getDims();
+    auto out_dims = out_desc.getDims();
+    size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
+    size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
+
+    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
+    // without calling thread blocks
+    auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context);
+    auto inf_req_shared = exec_net_shared.CreateInferRequest();
+
+    // Allocate shared buffers for input and output data which will be set to infer request
+    cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
+    cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+    // Allocate output buffer where inference result will be put as a post-processing step
+    cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+
+    // Wrap buffers above with IE blobs
+    Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
+    Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
+    Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
+    // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
+    // TODO: Why do we need to call it explicitly? Consider doing it internally
+    output_blob->allocate();
+
+    // Pass shared blobs to infer request
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
+    inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
+
+    // 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer and barrier to ensure that copy is finished before
+    // inference primitives starts execution
+    {
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
+        ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+    }
+
+    // 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
+    // before giving the control back
+    inf_req_shared.StartAsync();
+
+    // 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
+    // Enqueue barrier with empty wait list is needed to ensure that previous kernels are finished before copying the data. It's needed here since we
+    // create OOO queue.
+    // Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
+    // then the result may be incorrect without Wait().
+    {
+        ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+        ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
+    }
+
+    // 4. Wait for infer request and post-processing completion
+    ocl_instance->_queue.finish();
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
+    }
+}
+
+TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
+#if defined _WIN32
+    GTEST_SKIP();
+#endif
+    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+    auto ie = PluginCache::get().ie();
+    auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net_regular.CreateInferRequest();
+    auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto ocl_instance = std::make_shared<OpenCL>();
+    ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
+    cl_int err;
+
+    auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
+    auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
+    auto in_dims = in_desc.getDims();
+    auto out_dims = out_desc.getDims();
+    size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
+    size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
+
+    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
+    // without calling thread blocks
+    auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context);
+    auto inf_req_shared = exec_net_shared.CreateInferRequest();
+
+    // Allocate shared buffers for input and output data which will be set to infer request
+    cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
+    cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+    // Allocate output buffer where inference result will be put as a post-processing step
+    cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+
+    // Wrap buffers above with IE blobs
+    Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
+    Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
+    Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
+    // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
+    // TODO: Why do we need to call it explicitly? Consider doing it internally
+    output_blob->allocate();
+
+    // Pass shared blobs to infer request
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
+    inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
+
+    // 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer
+    {
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
+    }
+
+    // 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
+    // before giving the control back
+    inf_req_shared.StartAsync();
+
+    // 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
+    // Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
+    // then the result may be incorrect without Wait().
+    {
+        ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
+    }
+
+    // 4. Wait for infer request and post-processing completion
+    ocl_instance->_queue.finish();
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
+    }
+}
+
 class BatchedBlob_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
     void SetUp() override {
         num_batch = this->GetParam();
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp
index 5532d6e2006..21bf8d8988c 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp
@@ -113,7 +113,7 @@ TEST_F(OVRemoteTensor_Test, DISABLED_smoke_canInferOnUserContext) {
     // inference using remote tensor
     auto ocl_instance = std::make_shared<OpenCL>();
 
-    auto remote_context = ov::runtime::gpu::ClContext(ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_context.get());
+    auto remote_context = ov::runtime::gpu::ClContext(ie, ocl_instance->_context.get());
     auto exec_net_shared = ie.compile_model(function, remote_context);
     auto inf_req_shared = exec_net_shared.create_infer_request();
     inf_req_shared.set_tensor(input->get_friendly_name(), fakeImageData);
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
index 53b9cad0fea..bbc41b65f13 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
@@ -62,8 +62,9 @@ public:
             const build_options& options,
             bool is_internal);
 
-    network(program::ptr program,
-            uint16_t stream_id = 0);
+    network(program::ptr program, uint16_t stream_id = 0);
+
+    network(program::ptr program, stream::ptr stream, uint16_t stream_id);
 
     ~network();
 
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
index 6f9ebf75e41..c4db9e19387 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
@@ -125,6 +125,9 @@ public:
     /// Create stream object for current engine
     virtual stream_ptr create_stream() const = 0;
 
+    /// Creates stream object from user handle
+    virtual stream_ptr create_stream(void *handle) const = 0;
+
     /// Returns service stream which can be used during program build and optimizations
     virtual stream& get_program_stream() const = 0;
 
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
index 0cf2964819a..fa5aa273649 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
@@ -213,6 +213,10 @@ stream::ptr ocl_engine::create_stream() const {
     return std::make_shared<ocl_stream>(*this);
 }
 
+stream::ptr ocl_engine::create_stream(void* handle) const {
+    return std::make_shared<ocl_stream>(*this, handle);
+}
+
 stream& ocl_engine::get_program_stream() const {
     return *_program_stream;
 }
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp
index 67995d65c7b..f91f3e13ff1 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp
@@ -40,6 +40,7 @@ public:
     bool extension_supported(std::string extension) const;
 
     stream_ptr create_stream() const override;
+    stream_ptr create_stream(void *handle) const override;
     stream& get_program_stream() const override;
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp
index 0d279001ffd..6b6c5e02122 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp
@@ -253,9 +253,18 @@ void set_arguments_impl(ocl_kernel_type& kernel,
         }
     }
 }
+
+sync_methods get_expected_sync_method(const engine_configuration &config) {
+    return config.enable_profiling ? sync_methods::events : config.queue_type == queue_types::out_of_order ? sync_methods::barriers
+                                                                                                           : sync_methods::none;
+}
+
 }  // namespace
 
-ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration().queue_type), _engine(engine) {
+ocl_stream::ocl_stream(const ocl_engine &engine)
+    : stream(engine.configuration().queue_type)
+    , _engine(engine)
+    , sync_method(get_expected_sync_method(engine.configuration())) {
     auto context = engine.get_cl_context();
     auto device = engine.get_cl_device();
     auto config = engine.configuration();
@@ -263,9 +272,6 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
     queue_builder.set_profiling(config.enable_profiling);
     queue_builder.set_out_of_order((config.queue_type == queue_types::out_of_order));
 
-    sync_method = _engine.configuration().enable_profiling ? sync_methods::events :
-                  config.queue_type == queue_types::out_of_order ? sync_methods::barriers : sync_methods::none;
-
     if (sync_method == sync_methods::none && config.queue_type == queue_types::out_of_order) {
         throw std::runtime_error("[CLDNN] Unexpected sync method (none) is specified for out_of_order queue");
     }
@@ -288,6 +294,22 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
 #endif
 }
 
+ocl_stream::ocl_stream(const ocl_engine &engine, void *handle)
+    : stream(engine.configuration().queue_type)
+    , _engine(engine)
+    , sync_method(get_expected_sync_method(engine.configuration())) {
+    auto casted_handle = static_cast<cl_command_queue>(handle);
+    _command_queue = ocl_queue_type(casted_handle, true);
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    auto config = engine.configuration();
+    if (config.queue_type == queue_types::in_order) {
+        auto onednn_engine = engine.get_onednn_engine();
+        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
+    }
+#endif
+}
+
 #ifdef ENABLE_ONEDNN_FOR_GPU
 dnnl::stream& ocl_stream::get_onednn_stream() {
     if (!_onednn_stream)
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp
index 1bfd294bbbc..dc354d3267a 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp
@@ -50,6 +50,7 @@ public:
     const ocl_queue_type& get_cl_queue() const { return _command_queue; }
 
     explicit ocl_stream(const ocl_engine& engine);
+    ocl_stream(const ocl_engine &engine, void *handle);
     ocl_stream(ocl_stream&& other)
         : stream(other._engine.configuration().queue_type)
         , _engine(other._engine)
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index d7cf8870155..a4cf00c4225 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -234,7 +234,10 @@ network::network(engine& engine,
     : network(program::build_program(engine, nodes, options, is_internal), engine.create_stream(), is_internal) {}
 
 network::network(program::ptr program, uint16_t stream_id)
-    : network(program, program->get_engine().create_stream(), false, stream_id ==0) {}
+    : network(program, program->get_engine().create_stream(), false, stream_id == 0) {}
+
+network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
+    : network(program, stream, false, stream_id == 0) {}
 
 network::~network() {
     _memory_pool->clear_pool_for_network(net_id);