[GPU] Extended remote context to accept user queue (#6235)

* [GPU] extended remote context to accept user queues for each stream * [GPU] OV2.0 API for queue sharing. Removed deviceName arg for context creation
2021-10-21 10:45:25 +03:00 · 2021-10-21 10:45:25 +03:00 · 6f754052cf
commit 6f754052cf
parent f8439eeed8
26 changed files with 467 additions and 56 deletions
--- a/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
+++ b/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
@ -68,6 +68,28 @@ and returns a smart pointer to it, which is cast to `Blob::Ptr`.
 To ensure that the plugin generates the correct execution graph for the NV12 dual-plane input, set
 the `CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS` plugin configuration flag to `PluginConfigParams::YES`.

+## Context & queue sharing
+
+GPU plugin supports creation of shared context from `cl_command_queue` handle. In that case
+opencl context handle is extracted from given queue via OpenCL™ API, and the queue itself is used inside
+the plugin for further execution of inference primitives. Sharing of the queue changes behavior of `StartAsync()`
+method to guarantee that submission of inference primitives into given queue is finished before
+returning of control back to calling thread.
+
+This sharing mechanism allows to do pipeline synchonization on app side and avoid blocking of host thread
+on waiting for completion of inference. Pseudocode may look as follows:
+
+@snippet snippets/GPU_RemoteBlob_API3.cpp part0
+
+### Limitations
+
+ - Some primitives in GPU plugin may block host thread on waiting for previous primitives before adding its kernels
+   to the command queue. In such cases `StartAsync()` call takes much more time to return control to the calling thread
+   as internally it waits for partial or full network completion.
+   Examples of operations: Loop, TensorIterator, DetectionOutput, NonMaxSuppression
+ - Synchonization of pre/post processing jobs and inference pipleine inside shared queue is the user responsibility
+ - Throughput mode is not available when queue sharing is used, i.e. only single stream can be used for each executable network.
+
 ## Low-Level Methods and Their Parameter Description

 The high-level wrappers above bring a direct dependency on native APIs to the user program.
@ -84,6 +106,7 @@ descriptor, which sets the expected structure  and possible parameter values of
 |----------------|---------------------------------------------------------------------|
 | `CONTEXT_TYPE` | Describes the type of the shared context in a map. Can be `OCL` (for pure OpenCL context) or `VA_SHARED` (for context shared with a video decoding device). |
 | `OCL_CONTEXT` | Contains the OpenCL context handle. |
+| `OCL_QUEUE` | Contains the OpenCL queue handle if queue sharing is needed. |
 | `VA_DEVICE` | Contains the native video decoding device handle. Can be `VADisplay` or `ID3D11Device` (a pointer). |
 | `SHARED_MEM_TYPE` | Describes the type of the shared memory buffer in a map. Can be `OCL_BUFFER` (clBuffer), `OCL_IMAGE2D` (clImage2D), `VA_SURFACE()`,  or `DX_BUFFER`.  |
 | `MEM_HANDLE` | Contains the OpenCL memory handle. |
--- a/docs/snippets/CMakeLists.txt
+++ b/docs/snippets/CMakeLists.txt
@ -11,7 +11,8 @@ file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
 if(NOT CLDNN__IOCL_ICD_INCDIRS OR TRUE)
    list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API0.cpp"
                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API1.cpp"
-                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp")
+                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp"
+                             "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API3.cpp")
 endif()

 # remove OpenCV related sources
--- a/docs/snippets/GPU_RemoteBlob_API3.cpp
+++ b/docs/snippets/GPU_RemoteBlob_API3.cpp
@ -0,0 +1,76 @@
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+
+#include <ie_core.hpp>
+#include <CL/cl2.hpp>
+#include <gpu/gpu_context_api_ocl.hpp>
+
+
+int main() {
+using namespace InferenceEngine;
+//! [part0]
+
+
+// ...
+
+
+// initialize the core and read the network
+InferenceEngine::Core ie;
+auto net = ie.ReadNetwork("network.xml");
+
+// initialize opencl context and create queue
+cl::Context ctx = get_my_OpenCL_context();
+cl::CommandQueue queue = get_my_OpenCL_queue();
+
+// share the queue with GPU plugin and compile ExecutableNetwork
+auto remote_context = gpu::make_shared_context(ie, "GPU", queue.get());
+auto exec_net_shared = ie.LoadNetwork(net, remote_context);
+
+// create the OpenCL buffers within the context
+cl::Buffer shared_in_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
+cl::Buffer shared_out_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
+// wrap in and out buffers into RemoteBlob and set them to infer request
+auto shared_in_blob = gpu::make_shared_blob(input_info->getTensorDesc(), remote_context, shared_in_buffer);
+auto shared_out_blob = gpu::make_shared_blob(out_data->getTensorDesc(), remote_context, shared_out_buffer);
+auto infer_request = exec_net_shared.CreateInferRequest();
+infer_request.SetBlob(input_name, shared_in_blob);
+infer_request.SetBlob(output_name, shared_out_blob);
+
+// ...
+// execute user kernel
+cl::Kernel kernel_preproc(program, kernel_name_preproc.c_str());
+kernel_preproc.setArg(0, shared_in_buffer);
+queue.enqueueNDRangeKernel(kernel_preproc,
+                           cl::NDRange(0),
+                           cl::NDRange(image_size),
+                           cl::NDRange(1),
+                           nullptr,  // wait events *
+                           &profileEvent);
+// Blocking clFinish() call is not required, but this barrier is added to the queue to guarantee that user kernel is finished
+// before any inference primitive is started
+queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+// ...
+
+// pass results to the inference
+// since the remote context is created with queue sharing, StartAsync() guarantees that scheduling is finished
+infer_request.StartAsync();
+
+// execute some postprocessing kernel.
+// infer_request.Wait() is not called, synchonization between inference and post-processing is done via
+// enqueueBarrierWithWaitList call.
+cl::Kernel kernel_postproc(program, kernel_name_postproc.c_str());
+kernel_postproc.setArg(0, shared_out_buffer);
+queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+queue.enqueueNDRangeKernel(kernel_postproc,
+                           cl::NDRange(0),
+                           cl::NDRange(image_size),
+                           cl::NDRange(1),
+                           nullptr,  // wait events *
+                           &profileEvent);
+
+// Wait for pipeline completion
+queue.finish();
+//! [part0]
+
+return 0;
+}
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
@ -13,12 +13,14 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
    : AsyncInferRequestThreadSafeDefault(inferRequest, taskExecutor, callbackExecutor), _inferRequest(inferRequest), _waitExecutor(waitExecutor) {
    _pipeline = {};

+    if (!_inferRequest->use_external_queue()) {
        _pipeline.push_back({taskExecutor,
                    [this] {
                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
                        _inferRequest->preprocess();
                        _inferRequest->enqueue();
        } });
+    }
    _pipeline.push_back({_waitExecutor,
                    [this] {
                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
@ -26,6 +28,22 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
                    }});
 }

+void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
+    if (_inferRequest->use_external_queue()) {
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
+    }
+    Parent::Infer_ThreadUnsafe();
+}
+
+void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
+    if (_inferRequest->use_external_queue()) {
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
+    }
+    Parent::StartAsync_ThreadUnsafe();
+}
+
 CLDNNPlugin::CLDNNAsyncInferRequest::~CLDNNAsyncInferRequest() {
    StopAndWait();
 }
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.h
@ -21,6 +21,9 @@ public:

    ~CLDNNAsyncInferRequest();

+    void Infer_ThreadUnsafe() override;
+    void StartAsync_ThreadUnsafe() override;
+
 private:
    CLDNNInferRequest::Ptr _inferRequest;
    InferenceEngine::ITaskExecutor::Ptr _waitExecutor;
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -89,6 +89,10 @@ IInferRequestInternal::Ptr CLDNNExecNetwork::CreateInferRequestImpl(const std::v
    }
    if (m_config.useProfiling)
        ptr->EnableProfiling();
+
+    if (m_graphs.front()->use_external_queue()) {
+        ptr->enable_external_queue();
+    }
    ptr->SetGraph(m_graphs.front());

    return ptr;
--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@ -94,9 +94,26 @@ void CLDNNGraph::Build() {
    }
 }

+bool CLDNNGraph::use_external_queue() const {
+    auto impl = getContextImpl(m_context);
+    return impl->GetExternalQueue() != nullptr;
+}
+
 std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::BuildNetwork");
-    auto network = std::make_shared<cldnn::network>(program, m_stream_id);
+    std::shared_ptr<cldnn::network> network = nullptr;
+
+    auto impl = getContextImpl(m_context);
+    auto externalQueue = impl->GetExternalQueue();
+    if (externalQueue) {
+        if (m_config.throughput_streams != 1)
+            IE_THROW(ParameterMismatch) << "Throughput streams can't be used with shared queue!\n";
+        auto &engine = m_program->GetEngine();
+        network = std::make_shared<cldnn::network>(program, engine.create_stream(externalQueue), m_stream_id);
+    } else {
+        network = std::make_shared<cldnn::network>(program, m_stream_id);
+    }
+

    if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
        static int net_id = 0;
--- a/inference-engine/src/cldnn_engine/cldnn_graph.h
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.h
@ -71,6 +71,8 @@ public:
        m_cv.notify_one();
    }

+    bool use_external_queue() const;
+
 protected:
    uint32_t m_state;
    std::condition_variable m_cv;
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@ -452,7 +452,8 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap
                                     const CLDNNExecNetwork::Ptr& execNetwork)
        : IInferRequestInternal(networkInputs, networkOutputs)
        , m_useProfiling(false)
-        , m_useStreams(false) {
+        , m_useStreams(false)
+        , m_useExternalQueue(false) {
    IE_ASSERT(nullptr != execNetwork);
    streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
 }
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
@ -57,6 +57,9 @@ public:
    void enqueue_dynamic();
    void wait_dynamic();

+    bool use_external_queue() const { return m_useExternalQueue; }
+    void enable_external_queue() { m_useExternalQueue = true; }
+
 private:
    InferenceEngine::BlobMap _deviceOutputs;
    std::map<std::string, cldnn::primitive_id> inputsMap;
@ -64,6 +67,7 @@ private:

    bool m_useProfiling;
    bool m_useStreams;
+    bool m_useExternalQueue;
    std::shared_ptr<CLDNNGraph> m_graph;

    // dynamic batch stuff
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -199,6 +199,7 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
    m_plugin(plugin),
    m_type(ContextType::OCL),
    m_config(config),
+    m_external_queue(nullptr),
    m_va_display(nullptr) {
    lock.clear(std::memory_order_relaxed);
    gpu_handle_param _context_id = nullptr;
@ -221,6 +222,9 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
        if (tile_id_itr != params.end()) {
            target_tile_id = tile_id_itr->second.as<int>();
        }
+
+        if (params.find(GPU_PARAM_KEY(OCL_QUEUE)) != params.end())
+            m_external_queue = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_QUEUE));
    }

    // TODO: Parameterize this based on plugin config and compilation options
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
@ -224,6 +224,7 @@ public:
    std::shared_ptr<cldnn::engine> GetEngine() const { return m_engine; }
    Config& GetConfig() { return m_config; }
    ContextType GetType() const { return m_type; }
+    InferenceEngine::gpu_handle_param GetExternalQueue() const { return m_external_queue; }
    const std::weak_ptr<InferenceEngine::IInferencePlugin> GetPlugin() const { return m_plugin; }

    void acquire_lock() {
@ -238,6 +239,7 @@ protected:
    // TODO: refactor to unique_ptr
    std::shared_ptr<cldnn::engine> m_engine;
    InferenceEngine::gpu_handle_param m_va_display;
+    InferenceEngine::gpu_handle_param m_external_queue;
    Config m_config;

    ContextType m_type;
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
@ -232,6 +232,25 @@ static inline RemoteContext::Ptr make_shared_context(Core& core,
    return core.CreateContext(deviceName, contextParams);
 }

+/**
+ * @brief This function is used to obtain remote context object from user-supplied OpenCL context handle
+ * @param core A reference to Inference Engine Core object
+ * @param deviceName A name of device to create a remote context for
+ * @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
+ * @note Only latency mode is supported for such context sharing case.
+ * @return A shared remote context instance
+ */
+static inline RemoteContext::Ptr make_shared_context(Core& core, std::string deviceName, cl_command_queue queue) {
+    cl_context ctx;
+    auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+    if (res != CL_SUCCESS)
+        IE_THROW() << "Can't get context from given opencl queue";
+    ParamMap contextParams = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
+                              {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
+                              {GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
+    return core.CreateContext(deviceName, contextParams);
+}
+
 /**
 * @brief This function is used to create remote blob object within default GPU plugin OpenCL context
 * @param desc A tensor descriptor object representing remote blob configuration
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
@ -68,6 +68,11 @@ DECLARE_GPU_PARAM_KEY(OCL_CONTEXT, gpu_handle_param);
 */
 DECLARE_GPU_PARAM_KEY(TILE_ID, int);

+/**
+ * @brief This key identifies OpenCL queue handle in a shared context
+ */
+DECLARE_GPU_PARAM_KEY(OCL_QUEUE, gpu_handle_param);
+
 /**
 * @brief This key identifies video acceleration device/display handle
 * in a shared context or shared memory blob parameter map
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/dx.hpp
@ -94,6 +94,7 @@ public:
 */
 class D3DContext : public ClContext {
    using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";

 public:
    /**
@ -117,16 +118,15 @@ public:
    /**
     * @brief Constructs D3DContext remote context object from ID3D11Device
     * @param core OpenVINO Runtime Core object instance
-     * @param deviceName A name of to create a remote context for
     * @param device A pointer to ID3D11Device to be used to create a remote context
     */
-    D3DContext(Core& core, std::string deviceName, ID3D11Device* device) {
+    D3DContext(Core& core, ID3D11Device* device) {
        // clang-format off
        ParamMap context_params = {
            {GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
            {GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}
        };
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
    }

    /**
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl.hpp
@ -124,6 +124,7 @@ public:
 */
 class ClContext : public RemoteContext {
    using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";

 public:
    /**
@ -139,13 +140,29 @@ public:
    /**
     * @brief Constructs context object from user-supplied OpenCL context handle
     * @param core A reference to OpenVINO Runtime Core object
-     * @param deviceName A name of device to create a remote context for
     * @param ctx A OpenCL context to be used to create shared remote context
     */
-    ClContext(Core& core, std::string deviceName, cl_context ctx) {
+    ClContext(Core& core, cl_context ctx) {
        ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
                                   {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)}};
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
+    }
+
+    /**
+     * @brief Constructs context object from user-supplied OpenCL context handle
+     * @param core A reference to OpenVINO Runtime Core object
+     * @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
+     * @note Only latency mode is supported for such context sharing case.
+     */
+    ClContext(Core& core, cl_command_queue queue) {
+        cl_context ctx;
+        auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+        if (res != CL_SUCCESS)
+            IE_THROW() << "Can't get context from given opencl queue";
+        ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
+                                   {GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
+                                   {GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
+        *this = core.create_context(device_name, context_params);
    }

    /**
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/va.hpp
@ -68,6 +68,7 @@ public:
 */
 class VAContext : public ClContext {
    using RemoteContext::create_tensor;
+    static constexpr const char* device_name = "GPU";

 public:
    /**
@ -91,13 +92,12 @@ public:
    /**
     * @brief Constructs remote context object from VA display handle
     * @param core OpenVINO Runtime Core object
-     * @param deviceName A device name to create a remote context for
     * @param device A `VADisplay` to create remote context from
     */
-    VAContext(Core& core, std::string deviceName, VADisplay device) {
+    VAContext(Core& core, VADisplay device) {
        ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
                                   {GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}};
-        *this = core.create_context(deviceName, context_params);
+        *this = core.create_context(device_name, context_params);
    }

    /**
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@ -123,6 +123,186 @@ TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
    }
 }

+TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
+#if defined _WIN32
+    GTEST_SKIP();
+#endif
+    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+    auto ie = PluginCache::get().ie();
+    auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net_regular.CreateInferRequest();
+    auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto ocl_instance = std::make_shared<OpenCL>();
+    cl_int err;
+
+    auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
+    auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
+    auto in_dims = in_desc.getDims();
+    auto out_dims = out_desc.getDims();
+    size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
+    size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
+
+    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
+    // without calling thread blocks
+    auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context);
+    auto inf_req_shared = exec_net_shared.CreateInferRequest();
+
+    // Allocate shared buffers for input and output data which will be set to infer request
+    cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
+    cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+    // Allocate output buffer where inference result will be put as a post-processing step
+    cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+
+    // Wrap buffers above with IE blobs
+    Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
+    Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
+    Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
+    // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
+    // TODO: Why do we need to call it explicitly? Consider doing it internally
+    output_blob->allocate();
+
+    // Pass shared blobs to infer request
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
+    inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
+
+    // 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer and barrier to ensure that copy is finished before
+    // inference primitives starts execution
+    {
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
+        ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+    }
+
+    // 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
+    // before giving the control back
+    inf_req_shared.StartAsync();
+
+    // 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
+    // Enqueue barrier with empty wait list is needed to ensure that previous kernels are finished before copying the data. It's needed here since we
+    // create OOO queue.
+    // Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
+    // then the result may be incorrect without Wait().
+    {
+        ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
+        ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
+    }
+
+    // 4. Wait for infer request and post-processing completion
+    ocl_instance->_queue.finish();
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
+    }
+}
+
+TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
+#if defined _WIN32
+    GTEST_SKIP();
+#endif
+    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+    auto ie = PluginCache::get().ie();
+    auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net_regular.CreateInferRequest();
+    auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto ocl_instance = std::make_shared<OpenCL>();
+    ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
+    cl_int err;
+
+    auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
+    auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
+    auto in_dims = in_desc.getDims();
+    auto out_dims = out_desc.getDims();
+    size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
+    size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
+
+    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
+    // without calling thread blocks
+    auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context);
+    auto inf_req_shared = exec_net_shared.CreateInferRequest();
+
+    // Allocate shared buffers for input and output data which will be set to infer request
+    cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
+    cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+    // Allocate output buffer where inference result will be put as a post-processing step
+    cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
+
+    // Wrap buffers above with IE blobs
+    Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
+    Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
+    Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
+    // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
+    // TODO: Why do we need to call it explicitly? Consider doing it internally
+    output_blob->allocate();
+
+    // Pass shared blobs to infer request
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
+    inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
+
+    // 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer
+    {
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
+    }
+
+    // 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
+    // before giving the control back
+    inf_req_shared.StartAsync();
+
+    // 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
+    // Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
+    // then the result may be incorrect without Wait().
+    {
+        ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
+    }
+
+    // 4. Wait for infer request and post-processing completion
+    ocl_instance->_queue.finish();
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
+    }
+}
+
 class BatchedBlob_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
    void SetUp() override {
        num_batch = this->GetParam();
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_tensor_tests.cpp
@ -113,7 +113,7 @@ TEST_F(OVRemoteTensor_Test, DISABLED_smoke_canInferOnUserContext) {
    // inference using remote tensor
    auto ocl_instance = std::make_shared<OpenCL>();

-    auto remote_context = ov::runtime::gpu::ClContext(ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_context.get());
+    auto remote_context = ov::runtime::gpu::ClContext(ie, ocl_instance->_context.get());
    auto exec_net_shared = ie.compile_model(function, remote_context);
    auto inf_req_shared = exec_net_shared.create_infer_request();
    inf_req_shared.set_tensor(input->get_friendly_name(), fakeImageData);
--- a/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
@ -62,8 +62,9 @@ public:
            const build_options& options,
            bool is_internal);

-    network(program::ptr program,
-            uint16_t stream_id = 0);
+    network(program::ptr program, uint16_t stream_id = 0);
+
+    network(program::ptr program, stream::ptr stream, uint16_t stream_id);

    ~network();

--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
@ -125,6 +125,9 @@ public:
    /// Create stream object for current engine
    virtual stream_ptr create_stream() const = 0;

+    /// Creates stream object from user handle
+    virtual stream_ptr create_stream(void *handle) const = 0;
+
    /// Returns service stream which can be used during program build and optimizations
    virtual stream& get_program_stream() const = 0;

--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
@ -213,6 +213,10 @@ stream::ptr ocl_engine::create_stream() const {
    return std::make_shared<ocl_stream>(*this);
 }

+stream::ptr ocl_engine::create_stream(void* handle) const {
+    return std::make_shared<ocl_stream>(*this, handle);
+}
+
 stream& ocl_engine::get_program_stream() const {
    return *_program_stream;
 }
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.hpp
@ -40,6 +40,7 @@ public:
    bool extension_supported(std::string extension) const;

    stream_ptr create_stream() const override;
+    stream_ptr create_stream(void *handle) const override;
    stream& get_program_stream() const override;

 #ifdef ENABLE_ONEDNN_FOR_GPU
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp
@ -253,9 +253,18 @@ void set_arguments_impl(ocl_kernel_type& kernel,
        }
    }
 }
+
+sync_methods get_expected_sync_method(const engine_configuration &config) {
+    return config.enable_profiling ? sync_methods::events : config.queue_type == queue_types::out_of_order ? sync_methods::barriers
+                                                                                                           : sync_methods::none;
+}
+
 }  // namespace

-ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration().queue_type), _engine(engine) {
+ocl_stream::ocl_stream(const ocl_engine &engine)
+    : stream(engine.configuration().queue_type)
+    , _engine(engine)
+    , sync_method(get_expected_sync_method(engine.configuration())) {
    auto context = engine.get_cl_context();
    auto device = engine.get_cl_device();
    auto config = engine.configuration();
@ -263,9 +272,6 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
    queue_builder.set_profiling(config.enable_profiling);
    queue_builder.set_out_of_order((config.queue_type == queue_types::out_of_order));

-    sync_method = _engine.configuration().enable_profiling ? sync_methods::events :
-                  config.queue_type == queue_types::out_of_order ? sync_methods::barriers : sync_methods::none;
-
    if (sync_method == sync_methods::none && config.queue_type == queue_types::out_of_order) {
        throw std::runtime_error("[CLDNN] Unexpected sync method (none) is specified for out_of_order queue");
    }
@ -288,6 +294,22 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
 #endif
 }

+ocl_stream::ocl_stream(const ocl_engine &engine, void *handle)
+    : stream(engine.configuration().queue_type)
+    , _engine(engine)
+    , sync_method(get_expected_sync_method(engine.configuration())) {
+    auto casted_handle = static_cast<cl_command_queue>(handle);
+    _command_queue = ocl_queue_type(casted_handle, true);
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    auto config = engine.configuration();
+    if (config.queue_type == queue_types::in_order) {
+        auto onednn_engine = engine.get_onednn_engine();
+        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
+    }
+#endif
+}
+
 #ifdef ENABLE_ONEDNN_FOR_GPU
 dnnl::stream& ocl_stream::get_onednn_stream() {
    if (!_onednn_stream)
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.hpp
@ -50,6 +50,7 @@ public:
    const ocl_queue_type& get_cl_queue() const { return _command_queue; }

    explicit ocl_stream(const ocl_engine& engine);
+    ocl_stream(const ocl_engine &engine, void *handle);
    ocl_stream(ocl_stream&& other)
        : stream(other._engine.configuration().queue_type)
        , _engine(other._engine)
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@ -236,6 +236,9 @@ network::network(engine& engine,
 network::network(program::ptr program, uint16_t stream_id)
    : network(program, program->get_engine().create_stream(), false, stream_id == 0) {}

+network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
+    : network(program, stream, false, stream_id == 0) {}
+
 network::~network() {
    _memory_pool->clear_pool_for_network(net_id);
 }