From 6addc0d535e9ab0b591dcefa2f771997a477ba4f Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Tue, 23 Nov 2021 20:26:44 +0300
Subject: [PATCH] [GPU] USM sharing and host blob creation in gpu remote
 context (#8657)

---
 .../src/cldnn_engine/cldnn_infer_request.cpp  |  22 ++-
 .../src/cldnn_engine/cldnn_remote_context.cpp |  55 +++++-
 .../src/cldnn_engine/cldnn_remote_context.h   | 111 ++++++++++-
 .../include/ie/gpu/gpu_context_api_ocl.hpp    |  43 ++++
 .../include/ie/gpu/gpu_params.hpp             |  13 +-
 .../include/ie/ie_remote_context.hpp          |  10 +-
 .../include/openvino/runtime/gpu/ocl/dx.hpp   |   5 +-
 .../include/openvino/runtime/gpu/ocl/ocl.hpp  |  76 +++++++-
 .../include/openvino/runtime/gpu/ocl/va.hpp   |   5 +-
 .../openvino/runtime/remote_context.hpp       |  10 +
 .../src/cpp/ie_remote_context.cpp             |   9 +
 .../src/ie_remote_context.cpp                 |  22 +++
 .../cldnn_remote_blob_tests.cpp               |  55 ++++++
 .../gpu_remote_tensor_tests.cpp               | 184 +++++++++++++++++-
 .../remote_blob_tests/remote_blob_helpers.hpp | 124 ++++++++++++
 .../clDNN/api/cldnn/runtime/engine.hpp        |   3 +
 .../thirdparty/clDNN/runtime/engine.cpp       |  11 ++
 .../clDNN/runtime/ocl/ocl_engine.cpp          |   3 +
 .../thirdparty/clDNN/runtime/ocl/ocl_ext.hpp  |  30 ++-
 .../clDNN/runtime/ocl/ocl_memory.cpp          |  20 ++
 .../clDNN/runtime/ocl/ocl_memory.hpp          |   3 +
 21 files changed, 776 insertions(+), 38 deletions(-)
 create mode 100644 inference-engine/src/inference_engine/src/ie_remote_context.cpp

diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
index c98b06ce4cf..77336141f00 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@@ -966,9 +966,25 @@ void CLDNNInferRequest::prepare_output(const cldnn::primitive_id& outputName, Bl
 }
 
 InferenceEngine::Blob::Ptr CLDNNInferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) {
-    auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(), m_graph->GetNetwork()->get_stream(), desc, layout);
-    getBlobImpl(blobPtr.get())->allocate();
-    return blobPtr;
+    if (m_graph->GetEngine()->use_unified_shared_memory()) {
+        auto blobPtr = std::make_shared<CLDNNRemoteUSMbuffer>(m_graph->GetContext(),
+                                                              m_graph->GetNetwork()->get_stream(),
+                                                              desc,
+                                                              layout,
+                                                              nullptr,
+                                                              0,
+                                                              0,
+                                                              CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
+        getBlobImpl(blobPtr.get())->allocate();
+        return blobPtr;
+    } else {
+        auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(),
+                                                             m_graph->GetNetwork()->get_stream(),
+                                                             desc,
+                                                             layout);
+        getBlobImpl(blobPtr.get())->allocate();
+        return blobPtr;
+    }
 }
 
 }  // namespace CLDNNPlugin
diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
index f043b6e4a89..12ac301a256 100644
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@@ -38,6 +38,24 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const {
             { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
             { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
         };
+    case BT_USM_SHARED:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
+    case BT_USM_HOST_INTERNAL:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
+    case BT_USM_DEVICE_INTERNAL:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
 #ifdef _WIN32
     case BT_DX_BUF_SHARED:
         return{
@@ -81,7 +99,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept {
     return lockedHolder != nullptr;
 }
 
-void CLDNNRemoteBlobImpl::allocate() noexcept {
+void CLDNNRemoteBlobImpl::allocate() {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::Allocate");
     assert(m_memObject == nullptr);
 
@@ -91,13 +109,25 @@ void CLDNNRemoteBlobImpl::allocate() noexcept {
 
     switch (m_mem_type) {
     case BlobType::BT_BUF_INTERNAL: {
-        m_memObject = eng->allocate_memory(m_layout);
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
+        break;
+    }
+    case BlobType::BT_USM_HOST_INTERNAL: {
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_host);
+        break;
+    }
+    case BlobType::BT_USM_DEVICE_INTERNAL: {
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_device);
         break;
     }
     case BlobType::BT_BUF_SHARED: {
         m_memObject = eng->share_buffer(m_layout, m_mem);
         break;
     }
+    case BlobType::BT_USM_SHARED: {
+        m_memObject = eng->share_usm(m_layout, m_mem);
+        break;
+    }
 #ifdef _WIN32
     case BlobType::BT_SURF_SHARED: {
         m_memObject = eng->share_surface(m_layout, m_mem, m_plane);
@@ -139,6 +169,9 @@ std::shared_ptr<RemoteContext> CLDNNRemoteBlobImpl::getContext() const noexcept
 }
 
 void CLDNNRemoteBlobImpl::lock() const {
+    if (!is_allocated()) {
+        IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
+    }
     lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
     auto ptr = lockedHolder->data();
     _handle = reinterpret_cast<void*>(ptr);
@@ -295,15 +328,17 @@ std::string CLDNNExecutionContextImpl::getDeviceName() const noexcept {
 
     auto engine_type = cldnn::engine_types::ocl;
     auto runtime_type = cldnn::runtime_types::ocl;
-    // Use actual runtime and engine types
-    cldnn::device_query device_query(engine_type, runtime_type);
-    auto all_devices = device_query.get_available_devices();
-    auto current_device = m_engine->get_device();
+    try {
+        // Use actual runtime and engine types
+        cldnn::device_query device_query(engine_type, runtime_type);
+        auto all_devices = device_query.get_available_devices();
+        auto current_device = m_engine->get_device();
 
-    for (auto& kv : all_devices) {
-        if (current_device->is_same(kv.second))
-            return devName + "." + kv.first;
-    }
+        for (auto& kv : all_devices) {
+            if (current_device->is_same(kv.second))
+                return devName + "." + kv.first;
+        }
+    } catch (...) { }
 
     if (!m_config.device_id.empty())
         devName += "." + m_config.device_id;
diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.h b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
index 19c24540994..f5e179db39b 100644
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
@@ -8,6 +8,7 @@
 #include <cldnn/runtime/engine.hpp>
 #include <ie_parameter.hpp>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
+#include <blob_factory.hpp>
 #include <ie_remote_context.hpp>
 #include "cldnn_config.h"
 #include "cldnn_common_utils.h"
@@ -37,6 +38,9 @@ public:
         BT_EMPTY,
         BT_BUF_INTERNAL,
         BT_BUF_SHARED,
+        BT_USM_SHARED,
+        BT_USM_HOST_INTERNAL,
+        BT_USM_DEVICE_INTERNAL,
         BT_IMG_SHARED,
         BT_SURF_SHARED,
         BT_DX_BUF_SHARED,
@@ -50,7 +54,7 @@ public:
                                  uint32_t plane = 0,
                                  BlobType mem_type = BT_BUF_INTERNAL);
 
-    void allocate() noexcept;
+    void allocate();
     bool deallocate() noexcept;
     InferenceEngine::ParamMap getParams() const;
     std::string getDeviceName() const noexcept;
@@ -106,7 +110,11 @@ public:
         : _impl(context, stream, layout, mem, surf, plane, mem_type)
         , TpublicAPI(desc) {}
 
-    void allocate() noexcept override { _impl.allocate(); }
+    void allocate() noexcept override {
+        try {
+            _impl.allocate();
+        } catch (...) {}
+    }
     bool deallocate() noexcept override { return _impl.deallocate(); }
     InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
     std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
@@ -125,6 +133,7 @@ protected:
 };
 
 using CLDNNRemoteCLbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
+using CLDNNRemoteUSMbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::USMBlob>;
 using CLDNNRemoteCLImage2D = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
 #ifdef _WIN32
 using CLDNNRemoteD3DBuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
@@ -157,6 +166,10 @@ inline CLDNNRemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
         auto ptr = blobPtr->as<CLDNNRemoteCLImage2D>();
         if (ptr) return ptr->getImpl();
     }
+    {
+        auto ptr = blobPtr->as<CLDNNRemoteUSMbuffer>();
+        if (ptr) return ptr->getImpl();
+    }
     return nullptr;
 }
 
@@ -204,6 +217,58 @@ public:
     bool free(void* handle) noexcept override { return true; }
 };
 
+class USMHostAllocator : public InferenceEngine::IAllocator {
+protected:
+    InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
+    InferenceEngine::gpu::ClContext* _context = nullptr;
+
+public:
+    using Ptr = std::shared_ptr<USMHostAllocator>;
+
+    USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { }
+    /**
+    * @brief Maps handle to heap memory accessible by any memory manipulation routines.
+    * @return Generic pointer to memory
+    */
+    void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
+        if (!_usm_host_blob)
+            return nullptr;
+        return _usm_host_blob->get();
+    };
+
+    /**
+    * @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
+    * The multiple sequential mappings of the same handle are suppose to get the same
+    * result while there isn't a ref counter supported.
+    */
+    void unlock(void* handle) noexcept override {}
+
+    /**
+    * @brief Allocates memory
+    * @param size The size in bytes to allocate
+    * @return Handle to the allocated resource
+    */
+    void* alloc(size_t size) noexcept override {
+        auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C);
+        InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
+        _usm_host_blob = std::dynamic_pointer_cast<InferenceEngine::gpu::USMBlob>(_context->CreateBlob(td, params));
+        _usm_host_blob->allocate();
+        return _usm_host_blob->get();
+    }
+
+    /**
+    * @brief Releases handle and all associated memory resources which invalidates the handle.
+    * @return false if handle cannot be released, otherwise - true.
+    */
+    bool free(void* handle) noexcept override {
+        try {
+            _usm_host_blob = nullptr;
+        } catch(...) { }
+        return true;
+    }
+};
+
+
 class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
 public:
     enum ContextType {
@@ -335,6 +400,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
             case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED:
                 ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
                 break;
+            case CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED:
+                ret = std::make_shared<CLDNNRemoteUSMbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
+                break;
             case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED:
                 layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
                 ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
@@ -368,6 +436,21 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
                                                      CLDNNRemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
     }
 
+    InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, CLDNNRemoteBlobImpl::BlobType alloc_type) {
+        cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
+                             FormatFromLayout(tensorDesc.getLayout()),
+                             CldnnTensorFromIEDims(tensorDesc.getDims()));
+        auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
+        auto& stream = _impl.GetEngine()->get_program_stream();
+
+        return std::make_shared<CLDNNRemoteUSMbuffer>(smart_this,
+                                                      stream,
+                                                      tensorDesc,
+                                                      layout,
+                                                      nullptr, 0, 0,
+                                                      alloc_type);
+    }
+
     void check_if_shared() {
         if (GetType() != CLDNNExecutionContextImpl::ContextType::DEV_SHARED)
             IE_THROW() << "Shared context is required to to share this type of memory";
@@ -382,9 +465,16 @@ public:
                                         const Config& config = {})
         : _impl(plugin, params, config) {}
 
-    InferenceEngine::ParamMap getParams() const noexcept override { return _impl.getParams(); }
+    InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
     std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
 
+    InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override {
+        if (_impl.GetEngine()->use_unified_shared_memory())
+            return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc, std::make_shared<USMHostAllocator>(this)));
+        else
+            return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc));
+    }
+
     InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override {
         using namespace InferenceEngine;
         using InferenceEngine::gpu::details::param_map_obj_getter;
@@ -395,9 +485,21 @@ public:
             // user will supply shared object handle
             std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));
 
+            bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
+                          memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
+                          memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER);
+
+            if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) {
+                IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device";
+            }
+
             if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) {
                 check_if_shared();
                 return reuse_surf(tensorDesc, params);
+            } else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) {
+                return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
+            } else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) {
+                return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
             } else {
                 CLDNNRemoteBlobImpl::BlobType blob_type;
                 cldnn::shared_handle mem = nullptr;
@@ -405,6 +507,9 @@ public:
                 if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) {
                     blob_type = CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED;
                     mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
+                } else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) {
+                    blob_type = CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED;
+                    mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
                 } else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) {
                     blob_type = CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED;
                     mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
index 95682bbfc65..352246ed834 100644
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
@@ -131,6 +131,49 @@ public:
     }
 };
 
+/**
+ * @brief This class represents an abstraction for GPU plugin remote blob
+ * which can be shared with user-supplied USM pointer.
+ * The plugin object derived from this class can be obtained with CreateBlob() call.
+ * @note User can obtain USM pointer from this class.
+ */
+class USMBlob : public ClBlob, public details::param_map_obj_getter {
+public:
+    /**
+     * @brief A smart pointer to the ClBufferBlob object
+     */
+    using Ptr = std::shared_ptr<USMBlob>;
+
+    /**
+     * @brief Creates a ClBufferBlob object with the specified dimensions and layout.
+     * @param tensorDesc Tensor description
+     */
+    explicit USMBlob(const TensorDesc& tensorDesc) : ClBlob(tensorDesc) {}
+
+    /**
+     * @brief Returns the underlying OpenCL memory object handle.
+     * @return underlying OpenCL memory object handle
+     */
+    void* get() {
+        const auto& params = getParams();
+        auto itrType = params.find(GPU_PARAM_KEY(SHARED_MEM_TYPE));
+        if (itrType == params.end())
+            IE_THROW() << "Parameter of type " << GPU_PARAM_KEY(SHARED_MEM_TYPE) << " not found";
+
+        auto mem_type = itrType->second.as<std::string>();
+        if (mem_type != GPU_PARAM_VALUE(USM_USER_BUFFER) && mem_type != GPU_PARAM_VALUE(USM_HOST_BUFFER) &&
+            mem_type != GPU_PARAM_VALUE(USM_DEVICE_BUFFER))
+            IE_THROW() << "Unexpected USM blob type: " << mem_type;
+
+        auto itrHandle = params.find(GPU_PARAM_KEY(MEM_HANDLE));
+        if (itrHandle == params.end()) {
+            IE_THROW() << "No parameter " << GPU_PARAM_KEY(MEM_HANDLE) << " found";
+        }
+
+        return itrHandle->second.as<gpu_handle_param>();
+    }
+};
+
 /**
  * @brief This class represents an abstraction for GPU plugin remote blob
  * which can be shared with user-supplied OpenCL 2D Image.
diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
index d7e36c95ac5..36f8014ed63 100644
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
@@ -98,7 +98,18 @@ DECLARE_GPU_PARAM_VALUE(OCL_BUFFER);
  * @brief Shared OpenCL 2D image blob
  */
 DECLARE_GPU_PARAM_VALUE(OCL_IMAGE2D);
-
+/**
+ * @brief Shared USM pointer allocated by user
+ */
+DECLARE_GPU_PARAM_VALUE(USM_USER_BUFFER);
+/**
+ * @brief Shared USM pointer type with host allocation type allocated by plugin
+ */
+DECLARE_GPU_PARAM_VALUE(USM_HOST_BUFFER);
+/**
+ * @brief Shared USM pointer type with device allocation type allocated by plugin
+ */
+DECLARE_GPU_PARAM_VALUE(USM_DEVICE_BUFFER);
 /**
  * @brief Shared video decoder surface or D3D 2D texture blob
  */
diff --git a/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp b/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp
index 0fee1c86808..31ec2d7f6a8 100644
--- a/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp
+++ b/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp
@@ -23,7 +23,7 @@ namespace InferenceEngine {
  * Such context represents a scope on the device within which executable
  * networks and remote memory blobs can exist, function and exchange data.
  */
-class RemoteContext : public std::enable_shared_from_this<RemoteContext> {
+class INFERENCE_ENGINE_API_CLASS(RemoteContext) : public std::enable_shared_from_this<RemoteContext> {
 public:
     /**
      * @brief A smart pointer to the RemoteContext object
@@ -110,6 +110,14 @@ public:
      */
     virtual RemoteBlob::Ptr CreateBlob(const TensorDesc& tensorDesc, const ParamMap& params = {}) = 0;
 
+    /**
+     * @brief Allocates host accessible memory blob friendly for the device in current context
+     * Returns a pointer to the object which implements MemoryBlob interface.
+     * @param tensorDesc Defines the layout and dims of the blob
+     * @return A pointer to host accessible MemoryBlob object
+     */
+    virtual MemoryBlob::Ptr CreateHostBlob(const TensorDesc& tensorDesc);
+
     /**
      * @brief Returns a map of device-specific parameters required for low-level
      * operations with underlying object.
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp
index 8da0821145a..e94ab133c6c 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp
@@ -102,9 +102,10 @@ public:
  * @note User can also obtain OpenCL context handle from this class.
  */
 class D3DContext : public ClContext {
-    using RemoteContext::create_tensor;
-
 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using ClContext::create_tensor;
+
     /**
      * @brief Checks that type defined runtime paramters are presented in remote object
      * @param remote_context remote context to check
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp
index d205b7f0548..4477c87873d 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp
@@ -117,6 +117,36 @@ public:
     }
 };
 
+/**
+ * @brief This class represents an abstraction for GPU plugin remote tensor
+ * which can be shared with user-supplied USM device pointer.
+ * The plugin object derived from this class can be obtained with ClContext::create_tensor() call.
+ * @note User can obtain USM pointer from this class.
+ */
+class USMTensor : public RemoteTensor {
+public:
+    /**
+     * @brief Checks that type defined runtime paramters are presented in remote object
+     * @param tensor a tensor to check
+     */
+    static void type_check(const Tensor& tensor) {
+        RemoteTensor::type_check(tensor,
+                                 {{GPU_PARAM_KEY(MEM_HANDLE), {}},
+                                  {GPU_PARAM_KEY(SHARED_MEM_TYPE),
+                                   {GPU_PARAM_VALUE(USM_USER_BUFFER),
+                                    GPU_PARAM_VALUE(USM_HOST_BUFFER),
+                                    GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}});
+    }
+
+    /**
+     * @brief Returns the underlying USM pointer.
+     * @return underlying USM pointer
+     */
+    void* get() {
+        return static_cast<void*>(get_params().at(GPU_PARAM_KEY(MEM_HANDLE)).as<void*>());
+    }
+};
+
 /**
  * @brief This class represents an abstraction for GPU plugin remote context
  * which is shared with OpenCL context object.
@@ -125,14 +155,14 @@ public:
  */
 class ClContext : public RemoteContext {
 protected:
-    using RemoteContext::create_tensor;
-
     /**
      * @brief GPU device name
      */
     static constexpr const char* device_name = "GPU";
 
 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using RemoteContext::create_tensor;
     /**
      * @brief Checks that type defined runtime paramters are presented in remote object
      * @param remote_context remote context to check
@@ -220,7 +250,7 @@ public:
      * @brief This function is used to obtain remote tensor object from user-supplied cl_mem object
      * @param type Tensor element type
      * @param shape Tensor shape
-     * @param buffer A cl_mem object wrapped by a remote tensor
+     * @param buffer A cl_mem object that should be wrapped by a remote tensor
      * @return A remote tensor instance
      */
     ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl_mem buffer) {
@@ -233,7 +263,7 @@ public:
      * @brief This function is used to obtain remote tensor object from user-supplied cl::Buffer object
      * @param type Tensor element type
      * @param shape Tensor shape
-     * @param buffer A cl::Buffer object wrapped by a remote tensor
+     * @param buffer A cl::Buffer object that should be wrapped by a remote tensor
      * @return A remote tensor instance
      */
     ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl::Buffer& buffer) {
@@ -244,7 +274,7 @@ public:
      * @brief This function is used to obtain remote tensor object from user-supplied cl::Image2D object
      * @param type Tensor element type
      * @param shape Tensor shape
-     * @param image A cl::Image2D object wrapped by a remote tensor
+     * @param image A cl::Image2D object that should be wrapped by a remote tensor
      * @return A remote tensor instance
      */
     ClImage2DTensor create_tensor(const element::Type type, const Shape& shape, const cl::Image2D& image) {
@@ -252,7 +282,43 @@ public:
                            {GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(image.get())}};
         return create_tensor(type, shape, params);
     }
+
+    /**
+     * @brief This function is used to obtain remote tensor object from user-supplied USM pointer
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @param usm_ptr A USM pointer that should be wrapped by a remote tensor
+     * @return A remote tensor instance
+     */
+    USMTensor create_tensor(const element::Type type, const Shape& shape, void* usm_ptr) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER)},
+                           {GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(usm_ptr)}};
+        return create_tensor(type, shape, params);
+    }
+
+    /**
+     * @brief This function is used to allocate USM tensor with host allocation type
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A remote tensor instance
+     */
+    USMTensor create_usm_host_tensor(const element::Type type, const Shape& shape) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
+        return create_tensor(type, shape, params);
+    }
+
+    /**
+     * @brief This function is used to allocate USM tensor with device allocation type
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A remote tensor instance
+     */
+    USMTensor create_usm_device_tensor(const element::Type type, const Shape& shape) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}};
+        return create_tensor(type, shape, params);
+    }
 };
+
 }  // namespace ocl
 }  // namespace gpu
 }  // namespace runtime
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp
index 91f6c037f69..45e8611077c 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp
@@ -72,9 +72,10 @@ public:
  * @note User can also obtain OpenCL context handle from this class.
  */
 class VAContext : public ClContext {
-    using RemoteContext::create_tensor;
-
 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using ClContext::create_tensor;
+
     /**
      * @brief Checks that type defined runtime paramters are presented in remote object
      * @param remote_context remote context to check
diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp
index 73831801d11..73b27110bb3 100644
--- a/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp
@@ -136,6 +136,16 @@ public:
      * @return A map of name/parameter elements.
      */
     ParamMap get_params() const;
+
+    /**
+     * @brief This function is used to create host tensor object friendly for the device in current context
+     * For example, GPU context may allocate USM host memory (if corresponding extension is available)
+     * which could be more efficient than regular host memory.
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A Tensor instance with device friendly memory
+     */
+    Tensor create_host_tensor(const element::Type type, const Shape& shape);
 };
 
 }  // namespace runtime
diff --git a/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp b/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp
index cf5974dd22e..1c3942bfc10 100644
--- a/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp
+++ b/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp
@@ -69,6 +69,15 @@ RemoteTensor RemoteContext::create_tensor(const element::Type& element_type,
     });
 }
 
+Tensor RemoteContext::create_host_tensor(const element::Type element_type, const Shape& shape) {
+    OV_REMOTE_CONTEXT_STATEMENT({
+        auto blob = _impl->CreateHostBlob(
+            {ie::details::convertPrecision(element_type), shape, ie::TensorDesc::getLayoutByRank(shape.size())});
+        blob->allocate();
+        return {_so, blob};
+    });
+}
+
 ie::ParamMap RemoteContext::get_params() const {
     OV_REMOTE_CONTEXT_STATEMENT(return _impl->getParams());
 }
diff --git a/inference-engine/src/inference_engine/src/ie_remote_context.cpp b/inference-engine/src/inference_engine/src/ie_remote_context.cpp
new file mode 100644
index 00000000000..fbebf9fe83d
--- /dev/null
+++ b/inference-engine/src/inference_engine/src/ie_remote_context.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_remote_context.hpp"
+
+#include <memory>
+#include <string>
+
+#include "blob_factory.hpp"
+
+namespace InferenceEngine {
+
+MemoryBlob::Ptr RemoteContext::CreateHostBlob(const TensorDesc& tensorDesc) {
+    auto blob = std::dynamic_pointer_cast<MemoryBlob>(make_blob_with_precision(tensorDesc));
+    if (!blob)
+        IE_THROW(NotAllocated) << "Failed to create host blob in remote context for " << getDeviceName() << " device";
+
+    return blob;
+}
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
index 968fa18d40f..95aecd6b357 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@@ -84,6 +84,61 @@ TEST_F(RemoteBlob_Test, smoke_canInputUserBlob) {
     }
 }
 
+
+TEST_F(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
+#if defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    // TODO: Issue: investigate issue with IECore
+    auto ie = InferenceEngine::Core();
+    auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net.CreateInferRequest();
+    InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(
+            net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto inf_req_shared = exec_net.CreateInferRequest();
+    auto cldnn_context = exec_net.GetContext();
+    cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
+    auto ocl_instance = std::make_shared<OpenCL>(ctx);
+
+    auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
+    size_t imSize = dims[1] * dims[2] * dims[3];
+
+    Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context);
+    shared_blob->allocate();
+    {
+        cl::Buffer shared_buffer = *shared_blob->as<gpu::ClBufferBlob>();
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+    }
+
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
+
+    inf_req_shared.Infer();
+    auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+    }
+}
+
+
 TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
     auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
     CNNNetwork net(fn_ptr);
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
index b593c29183d..97bf9c5512d 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
@@ -30,7 +30,46 @@ protected:
     }
 };
 
-TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
+enum class RemoteTensorSharingType {
+    USER_CL_TENSOR = 0,
+    PLUGIN_CL_TENSOR = 1,
+    USER_USM_HOST_TENSOR = 2,
+    USER_USM_DEVICE_TENSOR = 3,
+    PLUGIN_USM_HOST_TENSOR = 4,
+    PLUGIN_USM_DEVICE_TENSOR = 5,
+    PLUGIN_HOST_TENSOR = 6
+};
+
+std::ostream& operator<<(std::ostream& stream, RemoteTensorSharingType sharing_type) {
+    switch (sharing_type) {
+    case RemoteTensorSharingType::USER_CL_TENSOR:  stream << "USER_CL_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_CL_TENSOR: stream << "PLUGIN_CL_TENSOR"; break;
+    case RemoteTensorSharingType::USER_USM_HOST_TENSOR: stream << "USER_USM_HOST_TENSOR"; break;
+    case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: stream << "USER_USM_DEVICE_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: stream << "PLUGIN_USM_HOST_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: stream << "PLUGIN_USM_DEVICE_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: stream << "PLUGIN_HOST_TENSOR"; break;
+    }
+
+    return stream;
+}
+
+class OVRemoteTensorInputBlob_Test : public OVRemoteTensor_Test, public testing::WithParamInterface<RemoteTensorSharingType> {
+public:
+    void SetUp() override {
+        fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    }
+
+    static std::string getTestCaseName(testing::TestParamInfo<RemoteTensorSharingType> obj) {
+        RemoteTensorSharingType sharing_type = obj.param;
+
+        std::ostringstream result;
+        result << sharing_type;
+        return result.str();
+    }
+};
+
+TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
 #if defined(ANDROID)
     GTEST_SKIP();
 #endif
@@ -45,6 +84,8 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
 
     auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
 
+    RemoteTensorSharingType sharing_type = GetParam();
+
     // regular inference
     auto inf_req_regular = exec_net.create_infer_request();
     auto input = function->get_parameters().at(0);
@@ -65,16 +106,129 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
 
     auto imSize = ov::shape_size(input->get_shape());
 
-    cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
-    {
-        void* buffer = fakeImageData.data();
-        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+    switch (sharing_type) {
+        case RemoteTensorSharingType::USER_CL_TENSOR: {
+            cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
+            {
+                void* buffer = fakeImageData.data();
+                ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
+            {
+                void* buffer = fakeImageData.data();
+                err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
+                if (err != CL_SUCCESS)
+                    FAIL() << "Failed to copy data from host buffer to USM device";
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            ocl_instance->free_mem(shared_buffer);
+
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
+            {
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            ocl_instance->free_mem(shared_buffer);
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_CL_TENSOR: {
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::ClBufferTensor>());
+            auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::ClBufferTensor>();
+            {
+                cl::Buffer shared_buffer = cl_tensor;
+                void* buffer = fakeImageData.data();
+                ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+            }
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            auto cldnn_tensor = cldnn_context.create_usm_host_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
+            {
+                auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
+                void* shared_buffer = cl_tensor.get();
+                ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            auto cldnn_tensor = cldnn_context.create_usm_device_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
+            {
+                auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
+                void* shared_buffer = cl_tensor.get();
+                ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_DEVICE_INTEL);
+                void* buffer = fakeImageData.data();
+                err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
+                if (err != CL_SUCCESS)
+                    FAIL() << "Failed to copy data from host buffer to USM device";
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: {
+            auto cldnn_tensor = cldnn_context.create_host_tensor(input->get_element_type(), input->get_shape());
+            {
+                ASSERT_NO_THROW(cldnn_tensor.data());
+                void* shared_buffer = cldnn_tensor.data();
+                if (ocl_instance->supports_usm())
+                    ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
     }
 
-    auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
-    inf_req_shared.set_tensor(input, cldnn_tensor);
-
-    inf_req_shared.infer();
     auto output_tensor_shared = inf_req_shared.get_tensor(output);
 
     // compare results
@@ -88,6 +242,18 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
     }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    smoke_GPU,
+    OVRemoteTensorInputBlob_Test,
+        ::testing::ValuesIn(std::vector<RemoteTensorSharingType>{RemoteTensorSharingType::USER_CL_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_CL_TENSOR,
+                                                                 RemoteTensorSharingType::USER_USM_HOST_TENSOR,
+                                                                 RemoteTensorSharingType::USER_USM_DEVICE_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_HOST_TENSOR}),
+        OVRemoteTensorInputBlob_Test::getTestCaseName);
+
 TEST_F(OVRemoteTensor_Test, smoke_canInferOnUserContext) {
     auto ie = ov::runtime::Core();
 
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp
index 5704797917e..0ff3ec4aeff 100644
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp
@@ -18,10 +18,57 @@
 #endif
 #include <gpu/gpu_context_api_ocl.hpp>
 
+namespace {
+template <typename T>
+T load_entrypoint(const cl_platform_id platform, const std::string name) {
+#if defined(__GNUC__) && __GNUC__ < 5
+// OCL spec says:
+// "The function clGetExtensionFunctionAddressForPlatform returns the address of the extension function named by funcname for a given platform.
+//  The pointer returned should be cast to a function pointer type matching the extension function's definition defined in the appropriate extension
+//  specification and header file."
+// So the pointer-to-object to pointer-to-function cast below is supposed to be valid, thus we suppress warning from old GCC versions.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+    T p = reinterpret_cast<T>(clGetExtensionFunctionAddressForPlatform(platform, name.c_str()));
+#if defined(__GNUC__) && __GNUC__ < 5
+#pragma GCC diagnostic pop
+#endif
+    if (!p) {
+        throw std::runtime_error("clGetExtensionFunctionAddressForPlatform(" + name + ") returned NULL.");
+    }
+    return p;
+}
+
+template <typename T>
+T try_load_entrypoint(const cl_platform_id platform, const std::string name) {
+    try {
+        return load_entrypoint<T>(platform, name);
+    } catch (...) {
+        return nullptr;
+    }
+}
+}  // namespace
+
 struct OpenCL {
     cl::Context _context;
     cl::Device _device;
     cl::CommandQueue _queue;
+    cl_platform_id _platform;
+
+    clHostMemAllocINTEL_fn _host_mem_alloc_fn = nullptr;
+    clMemFreeINTEL_fn _mem_free_fn = nullptr;
+    clDeviceMemAllocINTEL_fn _device_mem_alloc_fn = nullptr;
+    clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
+    clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
+
+    void init_extension_functions(cl_platform_id platform) {
+        _host_mem_alloc_fn = try_load_entrypoint<clHostMemAllocINTEL_fn>(platform, "clHostMemAllocINTEL");
+        _device_mem_alloc_fn = try_load_entrypoint<clDeviceMemAllocINTEL_fn>(platform, "clDeviceMemAllocINTEL");
+        _mem_free_fn = try_load_entrypoint<clMemFreeINTEL_fn>(platform, "clMemFreeINTEL");
+        _enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(platform, "clEnqueueMemcpyINTEL");
+        _get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(platform, "clGetMemAllocInfoINTEL");
+    }
 
     explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
         // get Intel iGPU OCL device, create context and queue
@@ -42,12 +89,15 @@ struct OpenCL {
                     if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
                         _device = d;
                         _context = cl::Context(_device);
+                        _platform = id;
                         break;
                     }
                 }
             }
             cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
             _queue = cl::CommandQueue(_context, _device, props);
+
+            init_extension_functions(_platform);
         }
     }
 
@@ -56,7 +106,81 @@ struct OpenCL {
         _context = cl::Context(context, true);
         _device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
 
+        cl_int error = clGetDeviceInfo(_device.get(), CL_DEVICE_PLATFORM, sizeof(_platform), &_platform, nullptr);
+        if (error) {
+            throw std::runtime_error("OpenCL helper failed to retrieve CL_DEVICE_PLATFORM: " + std::to_string(error));
+        }
+
         cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
         _queue = cl::CommandQueue(_context, _device, props);
+
+        init_extension_functions(_platform);
+    }
+
+    bool supports_usm() const {
+        return _host_mem_alloc_fn != nullptr &&
+               _device_mem_alloc_fn != nullptr &&
+               _mem_free_fn != nullptr &&
+               _enqueue_memcpy_fn != nullptr &&
+               _get_mem_alloc_info_fn != nullptr;
+    }
+
+    void* allocate_usm_host_buffer(size_t size) const {
+        cl_int err_code_ret;
+        if (!_device_mem_alloc_fn)
+            throw std::runtime_error("[GPU] clHostMemAllocINTEL is nullptr");
+        auto ret_ptr = _host_mem_alloc_fn(_context.get(), nullptr, size, 0, &err_code_ret);
+        if (err_code_ret != CL_SUCCESS)
+            throw std::runtime_error("OpenCL helper failed to allocate USM host memory");
+        return ret_ptr;
+    }
+
+    void* allocate_usm_device_buffer(size_t size) const {
+        cl_int err_code_ret;
+        if (!_device_mem_alloc_fn)
+            throw std::runtime_error("[GPU] clDeviceMemAllocINTEL is nullptr");
+        auto ret_ptr = _device_mem_alloc_fn(_context.get(), _device.get(), nullptr, size, 0, &err_code_ret);
+        if (err_code_ret != CL_SUCCESS)
+            throw std::runtime_error("OpenCL helper failed to allocate USM device memory");
+        return ret_ptr;
+    }
+
+    void free_mem(void* usm_ptr) {
+        if (!_mem_free_fn)
+            throw std::runtime_error("[GPU] clMemFreeINTEL is nullptr");
+
+        _mem_free_fn(_context.get(), usm_ptr);
+    }
+
+    cl_int memcpy(const cl::CommandQueue& cpp_queue, void *dst_ptr, const void *src_ptr,
+                  size_t bytes_count, bool blocking = true, const std::vector<cl::Event>* wait_list = nullptr, cl::Event* ret_event = nullptr) const {
+        if (!_enqueue_memcpy_fn)
+            throw std::runtime_error("[GPU] clEnqueueMemcpyINTEL is nullptr");
+        cl_event tmp;
+        cl_int err = _enqueue_memcpy_fn(
+            cpp_queue.get(),
+            static_cast<cl_bool>(blocking),
+            dst_ptr,
+            src_ptr,
+            bytes_count,
+            wait_list == nullptr ? 0 : static_cast<cl_uint>(wait_list->size()),
+            wait_list == nullptr ? nullptr : reinterpret_cast<const cl_event*>(&wait_list->front()),
+            ret_event == nullptr ? nullptr : &tmp);
+
+        if (ret_event != nullptr && err == CL_SUCCESS)
+            *ret_event = tmp;
+
+        return err;
+    }
+
+    cl_unified_shared_memory_type_intel get_allocation_type(const void* usm_ptr) const {
+        if (!_get_mem_alloc_info_fn) {
+            throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
+        }
+
+        cl_unified_shared_memory_type_intel ret_val;
+        size_t ret_val_size;
+        _get_mem_alloc_info_fn(_context.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
+        return ret_val;
     }
 };
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
index 8114009dd9a..acc57d689ea 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
@@ -62,6 +62,9 @@ public:
     /// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout
     memory_ptr share_buffer(const layout& layout, shared_handle buf);
 
+    /// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout
+    memory_ptr share_usm(const layout& layout, shared_handle usm_ptr);
+
     /// Create shared memory object using user-supplied 2D image @p img using specified @p layout
     memory_ptr share_image(const layout& layout, shared_handle img);
 
diff --git a/inference-engine/thirdparty/clDNN/runtime/engine.cpp b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
index 7e40a4ebf52..df15924c1c3 100644
--- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
@@ -92,6 +92,17 @@ memory_ptr engine::share_buffer(const layout& layout, shared_handle buf) {
     return reinterpret_handle(layout, params);
 }
 
+memory_ptr engine::share_usm(const layout& layout, shared_handle usm_ptr) {
+    shared_mem_params params = { shared_mem_type::shared_mem_usm, nullptr, nullptr, usm_ptr,
+#ifdef _WIN32
+        nullptr,
+#else
+        0,
+#endif
+        0 };
+    return reinterpret_handle(layout, params);
+}
+
 memory::ptr engine::share_image(const layout& layout, shared_handle img) {
     shared_mem_params params = { shared_mem_type::shared_mem_image, nullptr, nullptr, img,
 #ifdef _WIN32
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
index 5b67c37e8ff..a0b2774ef23 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
@@ -168,6 +168,9 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
         } else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
             cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
             return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
+        } else if (params.mem_type == shared_mem_type::shared_mem_usm) {
+            cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
+            return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
         } else {
             throw std::runtime_error("unknown shared object fromat or type");
         }
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
index a8535913603..c6a96460404 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
@@ -524,6 +524,7 @@ public:
             _enqueue_memcpy_fn             = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(_ctx.get(), "clEnqueueMemcpyINTEL");
             _enqueue_mem_fill_fn           = try_load_entrypoint<clEnqueueMemFillINTEL_fn>(_ctx.get(), "clEnqueueMemFillINTEL");
             _enqueue_memset_fn             = try_load_entrypoint<clEnqueueMemsetINTEL_fn>(_ctx.get(), "clEnqueueMemsetINTEL");
+            _get_mem_alloc_info_fn         = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(_ctx.get(), "clGetMemAllocInfoINTEL");
         }
     }
 
@@ -621,6 +622,17 @@ public:
         return err;
     }
 
+    cl_unified_shared_memory_type_intel get_usm_allocation_type(const void* usm_ptr) const {
+        if (!_get_mem_alloc_info_fn) {
+            throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
+        }
+
+        cl_unified_shared_memory_type_intel ret_val;
+        size_t ret_val_size;
+        _get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
+        return ret_val;
+    }
+
 private:
     cl::Context _ctx;
     cl::Device _device;
@@ -632,6 +644,7 @@ private:
     clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
     clEnqueueMemFillINTEL_fn _enqueue_mem_fill_fn = nullptr;
     clEnqueueMemsetINTEL_fn _enqueue_memset_fn = nullptr;
+    clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
 };
 
 /*
@@ -640,11 +653,16 @@ private:
 */
 class UsmHolder {
 public:
-    UsmHolder(const cl::UsmHelper& usmHelper, void* ptr) : _usmHelper(usmHelper), _ptr(ptr) { }
+    UsmHolder(const cl::UsmHelper& usmHelper, void* ptr, bool shared_memory = false)
+    : _usmHelper(usmHelper)
+    , _ptr(ptr)
+    , _shared_memory(shared_memory) { }
+
     void* ptr() { return _ptr; }
     ~UsmHolder() {
         try {
-            _usmHelper.free_mem(_ptr);
+            if (!_shared_memory)
+                _usmHelper.free_mem(_ptr);
         } catch (...) {
             // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
         }
@@ -652,6 +670,7 @@ public:
 private:
     const cl::UsmHelper& _usmHelper;
     void* _ptr;
+    bool _shared_memory = false;
 };
 /*
     USM base class. Different usm types should derive from this class.
@@ -659,6 +678,13 @@ private:
 class UsmMemory {
 public:
     explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
+    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
+    : _usmHelper(usmHelper)
+    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
+        if (!usm_ptr) {
+            throw std::runtime_error("[GPU] Can't share null usm pointer");
+        }
+    }
 
     // Get methods returns original pointer allocated by openCL.
     void* get() const { return _usm_pointer->ptr(); }
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
index 27e9331a74f..3bd44357aa4 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
@@ -279,6 +279,12 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemo
     , _buffer(buffer) {
 }
 
+gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer)
+    : lockable_gpu_mem()
+    , memory(engine, new_layout, detect_allocation_type(engine, buffer), true)
+    , _buffer(buffer) {
+}
+
 gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
     : lockable_gpu_mem()
     , memory(engine, layout, type, false)
@@ -393,6 +399,20 @@ shared_mem_params gpu_usm::get_internal_params() const {
     };
 }
 
+allocation_type gpu_usm::detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer) {
+    auto cl_alloc_type = engine->get_usm_helper().get_usm_allocation_type(buffer.get());
+
+    allocation_type res = allocation_type::unknown;
+    switch (cl_alloc_type) {
+        case CL_MEM_TYPE_DEVICE_INTEL: res = allocation_type::usm_device; break;
+        case CL_MEM_TYPE_HOST_INTEL: res = allocation_type::usm_host; break;
+        case CL_MEM_TYPE_SHARED_INTEL: res = allocation_type::usm_shared; break;
+        default: throw std::runtime_error("[GPU] Unsupported USM alloc type: " + std::to_string(cl_alloc_type));
+    }
+
+    return res;
+}
+
 std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
     std::vector<cl_mem> res;
     for (auto& m : mem) {
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
index 1ef23a81963..fa89bf42eb2 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
@@ -100,6 +100,7 @@ private:
 
 struct gpu_usm : public lockable_gpu_mem, public memory {
     gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
+    gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer);
     gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);
 
     void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
@@ -120,6 +121,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
 
 protected:
     cl::UsmMemory _buffer;
+
+    static allocation_type detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer);
 };
 
 struct ocl_surfaces_lock : public surfaces_lock {