[GPU] USM sharing and host blob creation in gpu remote context (#8657)

2021-11-23 20:26:44 +03:00 · 2021-11-23 20:26:44 +03:00 · 6addc0d535
commit 6addc0d535
parent c49620bb6a
21 changed files with 776 additions and 38 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@ -966,9 +966,25 @@ void CLDNNInferRequest::prepare_output(const cldnn::primitive_id& outputName, Bl
 }

 InferenceEngine::Blob::Ptr CLDNNInferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) {
-    auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(), m_graph->GetNetwork()->get_stream(), desc, layout);
-    getBlobImpl(blobPtr.get())->allocate();
-    return blobPtr;
+    if (m_graph->GetEngine()->use_unified_shared_memory()) {
+        auto blobPtr = std::make_shared<CLDNNRemoteUSMbuffer>(m_graph->GetContext(),
+                                                              m_graph->GetNetwork()->get_stream(),
+                                                              desc,
+                                                              layout,
+                                                              nullptr,
+                                                              0,
+                                                              0,
+                                                              CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
+        getBlobImpl(blobPtr.get())->allocate();
+        return blobPtr;
+    } else {
+        auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(),
+                                                             m_graph->GetNetwork()->get_stream(),
+                                                             desc,
+                                                             layout);
+        getBlobImpl(blobPtr.get())->allocate();
+        return blobPtr;
+    }
 }

 }  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -38,6 +38,24 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const {
            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
        };
+    case BT_USM_SHARED:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
+    case BT_USM_HOST_INTERNAL:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
+    case BT_USM_DEVICE_INTERNAL:
+        return{
+            { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
+            { GPU_PARAM_KEY(OCL_CONTEXT), params.context },
+            { GPU_PARAM_KEY(MEM_HANDLE),  params.mem }
+        };
 #ifdef _WIN32
    case BT_DX_BUF_SHARED:
        return{
@ -81,7 +99,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept {
    return lockedHolder != nullptr;
 }

-void CLDNNRemoteBlobImpl::allocate() noexcept {
+void CLDNNRemoteBlobImpl::allocate() {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::Allocate");
    assert(m_memObject == nullptr);

@ -91,13 +109,25 @@ void CLDNNRemoteBlobImpl::allocate() noexcept {

    switch (m_mem_type) {
    case BlobType::BT_BUF_INTERNAL: {
-        m_memObject = eng->allocate_memory(m_layout);
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
+        break;
+    }
+    case BlobType::BT_USM_HOST_INTERNAL: {
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_host);
+        break;
+    }
+    case BlobType::BT_USM_DEVICE_INTERNAL: {
+        m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_device);
        break;
    }
    case BlobType::BT_BUF_SHARED: {
        m_memObject = eng->share_buffer(m_layout, m_mem);
        break;
    }
+    case BlobType::BT_USM_SHARED: {
+        m_memObject = eng->share_usm(m_layout, m_mem);
+        break;
+    }
 #ifdef _WIN32
    case BlobType::BT_SURF_SHARED: {
        m_memObject = eng->share_surface(m_layout, m_mem, m_plane);
@ -139,6 +169,9 @@ std::shared_ptr<RemoteContext> CLDNNRemoteBlobImpl::getContext() const noexcept
 }

 void CLDNNRemoteBlobImpl::lock() const {
+    if (!is_allocated()) {
+        IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
+    }
    lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
    auto ptr = lockedHolder->data();
    _handle = reinterpret_cast<void*>(ptr);
@ -295,15 +328,17 @@ std::string CLDNNExecutionContextImpl::getDeviceName() const noexcept {

    auto engine_type = cldnn::engine_types::ocl;
    auto runtime_type = cldnn::runtime_types::ocl;
-    // Use actual runtime and engine types
-    cldnn::device_query device_query(engine_type, runtime_type);
-    auto all_devices = device_query.get_available_devices();
-    auto current_device = m_engine->get_device();
+    try {
+        // Use actual runtime and engine types
+        cldnn::device_query device_query(engine_type, runtime_type);
+        auto all_devices = device_query.get_available_devices();
+        auto current_device = m_engine->get_device();

-    for (auto& kv : all_devices) {
-        if (current_device->is_same(kv.second))
-            return devName + "." + kv.first;
-    }
+        for (auto& kv : all_devices) {
+            if (current_device->is_same(kv.second))
+                return devName + "." + kv.first;
+        }
+    } catch (...) { }

    if (!m_config.device_id.empty())
        devName += "." + m_config.device_id;
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
@ -8,6 +8,7 @@
 #include <cldnn/runtime/engine.hpp>
 #include <ie_parameter.hpp>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
+#include <blob_factory.hpp>
 #include <ie_remote_context.hpp>
 #include "cldnn_config.h"
 #include "cldnn_common_utils.h"
@ -37,6 +38,9 @@ public:
        BT_EMPTY,
        BT_BUF_INTERNAL,
        BT_BUF_SHARED,
+        BT_USM_SHARED,
+        BT_USM_HOST_INTERNAL,
+        BT_USM_DEVICE_INTERNAL,
        BT_IMG_SHARED,
        BT_SURF_SHARED,
        BT_DX_BUF_SHARED,
@ -50,7 +54,7 @@ public:
                                 uint32_t plane = 0,
                                 BlobType mem_type = BT_BUF_INTERNAL);

-    void allocate() noexcept;
+    void allocate();
    bool deallocate() noexcept;
    InferenceEngine::ParamMap getParams() const;
    std::string getDeviceName() const noexcept;
@ -106,7 +110,11 @@ public:
        : _impl(context, stream, layout, mem, surf, plane, mem_type)
        , TpublicAPI(desc) {}

-    void allocate() noexcept override { _impl.allocate(); }
+    void allocate() noexcept override {
+        try {
+            _impl.allocate();
+        } catch (...) {}
+    }
    bool deallocate() noexcept override { return _impl.deallocate(); }
    InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
    std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
@ -125,6 +133,7 @@ protected:
 };

 using CLDNNRemoteCLbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
+using CLDNNRemoteUSMbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::USMBlob>;
 using CLDNNRemoteCLImage2D = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
 #ifdef _WIN32
 using CLDNNRemoteD3DBuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
@ -157,6 +166,10 @@ inline CLDNNRemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
        auto ptr = blobPtr->as<CLDNNRemoteCLImage2D>();
        if (ptr) return ptr->getImpl();
    }
+    {
+        auto ptr = blobPtr->as<CLDNNRemoteUSMbuffer>();
+        if (ptr) return ptr->getImpl();
+    }
    return nullptr;
 }

@ -204,6 +217,58 @@ public:
    bool free(void* handle) noexcept override { return true; }
 };

+class USMHostAllocator : public InferenceEngine::IAllocator {
+protected:
+    InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
+    InferenceEngine::gpu::ClContext* _context = nullptr;
+
+public:
+    using Ptr = std::shared_ptr<USMHostAllocator>;
+
+    USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { }
+    /**
+    * @brief Maps handle to heap memory accessible by any memory manipulation routines.
+    * @return Generic pointer to memory
+    */
+    void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
+        if (!_usm_host_blob)
+            return nullptr;
+        return _usm_host_blob->get();
+    };
+
+    /**
+    * @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
+    * The multiple sequential mappings of the same handle are suppose to get the same
+    * result while there isn't a ref counter supported.
+    */
+    void unlock(void* handle) noexcept override {}
+
+    /**
+    * @brief Allocates memory
+    * @param size The size in bytes to allocate
+    * @return Handle to the allocated resource
+    */
+    void* alloc(size_t size) noexcept override {
+        auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C);
+        InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
+        _usm_host_blob = std::dynamic_pointer_cast<InferenceEngine::gpu::USMBlob>(_context->CreateBlob(td, params));
+        _usm_host_blob->allocate();
+        return _usm_host_blob->get();
+    }
+
+    /**
+    * @brief Releases handle and all associated memory resources which invalidates the handle.
+    * @return false if handle cannot be released, otherwise - true.
+    */
+    bool free(void* handle) noexcept override {
+        try {
+            _usm_host_blob = nullptr;
+        } catch(...) { }
+        return true;
+    }
+};
+
+
 class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
 public:
    enum ContextType {
@ -335,6 +400,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
            case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED:
                ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
                break;
+            case CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED:
+                ret = std::make_shared<CLDNNRemoteUSMbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
+                break;
            case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED:
                layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
                ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
@ -368,6 +436,21 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
                                                     CLDNNRemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
    }

+    InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, CLDNNRemoteBlobImpl::BlobType alloc_type) {
+        cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
+                             FormatFromLayout(tensorDesc.getLayout()),
+                             CldnnTensorFromIEDims(tensorDesc.getDims()));
+        auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
+        auto& stream = _impl.GetEngine()->get_program_stream();
+
+        return std::make_shared<CLDNNRemoteUSMbuffer>(smart_this,
+                                                      stream,
+                                                      tensorDesc,
+                                                      layout,
+                                                      nullptr, 0, 0,
+                                                      alloc_type);
+    }
+
    void check_if_shared() {
        if (GetType() != CLDNNExecutionContextImpl::ContextType::DEV_SHARED)
            IE_THROW() << "Shared context is required to to share this type of memory";
@ -382,9 +465,16 @@ public:
                                        const Config& config = {})
        : _impl(plugin, params, config) {}

-    InferenceEngine::ParamMap getParams() const noexcept override { return _impl.getParams(); }
+    InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
    std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }

+    InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override {
+        if (_impl.GetEngine()->use_unified_shared_memory())
+            return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc, std::make_shared<USMHostAllocator>(this)));
+        else
+            return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc));
+    }
+
    InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override {
        using namespace InferenceEngine;
        using InferenceEngine::gpu::details::param_map_obj_getter;
@ -395,9 +485,21 @@ public:
            // user will supply shared object handle
            std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));

+            bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
+                          memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
+                          memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER);
+
+            if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) {
+                IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device";
+            }
+
            if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) {
                check_if_shared();
                return reuse_surf(tensorDesc, params);
+            } else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) {
+                return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
+            } else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) {
+                return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
            } else {
                CLDNNRemoteBlobImpl::BlobType blob_type;
                cldnn::shared_handle mem = nullptr;
@ -405,6 +507,9 @@ public:
                if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) {
                    blob_type = CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED;
                    mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
+                } else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) {
+                    blob_type = CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED;
+                    mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
                } else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) {
                    blob_type = CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED;
                    mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp
@ -131,6 +131,49 @@ public:
    }
 };

+/**
+ * @brief This class represents an abstraction for GPU plugin remote blob
+ * which can be shared with user-supplied USM pointer.
+ * The plugin object derived from this class can be obtained with CreateBlob() call.
+ * @note User can obtain USM pointer from this class.
+ */
+class USMBlob : public ClBlob, public details::param_map_obj_getter {
+public:
+    /**
+     * @brief A smart pointer to the ClBufferBlob object
+     */
+    using Ptr = std::shared_ptr<USMBlob>;
+
+    /**
+     * @brief Creates a ClBufferBlob object with the specified dimensions and layout.
+     * @param tensorDesc Tensor description
+     */
+    explicit USMBlob(const TensorDesc& tensorDesc) : ClBlob(tensorDesc) {}
+
+    /**
+     * @brief Returns the underlying OpenCL memory object handle.
+     * @return underlying OpenCL memory object handle
+     */
+    void* get() {
+        const auto& params = getParams();
+        auto itrType = params.find(GPU_PARAM_KEY(SHARED_MEM_TYPE));
+        if (itrType == params.end())
+            IE_THROW() << "Parameter of type " << GPU_PARAM_KEY(SHARED_MEM_TYPE) << " not found";
+
+        auto mem_type = itrType->second.as<std::string>();
+        if (mem_type != GPU_PARAM_VALUE(USM_USER_BUFFER) && mem_type != GPU_PARAM_VALUE(USM_HOST_BUFFER) &&
+            mem_type != GPU_PARAM_VALUE(USM_DEVICE_BUFFER))
+            IE_THROW() << "Unexpected USM blob type: " << mem_type;
+
+        auto itrHandle = params.find(GPU_PARAM_KEY(MEM_HANDLE));
+        if (itrHandle == params.end()) {
+            IE_THROW() << "No parameter " << GPU_PARAM_KEY(MEM_HANDLE) << " found";
+        }
+
+        return itrHandle->second.as<gpu_handle_param>();
+    }
+};
+
 /**
 * @brief This class represents an abstraction for GPU plugin remote blob
 * which can be shared with user-supplied OpenCL 2D Image.
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp
@ -98,7 +98,18 @@ DECLARE_GPU_PARAM_VALUE(OCL_BUFFER);
 * @brief Shared OpenCL 2D image blob
 */
 DECLARE_GPU_PARAM_VALUE(OCL_IMAGE2D);
-
+/**
+ * @brief Shared USM pointer allocated by user
+ */
+DECLARE_GPU_PARAM_VALUE(USM_USER_BUFFER);
+/**
+ * @brief Shared USM pointer type with host allocation type allocated by plugin
+ */
+DECLARE_GPU_PARAM_VALUE(USM_HOST_BUFFER);
+/**
+ * @brief Shared USM pointer type with device allocation type allocated by plugin
+ */
+DECLARE_GPU_PARAM_VALUE(USM_DEVICE_BUFFER);
 /**
 * @brief Shared video decoder surface or D3D 2D texture blob
 */
--- a/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp
+++ b/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp
@ -23,7 +23,7 @@ namespace InferenceEngine {
 * Such context represents a scope on the device within which executable
 * networks and remote memory blobs can exist, function and exchange data.
 */
-class RemoteContext : public std::enable_shared_from_this<RemoteContext> {
+class INFERENCE_ENGINE_API_CLASS(RemoteContext) : public std::enable_shared_from_this<RemoteContext> {
 public:
    /**
     * @brief A smart pointer to the RemoteContext object
@ -110,6 +110,14 @@ public:
     */
    virtual RemoteBlob::Ptr CreateBlob(const TensorDesc& tensorDesc, const ParamMap& params = {}) = 0;

+    /**
+     * @brief Allocates host accessible memory blob friendly for the device in current context
+     * Returns a pointer to the object which implements MemoryBlob interface.
+     * @param tensorDesc Defines the layout and dims of the blob
+     * @return A pointer to host accessible MemoryBlob object
+     */
+    virtual MemoryBlob::Ptr CreateHostBlob(const TensorDesc& tensorDesc);
+
    /**
     * @brief Returns a map of device-specific parameters required for low-level
     * operations with underlying object.
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp
@ -102,9 +102,10 @@ public:
 * @note User can also obtain OpenCL context handle from this class.
 */
 class D3DContext : public ClContext {
-    using RemoteContext::create_tensor;
-
 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using ClContext::create_tensor;
+
    /**
     * @brief Checks that type defined runtime paramters are presented in remote object
     * @param remote_context remote context to check
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp
@ -117,6 +117,36 @@ public:
    }
 };

+/**
+ * @brief This class represents an abstraction for GPU plugin remote tensor
+ * which can be shared with user-supplied USM device pointer.
+ * The plugin object derived from this class can be obtained with ClContext::create_tensor() call.
+ * @note User can obtain USM pointer from this class.
+ */
+class USMTensor : public RemoteTensor {
+public:
+    /**
+     * @brief Checks that type defined runtime paramters are presented in remote object
+     * @param tensor a tensor to check
+     */
+    static void type_check(const Tensor& tensor) {
+        RemoteTensor::type_check(tensor,
+                                 {{GPU_PARAM_KEY(MEM_HANDLE), {}},
+                                  {GPU_PARAM_KEY(SHARED_MEM_TYPE),
+                                   {GPU_PARAM_VALUE(USM_USER_BUFFER),
+                                    GPU_PARAM_VALUE(USM_HOST_BUFFER),
+                                    GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}});
+    }
+
+    /**
+     * @brief Returns the underlying USM pointer.
+     * @return underlying USM pointer
+     */
+    void* get() {
+        return static_cast<void*>(get_params().at(GPU_PARAM_KEY(MEM_HANDLE)).as<void*>());
+    }
+};
+
 /**
 * @brief This class represents an abstraction for GPU plugin remote context
 * which is shared with OpenCL context object.
@ -125,14 +155,14 @@ public:
 */
 class ClContext : public RemoteContext {
 protected:
-    using RemoteContext::create_tensor;
-
    /**
     * @brief GPU device name
     */
    static constexpr const char* device_name = "GPU";

 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using RemoteContext::create_tensor;
    /**
     * @brief Checks that type defined runtime paramters are presented in remote object
     * @param remote_context remote context to check
@ -220,7 +250,7 @@ public:
     * @brief This function is used to obtain remote tensor object from user-supplied cl_mem object
     * @param type Tensor element type
     * @param shape Tensor shape
-     * @param buffer A cl_mem object wrapped by a remote tensor
+     * @param buffer A cl_mem object that should be wrapped by a remote tensor
     * @return A remote tensor instance
     */
    ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl_mem buffer) {
@ -233,7 +263,7 @@ public:
     * @brief This function is used to obtain remote tensor object from user-supplied cl::Buffer object
     * @param type Tensor element type
     * @param shape Tensor shape
-     * @param buffer A cl::Buffer object wrapped by a remote tensor
+     * @param buffer A cl::Buffer object that should be wrapped by a remote tensor
     * @return A remote tensor instance
     */
    ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl::Buffer& buffer) {
@ -244,7 +274,7 @@ public:
     * @brief This function is used to obtain remote tensor object from user-supplied cl::Image2D object
     * @param type Tensor element type
     * @param shape Tensor shape
-     * @param image A cl::Image2D object wrapped by a remote tensor
+     * @param image A cl::Image2D object that should be wrapped by a remote tensor
     * @return A remote tensor instance
     */
    ClImage2DTensor create_tensor(const element::Type type, const Shape& shape, const cl::Image2D& image) {
@ -252,7 +282,43 @@ public:
                           {GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(image.get())}};
        return create_tensor(type, shape, params);
    }
+
+    /**
+     * @brief This function is used to obtain remote tensor object from user-supplied USM pointer
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @param usm_ptr A USM pointer that should be wrapped by a remote tensor
+     * @return A remote tensor instance
+     */
+    USMTensor create_tensor(const element::Type type, const Shape& shape, void* usm_ptr) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER)},
+                           {GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(usm_ptr)}};
+        return create_tensor(type, shape, params);
+    }
+
+    /**
+     * @brief This function is used to allocate USM tensor with host allocation type
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A remote tensor instance
+     */
+    USMTensor create_usm_host_tensor(const element::Type type, const Shape& shape) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
+        return create_tensor(type, shape, params);
+    }
+
+    /**
+     * @brief This function is used to allocate USM tensor with device allocation type
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A remote tensor instance
+     */
+    USMTensor create_usm_device_tensor(const element::Type type, const Shape& shape) {
+        ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}};
+        return create_tensor(type, shape, params);
+    }
 };
+
 }  // namespace ocl
 }  // namespace gpu
 }  // namespace runtime
--- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp
@ -72,9 +72,10 @@ public:
 * @note User can also obtain OpenCL context handle from this class.
 */
 class VAContext : public ClContext {
-    using RemoteContext::create_tensor;
-
 public:
+    // Needed to make create_tensor overloads from base class visible for user
+    using ClContext::create_tensor;
+
    /**
     * @brief Checks that type defined runtime paramters are presented in remote object
     * @param remote_context remote context to check
--- a/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp
+++ b/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp
@ -136,6 +136,16 @@ public:
     * @return A map of name/parameter elements.
     */
    ParamMap get_params() const;
+
+    /**
+     * @brief This function is used to create host tensor object friendly for the device in current context
+     * For example, GPU context may allocate USM host memory (if corresponding extension is available)
+     * which could be more efficient than regular host memory.
+     * @param type Tensor element type
+     * @param shape Tensor shape
+     * @return A Tensor instance with device friendly memory
+     */
+    Tensor create_host_tensor(const element::Type type, const Shape& shape);
 };

 }  // namespace runtime
--- a/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp
+++ b/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp
@ -69,6 +69,15 @@ RemoteTensor RemoteContext::create_tensor(const element::Type& element_type,
    });
 }

+Tensor RemoteContext::create_host_tensor(const element::Type element_type, const Shape& shape) {
+    OV_REMOTE_CONTEXT_STATEMENT({
+        auto blob = _impl->CreateHostBlob(
+            {ie::details::convertPrecision(element_type), shape, ie::TensorDesc::getLayoutByRank(shape.size())});
+        blob->allocate();
+        return {_so, blob};
+    });
+}
+
 ie::ParamMap RemoteContext::get_params() const {
    OV_REMOTE_CONTEXT_STATEMENT(return _impl->getParams());
 }
--- a/inference-engine/src/inference_engine/src/ie_remote_context.cpp
+++ b/inference-engine/src/inference_engine/src/ie_remote_context.cpp
@ -0,0 +1,22 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_remote_context.hpp"
+
+#include <memory>
+#include <string>
+
+#include "blob_factory.hpp"
+
+namespace InferenceEngine {
+
+MemoryBlob::Ptr RemoteContext::CreateHostBlob(const TensorDesc& tensorDesc) {
+    auto blob = std::dynamic_pointer_cast<MemoryBlob>(make_blob_with_precision(tensorDesc));
+    if (!blob)
+        IE_THROW(NotAllocated) << "Failed to create host blob in remote context for " << getDeviceName() << " device";
+
+    return blob;
+}
+
+}  // namespace InferenceEngine
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@ -84,6 +84,61 @@ TEST_F(RemoteBlob_Test, smoke_canInputUserBlob) {
    }
 }

+
+TEST_F(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
+#if defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    // TODO: Issue: investigate issue with IECore
+    auto ie = InferenceEngine::Core();
+    auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net.CreateInferRequest();
+    InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(
+            net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto inf_req_shared = exec_net.CreateInferRequest();
+    auto cldnn_context = exec_net.GetContext();
+    cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
+    auto ocl_instance = std::make_shared<OpenCL>(ctx);
+
+    auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
+    size_t imSize = dims[1] * dims[2] * dims[3];
+
+    Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context);
+    shared_blob->allocate();
+    {
+        cl::Buffer shared_buffer = *shared_blob->as<gpu::ClBufferBlob>();
+        void *buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+    }
+
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
+
+    inf_req_shared.Infer();
+    auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+    }
+}
+
+
 TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
    CNNNetwork net(fn_ptr);
--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
@ -30,7 +30,46 @@ protected:
    }
 };

-TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
+enum class RemoteTensorSharingType {
+    USER_CL_TENSOR = 0,
+    PLUGIN_CL_TENSOR = 1,
+    USER_USM_HOST_TENSOR = 2,
+    USER_USM_DEVICE_TENSOR = 3,
+    PLUGIN_USM_HOST_TENSOR = 4,
+    PLUGIN_USM_DEVICE_TENSOR = 5,
+    PLUGIN_HOST_TENSOR = 6
+};
+
+std::ostream& operator<<(std::ostream& stream, RemoteTensorSharingType sharing_type) {
+    switch (sharing_type) {
+    case RemoteTensorSharingType::USER_CL_TENSOR:  stream << "USER_CL_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_CL_TENSOR: stream << "PLUGIN_CL_TENSOR"; break;
+    case RemoteTensorSharingType::USER_USM_HOST_TENSOR: stream << "USER_USM_HOST_TENSOR"; break;
+    case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: stream << "USER_USM_DEVICE_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: stream << "PLUGIN_USM_HOST_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: stream << "PLUGIN_USM_DEVICE_TENSOR"; break;
+    case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: stream << "PLUGIN_HOST_TENSOR"; break;
+    }
+
+    return stream;
+}
+
+class OVRemoteTensorInputBlob_Test : public OVRemoteTensor_Test, public testing::WithParamInterface<RemoteTensorSharingType> {
+public:
+    void SetUp() override {
+        fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    }
+
+    static std::string getTestCaseName(testing::TestParamInfo<RemoteTensorSharingType> obj) {
+        RemoteTensorSharingType sharing_type = obj.param;
+
+        std::ostringstream result;
+        result << sharing_type;
+        return result.str();
+    }
+};
+
+TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
 #if defined(ANDROID)
    GTEST_SKIP();
 #endif
@ -45,6 +84,8 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {

    auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);

+    RemoteTensorSharingType sharing_type = GetParam();
+
    // regular inference
    auto inf_req_regular = exec_net.create_infer_request();
    auto input = function->get_parameters().at(0);
@ -65,16 +106,129 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {

    auto imSize = ov::shape_size(input->get_shape());

-    cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
-    {
-        void* buffer = fakeImageData.data();
-        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+    switch (sharing_type) {
+        case RemoteTensorSharingType::USER_CL_TENSOR: {
+            cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
+            {
+                void* buffer = fakeImageData.data();
+                ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
+            {
+                void* buffer = fakeImageData.data();
+                err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
+                if (err != CL_SUCCESS)
+                    FAIL() << "Failed to copy data from host buffer to USM device";
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            ocl_instance->free_mem(shared_buffer);
+
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
+            {
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            ocl_instance->free_mem(shared_buffer);
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_CL_TENSOR: {
+            auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::ClBufferTensor>());
+            auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::ClBufferTensor>();
+            {
+                cl::Buffer shared_buffer = cl_tensor;
+                void* buffer = fakeImageData.data();
+                ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+            }
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            auto cldnn_tensor = cldnn_context.create_usm_host_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
+            {
+                auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
+                void* shared_buffer = cl_tensor.get();
+                ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            auto cldnn_tensor = cldnn_context.create_usm_device_tensor(input->get_element_type(), input->get_shape());
+            ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
+            {
+                auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
+                void* shared_buffer = cl_tensor.get();
+                ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_DEVICE_INTEL);
+                void* buffer = fakeImageData.data();
+                err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
+                if (err != CL_SUCCESS)
+                    FAIL() << "Failed to copy data from host buffer to USM device";
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
+        case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: {
+            auto cldnn_tensor = cldnn_context.create_host_tensor(input->get_element_type(), input->get_shape());
+            {
+                ASSERT_NO_THROW(cldnn_tensor.data());
+                void* shared_buffer = cldnn_tensor.data();
+                if (ocl_instance->supports_usm())
+                    ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
+                void* buffer = fakeImageData.data();
+                std::memcpy(shared_buffer, buffer, imSize);
+            }
+
+            inf_req_shared.set_tensor(input, cldnn_tensor);
+            inf_req_shared.infer();
+
+            break;
+        }
    }

-    auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
-    inf_req_shared.set_tensor(input, cldnn_tensor);
-
-    inf_req_shared.infer();
    auto output_tensor_shared = inf_req_shared.get_tensor(output);

    // compare results
@ -88,6 +242,18 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
    }
 }

+INSTANTIATE_TEST_SUITE_P(
+    smoke_GPU,
+    OVRemoteTensorInputBlob_Test,
+        ::testing::ValuesIn(std::vector<RemoteTensorSharingType>{RemoteTensorSharingType::USER_CL_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_CL_TENSOR,
+                                                                 RemoteTensorSharingType::USER_USM_HOST_TENSOR,
+                                                                 RemoteTensorSharingType::USER_USM_DEVICE_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR,
+                                                                 RemoteTensorSharingType::PLUGIN_HOST_TENSOR}),
+        OVRemoteTensorInputBlob_Test::getTestCaseName);
+
 TEST_F(OVRemoteTensor_Test, smoke_canInferOnUserContext) {
    auto ie = ov::runtime::Core();

--- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp
+++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp
@ -18,10 +18,57 @@
 #endif
 #include <gpu/gpu_context_api_ocl.hpp>

+namespace {
+template <typename T>
+T load_entrypoint(const cl_platform_id platform, const std::string name) {
+#if defined(__GNUC__) && __GNUC__ < 5
+// OCL spec says:
+// "The function clGetExtensionFunctionAddressForPlatform returns the address of the extension function named by funcname for a given platform.
+//  The pointer returned should be cast to a function pointer type matching the extension function's definition defined in the appropriate extension
+//  specification and header file."
+// So the pointer-to-object to pointer-to-function cast below is supposed to be valid, thus we suppress warning from old GCC versions.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+    T p = reinterpret_cast<T>(clGetExtensionFunctionAddressForPlatform(platform, name.c_str()));
+#if defined(__GNUC__) && __GNUC__ < 5
+#pragma GCC diagnostic pop
+#endif
+    if (!p) {
+        throw std::runtime_error("clGetExtensionFunctionAddressForPlatform(" + name + ") returned NULL.");
+    }
+    return p;
+}
+
+template <typename T>
+T try_load_entrypoint(const cl_platform_id platform, const std::string name) {
+    try {
+        return load_entrypoint<T>(platform, name);
+    } catch (...) {
+        return nullptr;
+    }
+}
+}  // namespace
+
 struct OpenCL {
    cl::Context _context;
    cl::Device _device;
    cl::CommandQueue _queue;
+    cl_platform_id _platform;
+
+    clHostMemAllocINTEL_fn _host_mem_alloc_fn = nullptr;
+    clMemFreeINTEL_fn _mem_free_fn = nullptr;
+    clDeviceMemAllocINTEL_fn _device_mem_alloc_fn = nullptr;
+    clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
+    clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
+
+    void init_extension_functions(cl_platform_id platform) {
+        _host_mem_alloc_fn = try_load_entrypoint<clHostMemAllocINTEL_fn>(platform, "clHostMemAllocINTEL");
+        _device_mem_alloc_fn = try_load_entrypoint<clDeviceMemAllocINTEL_fn>(platform, "clDeviceMemAllocINTEL");
+        _mem_free_fn = try_load_entrypoint<clMemFreeINTEL_fn>(platform, "clMemFreeINTEL");
+        _enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(platform, "clEnqueueMemcpyINTEL");
+        _get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(platform, "clGetMemAllocInfoINTEL");
+    }

    explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
        // get Intel iGPU OCL device, create context and queue
@ -42,12 +89,15 @@ struct OpenCL {
                    if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
                        _device = d;
                        _context = cl::Context(_device);
+                        _platform = id;
                        break;
                    }
                }
            }
            cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
            _queue = cl::CommandQueue(_context, _device, props);
+
+            init_extension_functions(_platform);
        }
    }

@ -56,7 +106,81 @@ struct OpenCL {
        _context = cl::Context(context, true);
        _device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);

+        cl_int error = clGetDeviceInfo(_device.get(), CL_DEVICE_PLATFORM, sizeof(_platform), &_platform, nullptr);
+        if (error) {
+            throw std::runtime_error("OpenCL helper failed to retrieve CL_DEVICE_PLATFORM: " + std::to_string(error));
+        }
+
        cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
        _queue = cl::CommandQueue(_context, _device, props);
+
+        init_extension_functions(_platform);
+    }
+
+    bool supports_usm() const {
+        return _host_mem_alloc_fn != nullptr &&
+               _device_mem_alloc_fn != nullptr &&
+               _mem_free_fn != nullptr &&
+               _enqueue_memcpy_fn != nullptr &&
+               _get_mem_alloc_info_fn != nullptr;
+    }
+
+    void* allocate_usm_host_buffer(size_t size) const {
+        cl_int err_code_ret;
+        if (!_device_mem_alloc_fn)
+            throw std::runtime_error("[GPU] clHostMemAllocINTEL is nullptr");
+        auto ret_ptr = _host_mem_alloc_fn(_context.get(), nullptr, size, 0, &err_code_ret);
+        if (err_code_ret != CL_SUCCESS)
+            throw std::runtime_error("OpenCL helper failed to allocate USM host memory");
+        return ret_ptr;
+    }
+
+    void* allocate_usm_device_buffer(size_t size) const {
+        cl_int err_code_ret;
+        if (!_device_mem_alloc_fn)
+            throw std::runtime_error("[GPU] clDeviceMemAllocINTEL is nullptr");
+        auto ret_ptr = _device_mem_alloc_fn(_context.get(), _device.get(), nullptr, size, 0, &err_code_ret);
+        if (err_code_ret != CL_SUCCESS)
+            throw std::runtime_error("OpenCL helper failed to allocate USM device memory");
+        return ret_ptr;
+    }
+
+    void free_mem(void* usm_ptr) {
+        if (!_mem_free_fn)
+            throw std::runtime_error("[GPU] clMemFreeINTEL is nullptr");
+
+        _mem_free_fn(_context.get(), usm_ptr);
+    }
+
+    cl_int memcpy(const cl::CommandQueue& cpp_queue, void *dst_ptr, const void *src_ptr,
+                  size_t bytes_count, bool blocking = true, const std::vector<cl::Event>* wait_list = nullptr, cl::Event* ret_event = nullptr) const {
+        if (!_enqueue_memcpy_fn)
+            throw std::runtime_error("[GPU] clEnqueueMemcpyINTEL is nullptr");
+        cl_event tmp;
+        cl_int err = _enqueue_memcpy_fn(
+            cpp_queue.get(),
+            static_cast<cl_bool>(blocking),
+            dst_ptr,
+            src_ptr,
+            bytes_count,
+            wait_list == nullptr ? 0 : static_cast<cl_uint>(wait_list->size()),
+            wait_list == nullptr ? nullptr : reinterpret_cast<const cl_event*>(&wait_list->front()),
+            ret_event == nullptr ? nullptr : &tmp);
+
+        if (ret_event != nullptr && err == CL_SUCCESS)
+            *ret_event = tmp;
+
+        return err;
+    }
+
+    cl_unified_shared_memory_type_intel get_allocation_type(const void* usm_ptr) const {
+        if (!_get_mem_alloc_info_fn) {
+            throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
+        }
+
+        cl_unified_shared_memory_type_intel ret_val;
+        size_t ret_val_size;
+        _get_mem_alloc_info_fn(_context.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
+        return ret_val;
    }
 };
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
@ -62,6 +62,9 @@ public:
    /// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout
    memory_ptr share_buffer(const layout& layout, shared_handle buf);

+    /// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout
+    memory_ptr share_usm(const layout& layout, shared_handle usm_ptr);
+
    /// Create shared memory object using user-supplied 2D image @p img using specified @p layout
    memory_ptr share_image(const layout& layout, shared_handle img);

--- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
@ -92,6 +92,17 @@ memory_ptr engine::share_buffer(const layout& layout, shared_handle buf) {
    return reinterpret_handle(layout, params);
 }

+memory_ptr engine::share_usm(const layout& layout, shared_handle usm_ptr) {
+    shared_mem_params params = { shared_mem_type::shared_mem_usm, nullptr, nullptr, usm_ptr,
+#ifdef _WIN32
+        nullptr,
+#else
+        0,
+#endif
+        0 };
+    return reinterpret_handle(layout, params);
+}
+
 memory::ptr engine::share_image(const layout& layout, shared_handle img) {
    shared_mem_params params = { shared_mem_type::shared_mem_image, nullptr, nullptr, img,
 #ifdef _WIN32
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
@ -168,6 +168,9 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
        } else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
            cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
            return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
+        } else if (params.mem_type == shared_mem_type::shared_mem_usm) {
+            cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
+            return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
        } else {
            throw std::runtime_error("unknown shared object fromat or type");
        }
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
@ -524,6 +524,7 @@ public:
            _enqueue_memcpy_fn             = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(_ctx.get(), "clEnqueueMemcpyINTEL");
            _enqueue_mem_fill_fn           = try_load_entrypoint<clEnqueueMemFillINTEL_fn>(_ctx.get(), "clEnqueueMemFillINTEL");
            _enqueue_memset_fn             = try_load_entrypoint<clEnqueueMemsetINTEL_fn>(_ctx.get(), "clEnqueueMemsetINTEL");
+            _get_mem_alloc_info_fn         = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(_ctx.get(), "clGetMemAllocInfoINTEL");
        }
    }

@ -621,6 +622,17 @@ public:
        return err;
    }

+    cl_unified_shared_memory_type_intel get_usm_allocation_type(const void* usm_ptr) const {
+        if (!_get_mem_alloc_info_fn) {
+            throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
+        }
+
+        cl_unified_shared_memory_type_intel ret_val;
+        size_t ret_val_size;
+        _get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
+        return ret_val;
+    }
+
 private:
    cl::Context _ctx;
    cl::Device _device;
@ -632,6 +644,7 @@ private:
    clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
    clEnqueueMemFillINTEL_fn _enqueue_mem_fill_fn = nullptr;
    clEnqueueMemsetINTEL_fn _enqueue_memset_fn = nullptr;
+    clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
 };

 /*
@ -640,11 +653,16 @@ private:
 */
 class UsmHolder {
 public:
-    UsmHolder(const cl::UsmHelper& usmHelper, void* ptr) : _usmHelper(usmHelper), _ptr(ptr) { }
+    UsmHolder(const cl::UsmHelper& usmHelper, void* ptr, bool shared_memory = false)
+    : _usmHelper(usmHelper)
+    , _ptr(ptr)
+    , _shared_memory(shared_memory) { }
+
    void* ptr() { return _ptr; }
    ~UsmHolder() {
        try {
-            _usmHelper.free_mem(_ptr);
+            if (!_shared_memory)
+                _usmHelper.free_mem(_ptr);
        } catch (...) {
            // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
        }
@ -652,6 +670,7 @@ public:
 private:
    const cl::UsmHelper& _usmHelper;
    void* _ptr;
+    bool _shared_memory = false;
 };
 /*
    USM base class. Different usm types should derive from this class.
@ -659,6 +678,13 @@ private:
 class UsmMemory {
 public:
    explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
+    UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
+    : _usmHelper(usmHelper)
+    , _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
+        if (!usm_ptr) {
+            throw std::runtime_error("[GPU] Can't share null usm pointer");
+        }
+    }

    // Get methods returns original pointer allocated by openCL.
    void* get() const { return _usm_pointer->ptr(); }
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
@ -279,6 +279,12 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemo
    , _buffer(buffer) {
 }

+gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer)
+    : lockable_gpu_mem()
+    , memory(engine, new_layout, detect_allocation_type(engine, buffer), true)
+    , _buffer(buffer) {
+}
+
 gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
    : lockable_gpu_mem()
    , memory(engine, layout, type, false)
@ -393,6 +399,20 @@ shared_mem_params gpu_usm::get_internal_params() const {
    };
 }

+allocation_type gpu_usm::detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer) {
+    auto cl_alloc_type = engine->get_usm_helper().get_usm_allocation_type(buffer.get());
+
+    allocation_type res = allocation_type::unknown;
+    switch (cl_alloc_type) {
+        case CL_MEM_TYPE_DEVICE_INTEL: res = allocation_type::usm_device; break;
+        case CL_MEM_TYPE_HOST_INTEL: res = allocation_type::usm_host; break;
+        case CL_MEM_TYPE_SHARED_INTEL: res = allocation_type::usm_shared; break;
+        default: throw std::runtime_error("[GPU] Unsupported USM alloc type: " + std::to_string(cl_alloc_type));
+    }
+
+    return res;
+}
+
 std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
    std::vector<cl_mem> res;
    for (auto& m : mem) {
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
@ -100,6 +100,7 @@ private:

 struct gpu_usm : public lockable_gpu_mem, public memory {
    gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
+    gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer);
    gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);

    void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
@ -120,6 +121,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {

 protected:
    cl::UsmMemory _buffer;
+
+    static allocation_type detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer);
 };

 struct ocl_surfaces_lock : public surfaces_lock {