[GPU] USM sharing and host blob creation in gpu remote context (#8657)

This commit is contained in:
Vladimir Paramuzov 2021-11-23 20:26:44 +03:00 committed by GitHub
parent c49620bb6a
commit 6addc0d535
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 776 additions and 38 deletions

View File

@ -966,9 +966,25 @@ void CLDNNInferRequest::prepare_output(const cldnn::primitive_id& outputName, Bl
}
InferenceEngine::Blob::Ptr CLDNNInferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) {
auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(), m_graph->GetNetwork()->get_stream(), desc, layout);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
if (m_graph->GetEngine()->use_unified_shared_memory()) {
auto blobPtr = std::make_shared<CLDNNRemoteUSMbuffer>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
desc,
layout,
nullptr,
0,
0,
CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
} else {
auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
desc,
layout);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
}
}
} // namespace CLDNNPlugin

View File

@ -38,6 +38,24 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const {
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_HOST_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_DEVICE_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
#ifdef _WIN32
case BT_DX_BUF_SHARED:
return{
@ -81,7 +99,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept {
return lockedHolder != nullptr;
}
void CLDNNRemoteBlobImpl::allocate() noexcept {
void CLDNNRemoteBlobImpl::allocate() {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::Allocate");
assert(m_memObject == nullptr);
@ -91,13 +109,25 @@ void CLDNNRemoteBlobImpl::allocate() noexcept {
switch (m_mem_type) {
case BlobType::BT_BUF_INTERNAL: {
m_memObject = eng->allocate_memory(m_layout);
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
break;
}
case BlobType::BT_USM_HOST_INTERNAL: {
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_host);
break;
}
case BlobType::BT_USM_DEVICE_INTERNAL: {
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_device);
break;
}
case BlobType::BT_BUF_SHARED: {
m_memObject = eng->share_buffer(m_layout, m_mem);
break;
}
case BlobType::BT_USM_SHARED: {
m_memObject = eng->share_usm(m_layout, m_mem);
break;
}
#ifdef _WIN32
case BlobType::BT_SURF_SHARED: {
m_memObject = eng->share_surface(m_layout, m_mem, m_plane);
@ -139,6 +169,9 @@ std::shared_ptr<RemoteContext> CLDNNRemoteBlobImpl::getContext() const noexcept
}
void CLDNNRemoteBlobImpl::lock() const {
if (!is_allocated()) {
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
}
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
auto ptr = lockedHolder->data();
_handle = reinterpret_cast<void*>(ptr);
@ -295,15 +328,17 @@ std::string CLDNNExecutionContextImpl::getDeviceName() const noexcept {
auto engine_type = cldnn::engine_types::ocl;
auto runtime_type = cldnn::runtime_types::ocl;
// Use actual runtime and engine types
cldnn::device_query device_query(engine_type, runtime_type);
auto all_devices = device_query.get_available_devices();
auto current_device = m_engine->get_device();
try {
// Use actual runtime and engine types
cldnn::device_query device_query(engine_type, runtime_type);
auto all_devices = device_query.get_available_devices();
auto current_device = m_engine->get_device();
for (auto& kv : all_devices) {
if (current_device->is_same(kv.second))
return devName + "." + kv.first;
}
for (auto& kv : all_devices) {
if (current_device->is_same(kv.second))
return devName + "." + kv.first;
}
} catch (...) { }
if (!m_config.device_id.empty())
devName += "." + m_config.device_id;

View File

@ -8,6 +8,7 @@
#include <cldnn/runtime/engine.hpp>
#include <ie_parameter.hpp>
#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
#include <blob_factory.hpp>
#include <ie_remote_context.hpp>
#include "cldnn_config.h"
#include "cldnn_common_utils.h"
@ -37,6 +38,9 @@ public:
BT_EMPTY,
BT_BUF_INTERNAL,
BT_BUF_SHARED,
BT_USM_SHARED,
BT_USM_HOST_INTERNAL,
BT_USM_DEVICE_INTERNAL,
BT_IMG_SHARED,
BT_SURF_SHARED,
BT_DX_BUF_SHARED,
@ -50,7 +54,7 @@ public:
uint32_t plane = 0,
BlobType mem_type = BT_BUF_INTERNAL);
void allocate() noexcept;
void allocate();
bool deallocate() noexcept;
InferenceEngine::ParamMap getParams() const;
std::string getDeviceName() const noexcept;
@ -106,7 +110,11 @@ public:
: _impl(context, stream, layout, mem, surf, plane, mem_type)
, TpublicAPI(desc) {}
void allocate() noexcept override { _impl.allocate(); }
void allocate() noexcept override {
try {
_impl.allocate();
} catch (...) {}
}
bool deallocate() noexcept override { return _impl.deallocate(); }
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
@ -125,6 +133,7 @@ protected:
};
using CLDNNRemoteCLbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
using CLDNNRemoteUSMbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::USMBlob>;
using CLDNNRemoteCLImage2D = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
#ifdef _WIN32
using CLDNNRemoteD3DBuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
@ -157,6 +166,10 @@ inline CLDNNRemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
auto ptr = blobPtr->as<CLDNNRemoteCLImage2D>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<CLDNNRemoteUSMbuffer>();
if (ptr) return ptr->getImpl();
}
return nullptr;
}
@ -204,6 +217,58 @@ public:
bool free(void* handle) noexcept override { return true; }
};
class USMHostAllocator : public InferenceEngine::IAllocator {
protected:
InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
InferenceEngine::gpu::ClContext* _context = nullptr;
public:
using Ptr = std::shared_ptr<USMHostAllocator>;
USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { }
/**
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
* @return Generic pointer to memory
*/
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
if (!_usm_host_blob)
return nullptr;
return _usm_host_blob->get();
};
/**
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
* The multiple sequential mappings of the same handle are suppose to get the same
* result while there isn't a ref counter supported.
*/
void unlock(void* handle) noexcept override {}
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* alloc(size_t size) noexcept override {
auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C);
InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
_usm_host_blob = std::dynamic_pointer_cast<InferenceEngine::gpu::USMBlob>(_context->CreateBlob(td, params));
_usm_host_blob->allocate();
return _usm_host_blob->get();
}
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool free(void* handle) noexcept override {
try {
_usm_host_blob = nullptr;
} catch(...) { }
return true;
}
};
class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
public:
enum ContextType {
@ -335,6 +400,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED:
ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
case CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED:
ret = std::make_shared<CLDNNRemoteUSMbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED:
layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
@ -368,6 +436,21 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
CLDNNRemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
}
InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, CLDNNRemoteBlobImpl::BlobType alloc_type) {
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
FormatFromLayout(tensorDesc.getLayout()),
CldnnTensorFromIEDims(tensorDesc.getDims()));
auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
auto& stream = _impl.GetEngine()->get_program_stream();
return std::make_shared<CLDNNRemoteUSMbuffer>(smart_this,
stream,
tensorDesc,
layout,
nullptr, 0, 0,
alloc_type);
}
void check_if_shared() {
if (GetType() != CLDNNExecutionContextImpl::ContextType::DEV_SHARED)
IE_THROW() << "Shared context is required to to share this type of memory";
@ -382,9 +465,16 @@ public:
const Config& config = {})
: _impl(plugin, params, config) {}
InferenceEngine::ParamMap getParams() const noexcept override { return _impl.getParams(); }
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override {
if (_impl.GetEngine()->use_unified_shared_memory())
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc, std::make_shared<USMHostAllocator>(this)));
else
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc));
}
InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override {
using namespace InferenceEngine;
using InferenceEngine::gpu::details::param_map_obj_getter;
@ -395,9 +485,21 @@ public:
// user will supply shared object handle
std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));
bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER);
if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) {
IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device";
}
if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) {
check_if_shared();
return reuse_surf(tensorDesc, params);
} else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) {
return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
} else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) {
return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
} else {
CLDNNRemoteBlobImpl::BlobType blob_type;
cldnn::shared_handle mem = nullptr;
@ -405,6 +507,9 @@ public:
if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) {
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) {
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) {
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));

View File

@ -131,6 +131,49 @@ public:
}
};
/**
* @brief This class represents an abstraction for GPU plugin remote blob
* which can be shared with user-supplied USM pointer.
* The plugin object derived from this class can be obtained with CreateBlob() call.
* @note User can obtain USM pointer from this class.
*/
class USMBlob : public ClBlob, public details::param_map_obj_getter {
public:
/**
* @brief A smart pointer to the ClBufferBlob object
*/
using Ptr = std::shared_ptr<USMBlob>;
/**
* @brief Creates a ClBufferBlob object with the specified dimensions and layout.
* @param tensorDesc Tensor description
*/
explicit USMBlob(const TensorDesc& tensorDesc) : ClBlob(tensorDesc) {}
/**
* @brief Returns the underlying OpenCL memory object handle.
* @return underlying OpenCL memory object handle
*/
void* get() {
const auto& params = getParams();
auto itrType = params.find(GPU_PARAM_KEY(SHARED_MEM_TYPE));
if (itrType == params.end())
IE_THROW() << "Parameter of type " << GPU_PARAM_KEY(SHARED_MEM_TYPE) << " not found";
auto mem_type = itrType->second.as<std::string>();
if (mem_type != GPU_PARAM_VALUE(USM_USER_BUFFER) && mem_type != GPU_PARAM_VALUE(USM_HOST_BUFFER) &&
mem_type != GPU_PARAM_VALUE(USM_DEVICE_BUFFER))
IE_THROW() << "Unexpected USM blob type: " << mem_type;
auto itrHandle = params.find(GPU_PARAM_KEY(MEM_HANDLE));
if (itrHandle == params.end()) {
IE_THROW() << "No parameter " << GPU_PARAM_KEY(MEM_HANDLE) << " found";
}
return itrHandle->second.as<gpu_handle_param>();
}
};
/**
* @brief This class represents an abstraction for GPU plugin remote blob
* which can be shared with user-supplied OpenCL 2D Image.

View File

@ -98,7 +98,18 @@ DECLARE_GPU_PARAM_VALUE(OCL_BUFFER);
* @brief Shared OpenCL 2D image blob
*/
DECLARE_GPU_PARAM_VALUE(OCL_IMAGE2D);
/**
* @brief Shared USM pointer allocated by user
*/
DECLARE_GPU_PARAM_VALUE(USM_USER_BUFFER);
/**
* @brief Shared USM pointer type with host allocation type allocated by plugin
*/
DECLARE_GPU_PARAM_VALUE(USM_HOST_BUFFER);
/**
* @brief Shared USM pointer type with device allocation type allocated by plugin
*/
DECLARE_GPU_PARAM_VALUE(USM_DEVICE_BUFFER);
/**
* @brief Shared video decoder surface or D3D 2D texture blob
*/

View File

@ -23,7 +23,7 @@ namespace InferenceEngine {
* Such context represents a scope on the device within which executable
* networks and remote memory blobs can exist, function and exchange data.
*/
class RemoteContext : public std::enable_shared_from_this<RemoteContext> {
class INFERENCE_ENGINE_API_CLASS(RemoteContext) : public std::enable_shared_from_this<RemoteContext> {
public:
/**
* @brief A smart pointer to the RemoteContext object
@ -110,6 +110,14 @@ public:
*/
virtual RemoteBlob::Ptr CreateBlob(const TensorDesc& tensorDesc, const ParamMap& params = {}) = 0;
/**
* @brief Allocates host accessible memory blob friendly for the device in current context
* Returns a pointer to the object which implements MemoryBlob interface.
* @param tensorDesc Defines the layout and dims of the blob
* @return A pointer to host accessible MemoryBlob object
*/
virtual MemoryBlob::Ptr CreateHostBlob(const TensorDesc& tensorDesc);
/**
* @brief Returns a map of device-specific parameters required for low-level
* operations with underlying object.

View File

@ -102,9 +102,10 @@ public:
* @note User can also obtain OpenCL context handle from this class.
*/
class D3DContext : public ClContext {
using RemoteContext::create_tensor;
public:
// Needed to make create_tensor overloads from base class visible for user
using ClContext::create_tensor;
/**
* @brief Checks that type defined runtime paramters are presented in remote object
* @param remote_context remote context to check

View File

@ -117,6 +117,36 @@ public:
}
};
/**
* @brief This class represents an abstraction for GPU plugin remote tensor
* which can be shared with user-supplied USM device pointer.
* The plugin object derived from this class can be obtained with ClContext::create_tensor() call.
* @note User can obtain USM pointer from this class.
*/
class USMTensor : public RemoteTensor {
public:
/**
* @brief Checks that type defined runtime paramters are presented in remote object
* @param tensor a tensor to check
*/
static void type_check(const Tensor& tensor) {
RemoteTensor::type_check(tensor,
{{GPU_PARAM_KEY(MEM_HANDLE), {}},
{GPU_PARAM_KEY(SHARED_MEM_TYPE),
{GPU_PARAM_VALUE(USM_USER_BUFFER),
GPU_PARAM_VALUE(USM_HOST_BUFFER),
GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}});
}
/**
* @brief Returns the underlying USM pointer.
* @return underlying USM pointer
*/
void* get() {
return static_cast<void*>(get_params().at(GPU_PARAM_KEY(MEM_HANDLE)).as<void*>());
}
};
/**
* @brief This class represents an abstraction for GPU plugin remote context
* which is shared with OpenCL context object.
@ -125,14 +155,14 @@ public:
*/
class ClContext : public RemoteContext {
protected:
using RemoteContext::create_tensor;
/**
* @brief GPU device name
*/
static constexpr const char* device_name = "GPU";
public:
// Needed to make create_tensor overloads from base class visible for user
using RemoteContext::create_tensor;
/**
* @brief Checks that type defined runtime paramters are presented in remote object
* @param remote_context remote context to check
@ -220,7 +250,7 @@ public:
* @brief This function is used to obtain remote tensor object from user-supplied cl_mem object
* @param type Tensor element type
* @param shape Tensor shape
* @param buffer A cl_mem object wrapped by a remote tensor
* @param buffer A cl_mem object that should be wrapped by a remote tensor
* @return A remote tensor instance
*/
ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl_mem buffer) {
@ -233,7 +263,7 @@ public:
* @brief This function is used to obtain remote tensor object from user-supplied cl::Buffer object
* @param type Tensor element type
* @param shape Tensor shape
* @param buffer A cl::Buffer object wrapped by a remote tensor
* @param buffer A cl::Buffer object that should be wrapped by a remote tensor
* @return A remote tensor instance
*/
ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl::Buffer& buffer) {
@ -244,7 +274,7 @@ public:
* @brief This function is used to obtain remote tensor object from user-supplied cl::Image2D object
* @param type Tensor element type
* @param shape Tensor shape
* @param image A cl::Image2D object wrapped by a remote tensor
* @param image A cl::Image2D object that should be wrapped by a remote tensor
* @return A remote tensor instance
*/
ClImage2DTensor create_tensor(const element::Type type, const Shape& shape, const cl::Image2D& image) {
@ -252,7 +282,43 @@ public:
{GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(image.get())}};
return create_tensor(type, shape, params);
}
/**
* @brief This function is used to obtain remote tensor object from user-supplied USM pointer
* @param type Tensor element type
* @param shape Tensor shape
* @param usm_ptr A USM pointer that should be wrapped by a remote tensor
* @return A remote tensor instance
*/
USMTensor create_tensor(const element::Type type, const Shape& shape, void* usm_ptr) {
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER)},
{GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(usm_ptr)}};
return create_tensor(type, shape, params);
}
/**
* @brief This function is used to allocate USM tensor with host allocation type
* @param type Tensor element type
* @param shape Tensor shape
* @return A remote tensor instance
*/
USMTensor create_usm_host_tensor(const element::Type type, const Shape& shape) {
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
return create_tensor(type, shape, params);
}
/**
* @brief This function is used to allocate USM tensor with device allocation type
* @param type Tensor element type
* @param shape Tensor shape
* @return A remote tensor instance
*/
USMTensor create_usm_device_tensor(const element::Type type, const Shape& shape) {
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}};
return create_tensor(type, shape, params);
}
};
} // namespace ocl
} // namespace gpu
} // namespace runtime

View File

@ -72,9 +72,10 @@ public:
* @note User can also obtain OpenCL context handle from this class.
*/
class VAContext : public ClContext {
using RemoteContext::create_tensor;
public:
// Needed to make create_tensor overloads from base class visible for user
using ClContext::create_tensor;
/**
* @brief Checks that type defined runtime paramters are presented in remote object
* @param remote_context remote context to check

View File

@ -136,6 +136,16 @@ public:
* @return A map of name/parameter elements.
*/
ParamMap get_params() const;
/**
* @brief This function is used to create host tensor object friendly for the device in current context
* For example, GPU context may allocate USM host memory (if corresponding extension is available)
* which could be more efficient than regular host memory.
* @param type Tensor element type
* @param shape Tensor shape
* @return A Tensor instance with device friendly memory
*/
Tensor create_host_tensor(const element::Type type, const Shape& shape);
};
} // namespace runtime

View File

@ -69,6 +69,15 @@ RemoteTensor RemoteContext::create_tensor(const element::Type& element_type,
});
}
Tensor RemoteContext::create_host_tensor(const element::Type element_type, const Shape& shape) {
OV_REMOTE_CONTEXT_STATEMENT({
auto blob = _impl->CreateHostBlob(
{ie::details::convertPrecision(element_type), shape, ie::TensorDesc::getLayoutByRank(shape.size())});
blob->allocate();
return {_so, blob};
});
}
ie::ParamMap RemoteContext::get_params() const {
OV_REMOTE_CONTEXT_STATEMENT(return _impl->getParams());
}

View File

@ -0,0 +1,22 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "ie_remote_context.hpp"
#include <memory>
#include <string>
#include "blob_factory.hpp"
namespace InferenceEngine {
MemoryBlob::Ptr RemoteContext::CreateHostBlob(const TensorDesc& tensorDesc) {
auto blob = std::dynamic_pointer_cast<MemoryBlob>(make_blob_with_precision(tensorDesc));
if (!blob)
IE_THROW(NotAllocated) << "Failed to create host blob in remote context for " << getDeviceName() << " device";
return blob;
}
} // namespace InferenceEngine

View File

@ -84,6 +84,61 @@ TEST_F(RemoteBlob_Test, smoke_canInputUserBlob) {
}
}
TEST_F(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
CNNNetwork net(fn_ptr);
net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
// TODO: Issue: investigate issue with IECore
auto ie = InferenceEngine::Core();
auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
// regular inference
auto inf_req_regular = exec_net.CreateInferRequest();
InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(
net.getInputsInfo().begin()->second->getTensorDesc());
inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
inf_req_regular.Infer();
auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
// inference using remote blob
auto inf_req_shared = exec_net.CreateInferRequest();
auto cldnn_context = exec_net.GetContext();
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
auto ocl_instance = std::make_shared<OpenCL>(ctx);
auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
size_t imSize = dims[1] * dims[2] * dims[3];
Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context);
shared_blob->allocate();
{
cl::Buffer shared_buffer = *shared_blob->as<gpu::ClBufferBlob>();
void *buffer = fakeImageData->buffer();
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
}
inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
inf_req_shared.Infer();
auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
// compare results
{
ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
}
}
TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
CNNNetwork net(fn_ptr);

View File

@ -30,7 +30,46 @@ protected:
}
};
TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
enum class RemoteTensorSharingType {
USER_CL_TENSOR = 0,
PLUGIN_CL_TENSOR = 1,
USER_USM_HOST_TENSOR = 2,
USER_USM_DEVICE_TENSOR = 3,
PLUGIN_USM_HOST_TENSOR = 4,
PLUGIN_USM_DEVICE_TENSOR = 5,
PLUGIN_HOST_TENSOR = 6
};
std::ostream& operator<<(std::ostream& stream, RemoteTensorSharingType sharing_type) {
switch (sharing_type) {
case RemoteTensorSharingType::USER_CL_TENSOR: stream << "USER_CL_TENSOR"; break;
case RemoteTensorSharingType::PLUGIN_CL_TENSOR: stream << "PLUGIN_CL_TENSOR"; break;
case RemoteTensorSharingType::USER_USM_HOST_TENSOR: stream << "USER_USM_HOST_TENSOR"; break;
case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: stream << "USER_USM_DEVICE_TENSOR"; break;
case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: stream << "PLUGIN_USM_HOST_TENSOR"; break;
case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: stream << "PLUGIN_USM_DEVICE_TENSOR"; break;
case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: stream << "PLUGIN_HOST_TENSOR"; break;
}
return stream;
}
class OVRemoteTensorInputBlob_Test : public OVRemoteTensor_Test, public testing::WithParamInterface<RemoteTensorSharingType> {
public:
void SetUp() override {
fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
}
static std::string getTestCaseName(testing::TestParamInfo<RemoteTensorSharingType> obj) {
RemoteTensorSharingType sharing_type = obj.param;
std::ostringstream result;
result << sharing_type;
return result.str();
}
};
TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
@ -45,6 +84,8 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
RemoteTensorSharingType sharing_type = GetParam();
// regular inference
auto inf_req_regular = exec_net.create_infer_request();
auto input = function->get_parameters().at(0);
@ -65,16 +106,129 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
auto imSize = ov::shape_size(input->get_shape());
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
{
void* buffer = fakeImageData.data();
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
switch (sharing_type) {
case RemoteTensorSharingType::USER_CL_TENSOR: {
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
{
void* buffer = fakeImageData.data();
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
}
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
break;
}
case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
if (!ocl_instance->supports_usm())
GTEST_SKIP();
void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
{
void* buffer = fakeImageData.data();
err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
if (err != CL_SUCCESS)
FAIL() << "Failed to copy data from host buffer to USM device";
}
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
ocl_instance->free_mem(shared_buffer);
break;
}
case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
if (!ocl_instance->supports_usm())
GTEST_SKIP();
void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
{
void* buffer = fakeImageData.data();
std::memcpy(shared_buffer, buffer, imSize);
}
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
ocl_instance->free_mem(shared_buffer);
break;
}
case RemoteTensorSharingType::PLUGIN_CL_TENSOR: {
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape());
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::ClBufferTensor>());
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::ClBufferTensor>();
{
cl::Buffer shared_buffer = cl_tensor;
void* buffer = fakeImageData.data();
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
}
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
break;
}
case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: {
if (!ocl_instance->supports_usm())
GTEST_SKIP();
auto cldnn_tensor = cldnn_context.create_usm_host_tensor(input->get_element_type(), input->get_shape());
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
{
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
void* shared_buffer = cl_tensor.get();
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
void* buffer = fakeImageData.data();
std::memcpy(shared_buffer, buffer, imSize);
}
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
break;
}
case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: {
if (!ocl_instance->supports_usm())
GTEST_SKIP();
auto cldnn_tensor = cldnn_context.create_usm_device_tensor(input->get_element_type(), input->get_shape());
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
{
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
void* shared_buffer = cl_tensor.get();
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_DEVICE_INTEL);
void* buffer = fakeImageData.data();
err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
if (err != CL_SUCCESS)
FAIL() << "Failed to copy data from host buffer to USM device";
}
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
break;
}
case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: {
auto cldnn_tensor = cldnn_context.create_host_tensor(input->get_element_type(), input->get_shape());
{
ASSERT_NO_THROW(cldnn_tensor.data());
void* shared_buffer = cldnn_tensor.data();
if (ocl_instance->supports_usm())
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
void* buffer = fakeImageData.data();
std::memcpy(shared_buffer, buffer, imSize);
}
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
break;
}
}
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
inf_req_shared.set_tensor(input, cldnn_tensor);
inf_req_shared.infer();
auto output_tensor_shared = inf_req_shared.get_tensor(output);
// compare results
@ -88,6 +242,18 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
}
}
INSTANTIATE_TEST_SUITE_P(
smoke_GPU,
OVRemoteTensorInputBlob_Test,
::testing::ValuesIn(std::vector<RemoteTensorSharingType>{RemoteTensorSharingType::USER_CL_TENSOR,
RemoteTensorSharingType::PLUGIN_CL_TENSOR,
RemoteTensorSharingType::USER_USM_HOST_TENSOR,
RemoteTensorSharingType::USER_USM_DEVICE_TENSOR,
RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR,
RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR,
RemoteTensorSharingType::PLUGIN_HOST_TENSOR}),
OVRemoteTensorInputBlob_Test::getTestCaseName);
TEST_F(OVRemoteTensor_Test, smoke_canInferOnUserContext) {
auto ie = ov::runtime::Core();

View File

@ -18,10 +18,57 @@
#endif
#include <gpu/gpu_context_api_ocl.hpp>
namespace {
template <typename T>
T load_entrypoint(const cl_platform_id platform, const std::string name) {
#if defined(__GNUC__) && __GNUC__ < 5
// OCL spec says:
// "The function clGetExtensionFunctionAddressForPlatform returns the address of the extension function named by funcname for a given platform.
// The pointer returned should be cast to a function pointer type matching the extension function's definition defined in the appropriate extension
// specification and header file."
// So the pointer-to-object to pointer-to-function cast below is supposed to be valid, thus we suppress warning from old GCC versions.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
T p = reinterpret_cast<T>(clGetExtensionFunctionAddressForPlatform(platform, name.c_str()));
#if defined(__GNUC__) && __GNUC__ < 5
#pragma GCC diagnostic pop
#endif
if (!p) {
throw std::runtime_error("clGetExtensionFunctionAddressForPlatform(" + name + ") returned NULL.");
}
return p;
}
template <typename T>
T try_load_entrypoint(const cl_platform_id platform, const std::string name) {
try {
return load_entrypoint<T>(platform, name);
} catch (...) {
return nullptr;
}
}
} // namespace
struct OpenCL {
cl::Context _context;
cl::Device _device;
cl::CommandQueue _queue;
cl_platform_id _platform;
clHostMemAllocINTEL_fn _host_mem_alloc_fn = nullptr;
clMemFreeINTEL_fn _mem_free_fn = nullptr;
clDeviceMemAllocINTEL_fn _device_mem_alloc_fn = nullptr;
clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
void init_extension_functions(cl_platform_id platform) {
_host_mem_alloc_fn = try_load_entrypoint<clHostMemAllocINTEL_fn>(platform, "clHostMemAllocINTEL");
_device_mem_alloc_fn = try_load_entrypoint<clDeviceMemAllocINTEL_fn>(platform, "clDeviceMemAllocINTEL");
_mem_free_fn = try_load_entrypoint<clMemFreeINTEL_fn>(platform, "clMemFreeINTEL");
_enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(platform, "clEnqueueMemcpyINTEL");
_get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(platform, "clGetMemAllocInfoINTEL");
}
explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
// get Intel iGPU OCL device, create context and queue
@ -42,12 +89,15 @@ struct OpenCL {
if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
_device = d;
_context = cl::Context(_device);
_platform = id;
break;
}
}
}
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
_queue = cl::CommandQueue(_context, _device, props);
init_extension_functions(_platform);
}
}
@ -56,7 +106,81 @@ struct OpenCL {
_context = cl::Context(context, true);
_device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
cl_int error = clGetDeviceInfo(_device.get(), CL_DEVICE_PLATFORM, sizeof(_platform), &_platform, nullptr);
if (error) {
throw std::runtime_error("OpenCL helper failed to retrieve CL_DEVICE_PLATFORM: " + std::to_string(error));
}
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
_queue = cl::CommandQueue(_context, _device, props);
init_extension_functions(_platform);
}
bool supports_usm() const {
return _host_mem_alloc_fn != nullptr &&
_device_mem_alloc_fn != nullptr &&
_mem_free_fn != nullptr &&
_enqueue_memcpy_fn != nullptr &&
_get_mem_alloc_info_fn != nullptr;
}
void* allocate_usm_host_buffer(size_t size) const {
cl_int err_code_ret;
if (!_device_mem_alloc_fn)
throw std::runtime_error("[GPU] clHostMemAllocINTEL is nullptr");
auto ret_ptr = _host_mem_alloc_fn(_context.get(), nullptr, size, 0, &err_code_ret);
if (err_code_ret != CL_SUCCESS)
throw std::runtime_error("OpenCL helper failed to allocate USM host memory");
return ret_ptr;
}
void* allocate_usm_device_buffer(size_t size) const {
cl_int err_code_ret;
if (!_device_mem_alloc_fn)
throw std::runtime_error("[GPU] clDeviceMemAllocINTEL is nullptr");
auto ret_ptr = _device_mem_alloc_fn(_context.get(), _device.get(), nullptr, size, 0, &err_code_ret);
if (err_code_ret != CL_SUCCESS)
throw std::runtime_error("OpenCL helper failed to allocate USM device memory");
return ret_ptr;
}
void free_mem(void* usm_ptr) {
if (!_mem_free_fn)
throw std::runtime_error("[GPU] clMemFreeINTEL is nullptr");
_mem_free_fn(_context.get(), usm_ptr);
}
cl_int memcpy(const cl::CommandQueue& cpp_queue, void *dst_ptr, const void *src_ptr,
size_t bytes_count, bool blocking = true, const std::vector<cl::Event>* wait_list = nullptr, cl::Event* ret_event = nullptr) const {
if (!_enqueue_memcpy_fn)
throw std::runtime_error("[GPU] clEnqueueMemcpyINTEL is nullptr");
cl_event tmp;
cl_int err = _enqueue_memcpy_fn(
cpp_queue.get(),
static_cast<cl_bool>(blocking),
dst_ptr,
src_ptr,
bytes_count,
wait_list == nullptr ? 0 : static_cast<cl_uint>(wait_list->size()),
wait_list == nullptr ? nullptr : reinterpret_cast<const cl_event*>(&wait_list->front()),
ret_event == nullptr ? nullptr : &tmp);
if (ret_event != nullptr && err == CL_SUCCESS)
*ret_event = tmp;
return err;
}
cl_unified_shared_memory_type_intel get_allocation_type(const void* usm_ptr) const {
if (!_get_mem_alloc_info_fn) {
throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
}
cl_unified_shared_memory_type_intel ret_val;
size_t ret_val_size;
_get_mem_alloc_info_fn(_context.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
return ret_val;
}
};

View File

@ -62,6 +62,9 @@ public:
/// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout
memory_ptr share_buffer(const layout& layout, shared_handle buf);
/// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout
memory_ptr share_usm(const layout& layout, shared_handle usm_ptr);
/// Create shared memory object using user-supplied 2D image @p img using specified @p layout
memory_ptr share_image(const layout& layout, shared_handle img);

View File

@ -92,6 +92,17 @@ memory_ptr engine::share_buffer(const layout& layout, shared_handle buf) {
return reinterpret_handle(layout, params);
}
memory_ptr engine::share_usm(const layout& layout, shared_handle usm_ptr) {
shared_mem_params params = { shared_mem_type::shared_mem_usm, nullptr, nullptr, usm_ptr,
#ifdef _WIN32
nullptr,
#else
0,
#endif
0 };
return reinterpret_handle(layout, params);
}
memory::ptr engine::share_image(const layout& layout, shared_handle img) {
shared_mem_params params = { shared_mem_type::shared_mem_image, nullptr, nullptr, img,
#ifdef _WIN32

View File

@ -168,6 +168,9 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
} else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
} else if (params.mem_type == shared_mem_type::shared_mem_usm) {
cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
} else {
throw std::runtime_error("unknown shared object fromat or type");
}

View File

@ -524,6 +524,7 @@ public:
_enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(_ctx.get(), "clEnqueueMemcpyINTEL");
_enqueue_mem_fill_fn = try_load_entrypoint<clEnqueueMemFillINTEL_fn>(_ctx.get(), "clEnqueueMemFillINTEL");
_enqueue_memset_fn = try_load_entrypoint<clEnqueueMemsetINTEL_fn>(_ctx.get(), "clEnqueueMemsetINTEL");
_get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(_ctx.get(), "clGetMemAllocInfoINTEL");
}
}
@ -621,6 +622,17 @@ public:
return err;
}
cl_unified_shared_memory_type_intel get_usm_allocation_type(const void* usm_ptr) const {
if (!_get_mem_alloc_info_fn) {
throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
}
cl_unified_shared_memory_type_intel ret_val;
size_t ret_val_size;
_get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
return ret_val;
}
private:
cl::Context _ctx;
cl::Device _device;
@ -632,6 +644,7 @@ private:
clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
clEnqueueMemFillINTEL_fn _enqueue_mem_fill_fn = nullptr;
clEnqueueMemsetINTEL_fn _enqueue_memset_fn = nullptr;
clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
};
/*
@ -640,11 +653,16 @@ private:
*/
class UsmHolder {
public:
UsmHolder(const cl::UsmHelper& usmHelper, void* ptr) : _usmHelper(usmHelper), _ptr(ptr) { }
UsmHolder(const cl::UsmHelper& usmHelper, void* ptr, bool shared_memory = false)
: _usmHelper(usmHelper)
, _ptr(ptr)
, _shared_memory(shared_memory) { }
void* ptr() { return _ptr; }
~UsmHolder() {
try {
_usmHelper.free_mem(_ptr);
if (!_shared_memory)
_usmHelper.free_mem(_ptr);
} catch (...) {
// Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
}
@ -652,6 +670,7 @@ public:
private:
const cl::UsmHelper& _usmHelper;
void* _ptr;
bool _shared_memory = false;
};
/*
USM base class. Different usm types should derive from this class.
@ -659,6 +678,13 @@ private:
class UsmMemory {
public:
explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
: _usmHelper(usmHelper)
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
if (!usm_ptr) {
throw std::runtime_error("[GPU] Can't share null usm pointer");
}
}
// Get methods returns original pointer allocated by openCL.
void* get() const { return _usm_pointer->ptr(); }

View File

@ -279,6 +279,12 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemo
, _buffer(buffer) {
}
gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer)
: lockable_gpu_mem()
, memory(engine, new_layout, detect_allocation_type(engine, buffer), true)
, _buffer(buffer) {
}
gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
: lockable_gpu_mem()
, memory(engine, layout, type, false)
@ -393,6 +399,20 @@ shared_mem_params gpu_usm::get_internal_params() const {
};
}
allocation_type gpu_usm::detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer) {
auto cl_alloc_type = engine->get_usm_helper().get_usm_allocation_type(buffer.get());
allocation_type res = allocation_type::unknown;
switch (cl_alloc_type) {
case CL_MEM_TYPE_DEVICE_INTEL: res = allocation_type::usm_device; break;
case CL_MEM_TYPE_HOST_INTEL: res = allocation_type::usm_host; break;
case CL_MEM_TYPE_SHARED_INTEL: res = allocation_type::usm_shared; break;
default: throw std::runtime_error("[GPU] Unsupported USM alloc type: " + std::to_string(cl_alloc_type));
}
return res;
}
std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
std::vector<cl_mem> res;
for (auto& m : mem) {

View File

@ -100,6 +100,7 @@ private:
struct gpu_usm : public lockable_gpu_mem, public memory {
gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer);
gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);
void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
@ -120,6 +121,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
protected:
cl::UsmMemory _buffer;
static allocation_type detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer);
};
struct ocl_surfaces_lock : public surfaces_lock {