[GPU] USM sharing and host blob creation in gpu remote context (#8657)
This commit is contained in:
parent
c49620bb6a
commit
6addc0d535
@ -966,9 +966,25 @@ void CLDNNInferRequest::prepare_output(const cldnn::primitive_id& outputName, Bl
|
||||
}
|
||||
|
||||
InferenceEngine::Blob::Ptr CLDNNInferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) {
|
||||
auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(), m_graph->GetNetwork()->get_stream(), desc, layout);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
auto blobPtr = std::make_shared<CLDNNRemoteUSMbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
layout,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
} else {
|
||||
auto blobPtr = std::make_shared<CLDNNRemoteCLbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
layout);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace CLDNNPlugin
|
||||
|
@ -38,6 +38,24 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const {
|
||||
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
|
||||
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
|
||||
};
|
||||
case BT_USM_SHARED:
|
||||
return{
|
||||
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
|
||||
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
|
||||
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
|
||||
};
|
||||
case BT_USM_HOST_INTERNAL:
|
||||
return{
|
||||
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
|
||||
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
|
||||
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
|
||||
};
|
||||
case BT_USM_DEVICE_INTERNAL:
|
||||
return{
|
||||
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
|
||||
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
|
||||
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
|
||||
};
|
||||
#ifdef _WIN32
|
||||
case BT_DX_BUF_SHARED:
|
||||
return{
|
||||
@ -81,7 +99,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept {
|
||||
return lockedHolder != nullptr;
|
||||
}
|
||||
|
||||
void CLDNNRemoteBlobImpl::allocate() noexcept {
|
||||
void CLDNNRemoteBlobImpl::allocate() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::Allocate");
|
||||
assert(m_memObject == nullptr);
|
||||
|
||||
@ -91,13 +109,25 @@ void CLDNNRemoteBlobImpl::allocate() noexcept {
|
||||
|
||||
switch (m_mem_type) {
|
||||
case BlobType::BT_BUF_INTERNAL: {
|
||||
m_memObject = eng->allocate_memory(m_layout);
|
||||
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
|
||||
break;
|
||||
}
|
||||
case BlobType::BT_USM_HOST_INTERNAL: {
|
||||
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_host);
|
||||
break;
|
||||
}
|
||||
case BlobType::BT_USM_DEVICE_INTERNAL: {
|
||||
m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_device);
|
||||
break;
|
||||
}
|
||||
case BlobType::BT_BUF_SHARED: {
|
||||
m_memObject = eng->share_buffer(m_layout, m_mem);
|
||||
break;
|
||||
}
|
||||
case BlobType::BT_USM_SHARED: {
|
||||
m_memObject = eng->share_usm(m_layout, m_mem);
|
||||
break;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
case BlobType::BT_SURF_SHARED: {
|
||||
m_memObject = eng->share_surface(m_layout, m_mem, m_plane);
|
||||
@ -139,6 +169,9 @@ std::shared_ptr<RemoteContext> CLDNNRemoteBlobImpl::getContext() const noexcept
|
||||
}
|
||||
|
||||
void CLDNNRemoteBlobImpl::lock() const {
|
||||
if (!is_allocated()) {
|
||||
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
|
||||
}
|
||||
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
|
||||
auto ptr = lockedHolder->data();
|
||||
_handle = reinterpret_cast<void*>(ptr);
|
||||
@ -295,15 +328,17 @@ std::string CLDNNExecutionContextImpl::getDeviceName() const noexcept {
|
||||
|
||||
auto engine_type = cldnn::engine_types::ocl;
|
||||
auto runtime_type = cldnn::runtime_types::ocl;
|
||||
// Use actual runtime and engine types
|
||||
cldnn::device_query device_query(engine_type, runtime_type);
|
||||
auto all_devices = device_query.get_available_devices();
|
||||
auto current_device = m_engine->get_device();
|
||||
try {
|
||||
// Use actual runtime and engine types
|
||||
cldnn::device_query device_query(engine_type, runtime_type);
|
||||
auto all_devices = device_query.get_available_devices();
|
||||
auto current_device = m_engine->get_device();
|
||||
|
||||
for (auto& kv : all_devices) {
|
||||
if (current_device->is_same(kv.second))
|
||||
return devName + "." + kv.first;
|
||||
}
|
||||
for (auto& kv : all_devices) {
|
||||
if (current_device->is_same(kv.second))
|
||||
return devName + "." + kv.first;
|
||||
}
|
||||
} catch (...) { }
|
||||
|
||||
if (!m_config.device_id.empty())
|
||||
devName += "." + m_config.device_id;
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <cldnn/runtime/engine.hpp>
|
||||
#include <ie_parameter.hpp>
|
||||
#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
|
||||
#include <blob_factory.hpp>
|
||||
#include <ie_remote_context.hpp>
|
||||
#include "cldnn_config.h"
|
||||
#include "cldnn_common_utils.h"
|
||||
@ -37,6 +38,9 @@ public:
|
||||
BT_EMPTY,
|
||||
BT_BUF_INTERNAL,
|
||||
BT_BUF_SHARED,
|
||||
BT_USM_SHARED,
|
||||
BT_USM_HOST_INTERNAL,
|
||||
BT_USM_DEVICE_INTERNAL,
|
||||
BT_IMG_SHARED,
|
||||
BT_SURF_SHARED,
|
||||
BT_DX_BUF_SHARED,
|
||||
@ -50,7 +54,7 @@ public:
|
||||
uint32_t plane = 0,
|
||||
BlobType mem_type = BT_BUF_INTERNAL);
|
||||
|
||||
void allocate() noexcept;
|
||||
void allocate();
|
||||
bool deallocate() noexcept;
|
||||
InferenceEngine::ParamMap getParams() const;
|
||||
std::string getDeviceName() const noexcept;
|
||||
@ -106,7 +110,11 @@ public:
|
||||
: _impl(context, stream, layout, mem, surf, plane, mem_type)
|
||||
, TpublicAPI(desc) {}
|
||||
|
||||
void allocate() noexcept override { _impl.allocate(); }
|
||||
void allocate() noexcept override {
|
||||
try {
|
||||
_impl.allocate();
|
||||
} catch (...) {}
|
||||
}
|
||||
bool deallocate() noexcept override { return _impl.deallocate(); }
|
||||
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
|
||||
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
|
||||
@ -125,6 +133,7 @@ protected:
|
||||
};
|
||||
|
||||
using CLDNNRemoteCLbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
|
||||
using CLDNNRemoteUSMbuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::USMBlob>;
|
||||
using CLDNNRemoteCLImage2D = typedCLDNNRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
|
||||
#ifdef _WIN32
|
||||
using CLDNNRemoteD3DBuffer = typedCLDNNRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
|
||||
@ -157,6 +166,10 @@ inline CLDNNRemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
|
||||
auto ptr = blobPtr->as<CLDNNRemoteCLImage2D>();
|
||||
if (ptr) return ptr->getImpl();
|
||||
}
|
||||
{
|
||||
auto ptr = blobPtr->as<CLDNNRemoteUSMbuffer>();
|
||||
if (ptr) return ptr->getImpl();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -204,6 +217,58 @@ public:
|
||||
bool free(void* handle) noexcept override { return true; }
|
||||
};
|
||||
|
||||
class USMHostAllocator : public InferenceEngine::IAllocator {
|
||||
protected:
|
||||
InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
|
||||
InferenceEngine::gpu::ClContext* _context = nullptr;
|
||||
|
||||
public:
|
||||
using Ptr = std::shared_ptr<USMHostAllocator>;
|
||||
|
||||
USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { }
|
||||
/**
|
||||
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
|
||||
* @return Generic pointer to memory
|
||||
*/
|
||||
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
|
||||
if (!_usm_host_blob)
|
||||
return nullptr;
|
||||
return _usm_host_blob->get();
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
|
||||
* The multiple sequential mappings of the same handle are suppose to get the same
|
||||
* result while there isn't a ref counter supported.
|
||||
*/
|
||||
void unlock(void* handle) noexcept override {}
|
||||
|
||||
/**
|
||||
* @brief Allocates memory
|
||||
* @param size The size in bytes to allocate
|
||||
* @return Handle to the allocated resource
|
||||
*/
|
||||
void* alloc(size_t size) noexcept override {
|
||||
auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C);
|
||||
InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
|
||||
_usm_host_blob = std::dynamic_pointer_cast<InferenceEngine::gpu::USMBlob>(_context->CreateBlob(td, params));
|
||||
_usm_host_blob->allocate();
|
||||
return _usm_host_blob->get();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Releases handle and all associated memory resources which invalidates the handle.
|
||||
* @return false if handle cannot be released, otherwise - true.
|
||||
*/
|
||||
bool free(void* handle) noexcept override {
|
||||
try {
|
||||
_usm_host_blob = nullptr;
|
||||
} catch(...) { }
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
|
||||
public:
|
||||
enum ContextType {
|
||||
@ -335,6 +400,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
|
||||
case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED:
|
||||
ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
|
||||
break;
|
||||
case CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED:
|
||||
ret = std::make_shared<CLDNNRemoteUSMbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
|
||||
break;
|
||||
case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED:
|
||||
layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
|
||||
ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
|
||||
@ -368,6 +436,21 @@ class typedCLDNNExecutionContext : public TpublicContextAPI {
|
||||
CLDNNRemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
|
||||
}
|
||||
|
||||
InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, CLDNNRemoteBlobImpl::BlobType alloc_type) {
|
||||
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
|
||||
FormatFromLayout(tensorDesc.getLayout()),
|
||||
CldnnTensorFromIEDims(tensorDesc.getDims()));
|
||||
auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
|
||||
auto& stream = _impl.GetEngine()->get_program_stream();
|
||||
|
||||
return std::make_shared<CLDNNRemoteUSMbuffer>(smart_this,
|
||||
stream,
|
||||
tensorDesc,
|
||||
layout,
|
||||
nullptr, 0, 0,
|
||||
alloc_type);
|
||||
}
|
||||
|
||||
void check_if_shared() {
|
||||
if (GetType() != CLDNNExecutionContextImpl::ContextType::DEV_SHARED)
|
||||
IE_THROW() << "Shared context is required to to share this type of memory";
|
||||
@ -382,9 +465,16 @@ public:
|
||||
const Config& config = {})
|
||||
: _impl(plugin, params, config) {}
|
||||
|
||||
InferenceEngine::ParamMap getParams() const noexcept override { return _impl.getParams(); }
|
||||
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
|
||||
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
|
||||
|
||||
InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override {
|
||||
if (_impl.GetEngine()->use_unified_shared_memory())
|
||||
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc, std::make_shared<USMHostAllocator>(this)));
|
||||
else
|
||||
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc));
|
||||
}
|
||||
|
||||
InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override {
|
||||
using namespace InferenceEngine;
|
||||
using InferenceEngine::gpu::details::param_map_obj_getter;
|
||||
@ -395,9 +485,21 @@ public:
|
||||
// user will supply shared object handle
|
||||
std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));
|
||||
|
||||
bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
|
||||
memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
|
||||
memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER);
|
||||
|
||||
if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) {
|
||||
IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device";
|
||||
}
|
||||
|
||||
if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) {
|
||||
check_if_shared();
|
||||
return reuse_surf(tensorDesc, params);
|
||||
} else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) {
|
||||
return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
|
||||
} else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) {
|
||||
return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
|
||||
} else {
|
||||
CLDNNRemoteBlobImpl::BlobType blob_type;
|
||||
cldnn::shared_handle mem = nullptr;
|
||||
@ -405,6 +507,9 @@ public:
|
||||
if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) {
|
||||
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED;
|
||||
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
|
||||
} else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) {
|
||||
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED;
|
||||
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
|
||||
} else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) {
|
||||
blob_type = CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED;
|
||||
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
|
||||
|
@ -131,6 +131,49 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief This class represents an abstraction for GPU plugin remote blob
|
||||
* which can be shared with user-supplied USM pointer.
|
||||
* The plugin object derived from this class can be obtained with CreateBlob() call.
|
||||
* @note User can obtain USM pointer from this class.
|
||||
*/
|
||||
class USMBlob : public ClBlob, public details::param_map_obj_getter {
|
||||
public:
|
||||
/**
|
||||
* @brief A smart pointer to the ClBufferBlob object
|
||||
*/
|
||||
using Ptr = std::shared_ptr<USMBlob>;
|
||||
|
||||
/**
|
||||
* @brief Creates a ClBufferBlob object with the specified dimensions and layout.
|
||||
* @param tensorDesc Tensor description
|
||||
*/
|
||||
explicit USMBlob(const TensorDesc& tensorDesc) : ClBlob(tensorDesc) {}
|
||||
|
||||
/**
|
||||
* @brief Returns the underlying OpenCL memory object handle.
|
||||
* @return underlying OpenCL memory object handle
|
||||
*/
|
||||
void* get() {
|
||||
const auto& params = getParams();
|
||||
auto itrType = params.find(GPU_PARAM_KEY(SHARED_MEM_TYPE));
|
||||
if (itrType == params.end())
|
||||
IE_THROW() << "Parameter of type " << GPU_PARAM_KEY(SHARED_MEM_TYPE) << " not found";
|
||||
|
||||
auto mem_type = itrType->second.as<std::string>();
|
||||
if (mem_type != GPU_PARAM_VALUE(USM_USER_BUFFER) && mem_type != GPU_PARAM_VALUE(USM_HOST_BUFFER) &&
|
||||
mem_type != GPU_PARAM_VALUE(USM_DEVICE_BUFFER))
|
||||
IE_THROW() << "Unexpected USM blob type: " << mem_type;
|
||||
|
||||
auto itrHandle = params.find(GPU_PARAM_KEY(MEM_HANDLE));
|
||||
if (itrHandle == params.end()) {
|
||||
IE_THROW() << "No parameter " << GPU_PARAM_KEY(MEM_HANDLE) << " found";
|
||||
}
|
||||
|
||||
return itrHandle->second.as<gpu_handle_param>();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief This class represents an abstraction for GPU plugin remote blob
|
||||
* which can be shared with user-supplied OpenCL 2D Image.
|
||||
|
@ -98,7 +98,18 @@ DECLARE_GPU_PARAM_VALUE(OCL_BUFFER);
|
||||
* @brief Shared OpenCL 2D image blob
|
||||
*/
|
||||
DECLARE_GPU_PARAM_VALUE(OCL_IMAGE2D);
|
||||
|
||||
/**
|
||||
* @brief Shared USM pointer allocated by user
|
||||
*/
|
||||
DECLARE_GPU_PARAM_VALUE(USM_USER_BUFFER);
|
||||
/**
|
||||
* @brief Shared USM pointer type with host allocation type allocated by plugin
|
||||
*/
|
||||
DECLARE_GPU_PARAM_VALUE(USM_HOST_BUFFER);
|
||||
/**
|
||||
* @brief Shared USM pointer type with device allocation type allocated by plugin
|
||||
*/
|
||||
DECLARE_GPU_PARAM_VALUE(USM_DEVICE_BUFFER);
|
||||
/**
|
||||
* @brief Shared video decoder surface or D3D 2D texture blob
|
||||
*/
|
||||
|
@ -23,7 +23,7 @@ namespace InferenceEngine {
|
||||
* Such context represents a scope on the device within which executable
|
||||
* networks and remote memory blobs can exist, function and exchange data.
|
||||
*/
|
||||
class RemoteContext : public std::enable_shared_from_this<RemoteContext> {
|
||||
class INFERENCE_ENGINE_API_CLASS(RemoteContext) : public std::enable_shared_from_this<RemoteContext> {
|
||||
public:
|
||||
/**
|
||||
* @brief A smart pointer to the RemoteContext object
|
||||
@ -110,6 +110,14 @@ public:
|
||||
*/
|
||||
virtual RemoteBlob::Ptr CreateBlob(const TensorDesc& tensorDesc, const ParamMap& params = {}) = 0;
|
||||
|
||||
/**
|
||||
* @brief Allocates host accessible memory blob friendly for the device in current context
|
||||
* Returns a pointer to the object which implements MemoryBlob interface.
|
||||
* @param tensorDesc Defines the layout and dims of the blob
|
||||
* @return A pointer to host accessible MemoryBlob object
|
||||
*/
|
||||
virtual MemoryBlob::Ptr CreateHostBlob(const TensorDesc& tensorDesc);
|
||||
|
||||
/**
|
||||
* @brief Returns a map of device-specific parameters required for low-level
|
||||
* operations with underlying object.
|
||||
|
@ -102,9 +102,10 @@ public:
|
||||
* @note User can also obtain OpenCL context handle from this class.
|
||||
*/
|
||||
class D3DContext : public ClContext {
|
||||
using RemoteContext::create_tensor;
|
||||
|
||||
public:
|
||||
// Needed to make create_tensor overloads from base class visible for user
|
||||
using ClContext::create_tensor;
|
||||
|
||||
/**
|
||||
* @brief Checks that type defined runtime paramters are presented in remote object
|
||||
* @param remote_context remote context to check
|
||||
|
@ -117,6 +117,36 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief This class represents an abstraction for GPU plugin remote tensor
|
||||
* which can be shared with user-supplied USM device pointer.
|
||||
* The plugin object derived from this class can be obtained with ClContext::create_tensor() call.
|
||||
* @note User can obtain USM pointer from this class.
|
||||
*/
|
||||
class USMTensor : public RemoteTensor {
|
||||
public:
|
||||
/**
|
||||
* @brief Checks that type defined runtime paramters are presented in remote object
|
||||
* @param tensor a tensor to check
|
||||
*/
|
||||
static void type_check(const Tensor& tensor) {
|
||||
RemoteTensor::type_check(tensor,
|
||||
{{GPU_PARAM_KEY(MEM_HANDLE), {}},
|
||||
{GPU_PARAM_KEY(SHARED_MEM_TYPE),
|
||||
{GPU_PARAM_VALUE(USM_USER_BUFFER),
|
||||
GPU_PARAM_VALUE(USM_HOST_BUFFER),
|
||||
GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns the underlying USM pointer.
|
||||
* @return underlying USM pointer
|
||||
*/
|
||||
void* get() {
|
||||
return static_cast<void*>(get_params().at(GPU_PARAM_KEY(MEM_HANDLE)).as<void*>());
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief This class represents an abstraction for GPU plugin remote context
|
||||
* which is shared with OpenCL context object.
|
||||
@ -125,14 +155,14 @@ public:
|
||||
*/
|
||||
class ClContext : public RemoteContext {
|
||||
protected:
|
||||
using RemoteContext::create_tensor;
|
||||
|
||||
/**
|
||||
* @brief GPU device name
|
||||
*/
|
||||
static constexpr const char* device_name = "GPU";
|
||||
|
||||
public:
|
||||
// Needed to make create_tensor overloads from base class visible for user
|
||||
using RemoteContext::create_tensor;
|
||||
/**
|
||||
* @brief Checks that type defined runtime paramters are presented in remote object
|
||||
* @param remote_context remote context to check
|
||||
@ -220,7 +250,7 @@ public:
|
||||
* @brief This function is used to obtain remote tensor object from user-supplied cl_mem object
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @param buffer A cl_mem object wrapped by a remote tensor
|
||||
* @param buffer A cl_mem object that should be wrapped by a remote tensor
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl_mem buffer) {
|
||||
@ -233,7 +263,7 @@ public:
|
||||
* @brief This function is used to obtain remote tensor object from user-supplied cl::Buffer object
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @param buffer A cl::Buffer object wrapped by a remote tensor
|
||||
* @param buffer A cl::Buffer object that should be wrapped by a remote tensor
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl::Buffer& buffer) {
|
||||
@ -244,7 +274,7 @@ public:
|
||||
* @brief This function is used to obtain remote tensor object from user-supplied cl::Image2D object
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @param image A cl::Image2D object wrapped by a remote tensor
|
||||
* @param image A cl::Image2D object that should be wrapped by a remote tensor
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
ClImage2DTensor create_tensor(const element::Type type, const Shape& shape, const cl::Image2D& image) {
|
||||
@ -252,7 +282,43 @@ public:
|
||||
{GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(image.get())}};
|
||||
return create_tensor(type, shape, params);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief This function is used to obtain remote tensor object from user-supplied USM pointer
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @param usm_ptr A USM pointer that should be wrapped by a remote tensor
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
USMTensor create_tensor(const element::Type type, const Shape& shape, void* usm_ptr) {
|
||||
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER)},
|
||||
{GPU_PARAM_KEY(MEM_HANDLE), static_cast<gpu_handle_param>(usm_ptr)}};
|
||||
return create_tensor(type, shape, params);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief This function is used to allocate USM tensor with host allocation type
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
USMTensor create_usm_host_tensor(const element::Type type, const Shape& shape) {
|
||||
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
|
||||
return create_tensor(type, shape, params);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief This function is used to allocate USM tensor with device allocation type
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @return A remote tensor instance
|
||||
*/
|
||||
USMTensor create_usm_device_tensor(const element::Type type, const Shape& shape) {
|
||||
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}};
|
||||
return create_tensor(type, shape, params);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ocl
|
||||
} // namespace gpu
|
||||
} // namespace runtime
|
||||
|
@ -72,9 +72,10 @@ public:
|
||||
* @note User can also obtain OpenCL context handle from this class.
|
||||
*/
|
||||
class VAContext : public ClContext {
|
||||
using RemoteContext::create_tensor;
|
||||
|
||||
public:
|
||||
// Needed to make create_tensor overloads from base class visible for user
|
||||
using ClContext::create_tensor;
|
||||
|
||||
/**
|
||||
* @brief Checks that type defined runtime paramters are presented in remote object
|
||||
* @param remote_context remote context to check
|
||||
|
@ -136,6 +136,16 @@ public:
|
||||
* @return A map of name/parameter elements.
|
||||
*/
|
||||
ParamMap get_params() const;
|
||||
|
||||
/**
|
||||
* @brief This function is used to create host tensor object friendly for the device in current context
|
||||
* For example, GPU context may allocate USM host memory (if corresponding extension is available)
|
||||
* which could be more efficient than regular host memory.
|
||||
* @param type Tensor element type
|
||||
* @param shape Tensor shape
|
||||
* @return A Tensor instance with device friendly memory
|
||||
*/
|
||||
Tensor create_host_tensor(const element::Type type, const Shape& shape);
|
||||
};
|
||||
|
||||
} // namespace runtime
|
||||
|
@ -69,6 +69,15 @@ RemoteTensor RemoteContext::create_tensor(const element::Type& element_type,
|
||||
});
|
||||
}
|
||||
|
||||
Tensor RemoteContext::create_host_tensor(const element::Type element_type, const Shape& shape) {
|
||||
OV_REMOTE_CONTEXT_STATEMENT({
|
||||
auto blob = _impl->CreateHostBlob(
|
||||
{ie::details::convertPrecision(element_type), shape, ie::TensorDesc::getLayoutByRank(shape.size())});
|
||||
blob->allocate();
|
||||
return {_so, blob};
|
||||
});
|
||||
}
|
||||
|
||||
ie::ParamMap RemoteContext::get_params() const {
|
||||
OV_REMOTE_CONTEXT_STATEMENT(return _impl->getParams());
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ie_remote_context.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "blob_factory.hpp"
|
||||
|
||||
namespace InferenceEngine {
|
||||
|
||||
MemoryBlob::Ptr RemoteContext::CreateHostBlob(const TensorDesc& tensorDesc) {
|
||||
auto blob = std::dynamic_pointer_cast<MemoryBlob>(make_blob_with_precision(tensorDesc));
|
||||
if (!blob)
|
||||
IE_THROW(NotAllocated) << "Failed to create host blob in remote context for " << getDeviceName() << " device";
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
} // namespace InferenceEngine
|
@ -84,6 +84,61 @@ TEST_F(RemoteBlob_Test, smoke_canInputUserBlob) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST_F(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
|
||||
#if defined(ANDROID)
|
||||
GTEST_SKIP();
|
||||
#endif
|
||||
CNNNetwork net(fn_ptr);
|
||||
|
||||
net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
|
||||
net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
|
||||
|
||||
// TODO: Issue: investigate issue with IECore
|
||||
auto ie = InferenceEngine::Core();
|
||||
auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net.CreateInferRequest();
|
||||
InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(
|
||||
net.getInputsInfo().begin()->second->getTensorDesc());
|
||||
inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
|
||||
|
||||
inf_req_regular.Infer();
|
||||
auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
|
||||
|
||||
// inference using remote blob
|
||||
auto inf_req_shared = exec_net.CreateInferRequest();
|
||||
auto cldnn_context = exec_net.GetContext();
|
||||
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
|
||||
auto ocl_instance = std::make_shared<OpenCL>(ctx);
|
||||
|
||||
auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
|
||||
size_t imSize = dims[1] * dims[2] * dims[3];
|
||||
|
||||
Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context);
|
||||
shared_blob->allocate();
|
||||
{
|
||||
cl::Buffer shared_buffer = *shared_blob->as<gpu::ClBufferBlob>();
|
||||
void *buffer = fakeImageData->buffer();
|
||||
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
|
||||
}
|
||||
|
||||
inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
|
||||
|
||||
inf_req_shared.Infer();
|
||||
auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
|
||||
|
||||
// compare results
|
||||
{
|
||||
ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
|
||||
ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
|
||||
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
|
||||
FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
|
||||
auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
|
||||
CNNNetwork net(fn_ptr);
|
||||
|
@ -30,7 +30,46 @@ protected:
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
|
||||
enum class RemoteTensorSharingType {
|
||||
USER_CL_TENSOR = 0,
|
||||
PLUGIN_CL_TENSOR = 1,
|
||||
USER_USM_HOST_TENSOR = 2,
|
||||
USER_USM_DEVICE_TENSOR = 3,
|
||||
PLUGIN_USM_HOST_TENSOR = 4,
|
||||
PLUGIN_USM_DEVICE_TENSOR = 5,
|
||||
PLUGIN_HOST_TENSOR = 6
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, RemoteTensorSharingType sharing_type) {
|
||||
switch (sharing_type) {
|
||||
case RemoteTensorSharingType::USER_CL_TENSOR: stream << "USER_CL_TENSOR"; break;
|
||||
case RemoteTensorSharingType::PLUGIN_CL_TENSOR: stream << "PLUGIN_CL_TENSOR"; break;
|
||||
case RemoteTensorSharingType::USER_USM_HOST_TENSOR: stream << "USER_USM_HOST_TENSOR"; break;
|
||||
case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: stream << "USER_USM_DEVICE_TENSOR"; break;
|
||||
case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: stream << "PLUGIN_USM_HOST_TENSOR"; break;
|
||||
case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: stream << "PLUGIN_USM_DEVICE_TENSOR"; break;
|
||||
case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: stream << "PLUGIN_HOST_TENSOR"; break;
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
class OVRemoteTensorInputBlob_Test : public OVRemoteTensor_Test, public testing::WithParamInterface<RemoteTensorSharingType> {
|
||||
public:
|
||||
void SetUp() override {
|
||||
fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
|
||||
}
|
||||
|
||||
static std::string getTestCaseName(testing::TestParamInfo<RemoteTensorSharingType> obj) {
|
||||
RemoteTensorSharingType sharing_type = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << sharing_type;
|
||||
return result.str();
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
|
||||
#if defined(ANDROID)
|
||||
GTEST_SKIP();
|
||||
#endif
|
||||
@ -45,6 +84,8 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
|
||||
|
||||
auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
|
||||
|
||||
RemoteTensorSharingType sharing_type = GetParam();
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net.create_infer_request();
|
||||
auto input = function->get_parameters().at(0);
|
||||
@ -65,16 +106,129 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
|
||||
|
||||
auto imSize = ov::shape_size(input->get_shape());
|
||||
|
||||
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
|
||||
{
|
||||
void* buffer = fakeImageData.data();
|
||||
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
|
||||
switch (sharing_type) {
|
||||
case RemoteTensorSharingType::USER_CL_TENSOR: {
|
||||
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
|
||||
{
|
||||
void* buffer = fakeImageData.data();
|
||||
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
|
||||
}
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
|
||||
{
|
||||
void* buffer = fakeImageData.data();
|
||||
err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS)
|
||||
FAIL() << "Failed to copy data from host buffer to USM device";
|
||||
}
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
ocl_instance->free_mem(shared_buffer);
|
||||
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
|
||||
{
|
||||
void* buffer = fakeImageData.data();
|
||||
std::memcpy(shared_buffer, buffer, imSize);
|
||||
}
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
ocl_instance->free_mem(shared_buffer);
|
||||
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::PLUGIN_CL_TENSOR: {
|
||||
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape());
|
||||
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::ClBufferTensor>());
|
||||
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::ClBufferTensor>();
|
||||
{
|
||||
cl::Buffer shared_buffer = cl_tensor;
|
||||
void* buffer = fakeImageData.data();
|
||||
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
|
||||
}
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_usm_host_tensor(input->get_element_type(), input->get_shape());
|
||||
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
|
||||
{
|
||||
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
|
||||
void* shared_buffer = cl_tensor.get();
|
||||
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
|
||||
void* buffer = fakeImageData.data();
|
||||
std::memcpy(shared_buffer, buffer, imSize);
|
||||
}
|
||||
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_usm_device_tensor(input->get_element_type(), input->get_shape());
|
||||
ASSERT_TRUE(cldnn_tensor.is<ov::runtime::gpu::ocl::USMTensor>());
|
||||
{
|
||||
auto cl_tensor = cldnn_tensor.as<ov::runtime::gpu::ocl::USMTensor>();
|
||||
void* shared_buffer = cl_tensor.get();
|
||||
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_DEVICE_INTEL);
|
||||
void* buffer = fakeImageData.data();
|
||||
err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS)
|
||||
FAIL() << "Failed to copy data from host buffer to USM device";
|
||||
}
|
||||
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: {
|
||||
auto cldnn_tensor = cldnn_context.create_host_tensor(input->get_element_type(), input->get_shape());
|
||||
{
|
||||
ASSERT_NO_THROW(cldnn_tensor.data());
|
||||
void* shared_buffer = cldnn_tensor.data();
|
||||
if (ocl_instance->supports_usm())
|
||||
ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL);
|
||||
void* buffer = fakeImageData.data();
|
||||
std::memcpy(shared_buffer, buffer, imSize);
|
||||
}
|
||||
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
inf_req_shared.infer();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer);
|
||||
inf_req_shared.set_tensor(input, cldnn_tensor);
|
||||
|
||||
inf_req_shared.infer();
|
||||
auto output_tensor_shared = inf_req_shared.get_tensor(output);
|
||||
|
||||
// compare results
|
||||
@ -88,6 +242,18 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) {
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
smoke_GPU,
|
||||
OVRemoteTensorInputBlob_Test,
|
||||
::testing::ValuesIn(std::vector<RemoteTensorSharingType>{RemoteTensorSharingType::USER_CL_TENSOR,
|
||||
RemoteTensorSharingType::PLUGIN_CL_TENSOR,
|
||||
RemoteTensorSharingType::USER_USM_HOST_TENSOR,
|
||||
RemoteTensorSharingType::USER_USM_DEVICE_TENSOR,
|
||||
RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR,
|
||||
RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR,
|
||||
RemoteTensorSharingType::PLUGIN_HOST_TENSOR}),
|
||||
OVRemoteTensorInputBlob_Test::getTestCaseName);
|
||||
|
||||
TEST_F(OVRemoteTensor_Test, smoke_canInferOnUserContext) {
|
||||
auto ie = ov::runtime::Core();
|
||||
|
||||
|
@ -18,10 +18,57 @@
|
||||
#endif
|
||||
#include <gpu/gpu_context_api_ocl.hpp>
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
T load_entrypoint(const cl_platform_id platform, const std::string name) {
|
||||
#if defined(__GNUC__) && __GNUC__ < 5
|
||||
// OCL spec says:
|
||||
// "The function clGetExtensionFunctionAddressForPlatform returns the address of the extension function named by funcname for a given platform.
|
||||
// The pointer returned should be cast to a function pointer type matching the extension function's definition defined in the appropriate extension
|
||||
// specification and header file."
|
||||
// So the pointer-to-object to pointer-to-function cast below is supposed to be valid, thus we suppress warning from old GCC versions.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wpedantic"
|
||||
#endif
|
||||
T p = reinterpret_cast<T>(clGetExtensionFunctionAddressForPlatform(platform, name.c_str()));
|
||||
#if defined(__GNUC__) && __GNUC__ < 5
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
if (!p) {
|
||||
throw std::runtime_error("clGetExtensionFunctionAddressForPlatform(" + name + ") returned NULL.");
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T try_load_entrypoint(const cl_platform_id platform, const std::string name) {
|
||||
try {
|
||||
return load_entrypoint<T>(platform, name);
|
||||
} catch (...) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
struct OpenCL {
|
||||
cl::Context _context;
|
||||
cl::Device _device;
|
||||
cl::CommandQueue _queue;
|
||||
cl_platform_id _platform;
|
||||
|
||||
clHostMemAllocINTEL_fn _host_mem_alloc_fn = nullptr;
|
||||
clMemFreeINTEL_fn _mem_free_fn = nullptr;
|
||||
clDeviceMemAllocINTEL_fn _device_mem_alloc_fn = nullptr;
|
||||
clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
|
||||
clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
|
||||
|
||||
void init_extension_functions(cl_platform_id platform) {
|
||||
_host_mem_alloc_fn = try_load_entrypoint<clHostMemAllocINTEL_fn>(platform, "clHostMemAllocINTEL");
|
||||
_device_mem_alloc_fn = try_load_entrypoint<clDeviceMemAllocINTEL_fn>(platform, "clDeviceMemAllocINTEL");
|
||||
_mem_free_fn = try_load_entrypoint<clMemFreeINTEL_fn>(platform, "clMemFreeINTEL");
|
||||
_enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(platform, "clEnqueueMemcpyINTEL");
|
||||
_get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(platform, "clGetMemAllocInfoINTEL");
|
||||
}
|
||||
|
||||
explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
|
||||
// get Intel iGPU OCL device, create context and queue
|
||||
@ -42,12 +89,15 @@ struct OpenCL {
|
||||
if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
|
||||
_device = d;
|
||||
_context = cl::Context(_device);
|
||||
_platform = id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
|
||||
_queue = cl::CommandQueue(_context, _device, props);
|
||||
|
||||
init_extension_functions(_platform);
|
||||
}
|
||||
}
|
||||
|
||||
@ -56,7 +106,81 @@ struct OpenCL {
|
||||
_context = cl::Context(context, true);
|
||||
_device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
|
||||
|
||||
cl_int error = clGetDeviceInfo(_device.get(), CL_DEVICE_PLATFORM, sizeof(_platform), &_platform, nullptr);
|
||||
if (error) {
|
||||
throw std::runtime_error("OpenCL helper failed to retrieve CL_DEVICE_PLATFORM: " + std::to_string(error));
|
||||
}
|
||||
|
||||
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
|
||||
_queue = cl::CommandQueue(_context, _device, props);
|
||||
|
||||
init_extension_functions(_platform);
|
||||
}
|
||||
|
||||
bool supports_usm() const {
|
||||
return _host_mem_alloc_fn != nullptr &&
|
||||
_device_mem_alloc_fn != nullptr &&
|
||||
_mem_free_fn != nullptr &&
|
||||
_enqueue_memcpy_fn != nullptr &&
|
||||
_get_mem_alloc_info_fn != nullptr;
|
||||
}
|
||||
|
||||
void* allocate_usm_host_buffer(size_t size) const {
|
||||
cl_int err_code_ret;
|
||||
if (!_device_mem_alloc_fn)
|
||||
throw std::runtime_error("[GPU] clHostMemAllocINTEL is nullptr");
|
||||
auto ret_ptr = _host_mem_alloc_fn(_context.get(), nullptr, size, 0, &err_code_ret);
|
||||
if (err_code_ret != CL_SUCCESS)
|
||||
throw std::runtime_error("OpenCL helper failed to allocate USM host memory");
|
||||
return ret_ptr;
|
||||
}
|
||||
|
||||
void* allocate_usm_device_buffer(size_t size) const {
|
||||
cl_int err_code_ret;
|
||||
if (!_device_mem_alloc_fn)
|
||||
throw std::runtime_error("[GPU] clDeviceMemAllocINTEL is nullptr");
|
||||
auto ret_ptr = _device_mem_alloc_fn(_context.get(), _device.get(), nullptr, size, 0, &err_code_ret);
|
||||
if (err_code_ret != CL_SUCCESS)
|
||||
throw std::runtime_error("OpenCL helper failed to allocate USM device memory");
|
||||
return ret_ptr;
|
||||
}
|
||||
|
||||
void free_mem(void* usm_ptr) {
|
||||
if (!_mem_free_fn)
|
||||
throw std::runtime_error("[GPU] clMemFreeINTEL is nullptr");
|
||||
|
||||
_mem_free_fn(_context.get(), usm_ptr);
|
||||
}
|
||||
|
||||
cl_int memcpy(const cl::CommandQueue& cpp_queue, void *dst_ptr, const void *src_ptr,
|
||||
size_t bytes_count, bool blocking = true, const std::vector<cl::Event>* wait_list = nullptr, cl::Event* ret_event = nullptr) const {
|
||||
if (!_enqueue_memcpy_fn)
|
||||
throw std::runtime_error("[GPU] clEnqueueMemcpyINTEL is nullptr");
|
||||
cl_event tmp;
|
||||
cl_int err = _enqueue_memcpy_fn(
|
||||
cpp_queue.get(),
|
||||
static_cast<cl_bool>(blocking),
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
bytes_count,
|
||||
wait_list == nullptr ? 0 : static_cast<cl_uint>(wait_list->size()),
|
||||
wait_list == nullptr ? nullptr : reinterpret_cast<const cl_event*>(&wait_list->front()),
|
||||
ret_event == nullptr ? nullptr : &tmp);
|
||||
|
||||
if (ret_event != nullptr && err == CL_SUCCESS)
|
||||
*ret_event = tmp;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_unified_shared_memory_type_intel get_allocation_type(const void* usm_ptr) const {
|
||||
if (!_get_mem_alloc_info_fn) {
|
||||
throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
|
||||
}
|
||||
|
||||
cl_unified_shared_memory_type_intel ret_val;
|
||||
size_t ret_val_size;
|
||||
_get_mem_alloc_info_fn(_context.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
|
||||
return ret_val;
|
||||
}
|
||||
};
|
||||
|
@ -62,6 +62,9 @@ public:
|
||||
/// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout
|
||||
memory_ptr share_buffer(const layout& layout, shared_handle buf);
|
||||
|
||||
/// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout
|
||||
memory_ptr share_usm(const layout& layout, shared_handle usm_ptr);
|
||||
|
||||
/// Create shared memory object using user-supplied 2D image @p img using specified @p layout
|
||||
memory_ptr share_image(const layout& layout, shared_handle img);
|
||||
|
||||
|
@ -92,6 +92,17 @@ memory_ptr engine::share_buffer(const layout& layout, shared_handle buf) {
|
||||
return reinterpret_handle(layout, params);
|
||||
}
|
||||
|
||||
memory_ptr engine::share_usm(const layout& layout, shared_handle usm_ptr) {
|
||||
shared_mem_params params = { shared_mem_type::shared_mem_usm, nullptr, nullptr, usm_ptr,
|
||||
#ifdef _WIN32
|
||||
nullptr,
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
0 };
|
||||
return reinterpret_handle(layout, params);
|
||||
}
|
||||
|
||||
memory::ptr engine::share_image(const layout& layout, shared_handle img) {
|
||||
shared_mem_params params = { shared_mem_type::shared_mem_image, nullptr, nullptr, img,
|
||||
#ifdef _WIN32
|
||||
|
@ -168,6 +168,9 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
|
||||
} else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
|
||||
cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
|
||||
return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
|
||||
} else if (params.mem_type == shared_mem_type::shared_mem_usm) {
|
||||
cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
|
||||
return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
|
||||
} else {
|
||||
throw std::runtime_error("unknown shared object fromat or type");
|
||||
}
|
||||
|
@ -524,6 +524,7 @@ public:
|
||||
_enqueue_memcpy_fn = try_load_entrypoint<clEnqueueMemcpyINTEL_fn>(_ctx.get(), "clEnqueueMemcpyINTEL");
|
||||
_enqueue_mem_fill_fn = try_load_entrypoint<clEnqueueMemFillINTEL_fn>(_ctx.get(), "clEnqueueMemFillINTEL");
|
||||
_enqueue_memset_fn = try_load_entrypoint<clEnqueueMemsetINTEL_fn>(_ctx.get(), "clEnqueueMemsetINTEL");
|
||||
_get_mem_alloc_info_fn = try_load_entrypoint<clGetMemAllocInfoINTEL_fn>(_ctx.get(), "clGetMemAllocInfoINTEL");
|
||||
}
|
||||
}
|
||||
|
||||
@ -621,6 +622,17 @@ public:
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_unified_shared_memory_type_intel get_usm_allocation_type(const void* usm_ptr) const {
|
||||
if (!_get_mem_alloc_info_fn) {
|
||||
throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
|
||||
}
|
||||
|
||||
cl_unified_shared_memory_type_intel ret_val;
|
||||
size_t ret_val_size;
|
||||
_get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size);
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
private:
|
||||
cl::Context _ctx;
|
||||
cl::Device _device;
|
||||
@ -632,6 +644,7 @@ private:
|
||||
clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr;
|
||||
clEnqueueMemFillINTEL_fn _enqueue_mem_fill_fn = nullptr;
|
||||
clEnqueueMemsetINTEL_fn _enqueue_memset_fn = nullptr;
|
||||
clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -640,11 +653,16 @@ private:
|
||||
*/
|
||||
class UsmHolder {
|
||||
public:
|
||||
UsmHolder(const cl::UsmHelper& usmHelper, void* ptr) : _usmHelper(usmHelper), _ptr(ptr) { }
|
||||
UsmHolder(const cl::UsmHelper& usmHelper, void* ptr, bool shared_memory = false)
|
||||
: _usmHelper(usmHelper)
|
||||
, _ptr(ptr)
|
||||
, _shared_memory(shared_memory) { }
|
||||
|
||||
void* ptr() { return _ptr; }
|
||||
~UsmHolder() {
|
||||
try {
|
||||
_usmHelper.free_mem(_ptr);
|
||||
if (!_shared_memory)
|
||||
_usmHelper.free_mem(_ptr);
|
||||
} catch (...) {
|
||||
// Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly
|
||||
}
|
||||
@ -652,6 +670,7 @@ public:
|
||||
private:
|
||||
const cl::UsmHelper& _usmHelper;
|
||||
void* _ptr;
|
||||
bool _shared_memory = false;
|
||||
};
|
||||
/*
|
||||
USM base class. Different usm types should derive from this class.
|
||||
@ -659,6 +678,13 @@ private:
|
||||
class UsmMemory {
|
||||
public:
|
||||
explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { }
|
||||
UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr)
|
||||
: _usmHelper(usmHelper)
|
||||
, _usm_pointer(std::make_shared<UsmHolder>(_usmHelper, usm_ptr, true)) {
|
||||
if (!usm_ptr) {
|
||||
throw std::runtime_error("[GPU] Can't share null usm pointer");
|
||||
}
|
||||
}
|
||||
|
||||
// Get methods returns original pointer allocated by openCL.
|
||||
void* get() const { return _usm_pointer->ptr(); }
|
||||
|
@ -279,6 +279,12 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemo
|
||||
, _buffer(buffer) {
|
||||
}
|
||||
|
||||
gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer)
|
||||
: lockable_gpu_mem()
|
||||
, memory(engine, new_layout, detect_allocation_type(engine, buffer), true)
|
||||
, _buffer(buffer) {
|
||||
}
|
||||
|
||||
gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
|
||||
: lockable_gpu_mem()
|
||||
, memory(engine, layout, type, false)
|
||||
@ -393,6 +399,20 @@ shared_mem_params gpu_usm::get_internal_params() const {
|
||||
};
|
||||
}
|
||||
|
||||
allocation_type gpu_usm::detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer) {
|
||||
auto cl_alloc_type = engine->get_usm_helper().get_usm_allocation_type(buffer.get());
|
||||
|
||||
allocation_type res = allocation_type::unknown;
|
||||
switch (cl_alloc_type) {
|
||||
case CL_MEM_TYPE_DEVICE_INTEL: res = allocation_type::usm_device; break;
|
||||
case CL_MEM_TYPE_HOST_INTEL: res = allocation_type::usm_host; break;
|
||||
case CL_MEM_TYPE_SHARED_INTEL: res = allocation_type::usm_shared; break;
|
||||
default: throw std::runtime_error("[GPU] Unsupported USM alloc type: " + std::to_string(cl_alloc_type));
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
|
||||
std::vector<cl_mem> res;
|
||||
for (auto& m : mem) {
|
||||
|
@ -100,6 +100,7 @@ private:
|
||||
|
||||
struct gpu_usm : public lockable_gpu_mem, public memory {
|
||||
gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
|
||||
gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer);
|
||||
gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);
|
||||
|
||||
void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
|
||||
@ -120,6 +121,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
|
||||
|
||||
protected:
|
||||
cl::UsmMemory _buffer;
|
||||
|
||||
static allocation_type detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer);
|
||||
};
|
||||
|
||||
struct ocl_surfaces_lock : public surfaces_lock {
|
||||
|
Loading…
Reference in New Issue
Block a user