From 6addc0d535e9ab0b591dcefa2f771997a477ba4f Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Tue, 23 Nov 2021 20:26:44 +0300 Subject: [PATCH] [GPU] USM sharing and host blob creation in gpu remote context (#8657) --- .../src/cldnn_engine/cldnn_infer_request.cpp | 22 ++- .../src/cldnn_engine/cldnn_remote_context.cpp | 55 +++++- .../src/cldnn_engine/cldnn_remote_context.h | 111 ++++++++++- .../include/ie/gpu/gpu_context_api_ocl.hpp | 43 ++++ .../include/ie/gpu/gpu_params.hpp | 13 +- .../include/ie/ie_remote_context.hpp | 10 +- .../include/openvino/runtime/gpu/ocl/dx.hpp | 5 +- .../include/openvino/runtime/gpu/ocl/ocl.hpp | 76 +++++++- .../include/openvino/runtime/gpu/ocl/va.hpp | 5 +- .../openvino/runtime/remote_context.hpp | 10 + .../src/cpp/ie_remote_context.cpp | 9 + .../src/ie_remote_context.cpp | 22 +++ .../cldnn_remote_blob_tests.cpp | 55 ++++++ .../gpu_remote_tensor_tests.cpp | 184 +++++++++++++++++- .../remote_blob_tests/remote_blob_helpers.hpp | 124 ++++++++++++ .../clDNN/api/cldnn/runtime/engine.hpp | 3 + .../thirdparty/clDNN/runtime/engine.cpp | 11 ++ .../clDNN/runtime/ocl/ocl_engine.cpp | 3 + .../thirdparty/clDNN/runtime/ocl/ocl_ext.hpp | 30 ++- .../clDNN/runtime/ocl/ocl_memory.cpp | 20 ++ .../clDNN/runtime/ocl/ocl_memory.hpp | 3 + 21 files changed, 776 insertions(+), 38 deletions(-) create mode 100644 inference-engine/src/inference_engine/src/ie_remote_context.cpp diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp index c98b06ce4cf..77336141f00 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp @@ -966,9 +966,25 @@ void CLDNNInferRequest::prepare_output(const cldnn::primitive_id& outputName, Bl } InferenceEngine::Blob::Ptr CLDNNInferRequest::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) { - auto blobPtr = std::make_shared(m_graph->GetContext(), m_graph->GetNetwork()->get_stream(), desc, layout); - getBlobImpl(blobPtr.get())->allocate(); - return blobPtr; + if (m_graph->GetEngine()->use_unified_shared_memory()) { + auto blobPtr = std::make_shared(m_graph->GetContext(), + m_graph->GetNetwork()->get_stream(), + desc, + layout, + nullptr, + 0, + 0, + CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL); + getBlobImpl(blobPtr.get())->allocate(); + return blobPtr; + } else { + auto blobPtr = std::make_shared(m_graph->GetContext(), + m_graph->GetNetwork()->get_stream(), + desc, + layout); + getBlobImpl(blobPtr.get())->allocate(); + return blobPtr; + } } } // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp index f043b6e4a89..12ac301a256 100644 --- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp @@ -38,6 +38,24 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const { { GPU_PARAM_KEY(OCL_CONTEXT), params.context }, { GPU_PARAM_KEY(MEM_HANDLE), params.mem } }; + case BT_USM_SHARED: + return{ + { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) }, + { GPU_PARAM_KEY(OCL_CONTEXT), params.context }, + { GPU_PARAM_KEY(MEM_HANDLE), params.mem } + }; + case BT_USM_HOST_INTERNAL: + return{ + { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) }, + { GPU_PARAM_KEY(OCL_CONTEXT), params.context }, + { GPU_PARAM_KEY(MEM_HANDLE), params.mem } + }; + case BT_USM_DEVICE_INTERNAL: + return{ + { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) }, + { GPU_PARAM_KEY(OCL_CONTEXT), params.context }, + { GPU_PARAM_KEY(MEM_HANDLE), params.mem } + }; #ifdef _WIN32 case BT_DX_BUF_SHARED: return{ @@ -81,7 +99,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept { return lockedHolder != nullptr; } -void CLDNNRemoteBlobImpl::allocate() noexcept { +void CLDNNRemoteBlobImpl::allocate() { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::Allocate"); assert(m_memObject == nullptr); @@ -91,13 +109,25 @@ void CLDNNRemoteBlobImpl::allocate() noexcept { switch (m_mem_type) { case BlobType::BT_BUF_INTERNAL: { - m_memObject = eng->allocate_memory(m_layout); + m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::cl_mem); + break; + } + case BlobType::BT_USM_HOST_INTERNAL: { + m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_host); + break; + } + case BlobType::BT_USM_DEVICE_INTERNAL: { + m_memObject = eng->allocate_memory(m_layout, cldnn::allocation_type::usm_device); break; } case BlobType::BT_BUF_SHARED: { m_memObject = eng->share_buffer(m_layout, m_mem); break; } + case BlobType::BT_USM_SHARED: { + m_memObject = eng->share_usm(m_layout, m_mem); + break; + } #ifdef _WIN32 case BlobType::BT_SURF_SHARED: { m_memObject = eng->share_surface(m_layout, m_mem, m_plane); @@ -139,6 +169,9 @@ std::shared_ptr CLDNNRemoteBlobImpl::getContext() const noexcept } void CLDNNRemoteBlobImpl::lock() const { + if (!is_allocated()) { + IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated"; + } lockedHolder = std::unique_ptr>(new cldnn::mem_lock(m_memObject, m_stream)); auto ptr = lockedHolder->data(); _handle = reinterpret_cast(ptr); @@ -295,15 +328,17 @@ std::string CLDNNExecutionContextImpl::getDeviceName() const noexcept { auto engine_type = cldnn::engine_types::ocl; auto runtime_type = cldnn::runtime_types::ocl; - // Use actual runtime and engine types - cldnn::device_query device_query(engine_type, runtime_type); - auto all_devices = device_query.get_available_devices(); - auto current_device = m_engine->get_device(); + try { + // Use actual runtime and engine types + cldnn::device_query device_query(engine_type, runtime_type); + auto all_devices = device_query.get_available_devices(); + auto current_device = m_engine->get_device(); - for (auto& kv : all_devices) { - if (current_device->is_same(kv.second)) - return devName + "." + kv.first; - } + for (auto& kv : all_devices) { + if (current_device->is_same(kv.second)) + return devName + "." + kv.first; + } + } catch (...) { } if (!m_config.device_id.empty()) devName += "." + m_config.device_id; diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.h b/inference-engine/src/cldnn_engine/cldnn_remote_context.h index 19c24540994..f5e179db39b 100644 --- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h +++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "cldnn_config.h" #include "cldnn_common_utils.h" @@ -37,6 +38,9 @@ public: BT_EMPTY, BT_BUF_INTERNAL, BT_BUF_SHARED, + BT_USM_SHARED, + BT_USM_HOST_INTERNAL, + BT_USM_DEVICE_INTERNAL, BT_IMG_SHARED, BT_SURF_SHARED, BT_DX_BUF_SHARED, @@ -50,7 +54,7 @@ public: uint32_t plane = 0, BlobType mem_type = BT_BUF_INTERNAL); - void allocate() noexcept; + void allocate(); bool deallocate() noexcept; InferenceEngine::ParamMap getParams() const; std::string getDeviceName() const noexcept; @@ -106,7 +110,11 @@ public: : _impl(context, stream, layout, mem, surf, plane, mem_type) , TpublicAPI(desc) {} - void allocate() noexcept override { _impl.allocate(); } + void allocate() noexcept override { + try { + _impl.allocate(); + } catch (...) {} + } bool deallocate() noexcept override { return _impl.deallocate(); } InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); } std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); } @@ -125,6 +133,7 @@ protected: }; using CLDNNRemoteCLbuffer = typedCLDNNRemoteBlob; +using CLDNNRemoteUSMbuffer = typedCLDNNRemoteBlob; using CLDNNRemoteCLImage2D = typedCLDNNRemoteBlob; #ifdef _WIN32 using CLDNNRemoteD3DBuffer = typedCLDNNRemoteBlob; @@ -157,6 +166,10 @@ inline CLDNNRemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) { auto ptr = blobPtr->as(); if (ptr) return ptr->getImpl(); } + { + auto ptr = blobPtr->as(); + if (ptr) return ptr->getImpl(); + } return nullptr; } @@ -204,6 +217,58 @@ public: bool free(void* handle) noexcept override { return true; } }; +class USMHostAllocator : public InferenceEngine::IAllocator { +protected: + InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr; + InferenceEngine::gpu::ClContext* _context = nullptr; + +public: + using Ptr = std::shared_ptr; + + USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { } + /** + * @brief Maps handle to heap memory accessible by any memory manipulation routines. + * @return Generic pointer to memory + */ + void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override { + if (!_usm_host_blob) + return nullptr; + return _usm_host_blob->get(); + }; + + /** + * @brief Unmaps memory by handle with multiple sequential mappings of the same handle. + * The multiple sequential mappings of the same handle are suppose to get the same + * result while there isn't a ref counter supported. + */ + void unlock(void* handle) noexcept override {} + + /** + * @brief Allocates memory + * @param size The size in bytes to allocate + * @return Handle to the allocated resource + */ + void* alloc(size_t size) noexcept override { + auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C); + InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}}; + _usm_host_blob = std::dynamic_pointer_cast(_context->CreateBlob(td, params)); + _usm_host_blob->allocate(); + return _usm_host_blob->get(); + } + + /** + * @brief Releases handle and all associated memory resources which invalidates the handle. + * @return false if handle cannot be released, otherwise - true. + */ + bool free(void* handle) noexcept override { + try { + _usm_host_blob = nullptr; + } catch(...) { } + return true; + } +}; + + class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter { public: enum ContextType { @@ -335,6 +400,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI { case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED: ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); break; + case CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED: + ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); + break; case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED: layout.format = ImageFormatFromLayout(tensorDesc.getLayout()); ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); @@ -368,6 +436,21 @@ class typedCLDNNExecutionContext : public TpublicContextAPI { CLDNNRemoteBlobImpl::BlobType::BT_BUF_INTERNAL); } + InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, CLDNNRemoteBlobImpl::BlobType alloc_type) { + cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()), + FormatFromLayout(tensorDesc.getLayout()), + CldnnTensorFromIEDims(tensorDesc.getDims())); + auto smart_this = std::dynamic_pointer_cast(this->shared_from_this()); + auto& stream = _impl.GetEngine()->get_program_stream(); + + return std::make_shared(smart_this, + stream, + tensorDesc, + layout, + nullptr, 0, 0, + alloc_type); + } + void check_if_shared() { if (GetType() != CLDNNExecutionContextImpl::ContextType::DEV_SHARED) IE_THROW() << "Shared context is required to to share this type of memory"; @@ -382,9 +465,16 @@ public: const Config& config = {}) : _impl(plugin, params, config) {} - InferenceEngine::ParamMap getParams() const noexcept override { return _impl.getParams(); } + InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); } std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); } + InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override { + if (_impl.GetEngine()->use_unified_shared_memory()) + return std::dynamic_pointer_cast(make_blob_with_precision(tensorDesc, std::make_shared(this))); + else + return std::dynamic_pointer_cast(make_blob_with_precision(tensorDesc)); + } + InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override { using namespace InferenceEngine; using InferenceEngine::gpu::details::param_map_obj_getter; @@ -395,9 +485,21 @@ public: // user will supply shared object handle std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE)); + bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) || + memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) || + memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER); + + if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) { + IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device"; + } + if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) { check_if_shared(); return reuse_surf(tensorDesc, params); + } else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) { + return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL); + } else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) { + return create_usm(tensorDesc, CLDNNRemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL); } else { CLDNNRemoteBlobImpl::BlobType blob_type; cldnn::shared_handle mem = nullptr; @@ -405,6 +507,9 @@ public: if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) { blob_type = CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED; mem = param_map_obj_getter::_ObjFromParamSimple(params, GPU_PARAM_KEY(MEM_HANDLE)); + } else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) { + blob_type = CLDNNRemoteBlobImpl::BlobType::BT_USM_SHARED; + mem = param_map_obj_getter::_ObjFromParamSimple(params, GPU_PARAM_KEY(MEM_HANDLE)); } else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) { blob_type = CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED; mem = param_map_obj_getter::_ObjFromParamSimple(params, GPU_PARAM_KEY(MEM_HANDLE)); diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp index 95682bbfc65..352246ed834 100644 --- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp +++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_context_api_ocl.hpp @@ -131,6 +131,49 @@ public: } }; +/** + * @brief This class represents an abstraction for GPU plugin remote blob + * which can be shared with user-supplied USM pointer. + * The plugin object derived from this class can be obtained with CreateBlob() call. + * @note User can obtain USM pointer from this class. + */ +class USMBlob : public ClBlob, public details::param_map_obj_getter { +public: + /** + * @brief A smart pointer to the ClBufferBlob object + */ + using Ptr = std::shared_ptr; + + /** + * @brief Creates a ClBufferBlob object with the specified dimensions and layout. + * @param tensorDesc Tensor description + */ + explicit USMBlob(const TensorDesc& tensorDesc) : ClBlob(tensorDesc) {} + + /** + * @brief Returns the underlying OpenCL memory object handle. + * @return underlying OpenCL memory object handle + */ + void* get() { + const auto& params = getParams(); + auto itrType = params.find(GPU_PARAM_KEY(SHARED_MEM_TYPE)); + if (itrType == params.end()) + IE_THROW() << "Parameter of type " << GPU_PARAM_KEY(SHARED_MEM_TYPE) << " not found"; + + auto mem_type = itrType->second.as(); + if (mem_type != GPU_PARAM_VALUE(USM_USER_BUFFER) && mem_type != GPU_PARAM_VALUE(USM_HOST_BUFFER) && + mem_type != GPU_PARAM_VALUE(USM_DEVICE_BUFFER)) + IE_THROW() << "Unexpected USM blob type: " << mem_type; + + auto itrHandle = params.find(GPU_PARAM_KEY(MEM_HANDLE)); + if (itrHandle == params.end()) { + IE_THROW() << "No parameter " << GPU_PARAM_KEY(MEM_HANDLE) << " found"; + } + + return itrHandle->second.as(); + } +}; + /** * @brief This class represents an abstraction for GPU plugin remote blob * which can be shared with user-supplied OpenCL 2D Image. diff --git a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp index d7e36c95ac5..36f8014ed63 100644 --- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp +++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_params.hpp @@ -98,7 +98,18 @@ DECLARE_GPU_PARAM_VALUE(OCL_BUFFER); * @brief Shared OpenCL 2D image blob */ DECLARE_GPU_PARAM_VALUE(OCL_IMAGE2D); - +/** + * @brief Shared USM pointer allocated by user + */ +DECLARE_GPU_PARAM_VALUE(USM_USER_BUFFER); +/** + * @brief Shared USM pointer type with host allocation type allocated by plugin + */ +DECLARE_GPU_PARAM_VALUE(USM_HOST_BUFFER); +/** + * @brief Shared USM pointer type with device allocation type allocated by plugin + */ +DECLARE_GPU_PARAM_VALUE(USM_DEVICE_BUFFER); /** * @brief Shared video decoder surface or D3D 2D texture blob */ diff --git a/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp b/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp index 0fee1c86808..31ec2d7f6a8 100644 --- a/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp +++ b/inference-engine/src/inference_engine/include/ie/ie_remote_context.hpp @@ -23,7 +23,7 @@ namespace InferenceEngine { * Such context represents a scope on the device within which executable * networks and remote memory blobs can exist, function and exchange data. */ -class RemoteContext : public std::enable_shared_from_this { +class INFERENCE_ENGINE_API_CLASS(RemoteContext) : public std::enable_shared_from_this { public: /** * @brief A smart pointer to the RemoteContext object @@ -110,6 +110,14 @@ public: */ virtual RemoteBlob::Ptr CreateBlob(const TensorDesc& tensorDesc, const ParamMap& params = {}) = 0; + /** + * @brief Allocates host accessible memory blob friendly for the device in current context + * Returns a pointer to the object which implements MemoryBlob interface. + * @param tensorDesc Defines the layout and dims of the blob + * @return A pointer to host accessible MemoryBlob object + */ + virtual MemoryBlob::Ptr CreateHostBlob(const TensorDesc& tensorDesc); + /** * @brief Returns a map of device-specific parameters required for low-level * operations with underlying object. diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp index 8da0821145a..e94ab133c6c 100644 --- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp +++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/dx.hpp @@ -102,9 +102,10 @@ public: * @note User can also obtain OpenCL context handle from this class. */ class D3DContext : public ClContext { - using RemoteContext::create_tensor; - public: + // Needed to make create_tensor overloads from base class visible for user + using ClContext::create_tensor; + /** * @brief Checks that type defined runtime paramters are presented in remote object * @param remote_context remote context to check diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp index d205b7f0548..4477c87873d 100644 --- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp +++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/ocl.hpp @@ -117,6 +117,36 @@ public: } }; +/** + * @brief This class represents an abstraction for GPU plugin remote tensor + * which can be shared with user-supplied USM device pointer. + * The plugin object derived from this class can be obtained with ClContext::create_tensor() call. + * @note User can obtain USM pointer from this class. + */ +class USMTensor : public RemoteTensor { +public: + /** + * @brief Checks that type defined runtime paramters are presented in remote object + * @param tensor a tensor to check + */ + static void type_check(const Tensor& tensor) { + RemoteTensor::type_check(tensor, + {{GPU_PARAM_KEY(MEM_HANDLE), {}}, + {GPU_PARAM_KEY(SHARED_MEM_TYPE), + {GPU_PARAM_VALUE(USM_USER_BUFFER), + GPU_PARAM_VALUE(USM_HOST_BUFFER), + GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}}); + } + + /** + * @brief Returns the underlying USM pointer. + * @return underlying USM pointer + */ + void* get() { + return static_cast(get_params().at(GPU_PARAM_KEY(MEM_HANDLE)).as()); + } +}; + /** * @brief This class represents an abstraction for GPU plugin remote context * which is shared with OpenCL context object. @@ -125,14 +155,14 @@ public: */ class ClContext : public RemoteContext { protected: - using RemoteContext::create_tensor; - /** * @brief GPU device name */ static constexpr const char* device_name = "GPU"; public: + // Needed to make create_tensor overloads from base class visible for user + using RemoteContext::create_tensor; /** * @brief Checks that type defined runtime paramters are presented in remote object * @param remote_context remote context to check @@ -220,7 +250,7 @@ public: * @brief This function is used to obtain remote tensor object from user-supplied cl_mem object * @param type Tensor element type * @param shape Tensor shape - * @param buffer A cl_mem object wrapped by a remote tensor + * @param buffer A cl_mem object that should be wrapped by a remote tensor * @return A remote tensor instance */ ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl_mem buffer) { @@ -233,7 +263,7 @@ public: * @brief This function is used to obtain remote tensor object from user-supplied cl::Buffer object * @param type Tensor element type * @param shape Tensor shape - * @param buffer A cl::Buffer object wrapped by a remote tensor + * @param buffer A cl::Buffer object that should be wrapped by a remote tensor * @return A remote tensor instance */ ClBufferTensor create_tensor(const element::Type type, const Shape& shape, const cl::Buffer& buffer) { @@ -244,7 +274,7 @@ public: * @brief This function is used to obtain remote tensor object from user-supplied cl::Image2D object * @param type Tensor element type * @param shape Tensor shape - * @param image A cl::Image2D object wrapped by a remote tensor + * @param image A cl::Image2D object that should be wrapped by a remote tensor * @return A remote tensor instance */ ClImage2DTensor create_tensor(const element::Type type, const Shape& shape, const cl::Image2D& image) { @@ -252,7 +282,43 @@ public: {GPU_PARAM_KEY(MEM_HANDLE), static_cast(image.get())}}; return create_tensor(type, shape, params); } + + /** + * @brief This function is used to obtain remote tensor object from user-supplied USM pointer + * @param type Tensor element type + * @param shape Tensor shape + * @param usm_ptr A USM pointer that should be wrapped by a remote tensor + * @return A remote tensor instance + */ + USMTensor create_tensor(const element::Type type, const Shape& shape, void* usm_ptr) { + ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER)}, + {GPU_PARAM_KEY(MEM_HANDLE), static_cast(usm_ptr)}}; + return create_tensor(type, shape, params); + } + + /** + * @brief This function is used to allocate USM tensor with host allocation type + * @param type Tensor element type + * @param shape Tensor shape + * @return A remote tensor instance + */ + USMTensor create_usm_host_tensor(const element::Type type, const Shape& shape) { + ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}}; + return create_tensor(type, shape, params); + } + + /** + * @brief This function is used to allocate USM tensor with device allocation type + * @param type Tensor element type + * @param shape Tensor shape + * @return A remote tensor instance + */ + USMTensor create_usm_device_tensor(const element::Type type, const Shape& shape) { + ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER)}}; + return create_tensor(type, shape, params); + } }; + } // namespace ocl } // namespace gpu } // namespace runtime diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp index 91f6c037f69..45e8611077c 100644 --- a/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp +++ b/inference-engine/src/inference_engine/include/openvino/runtime/gpu/ocl/va.hpp @@ -72,9 +72,10 @@ public: * @note User can also obtain OpenCL context handle from this class. */ class VAContext : public ClContext { - using RemoteContext::create_tensor; - public: + // Needed to make create_tensor overloads from base class visible for user + using ClContext::create_tensor; + /** * @brief Checks that type defined runtime paramters are presented in remote object * @param remote_context remote context to check diff --git a/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp b/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp index 73831801d11..73b27110bb3 100644 --- a/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp +++ b/inference-engine/src/inference_engine/include/openvino/runtime/remote_context.hpp @@ -136,6 +136,16 @@ public: * @return A map of name/parameter elements. */ ParamMap get_params() const; + + /** + * @brief This function is used to create host tensor object friendly for the device in current context + * For example, GPU context may allocate USM host memory (if corresponding extension is available) + * which could be more efficient than regular host memory. + * @param type Tensor element type + * @param shape Tensor shape + * @return A Tensor instance with device friendly memory + */ + Tensor create_host_tensor(const element::Type type, const Shape& shape); }; } // namespace runtime diff --git a/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp b/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp index cf5974dd22e..1c3942bfc10 100644 --- a/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp +++ b/inference-engine/src/inference_engine/src/cpp/ie_remote_context.cpp @@ -69,6 +69,15 @@ RemoteTensor RemoteContext::create_tensor(const element::Type& element_type, }); } +Tensor RemoteContext::create_host_tensor(const element::Type element_type, const Shape& shape) { + OV_REMOTE_CONTEXT_STATEMENT({ + auto blob = _impl->CreateHostBlob( + {ie::details::convertPrecision(element_type), shape, ie::TensorDesc::getLayoutByRank(shape.size())}); + blob->allocate(); + return {_so, blob}; + }); +} + ie::ParamMap RemoteContext::get_params() const { OV_REMOTE_CONTEXT_STATEMENT(return _impl->getParams()); } diff --git a/inference-engine/src/inference_engine/src/ie_remote_context.cpp b/inference-engine/src/inference_engine/src/ie_remote_context.cpp new file mode 100644 index 00000000000..fbebf9fe83d --- /dev/null +++ b/inference-engine/src/inference_engine/src/ie_remote_context.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ie_remote_context.hpp" + +#include +#include + +#include "blob_factory.hpp" + +namespace InferenceEngine { + +MemoryBlob::Ptr RemoteContext::CreateHostBlob(const TensorDesc& tensorDesc) { + auto blob = std::dynamic_pointer_cast(make_blob_with_precision(tensorDesc)); + if (!blob) + IE_THROW(NotAllocated) << "Failed to create host blob in remote context for " << getDeviceName() << " device"; + + return blob; +} + +} // namespace InferenceEngine diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp index 968fa18d40f..95aecd6b357 100644 --- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp @@ -84,6 +84,61 @@ TEST_F(RemoteBlob_Test, smoke_canInputUserBlob) { } } + +TEST_F(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) { +#if defined(ANDROID) + GTEST_SKIP(); +#endif + CNNNetwork net(fn_ptr); + + net.getInputsInfo().begin()->second->setLayout(Layout::NCHW); + net.getInputsInfo().begin()->second->setPrecision(Precision::U8); + + // TODO: Issue: investigate issue with IECore + auto ie = InferenceEngine::Core(); + auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU); + + // regular inference + auto inf_req_regular = exec_net.CreateInferRequest(); + InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob( + net.getInputsInfo().begin()->second->getTensorDesc()); + inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData); + + inf_req_regular.Infer(); + auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first); + + // inference using remote blob + auto inf_req_shared = exec_net.CreateInferRequest(); + auto cldnn_context = exec_net.GetContext(); + cl_context ctx = std::dynamic_pointer_cast(cldnn_context)->get(); + auto ocl_instance = std::make_shared(ctx); + + auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims(); + size_t imSize = dims[1] * dims[2] * dims[3]; + + Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context); + shared_blob->allocate(); + { + cl::Buffer shared_buffer = *shared_blob->as(); + void *buffer = fakeImageData->buffer(); + ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer); + } + + inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob); + + inf_req_shared.Infer(); + auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first); + + // compare results + { + ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32); + ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size()); + auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32); + FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr); + } +} + + TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) { auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat(); CNNNetwork net(fn_ptr); diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp index b593c29183d..97bf9c5512d 100644 --- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp @@ -30,7 +30,46 @@ protected: } }; -TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) { +enum class RemoteTensorSharingType { + USER_CL_TENSOR = 0, + PLUGIN_CL_TENSOR = 1, + USER_USM_HOST_TENSOR = 2, + USER_USM_DEVICE_TENSOR = 3, + PLUGIN_USM_HOST_TENSOR = 4, + PLUGIN_USM_DEVICE_TENSOR = 5, + PLUGIN_HOST_TENSOR = 6 +}; + +std::ostream& operator<<(std::ostream& stream, RemoteTensorSharingType sharing_type) { + switch (sharing_type) { + case RemoteTensorSharingType::USER_CL_TENSOR: stream << "USER_CL_TENSOR"; break; + case RemoteTensorSharingType::PLUGIN_CL_TENSOR: stream << "PLUGIN_CL_TENSOR"; break; + case RemoteTensorSharingType::USER_USM_HOST_TENSOR: stream << "USER_USM_HOST_TENSOR"; break; + case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: stream << "USER_USM_DEVICE_TENSOR"; break; + case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: stream << "PLUGIN_USM_HOST_TENSOR"; break; + case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: stream << "PLUGIN_USM_DEVICE_TENSOR"; break; + case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: stream << "PLUGIN_HOST_TENSOR"; break; + } + + return stream; +} + +class OVRemoteTensorInputBlob_Test : public OVRemoteTensor_Test, public testing::WithParamInterface { +public: + void SetUp() override { + fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat(); + } + + static std::string getTestCaseName(testing::TestParamInfo obj) { + RemoteTensorSharingType sharing_type = obj.param; + + std::ostringstream result; + result << sharing_type; + return result.str(); + } +}; + +TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) { #if defined(ANDROID) GTEST_SKIP(); #endif @@ -45,6 +84,8 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) { auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU); + RemoteTensorSharingType sharing_type = GetParam(); + // regular inference auto inf_req_regular = exec_net.create_infer_request(); auto input = function->get_parameters().at(0); @@ -65,16 +106,129 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) { auto imSize = ov::shape_size(input->get_shape()); - cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err); - { - void* buffer = fakeImageData.data(); - ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer); + switch (sharing_type) { + case RemoteTensorSharingType::USER_CL_TENSOR: { + cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err); + { + void* buffer = fakeImageData.data(); + ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer); + } + + auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer); + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + break; + } + case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize); + { + void* buffer = fakeImageData.data(); + err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr); + if (err != CL_SUCCESS) + FAIL() << "Failed to copy data from host buffer to USM device"; + } + + auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer); + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + ocl_instance->free_mem(shared_buffer); + + break; + } + case RemoteTensorSharingType::USER_USM_HOST_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize); + { + void* buffer = fakeImageData.data(); + std::memcpy(shared_buffer, buffer, imSize); + } + + auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer); + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + ocl_instance->free_mem(shared_buffer); + + break; + } + case RemoteTensorSharingType::PLUGIN_CL_TENSOR: { + auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape()); + ASSERT_TRUE(cldnn_tensor.is()); + auto cl_tensor = cldnn_tensor.as(); + { + cl::Buffer shared_buffer = cl_tensor; + void* buffer = fakeImageData.data(); + ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer); + } + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + break; + } + case RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + auto cldnn_tensor = cldnn_context.create_usm_host_tensor(input->get_element_type(), input->get_shape()); + ASSERT_TRUE(cldnn_tensor.is()); + { + auto cl_tensor = cldnn_tensor.as(); + void* shared_buffer = cl_tensor.get(); + ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL); + void* buffer = fakeImageData.data(); + std::memcpy(shared_buffer, buffer, imSize); + } + + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + break; + } + case RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + auto cldnn_tensor = cldnn_context.create_usm_device_tensor(input->get_element_type(), input->get_shape()); + ASSERT_TRUE(cldnn_tensor.is()); + { + auto cl_tensor = cldnn_tensor.as(); + void* shared_buffer = cl_tensor.get(); + ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_DEVICE_INTEL); + void* buffer = fakeImageData.data(); + err = ocl_instance->memcpy(ocl_instance->_queue, shared_buffer, buffer, imSize, true, nullptr, nullptr); + if (err != CL_SUCCESS) + FAIL() << "Failed to copy data from host buffer to USM device"; + } + + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + break; + } + case RemoteTensorSharingType::PLUGIN_HOST_TENSOR: { + auto cldnn_tensor = cldnn_context.create_host_tensor(input->get_element_type(), input->get_shape()); + { + ASSERT_NO_THROW(cldnn_tensor.data()); + void* shared_buffer = cldnn_tensor.data(); + if (ocl_instance->supports_usm()) + ASSERT_EQ(ocl_instance->get_allocation_type(shared_buffer), CL_MEM_TYPE_HOST_INTEL); + void* buffer = fakeImageData.data(); + std::memcpy(shared_buffer, buffer, imSize); + } + + inf_req_shared.set_tensor(input, cldnn_tensor); + inf_req_shared.infer(); + + break; + } } - auto cldnn_tensor = cldnn_context.create_tensor(input->get_element_type(), input->get_shape(), shared_buffer); - inf_req_shared.set_tensor(input, cldnn_tensor); - - inf_req_shared.infer(); auto output_tensor_shared = inf_req_shared.get_tensor(output); // compare results @@ -88,6 +242,18 @@ TEST_F(OVRemoteTensor_Test, smoke_canInputUserTensor) { } } +INSTANTIATE_TEST_SUITE_P( + smoke_GPU, + OVRemoteTensorInputBlob_Test, + ::testing::ValuesIn(std::vector{RemoteTensorSharingType::USER_CL_TENSOR, + RemoteTensorSharingType::PLUGIN_CL_TENSOR, + RemoteTensorSharingType::USER_USM_HOST_TENSOR, + RemoteTensorSharingType::USER_USM_DEVICE_TENSOR, + RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR, + RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR, + RemoteTensorSharingType::PLUGIN_HOST_TENSOR}), + OVRemoteTensorInputBlob_Test::getTestCaseName); + TEST_F(OVRemoteTensor_Test, smoke_canInferOnUserContext) { auto ie = ov::runtime::Core(); diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp index 5704797917e..0ff3ec4aeff 100644 --- a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp +++ b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/remote_blob_helpers.hpp @@ -18,10 +18,57 @@ #endif #include +namespace { +template +T load_entrypoint(const cl_platform_id platform, const std::string name) { +#if defined(__GNUC__) && __GNUC__ < 5 +// OCL spec says: +// "The function clGetExtensionFunctionAddressForPlatform returns the address of the extension function named by funcname for a given platform. +// The pointer returned should be cast to a function pointer type matching the extension function's definition defined in the appropriate extension +// specification and header file." +// So the pointer-to-object to pointer-to-function cast below is supposed to be valid, thus we suppress warning from old GCC versions. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + T p = reinterpret_cast(clGetExtensionFunctionAddressForPlatform(platform, name.c_str())); +#if defined(__GNUC__) && __GNUC__ < 5 +#pragma GCC diagnostic pop +#endif + if (!p) { + throw std::runtime_error("clGetExtensionFunctionAddressForPlatform(" + name + ") returned NULL."); + } + return p; +} + +template +T try_load_entrypoint(const cl_platform_id platform, const std::string name) { + try { + return load_entrypoint(platform, name); + } catch (...) { + return nullptr; + } +} +} // namespace + struct OpenCL { cl::Context _context; cl::Device _device; cl::CommandQueue _queue; + cl_platform_id _platform; + + clHostMemAllocINTEL_fn _host_mem_alloc_fn = nullptr; + clMemFreeINTEL_fn _mem_free_fn = nullptr; + clDeviceMemAllocINTEL_fn _device_mem_alloc_fn = nullptr; + clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr; + clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr; + + void init_extension_functions(cl_platform_id platform) { + _host_mem_alloc_fn = try_load_entrypoint(platform, "clHostMemAllocINTEL"); + _device_mem_alloc_fn = try_load_entrypoint(platform, "clDeviceMemAllocINTEL"); + _mem_free_fn = try_load_entrypoint(platform, "clMemFreeINTEL"); + _enqueue_memcpy_fn = try_load_entrypoint(platform, "clEnqueueMemcpyINTEL"); + _get_mem_alloc_info_fn = try_load_entrypoint(platform, "clGetMemAllocInfoINTEL"); + } explicit OpenCL(std::shared_ptr> media_api_context_properties = nullptr) { // get Intel iGPU OCL device, create context and queue @@ -42,12 +89,15 @@ struct OpenCL { if (refVendorID == d.getInfo()) { _device = d; _context = cl::Context(_device); + _platform = id; break; } } } cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; _queue = cl::CommandQueue(_context, _device, props); + + init_extension_functions(_platform); } } @@ -56,7 +106,81 @@ struct OpenCL { _context = cl::Context(context, true); _device = cl::Device(_context.getInfo()[0].get(), true); + cl_int error = clGetDeviceInfo(_device.get(), CL_DEVICE_PLATFORM, sizeof(_platform), &_platform, nullptr); + if (error) { + throw std::runtime_error("OpenCL helper failed to retrieve CL_DEVICE_PLATFORM: " + std::to_string(error)); + } + cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; _queue = cl::CommandQueue(_context, _device, props); + + init_extension_functions(_platform); + } + + bool supports_usm() const { + return _host_mem_alloc_fn != nullptr && + _device_mem_alloc_fn != nullptr && + _mem_free_fn != nullptr && + _enqueue_memcpy_fn != nullptr && + _get_mem_alloc_info_fn != nullptr; + } + + void* allocate_usm_host_buffer(size_t size) const { + cl_int err_code_ret; + if (!_device_mem_alloc_fn) + throw std::runtime_error("[GPU] clHostMemAllocINTEL is nullptr"); + auto ret_ptr = _host_mem_alloc_fn(_context.get(), nullptr, size, 0, &err_code_ret); + if (err_code_ret != CL_SUCCESS) + throw std::runtime_error("OpenCL helper failed to allocate USM host memory"); + return ret_ptr; + } + + void* allocate_usm_device_buffer(size_t size) const { + cl_int err_code_ret; + if (!_device_mem_alloc_fn) + throw std::runtime_error("[GPU] clDeviceMemAllocINTEL is nullptr"); + auto ret_ptr = _device_mem_alloc_fn(_context.get(), _device.get(), nullptr, size, 0, &err_code_ret); + if (err_code_ret != CL_SUCCESS) + throw std::runtime_error("OpenCL helper failed to allocate USM device memory"); + return ret_ptr; + } + + void free_mem(void* usm_ptr) { + if (!_mem_free_fn) + throw std::runtime_error("[GPU] clMemFreeINTEL is nullptr"); + + _mem_free_fn(_context.get(), usm_ptr); + } + + cl_int memcpy(const cl::CommandQueue& cpp_queue, void *dst_ptr, const void *src_ptr, + size_t bytes_count, bool blocking = true, const std::vector* wait_list = nullptr, cl::Event* ret_event = nullptr) const { + if (!_enqueue_memcpy_fn) + throw std::runtime_error("[GPU] clEnqueueMemcpyINTEL is nullptr"); + cl_event tmp; + cl_int err = _enqueue_memcpy_fn( + cpp_queue.get(), + static_cast(blocking), + dst_ptr, + src_ptr, + bytes_count, + wait_list == nullptr ? 0 : static_cast(wait_list->size()), + wait_list == nullptr ? nullptr : reinterpret_cast(&wait_list->front()), + ret_event == nullptr ? nullptr : &tmp); + + if (ret_event != nullptr && err == CL_SUCCESS) + *ret_event = tmp; + + return err; + } + + cl_unified_shared_memory_type_intel get_allocation_type(const void* usm_ptr) const { + if (!_get_mem_alloc_info_fn) { + throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr"); + } + + cl_unified_shared_memory_type_intel ret_val; + size_t ret_val_size; + _get_mem_alloc_info_fn(_context.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size); + return ret_val; } }; diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp index 8114009dd9a..acc57d689ea 100644 --- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp +++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp @@ -62,6 +62,9 @@ public: /// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout memory_ptr share_buffer(const layout& layout, shared_handle buf); + /// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout + memory_ptr share_usm(const layout& layout, shared_handle usm_ptr); + /// Create shared memory object using user-supplied 2D image @p img using specified @p layout memory_ptr share_image(const layout& layout, shared_handle img); diff --git a/inference-engine/thirdparty/clDNN/runtime/engine.cpp b/inference-engine/thirdparty/clDNN/runtime/engine.cpp index 7e40a4ebf52..df15924c1c3 100644 --- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp @@ -92,6 +92,17 @@ memory_ptr engine::share_buffer(const layout& layout, shared_handle buf) { return reinterpret_handle(layout, params); } +memory_ptr engine::share_usm(const layout& layout, shared_handle usm_ptr) { + shared_mem_params params = { shared_mem_type::shared_mem_usm, nullptr, nullptr, usm_ptr, +#ifdef _WIN32 + nullptr, +#else + 0, +#endif + 0 }; + return reinterpret_handle(layout, params); +} + memory::ptr engine::share_image(const layout& layout, shared_handle img) { shared_mem_params params = { shared_mem_type::shared_mem_image, nullptr, nullptr, img, #ifdef _WIN32 diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp index 5b67c37e8ff..a0b2774ef23 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp @@ -168,6 +168,9 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_ } else if (params.mem_type == shared_mem_type::shared_mem_buffer) { cl::Buffer buf(static_cast(params.mem), true); return std::make_shared(this, new_layout, buf); + } else if (params.mem_type == shared_mem_type::shared_mem_usm) { + cl::UsmMemory usm_buffer(get_usm_helper(), params.mem); + return std::make_shared(this, new_layout, usm_buffer); } else { throw std::runtime_error("unknown shared object fromat or type"); } diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp index a8535913603..c6a96460404 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp @@ -524,6 +524,7 @@ public: _enqueue_memcpy_fn = try_load_entrypoint(_ctx.get(), "clEnqueueMemcpyINTEL"); _enqueue_mem_fill_fn = try_load_entrypoint(_ctx.get(), "clEnqueueMemFillINTEL"); _enqueue_memset_fn = try_load_entrypoint(_ctx.get(), "clEnqueueMemsetINTEL"); + _get_mem_alloc_info_fn = try_load_entrypoint(_ctx.get(), "clGetMemAllocInfoINTEL"); } } @@ -621,6 +622,17 @@ public: return err; } + cl_unified_shared_memory_type_intel get_usm_allocation_type(const void* usm_ptr) const { + if (!_get_mem_alloc_info_fn) { + throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr"); + } + + cl_unified_shared_memory_type_intel ret_val; + size_t ret_val_size; + _get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_unified_shared_memory_type_intel), &ret_val, &ret_val_size); + return ret_val; + } + private: cl::Context _ctx; cl::Device _device; @@ -632,6 +644,7 @@ private: clEnqueueMemcpyINTEL_fn _enqueue_memcpy_fn = nullptr; clEnqueueMemFillINTEL_fn _enqueue_mem_fill_fn = nullptr; clEnqueueMemsetINTEL_fn _enqueue_memset_fn = nullptr; + clGetMemAllocInfoINTEL_fn _get_mem_alloc_info_fn = nullptr; }; /* @@ -640,11 +653,16 @@ private: */ class UsmHolder { public: - UsmHolder(const cl::UsmHelper& usmHelper, void* ptr) : _usmHelper(usmHelper), _ptr(ptr) { } + UsmHolder(const cl::UsmHelper& usmHelper, void* ptr, bool shared_memory = false) + : _usmHelper(usmHelper) + , _ptr(ptr) + , _shared_memory(shared_memory) { } + void* ptr() { return _ptr; } ~UsmHolder() { try { - _usmHelper.free_mem(_ptr); + if (!_shared_memory) + _usmHelper.free_mem(_ptr); } catch (...) { // Exception may happen only when clMemFreeINTEL function is unavailable, thus can't free memory properly } @@ -652,6 +670,7 @@ public: private: const cl::UsmHelper& _usmHelper; void* _ptr; + bool _shared_memory = false; }; /* USM base class. Different usm types should derive from this class. @@ -659,6 +678,13 @@ private: class UsmMemory { public: explicit UsmMemory(const cl::UsmHelper& usmHelper) : _usmHelper(usmHelper) { } + UsmMemory(const cl::UsmHelper& usmHelper, void* usm_ptr) + : _usmHelper(usmHelper) + , _usm_pointer(std::make_shared(_usmHelper, usm_ptr, true)) { + if (!usm_ptr) { + throw std::runtime_error("[GPU] Can't share null usm pointer"); + } + } // Get methods returns original pointer allocated by openCL. void* get() const { return _usm_pointer->ptr(); } diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp index 27e9331a74f..3bd44357aa4 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp @@ -279,6 +279,12 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemo , _buffer(buffer) { } +gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer) + : lockable_gpu_mem() + , memory(engine, new_layout, detect_allocation_type(engine, buffer), true) + , _buffer(buffer) { +} + gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type) : lockable_gpu_mem() , memory(engine, layout, type, false) @@ -393,6 +399,20 @@ shared_mem_params gpu_usm::get_internal_params() const { }; } +allocation_type gpu_usm::detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer) { + auto cl_alloc_type = engine->get_usm_helper().get_usm_allocation_type(buffer.get()); + + allocation_type res = allocation_type::unknown; + switch (cl_alloc_type) { + case CL_MEM_TYPE_DEVICE_INTEL: res = allocation_type::usm_device; break; + case CL_MEM_TYPE_HOST_INTEL: res = allocation_type::usm_host; break; + case CL_MEM_TYPE_SHARED_INTEL: res = allocation_type::usm_shared; break; + default: throw std::runtime_error("[GPU] Unsupported USM alloc type: " + std::to_string(cl_alloc_type)); + } + + return res; +} + std::vector ocl_surfaces_lock::get_handles(std::vector mem) const { std::vector res; for (auto& m : mem) { diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp index 1ef23a81963..fa89bf42eb2 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp @@ -100,6 +100,7 @@ private: struct gpu_usm : public lockable_gpu_mem, public memory { gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type); + gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer); gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type); void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override; @@ -120,6 +121,8 @@ struct gpu_usm : public lockable_gpu_mem, public memory { protected: cl::UsmMemory _buffer; + + static allocation_type detect_allocation_type(ocl_engine* engine, const cl::UsmMemory& buffer); }; struct ocl_surfaces_lock : public surfaces_lock {