From f68dd9041336cdbb29f65cf4525fca7e7832e326 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 13 Jan 2022 10:52:59 +0300 Subject: [PATCH] [GPU] Get rid of PreAllocator usages to allow setShape (#9576) --- .../intel_gpu/plugin/infer_request.hpp | 5 +- .../intel_gpu/src/plugin/infer_request.cpp | 116 ++++++------------ .../intel_gpu/src/plugin/remote_context.cpp | 15 +++ .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 12 ++ .../intel_gpu/src/runtime/ocl/ocl_ext.hpp | 11 ++ .../plugin/gpu/behavior/infer_request.cpp | 40 ++++++ .../gpu_remote_tensor_tests.cpp | 51 ++++++++ 7 files changed, 172 insertions(+), 78 deletions(-) create mode 100644 src/tests/functional/plugin/gpu/behavior/infer_request.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp index 3602a1e3f52..98d031bea40 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp @@ -89,7 +89,8 @@ private: std::vector& dependencies); void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob); - InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr); + InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, + std::shared_ptr alloc = nullptr); InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout); void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); @@ -97,7 +98,7 @@ private: const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, buf_info* bi = nullptr); - InferenceEngine::Blob::Ptr host_blob_from_device_blob(const InferenceEngine::Blob::Ptr blobPtr); + InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem); void allocate_inputs(); void allocate_outputs(); void allocate_inputs_dynamic(); diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index a746ce9b50e..a972e117fef 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -317,7 +317,6 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { // during pre-processing if (_inputs[name]->is()) { Blob::Ptr inputHostBlob = create_host_blob(desc); - inputHostBlob->allocate(); _inputs[name] = inputHostBlob; } _preProcData[name] = CreatePreprocDataHelper(); @@ -787,59 +786,27 @@ void InferRequest::setup_stream_graph() { m_graph = streamGraphs[streamID]; } -Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) { +Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr alloc) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob"); - const Precision& p = desc.getPrecision(); + auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator()); + blob->allocate(); + return blob; +} - switch (p) { - case Precision::FP32: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::FP16: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::I16: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::U16: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::I32: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::I64: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::I8: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::U8: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - case Precision::BOOL: - if (mem_ptr != nullptr) - return make_shared_blob(desc, reinterpret_cast(mem_ptr)); - else - return make_shared_blob(desc); - default: - IE_THROW(NotImplemented) << "The plugin does not support " << p.name() << " blob precision"; - } +Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) { + auto blob = std::make_shared(m_graph->GetContext(), + m_graph->GetNetwork()->get_stream(), + desc, + layout, + usm_host_mem, + 0, + 0, + RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL); + if (!blob) + IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob"; + blob->allocate(); + + return blob; } void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) { @@ -907,21 +874,6 @@ void InferRequest::copy_input_data(std::shared_ptr network, } } -Blob::Ptr InferRequest::host_blob_from_device_blob(Blob::Ptr blobPtr) { - uint8_t* bufferMem = nullptr; - auto clblobPtr = std::dynamic_pointer_cast(blobPtr); - if (clblobPtr) { - const auto memPtr = getBlobImpl(clblobPtr.get())->getMemory(); - if (memPtr->get_allocation_type() == cldnn::allocation_type::usm_host) { - bufferMem = reinterpret_cast(memPtr->get_internal_params().mem); - } - } - Blob::Ptr hostBlob = create_host_blob(blobPtr->getTensorDesc(), bufferMem); - hostBlob->allocate(); - - return hostBlob; -} - void InferRequest::allocate_inputs() { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs"); auto inputLayouts = m_graph->GetInputLayouts(); @@ -964,12 +916,18 @@ void InferRequest::allocate_inputs() { auto blobPtr = create_device_blob(desc_fp32, litr->second); _deviceInputs[name] = blobPtr; Blob::Ptr inputBlob = create_host_blob(desc); - inputBlob->allocate(); _inputs[name] = inputBlob; } else { - auto blobPtr = create_device_blob(desc, litr->second); - _deviceInputs[name] = blobPtr; - _inputs[name] = host_blob_from_device_blob(blobPtr); + if (m_graph->GetEngine()->use_unified_shared_memory()) { + // For USM case we create host blob using custom USM host allocator + // and then create shared device blob on top of this buffer + auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); + _inputs[name] = host_blob; + _deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as()); + } else { + _inputs[name] = create_host_blob(desc); + _deviceInputs[name] = create_device_blob(desc, litr->second); + } } } } @@ -996,7 +954,6 @@ void InferRequest::allocate_inputs_dynamic() { fp32inputBlob->allocate(); _inputs[input.first + fp32_suffix] = fp32inputBlob; } - inputBlob->allocate(); _inputs[input.first] = inputBlob; } } @@ -1013,10 +970,18 @@ void InferRequest::allocate_outputs() { GPU_DEBUG_IF(debug_config->verbose >= 2) { GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl; } - auto blobPtr = create_device_blob(desc, output_layout); - _deviceOutputs[no.first] = blobPtr; - _outputs[no.first] = host_blob_from_device_blob(blobPtr); + outputsMap[no.first] = outputID; + if (m_graph->GetEngine()->use_unified_shared_memory()) { + // For USM case we create host blob using custom USM host allocator + // and then create shared device blob on top of this buffer + auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); + _outputs[no.first] = host_blob; + _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as()); + } else { + _outputs[no.first] = create_host_blob(desc); + _deviceOutputs[no.first] = create_device_blob(desc, output_layout); + } } } @@ -1036,7 +1001,6 @@ void InferRequest::allocate_outputs_dynamic() { } Blob::Ptr outputBlob = create_host_blob(desc); - outputBlob->allocate(); _outputs[no.first] = outputBlob; outputsMap[no.first] = outputID; } diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index ea0f54fe238..201e3e50e05 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -26,6 +26,21 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context, BlobType mem_type) : m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane), _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) { + auto _impl = getContextImpl(m_context.lock()); + auto eng = _impl->GetEngine(); + + // Verify shared buffer/usm memory and ensure that requested byte size is not greater than allocated one + switch (m_mem_type) { + case BlobType::BT_BUF_SHARED: { + eng->share_buffer(m_layout, m_mem); + break; + } + case BlobType::BT_USM_SHARED: { + eng->share_usm(m_layout, m_mem); + break; + } + default: break; + } } ParamMap RemoteBlobImpl::getParams() const { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index a0b2774ef23..8e2d83d78de 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -167,9 +167,21 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_ #endif } else if (params.mem_type == shared_mem_type::shared_mem_buffer) { cl::Buffer buf(static_cast(params.mem), true); + auto actual_mem_size = buf.getInfo(); + auto requested_mem_size = new_layout.bytes_count(); + if (actual_mem_size < requested_mem_size) { + throw std::runtime_error("[GPU] shared buffer has smaller size (" + std::to_string(actual_mem_size) + + ") than specified layout (" + std::to_string(requested_mem_size) + ")"); + } return std::make_shared(this, new_layout, buf); } else if (params.mem_type == shared_mem_type::shared_mem_usm) { cl::UsmMemory usm_buffer(get_usm_helper(), params.mem); + auto actual_mem_size = get_usm_helper().get_usm_allocation_size(usm_buffer.get()); + auto requested_mem_size = new_layout.bytes_count(); + if (actual_mem_size < requested_mem_size) { + throw std::runtime_error("[GPU] shared USM buffer has smaller size (" + std::to_string(actual_mem_size) + + ") than specified layout (" + std::to_string(requested_mem_size) + ")"); + } return std::make_shared(this, new_layout, usm_buffer); } else { throw std::runtime_error("unknown shared object fromat or type"); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp index 512bc5a4204..1e1a23ed7ab 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp @@ -633,6 +633,17 @@ public: return ret_val; } + size_t get_usm_allocation_size(const void* usm_ptr) const { + if (!_get_mem_alloc_info_fn) { + throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr"); + } + + size_t ret_val; + size_t ret_val_size; + _get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_SIZE_INTEL, sizeof(size_t), &ret_val, &ret_val_size); + return ret_val; + } + private: cl::Context _ctx; cl::Device _device; diff --git a/src/tests/functional/plugin/gpu/behavior/infer_request.cpp b/src/tests/functional/plugin/gpu/behavior/infer_request.cpp new file mode 100644 index 00000000000..aa1e7b925df --- /dev/null +++ b/src/tests/functional/plugin/gpu/behavior/infer_request.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "openvino/runtime/core.hpp" + +#include +#include "ngraph_functions/subgraph_builders.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "openvino/core/preprocess/pre_post_process.hpp" +#include "transformations/utils/utils.hpp" + +using namespace ::testing; + +TEST(TensorTest, smoke_canSetShapeForPreallocatedTensor) { + auto ie = ov::runtime::Core(); + using namespace ov::preprocess; + auto p = PrePostProcessor(ngraph::builder::subgraph::makeSplitMultiConvConcat()); + p.input().tensor().set_element_type(ov::element::i8); + p.input().preprocess().convert_element_type(ov::element::f32); + + auto function = p.build(); + auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU); + auto inf_req = exec_net.create_infer_request(); + + // Check set_shape call for pre-allocated input/output tensors + auto input_tensor = inf_req.get_input_tensor(0); + ASSERT_NO_THROW(input_tensor.set_shape({1, 4, 20, 20})); + ASSERT_NO_THROW(input_tensor.set_shape({1, 3, 20, 20})); + ASSERT_NO_THROW(input_tensor.set_shape({2, 3, 20, 20})); + auto output_tensor = inf_req.get_output_tensor(0); + ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 12, 12})); + ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 10, 10})); + ASSERT_NO_THROW(output_tensor.set_shape({2, 10, 20, 20})); +} diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp index cf3fc5772e8..90b460cd4db 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp @@ -87,6 +87,57 @@ public: } }; +TEST_P(OVRemoteTensorInputBlob_Test, smoke_cantCreateBlobWithInvalidSize) { + RemoteTensorSharingType sharing_type; + bool with_auto_batching; + std::tie(sharing_type, with_auto_batching) = GetParam(); + if (with_auto_batching) + GTEST_SKIP(); + + if (sharing_type == RemoteTensorSharingType::PLUGIN_CL_TENSOR || + sharing_type == RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR || + sharing_type == RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR || + sharing_type == RemoteTensorSharingType::PLUGIN_HOST_TENSOR) + GTEST_SKIP(); + + auto ie = ov::runtime::Core(); + auto cldnn_context = ie.get_default_context(deviceName).as(); + cl_context ctx = cldnn_context; + auto ocl_instance = std::make_shared(ctx); + cl_int err; + + ov::Shape invalid_shape = {1, 20, 30, 40}; + + auto imSize = ov::shape_size(ov::Shape({1, 2, 3, 4})); + + switch (sharing_type) { + case RemoteTensorSharingType::USER_CL_TENSOR: { + cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err); + ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer)); + break; + } + case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize); + ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer)); + ocl_instance->free_mem(shared_buffer); + break; + } + case RemoteTensorSharingType::USER_USM_HOST_TENSOR: { + if (!ocl_instance->supports_usm()) + GTEST_SKIP(); + + void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize); + ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer)); + ocl_instance->free_mem(shared_buffer); + break; + } + default: break; + } +} + TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) { #if defined(ANDROID) GTEST_SKIP();