[GPU] Get rid of PreAllocator usages to allow setShape (#9576)
This commit is contained in:
parent
e329baef04
commit
f68dd90413
@ -89,7 +89,8 @@ private:
|
||||
std::vector<cldnn::event::ptr>& dependencies);
|
||||
void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
|
||||
|
||||
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr);
|
||||
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc,
|
||||
std::shared_ptr<InferenceEngine::IAllocator> alloc = nullptr);
|
||||
InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout);
|
||||
|
||||
void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr);
|
||||
@ -97,7 +98,7 @@ private:
|
||||
const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob,
|
||||
buf_info* bi = nullptr);
|
||||
|
||||
InferenceEngine::Blob::Ptr host_blob_from_device_blob(const InferenceEngine::Blob::Ptr blobPtr);
|
||||
InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
|
||||
void allocate_inputs();
|
||||
void allocate_outputs();
|
||||
void allocate_inputs_dynamic();
|
||||
|
@ -317,7 +317,6 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
|
||||
// during pre-processing
|
||||
if (_inputs[name]->is<gpu::ClBlob>()) {
|
||||
Blob::Ptr inputHostBlob = create_host_blob(desc);
|
||||
inputHostBlob->allocate();
|
||||
_inputs[name] = inputHostBlob;
|
||||
}
|
||||
_preProcData[name] = CreatePreprocDataHelper();
|
||||
@ -787,59 +786,27 @@ void InferRequest::setup_stream_graph() {
|
||||
m_graph = streamGraphs[streamID];
|
||||
}
|
||||
|
||||
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
|
||||
Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr<InferenceEngine::IAllocator> alloc) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob");
|
||||
const Precision& p = desc.getPrecision();
|
||||
auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator());
|
||||
blob->allocate();
|
||||
return blob;
|
||||
}
|
||||
|
||||
switch (p) {
|
||||
case Precision::FP32:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<float>(desc, reinterpret_cast<float*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<float>(desc);
|
||||
case Precision::FP16:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<uint16_t>(desc);
|
||||
case Precision::I16:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<int16_t>(desc, reinterpret_cast<int16_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<int16_t>(desc);
|
||||
case Precision::U16:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<uint16_t>(desc);
|
||||
case Precision::I32:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<int32_t>(desc, reinterpret_cast<int32_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<int32_t>(desc);
|
||||
case Precision::I64:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<int64_t>(desc, reinterpret_cast<int64_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<int64_t>(desc);
|
||||
case Precision::I8:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<int8_t>(desc, reinterpret_cast<int8_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<int8_t>(desc);
|
||||
case Precision::U8:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<uint8_t>(desc, reinterpret_cast<uint8_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<uint8_t>(desc);
|
||||
case Precision::BOOL:
|
||||
if (mem_ptr != nullptr)
|
||||
return make_shared_blob<uint8_t>(desc, reinterpret_cast<uint8_t*>(mem_ptr));
|
||||
else
|
||||
return make_shared_blob<uint8_t>(desc);
|
||||
default:
|
||||
IE_THROW(NotImplemented) << "The plugin does not support " << p.name() << " blob precision";
|
||||
}
|
||||
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
|
||||
auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
layout,
|
||||
usm_host_mem,
|
||||
0,
|
||||
0,
|
||||
RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
|
||||
if (!blob)
|
||||
IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
|
||||
blob->allocate();
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) {
|
||||
@ -907,21 +874,6 @@ void InferRequest::copy_input_data(std::shared_ptr<cldnn::network> network,
|
||||
}
|
||||
}
|
||||
|
||||
Blob::Ptr InferRequest::host_blob_from_device_blob(Blob::Ptr blobPtr) {
|
||||
uint8_t* bufferMem = nullptr;
|
||||
auto clblobPtr = std::dynamic_pointer_cast<InferenceEngine::gpu::ClBlob>(blobPtr);
|
||||
if (clblobPtr) {
|
||||
const auto memPtr = getBlobImpl(clblobPtr.get())->getMemory();
|
||||
if (memPtr->get_allocation_type() == cldnn::allocation_type::usm_host) {
|
||||
bufferMem = reinterpret_cast<uint8_t*>(memPtr->get_internal_params().mem);
|
||||
}
|
||||
}
|
||||
Blob::Ptr hostBlob = create_host_blob(blobPtr->getTensorDesc(), bufferMem);
|
||||
hostBlob->allocate();
|
||||
|
||||
return hostBlob;
|
||||
}
|
||||
|
||||
void InferRequest::allocate_inputs() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs");
|
||||
auto inputLayouts = m_graph->GetInputLayouts();
|
||||
@ -964,12 +916,18 @@ void InferRequest::allocate_inputs() {
|
||||
auto blobPtr = create_device_blob(desc_fp32, litr->second);
|
||||
_deviceInputs[name] = blobPtr;
|
||||
Blob::Ptr inputBlob = create_host_blob(desc);
|
||||
inputBlob->allocate();
|
||||
_inputs[name] = inputBlob;
|
||||
} else {
|
||||
auto blobPtr = create_device_blob(desc, litr->second);
|
||||
_deviceInputs[name] = blobPtr;
|
||||
_inputs[name] = host_blob_from_device_blob(blobPtr);
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
|
||||
_inputs[name] = host_blob;
|
||||
_deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as<void*>());
|
||||
} else {
|
||||
_inputs[name] = create_host_blob(desc);
|
||||
_deviceInputs[name] = create_device_blob(desc, litr->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -996,7 +954,6 @@ void InferRequest::allocate_inputs_dynamic() {
|
||||
fp32inputBlob->allocate();
|
||||
_inputs[input.first + fp32_suffix] = fp32inputBlob;
|
||||
}
|
||||
inputBlob->allocate();
|
||||
_inputs[input.first] = inputBlob;
|
||||
}
|
||||
}
|
||||
@ -1013,10 +970,18 @@ void InferRequest::allocate_outputs() {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
|
||||
}
|
||||
auto blobPtr = create_device_blob(desc, output_layout);
|
||||
_deviceOutputs[no.first] = blobPtr;
|
||||
_outputs[no.first] = host_blob_from_device_blob(blobPtr);
|
||||
|
||||
outputsMap[no.first] = outputID;
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
|
||||
_outputs[no.first] = host_blob;
|
||||
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
|
||||
} else {
|
||||
_outputs[no.first] = create_host_blob(desc);
|
||||
_deviceOutputs[no.first] = create_device_blob(desc, output_layout);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1036,7 +1001,6 @@ void InferRequest::allocate_outputs_dynamic() {
|
||||
}
|
||||
|
||||
Blob::Ptr outputBlob = create_host_blob(desc);
|
||||
outputBlob->allocate();
|
||||
_outputs[no.first] = outputBlob;
|
||||
outputsMap[no.first] = outputID;
|
||||
}
|
||||
|
@ -26,6 +26,21 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
|
||||
BlobType mem_type) :
|
||||
m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
|
||||
_handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
|
||||
auto _impl = getContextImpl(m_context.lock());
|
||||
auto eng = _impl->GetEngine();
|
||||
|
||||
// Verify shared buffer/usm memory and ensure that requested byte size is not greater than allocated one
|
||||
switch (m_mem_type) {
|
||||
case BlobType::BT_BUF_SHARED: {
|
||||
eng->share_buffer(m_layout, m_mem);
|
||||
break;
|
||||
}
|
||||
case BlobType::BT_USM_SHARED: {
|
||||
eng->share_usm(m_layout, m_mem);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
ParamMap RemoteBlobImpl::getParams() const {
|
||||
|
@ -167,9 +167,21 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
|
||||
#endif
|
||||
} else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
|
||||
cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
|
||||
auto actual_mem_size = buf.getInfo<CL_MEM_SIZE>();
|
||||
auto requested_mem_size = new_layout.bytes_count();
|
||||
if (actual_mem_size < requested_mem_size) {
|
||||
throw std::runtime_error("[GPU] shared buffer has smaller size (" + std::to_string(actual_mem_size) +
|
||||
") than specified layout (" + std::to_string(requested_mem_size) + ")");
|
||||
}
|
||||
return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
|
||||
} else if (params.mem_type == shared_mem_type::shared_mem_usm) {
|
||||
cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
|
||||
auto actual_mem_size = get_usm_helper().get_usm_allocation_size(usm_buffer.get());
|
||||
auto requested_mem_size = new_layout.bytes_count();
|
||||
if (actual_mem_size < requested_mem_size) {
|
||||
throw std::runtime_error("[GPU] shared USM buffer has smaller size (" + std::to_string(actual_mem_size) +
|
||||
") than specified layout (" + std::to_string(requested_mem_size) + ")");
|
||||
}
|
||||
return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
|
||||
} else {
|
||||
throw std::runtime_error("unknown shared object fromat or type");
|
||||
|
@ -633,6 +633,17 @@ public:
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
size_t get_usm_allocation_size(const void* usm_ptr) const {
|
||||
if (!_get_mem_alloc_info_fn) {
|
||||
throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
|
||||
}
|
||||
|
||||
size_t ret_val;
|
||||
size_t ret_val_size;
|
||||
_get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_SIZE_INTEL, sizeof(size_t), &ret_val, &ret_val_size);
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
private:
|
||||
cl::Context _ctx;
|
||||
cl::Device _device;
|
||||
|
40
src/tests/functional/plugin/gpu/behavior/infer_request.cpp
Normal file
40
src/tests/functional/plugin/gpu/behavior/infer_request.cpp
Normal file
@ -0,0 +1,40 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "openvino/runtime/core.hpp"
|
||||
|
||||
#include <common_test_utils/test_common.hpp>
|
||||
#include "ngraph_functions/subgraph_builders.hpp"
|
||||
#include "functional_test_utils/blob_utils.hpp"
|
||||
#include "openvino/core/preprocess/pre_post_process.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
using namespace ::testing;
|
||||
|
||||
TEST(TensorTest, smoke_canSetShapeForPreallocatedTensor) {
|
||||
auto ie = ov::runtime::Core();
|
||||
using namespace ov::preprocess;
|
||||
auto p = PrePostProcessor(ngraph::builder::subgraph::makeSplitMultiConvConcat());
|
||||
p.input().tensor().set_element_type(ov::element::i8);
|
||||
p.input().preprocess().convert_element_type(ov::element::f32);
|
||||
|
||||
auto function = p.build();
|
||||
auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
|
||||
auto inf_req = exec_net.create_infer_request();
|
||||
|
||||
// Check set_shape call for pre-allocated input/output tensors
|
||||
auto input_tensor = inf_req.get_input_tensor(0);
|
||||
ASSERT_NO_THROW(input_tensor.set_shape({1, 4, 20, 20}));
|
||||
ASSERT_NO_THROW(input_tensor.set_shape({1, 3, 20, 20}));
|
||||
ASSERT_NO_THROW(input_tensor.set_shape({2, 3, 20, 20}));
|
||||
auto output_tensor = inf_req.get_output_tensor(0);
|
||||
ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 12, 12}));
|
||||
ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 10, 10}));
|
||||
ASSERT_NO_THROW(output_tensor.set_shape({2, 10, 20, 20}));
|
||||
}
|
@ -87,6 +87,57 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(OVRemoteTensorInputBlob_Test, smoke_cantCreateBlobWithInvalidSize) {
|
||||
RemoteTensorSharingType sharing_type;
|
||||
bool with_auto_batching;
|
||||
std::tie(sharing_type, with_auto_batching) = GetParam();
|
||||
if (with_auto_batching)
|
||||
GTEST_SKIP();
|
||||
|
||||
if (sharing_type == RemoteTensorSharingType::PLUGIN_CL_TENSOR ||
|
||||
sharing_type == RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR ||
|
||||
sharing_type == RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR ||
|
||||
sharing_type == RemoteTensorSharingType::PLUGIN_HOST_TENSOR)
|
||||
GTEST_SKIP();
|
||||
|
||||
auto ie = ov::runtime::Core();
|
||||
auto cldnn_context = ie.get_default_context(deviceName).as<ov::runtime::intel_gpu::ocl::ClContext>();
|
||||
cl_context ctx = cldnn_context;
|
||||
auto ocl_instance = std::make_shared<OpenCL>(ctx);
|
||||
cl_int err;
|
||||
|
||||
ov::Shape invalid_shape = {1, 20, 30, 40};
|
||||
|
||||
auto imSize = ov::shape_size(ov::Shape({1, 2, 3, 4}));
|
||||
|
||||
switch (sharing_type) {
|
||||
case RemoteTensorSharingType::USER_CL_TENSOR: {
|
||||
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
|
||||
ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
|
||||
ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
|
||||
ocl_instance->free_mem(shared_buffer);
|
||||
break;
|
||||
}
|
||||
case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
|
||||
if (!ocl_instance->supports_usm())
|
||||
GTEST_SKIP();
|
||||
|
||||
void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
|
||||
ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
|
||||
ocl_instance->free_mem(shared_buffer);
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
|
||||
#if defined(ANDROID)
|
||||
GTEST_SKIP();
|
||||
|
Loading…
Reference in New Issue
Block a user