[GPU] Fix lockable memory allocation (#14134)

* [GPU] Fix lockable memory allocation

* Apply review comments

* [GPU] Replace usm handle sharing with independent blob creation

* Prohibit all other remote blob types

* [GPU] Fix for nullptr impl
This commit is contained in:
Sergey Shlyapnikov
2022-11-25 12:07:47 +04:00
committed by GitHub
parent 823ea7c68a
commit 637f1cdf8f
5 changed files with 120 additions and 35 deletions

View File

@@ -185,6 +185,7 @@ public:
void validate_primitives();
void set_arguments();
// Implementation specific calls
bool is_cpu_impl(const primitive_id& id) const;
std::shared_ptr<primitive_inst> get_primitive(const primitive_id& id);
std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
std::string get_primitive_info(const primitive_id& id) const;
@@ -260,8 +261,8 @@ private:
void allocate_primitive_instance(program_node const& node);
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
void add_to_exec_order(const primitive_id& id);
std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id);
std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id);
std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id) const;
std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id) const;
void check_names();
void add_default_output_chains();
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);

View File

@@ -59,6 +59,7 @@ public:
private:
// This blob is used for outputs processing if output data type convertion or padding handling is needed
InferenceEngine::Blob::Ptr intermediate_output_blob = nullptr;
InferenceEngine::BlobMap users_blobs_matching;
InferenceEngine::BlobMap _deviceOutputs;
std::map<std::string, cldnn::primitive_id> inputsMap;
std::map<std::string, cldnn::primitive_id> outputsMap;
@@ -76,7 +77,8 @@ private:
std::vector<cldnn::event::ptr>& dependencies);
void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
void allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
const cldnn::primitive_id& blob_name, const cldnn::layout& layout);
const cldnn::primitive_id& blob_name, const cldnn::layout& layout,
const bool need_lockable_mem = false);
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, bool is_dynamic);
InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc);
@@ -85,6 +87,10 @@ private:
void copy_input_data(std::shared_ptr<cldnn::network> network, const cldnn::primitive_id& inputName,
const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob);
template<typename RemoteBlobType, typename = typename std::enable_if<std::is_same<RemoteBlobType, RemoteCLbuffer>::value ||
std::is_same<RemoteBlobType, RemoteUSMbuffer>::value>::type>
InferenceEngine::Blob::Ptr create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr = nullptr);
InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
void allocate_inputs();
void allocate_outputs();

View File

@@ -693,7 +693,7 @@ void cldnn::network::check_names() {
}
}
std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) {
std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) const {
std::shared_ptr<primitive_inst> ret;
if (_primitives.find(id) != _primitives.end())
@@ -702,7 +702,7 @@ std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_i
return find_in_internal_networks(id);
}
std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) {
std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) const {
std::shared_ptr<primitive_inst> ret;
for (auto const& prim : _primitives) {
@@ -724,6 +724,15 @@ std::string network::get_primitive_info(const primitive_id& id) const {
return node.type()->to_string(node);
}
bool network::is_cpu_impl(const primitive_id& id) const {
auto prim_inst = find_primitive(id);
OPENVINO_ASSERT(prim_inst, "[GPU] Can't get implementation type, since topology",
"doesn't contain primitive with requested id: ", id);
return prim_inst->get_impl() ? prim_inst->get_impl()->is_cpu() : true;
}
std::string network::get_implementation_info(const primitive_id& id) const {
return _program->get_implementation_info(id);
}

View File

@@ -450,11 +450,7 @@ void InferRequest::enqueue() {
FormatFromTensorDesc(blobsDesc),
tensor_from_dims(blobsDesc.getDims()));
auto mergedBlobs = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
blobsDesc,
layout);
mergedBlobs->allocate();
auto mergedBlobs = create_remote_blob<RemoteCLbuffer>(blobsDesc, layout, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
dst = mergedBlobs->buffer().as<uint8_t*>();
_inputs[name] = mergedBlobs;
@@ -602,21 +598,33 @@ Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, bool is_dynamic
return blob;
}
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
template<typename RemoteBlobType, typename>
InferenceEngine::Blob::Ptr InferRequest::create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr) {
auto blob = std::make_shared<RemoteBlobType>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
desc,
layout,
usm_host_mem,
mem_ptr,
0,
0,
RemoteBlobImpl::BlobType::BT_USM_SHARED);
if (!blob)
IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
mem_type);
OPENVINO_ASSERT(blob, "[GPU] Failed to allocate remote blob");
blob->allocate();
return blob;
}
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteCLbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
const RemoteBlobImpl::BlobType, void*);
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteUSMbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
const RemoteBlobImpl::BlobType, void*);
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
auto blob = create_remote_blob<RemoteUSMbuffer>(desc, layout, RemoteBlobImpl::BlobType::BT_USM_SHARED, usm_host_mem);
OPENVINO_ASSERT(blob, "[GPU] Failed to allocate shared host <-> device blob");
return blob;
}
void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::copy_output_data");
auto is_convert_needed = [](const Precision& prc) {
@@ -836,7 +844,7 @@ std::map<std::string, InferenceEngineProfileInfo> InferRequest::GetPerformanceCo
}
void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
const cldnn::primitive_id& blob_name, const cldnn::layout& layout) {
const cldnn::primitive_id& blob_name, const cldnn::layout& layout, bool need_lockable_mem) {
const auto input_ptr = static_cast<const void*>(user_blob->cbuffer());
const auto alloc_type = m_graph->GetEngine()->detect_usm_allocation_type(input_ptr);
const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
@@ -855,7 +863,15 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
// so we don't need to allocate new memory
can_skip_allocation |= same_host_mem(impl_mem, src_ptr);
// Or if blob has any type except usm_host - in that case explicit copy will be performed anyway
can_skip_allocation |= impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
// Or if blob has usm_host type and lockable memory is expected by impl
can_skip_allocation |= need_lockable_mem ? impl_mem->get_allocation_type() == cldnn::allocation_type::usm_host
: impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
// In case of lockable memory we need to keep updated device's usm_host memory buffer with
// user's blob to avoid incorrect behaviour if user will call set_blob() with
// the following sequence (usm_host, system_host, usm_host, system_host...)
if (need_lockable_mem)
can_skip_allocation &= users_blobs_matching.find(blob_name) != users_blobs_matching.end()
&& users_blobs_matching[blob_name] == user_blob;
}
if (!can_skip_allocation) {
@@ -863,9 +879,13 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
device_mems[blob_name] = create_shared_device_blob(user_blob->getTensorDesc(), layout, user_blob->buffer().as<void*>());
} else if (need_lockable_mem) {
device_mems[blob_name] =
create_remote_blob<RemoteUSMbuffer>(user_blob->getTensorDesc(), layout, RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
} else {
device_mems[blob_name] = create_device_blob(user_blob->getTensorDesc());
}
users_blobs_matching[blob_name] = user_blob;
}
}
@@ -980,7 +1000,8 @@ void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::P
const bool is_dev_input = remote_ptr != nullptr;
if (is_static && can_use_usm && !is_dev_input) {
allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout);
auto is_cpu_impl = m_graph->GetNetwork()->is_cpu_impl(output_id);
allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout, is_cpu_impl);
}
OPENVINO_ASSERT(!is_static || _deviceOutputs.find(outputName) != _deviceOutputs.end(),
@@ -1010,23 +1031,9 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
auto l = cldnn::layout(shape, dt, format);
if (m_graph->GetEngine()->use_unified_shared_memory()) {
auto blobPtr = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
desc,
l,
nullptr,
0,
0,
RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
return create_remote_blob<RemoteUSMbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
} else {
auto blobPtr = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
m_graph->GetNetwork()->get_stream(),
desc,
l);
getBlobImpl(blobPtr.get())->allocate();
return blobPtr;
return create_remote_blob<RemoteCLbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
}
}

View File

@@ -189,6 +189,68 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
}
}
TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
auto input_tensor1 = infer_request1.get_input_tensor();
FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
auto output_tensor2 = infer_request2.get_output_tensor();
// Use tensor from infer request #2 as an output for infer request #1
infer_request1.set_output_tensor(output_tensor2);
ASSERT_NO_THROW(infer_request1.infer());
// Modify tensor somehow and save as a reference values
FuncTestUtils::fill_tensor(output_tensor2);
std::vector<float> ref_values;
ref_values.resize(output_tensor2.get_byte_size());
std::memcpy(ref_values.data(), output_tensor2.data(), output_tensor2.get_byte_size());
// Perform second infer() call with a system host memory tensor
infer_request1.set_output_tensor(output_tensor1);
ASSERT_NO_THROW(infer_request1.infer());
// Expect that output_tensor2 will not change it's data after infer() call
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
FuncTestUtils::compareRawBuffers(ref_values.data(),
output_tensor2.data<float>(),
ref_values.size(),
ov::shape_size(output_tensor2.get_shape()),
thr);
}
TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
auto input_tensor1 = infer_request1.get_input_tensor();
FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
auto output_tensor2 = infer_request2.get_output_tensor();
infer_request1.set_output_tensor(output_tensor2);
ASSERT_NO_THROW(infer_request1.infer());
FuncTestUtils::fill_tensor(input_tensor1, 10, 0, 1, 1);
infer_request1.set_output_tensor(output_tensor1);
ASSERT_NO_THROW(infer_request1.infer());
}
TEST(canSwapTensorsBetweenInferRequests, outputs) {
std::vector<std::vector<uint8_t>> ref;
std::vector<ov::Tensor> input_tensors;