[GPU] Fix lockable memory allocation (#14134)
* [GPU] Fix lockable memory allocation * Apply review comments * [GPU] Replace usm handle sharing with independent blob creation * Prohibit all other remote blob types * [GPU] Fix for nullptr impl
This commit is contained in:
committed by
GitHub
parent
823ea7c68a
commit
637f1cdf8f
@@ -185,6 +185,7 @@ public:
|
||||
void validate_primitives();
|
||||
void set_arguments();
|
||||
// Implementation specific calls
|
||||
bool is_cpu_impl(const primitive_id& id) const;
|
||||
std::shared_ptr<primitive_inst> get_primitive(const primitive_id& id);
|
||||
std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
|
||||
std::string get_primitive_info(const primitive_id& id) const;
|
||||
@@ -260,8 +261,8 @@ private:
|
||||
void allocate_primitive_instance(program_node const& node);
|
||||
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
|
||||
void add_to_exec_order(const primitive_id& id);
|
||||
std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id);
|
||||
std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id);
|
||||
std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id) const;
|
||||
std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id) const;
|
||||
void check_names();
|
||||
void add_default_output_chains();
|
||||
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
|
||||
|
||||
@@ -59,6 +59,7 @@ public:
|
||||
private:
|
||||
// This blob is used for outputs processing if output data type convertion or padding handling is needed
|
||||
InferenceEngine::Blob::Ptr intermediate_output_blob = nullptr;
|
||||
InferenceEngine::BlobMap users_blobs_matching;
|
||||
InferenceEngine::BlobMap _deviceOutputs;
|
||||
std::map<std::string, cldnn::primitive_id> inputsMap;
|
||||
std::map<std::string, cldnn::primitive_id> outputsMap;
|
||||
@@ -76,7 +77,8 @@ private:
|
||||
std::vector<cldnn::event::ptr>& dependencies);
|
||||
void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
|
||||
void allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
|
||||
const cldnn::primitive_id& blob_name, const cldnn::layout& layout);
|
||||
const cldnn::primitive_id& blob_name, const cldnn::layout& layout,
|
||||
const bool need_lockable_mem = false);
|
||||
|
||||
InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, bool is_dynamic);
|
||||
InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc);
|
||||
@@ -85,6 +87,10 @@ private:
|
||||
void copy_input_data(std::shared_ptr<cldnn::network> network, const cldnn::primitive_id& inputName,
|
||||
const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob);
|
||||
|
||||
template<typename RemoteBlobType, typename = typename std::enable_if<std::is_same<RemoteBlobType, RemoteCLbuffer>::value ||
|
||||
std::is_same<RemoteBlobType, RemoteUSMbuffer>::value>::type>
|
||||
InferenceEngine::Blob::Ptr create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
|
||||
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr = nullptr);
|
||||
InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
|
||||
void allocate_inputs();
|
||||
void allocate_outputs();
|
||||
|
||||
@@ -693,7 +693,7 @@ void cldnn::network::check_names() {
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) {
|
||||
std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) const {
|
||||
std::shared_ptr<primitive_inst> ret;
|
||||
|
||||
if (_primitives.find(id) != _primitives.end())
|
||||
@@ -702,7 +702,7 @@ std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_i
|
||||
return find_in_internal_networks(id);
|
||||
}
|
||||
|
||||
std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) {
|
||||
std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) const {
|
||||
std::shared_ptr<primitive_inst> ret;
|
||||
|
||||
for (auto const& prim : _primitives) {
|
||||
@@ -724,6 +724,15 @@ std::string network::get_primitive_info(const primitive_id& id) const {
|
||||
return node.type()->to_string(node);
|
||||
}
|
||||
|
||||
bool network::is_cpu_impl(const primitive_id& id) const {
|
||||
auto prim_inst = find_primitive(id);
|
||||
|
||||
OPENVINO_ASSERT(prim_inst, "[GPU] Can't get implementation type, since topology",
|
||||
"doesn't contain primitive with requested id: ", id);
|
||||
|
||||
return prim_inst->get_impl() ? prim_inst->get_impl()->is_cpu() : true;
|
||||
}
|
||||
|
||||
std::string network::get_implementation_info(const primitive_id& id) const {
|
||||
return _program->get_implementation_info(id);
|
||||
}
|
||||
|
||||
@@ -450,11 +450,7 @@ void InferRequest::enqueue() {
|
||||
FormatFromTensorDesc(blobsDesc),
|
||||
tensor_from_dims(blobsDesc.getDims()));
|
||||
|
||||
auto mergedBlobs = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
blobsDesc,
|
||||
layout);
|
||||
mergedBlobs->allocate();
|
||||
auto mergedBlobs = create_remote_blob<RemoteCLbuffer>(blobsDesc, layout, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
|
||||
dst = mergedBlobs->buffer().as<uint8_t*>();
|
||||
|
||||
_inputs[name] = mergedBlobs;
|
||||
@@ -602,21 +598,33 @@ Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, bool is_dynamic
|
||||
return blob;
|
||||
}
|
||||
|
||||
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
|
||||
auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
|
||||
template<typename RemoteBlobType, typename>
|
||||
InferenceEngine::Blob::Ptr InferRequest::create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
|
||||
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr) {
|
||||
auto blob = std::make_shared<RemoteBlobType>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
layout,
|
||||
usm_host_mem,
|
||||
mem_ptr,
|
||||
0,
|
||||
0,
|
||||
RemoteBlobImpl::BlobType::BT_USM_SHARED);
|
||||
if (!blob)
|
||||
IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
|
||||
mem_type);
|
||||
OPENVINO_ASSERT(blob, "[GPU] Failed to allocate remote blob");
|
||||
blob->allocate();
|
||||
return blob;
|
||||
}
|
||||
|
||||
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteCLbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
|
||||
const RemoteBlobImpl::BlobType, void*);
|
||||
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteUSMbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
|
||||
const RemoteBlobImpl::BlobType, void*);
|
||||
|
||||
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
|
||||
auto blob = create_remote_blob<RemoteUSMbuffer>(desc, layout, RemoteBlobImpl::BlobType::BT_USM_SHARED, usm_host_mem);
|
||||
OPENVINO_ASSERT(blob, "[GPU] Failed to allocate shared host <-> device blob");
|
||||
return blob;
|
||||
}
|
||||
|
||||
void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::copy_output_data");
|
||||
auto is_convert_needed = [](const Precision& prc) {
|
||||
@@ -836,7 +844,7 @@ std::map<std::string, InferenceEngineProfileInfo> InferRequest::GetPerformanceCo
|
||||
}
|
||||
|
||||
void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
|
||||
const cldnn::primitive_id& blob_name, const cldnn::layout& layout) {
|
||||
const cldnn::primitive_id& blob_name, const cldnn::layout& layout, bool need_lockable_mem) {
|
||||
const auto input_ptr = static_cast<const void*>(user_blob->cbuffer());
|
||||
const auto alloc_type = m_graph->GetEngine()->detect_usm_allocation_type(input_ptr);
|
||||
const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
|
||||
@@ -855,7 +863,15 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
|
||||
// so we don't need to allocate new memory
|
||||
can_skip_allocation |= same_host_mem(impl_mem, src_ptr);
|
||||
// Or if blob has any type except usm_host - in that case explicit copy will be performed anyway
|
||||
can_skip_allocation |= impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
|
||||
// Or if blob has usm_host type and lockable memory is expected by impl
|
||||
can_skip_allocation |= need_lockable_mem ? impl_mem->get_allocation_type() == cldnn::allocation_type::usm_host
|
||||
: impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
|
||||
// In case of lockable memory we need to keep updated device's usm_host memory buffer with
|
||||
// user's blob to avoid incorrect behaviour if user will call set_blob() with
|
||||
// the following sequence (usm_host, system_host, usm_host, system_host...)
|
||||
if (need_lockable_mem)
|
||||
can_skip_allocation &= users_blobs_matching.find(blob_name) != users_blobs_matching.end()
|
||||
&& users_blobs_matching[blob_name] == user_blob;
|
||||
}
|
||||
|
||||
if (!can_skip_allocation) {
|
||||
@@ -863,9 +879,13 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
|
||||
// For USM case we create host blob using custom USM host allocator
|
||||
// and then create shared device blob on top of this buffer
|
||||
device_mems[blob_name] = create_shared_device_blob(user_blob->getTensorDesc(), layout, user_blob->buffer().as<void*>());
|
||||
} else if (need_lockable_mem) {
|
||||
device_mems[blob_name] =
|
||||
create_remote_blob<RemoteUSMbuffer>(user_blob->getTensorDesc(), layout, RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
|
||||
} else {
|
||||
device_mems[blob_name] = create_device_blob(user_blob->getTensorDesc());
|
||||
}
|
||||
users_blobs_matching[blob_name] = user_blob;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -980,7 +1000,8 @@ void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::P
|
||||
const bool is_dev_input = remote_ptr != nullptr;
|
||||
|
||||
if (is_static && can_use_usm && !is_dev_input) {
|
||||
allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout);
|
||||
auto is_cpu_impl = m_graph->GetNetwork()->is_cpu_impl(output_id);
|
||||
allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout, is_cpu_impl);
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(!is_static || _deviceOutputs.find(outputName) != _deviceOutputs.end(),
|
||||
@@ -1010,23 +1031,9 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
|
||||
auto l = cldnn::layout(shape, dt, format);
|
||||
|
||||
if (m_graph->GetEngine()->use_unified_shared_memory()) {
|
||||
auto blobPtr = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
l,
|
||||
nullptr,
|
||||
0,
|
||||
0,
|
||||
RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
return create_remote_blob<RemoteUSMbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
|
||||
} else {
|
||||
auto blobPtr = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
|
||||
m_graph->GetNetwork()->get_stream(),
|
||||
desc,
|
||||
l);
|
||||
getBlobImpl(blobPtr.get())->allocate();
|
||||
return blobPtr;
|
||||
return create_remote_blob<RemoteCLbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -189,6 +189,68 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
|
||||
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
|
||||
|
||||
auto input_tensor1 = infer_request1.get_input_tensor();
|
||||
FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
|
||||
|
||||
auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
|
||||
auto output_tensor2 = infer_request2.get_output_tensor();
|
||||
|
||||
// Use tensor from infer request #2 as an output for infer request #1
|
||||
infer_request1.set_output_tensor(output_tensor2);
|
||||
ASSERT_NO_THROW(infer_request1.infer());
|
||||
|
||||
// Modify tensor somehow and save as a reference values
|
||||
FuncTestUtils::fill_tensor(output_tensor2);
|
||||
|
||||
std::vector<float> ref_values;
|
||||
ref_values.resize(output_tensor2.get_byte_size());
|
||||
std::memcpy(ref_values.data(), output_tensor2.data(), output_tensor2.get_byte_size());
|
||||
|
||||
// Perform second infer() call with a system host memory tensor
|
||||
infer_request1.set_output_tensor(output_tensor1);
|
||||
ASSERT_NO_THROW(infer_request1.infer());
|
||||
|
||||
// Expect that output_tensor2 will not change it's data after infer() call
|
||||
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
|
||||
FuncTestUtils::compareRawBuffers(ref_values.data(),
|
||||
output_tensor2.data<float>(),
|
||||
ref_values.size(),
|
||||
ov::shape_size(output_tensor2.get_shape()),
|
||||
thr);
|
||||
}
|
||||
|
||||
TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
|
||||
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
|
||||
|
||||
auto input_tensor1 = infer_request1.get_input_tensor();
|
||||
FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
|
||||
|
||||
auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
|
||||
auto output_tensor2 = infer_request2.get_output_tensor();
|
||||
|
||||
infer_request1.set_output_tensor(output_tensor2);
|
||||
ASSERT_NO_THROW(infer_request1.infer());
|
||||
|
||||
FuncTestUtils::fill_tensor(input_tensor1, 10, 0, 1, 1);
|
||||
infer_request1.set_output_tensor(output_tensor1);
|
||||
ASSERT_NO_THROW(infer_request1.infer());
|
||||
}
|
||||
|
||||
TEST(canSwapTensorsBetweenInferRequests, outputs) {
|
||||
std::vector<std::vector<uint8_t>> ref;
|
||||
std::vector<ov::Tensor> input_tensors;
|
||||
|
||||
Reference in New Issue
Block a user