[GPU] Fix lockable memory allocation (#14134)

* [GPU] Fix lockable memory allocation * Apply review comments * [GPU] Replace usm handle sharing with independent blob creation * Prohibit all other remote blob types * [GPU] Fix for nullptr impl
2022-11-25 12:07:47 +04:00
parent 823ea7c68a
commit 637f1cdf8f
5 changed files with 120 additions and 35 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -185,6 +185,7 @@ public:
    void validate_primitives();
    void set_arguments();
    // Implementation specific calls
+    bool is_cpu_impl(const primitive_id& id) const;
    std::shared_ptr<primitive_inst> get_primitive(const primitive_id& id);
    std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
    std::string get_primitive_info(const primitive_id& id) const;
@@ -260,8 +261,8 @@ private:
    void allocate_primitive_instance(program_node const& node);
    void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
    void add_to_exec_order(const primitive_id& id);
-    std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id);
-    std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id);
+    std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id) const;
+    std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id) const;
    void check_names();
    void add_default_output_chains();
    output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp
@@ -59,6 +59,7 @@ public:
 private:
    // This blob is used for outputs processing if output data type convertion or padding handling is needed
    InferenceEngine::Blob::Ptr intermediate_output_blob = nullptr;
+    InferenceEngine::BlobMap users_blobs_matching;
    InferenceEngine::BlobMap _deviceOutputs;
    std::map<std::string, cldnn::primitive_id> inputsMap;
    std::map<std::string, cldnn::primitive_id> outputsMap;
@@ -76,7 +77,8 @@ private:
                       std::vector<cldnn::event::ptr>& dependencies);
    void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
    void allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
-                                    const cldnn::primitive_id& blob_name, const cldnn::layout& layout);
+                                    const cldnn::primitive_id& blob_name, const cldnn::layout& layout,
+                                    const bool need_lockable_mem = false);

    InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, bool is_dynamic);
    InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc);
@@ -85,6 +87,10 @@ private:
    void copy_input_data(std::shared_ptr<cldnn::network> network, const cldnn::primitive_id& inputName,
                         const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob);

+    template<typename RemoteBlobType, typename = typename std::enable_if<std::is_same<RemoteBlobType, RemoteCLbuffer>::value ||
+                                                                         std::is_same<RemoteBlobType, RemoteUSMbuffer>::value>::type>
+    InferenceEngine::Blob::Ptr create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
+                                                  const RemoteBlobImpl::BlobType mem_type, void* mem_ptr = nullptr);
    InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
    void allocate_inputs();
    void allocate_outputs();
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -693,7 +693,7 @@ void cldnn::network::check_names() {
    }
 }

-std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) {
+std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) const {
    std::shared_ptr<primitive_inst> ret;

    if (_primitives.find(id) != _primitives.end())
@@ -702,7 +702,7 @@ std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_i
    return find_in_internal_networks(id);
 }

-std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) {
+std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) const {
    std::shared_ptr<primitive_inst> ret;

    for (auto const& prim : _primitives) {
@@ -724,6 +724,15 @@ std::string network::get_primitive_info(const primitive_id& id) const {
    return node.type()->to_string(node);
 }

+bool network::is_cpu_impl(const primitive_id& id) const {
+    auto prim_inst = find_primitive(id);
+
+    OPENVINO_ASSERT(prim_inst, "[GPU] Can't get implementation type, since topology",
+                               "doesn't contain primitive with requested id: ", id);
+
+    return prim_inst->get_impl() ? prim_inst->get_impl()->is_cpu() : true;
+}
+
 std::string network::get_implementation_info(const primitive_id& id) const {
    return _program->get_implementation_info(id);
 }
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@@ -450,11 +450,7 @@ void InferRequest::enqueue() {
                                     FormatFromTensorDesc(blobsDesc),
                                     tensor_from_dims(blobsDesc.getDims()));

-                auto mergedBlobs = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
-                                                                    m_graph->GetNetwork()->get_stream(),
-                                                                    blobsDesc,
-                                                                    layout);
-                mergedBlobs->allocate();
+                auto mergedBlobs = create_remote_blob<RemoteCLbuffer>(blobsDesc, layout, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
                dst = mergedBlobs->buffer().as<uint8_t*>();

                _inputs[name] = mergedBlobs;
@@ -602,21 +598,33 @@ Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, bool is_dynamic
    return blob;
 }

-Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
-    auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
+template<typename RemoteBlobType, typename>
+InferenceEngine::Blob::Ptr InferRequest::create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
+                                                            const RemoteBlobImpl::BlobType mem_type, void* mem_ptr) {
+    auto blob = std::make_shared<RemoteBlobType>(m_graph->GetContext(),
                                                  m_graph->GetNetwork()->get_stream(),
                                                  desc,
                                                  layout,
-                                                  usm_host_mem,
+                                                  mem_ptr,
                                                  0,
                                                  0,
-                                                  RemoteBlobImpl::BlobType::BT_USM_SHARED);
-    if (!blob)
-        IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
+                                                  mem_type);
+    OPENVINO_ASSERT(blob, "[GPU] Failed to allocate remote blob");
    blob->allocate();
    return blob;
 }

+template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteCLbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
+                                                                                     const RemoteBlobImpl::BlobType, void*);
+template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteUSMbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
+                                                                                      const RemoteBlobImpl::BlobType, void*);
+
+Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
+    auto blob = create_remote_blob<RemoteUSMbuffer>(desc, layout, RemoteBlobImpl::BlobType::BT_USM_SHARED, usm_host_mem);
+    OPENVINO_ASSERT(blob, "[GPU] Failed to allocate shared host <-> device blob");
+    return blob;
+}
+
 void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst) {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::copy_output_data");
    auto is_convert_needed = [](const Precision& prc) {
@@ -836,7 +844,7 @@ std::map<std::string, InferenceEngineProfileInfo> InferRequest::GetPerformanceCo
 }

 void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
-                                              const cldnn::primitive_id& blob_name, const cldnn::layout& layout) {
+                                              const cldnn::primitive_id& blob_name, const cldnn::layout& layout, bool need_lockable_mem) {
    const auto input_ptr = static_cast<const void*>(user_blob->cbuffer());
    const auto alloc_type = m_graph->GetEngine()->detect_usm_allocation_type(input_ptr);
    const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
@@ -855,7 +863,15 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
        // so we don't need to allocate new memory
        can_skip_allocation |= same_host_mem(impl_mem, src_ptr);
        // Or if blob has any type except usm_host - in that case explicit copy will be performed anyway
-        can_skip_allocation |= impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
+        // Or if blob has usm_host type and lockable memory is expected by impl
+        can_skip_allocation |= need_lockable_mem ? impl_mem->get_allocation_type() == cldnn::allocation_type::usm_host
+                                                 : impl_mem->get_allocation_type() != cldnn::allocation_type::usm_host;
+        // In case of lockable memory we need to keep updated device's usm_host memory buffer with
+        // user's blob to avoid incorrect behaviour if user will call set_blob() with
+        // the following sequence (usm_host, system_host, usm_host, system_host...)
+        if (need_lockable_mem)
+            can_skip_allocation &= users_blobs_matching.find(blob_name) != users_blobs_matching.end()
+                                && users_blobs_matching[blob_name] == user_blob;
    }

    if (!can_skip_allocation) {
@@ -863,9 +879,13 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
            // For USM case we create host blob using custom USM host allocator
            // and then create shared device blob on top of this buffer
            device_mems[blob_name] = create_shared_device_blob(user_blob->getTensorDesc(), layout, user_blob->buffer().as<void*>());
+        } else if (need_lockable_mem) {
+            device_mems[blob_name] =
+                create_remote_blob<RemoteUSMbuffer>(user_blob->getTensorDesc(), layout, RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
        } else {
            device_mems[blob_name] = create_device_blob(user_blob->getTensorDesc());
        }
+        users_blobs_matching[blob_name] = user_blob;
    }
 }

@@ -980,7 +1000,8 @@ void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::P
    const bool is_dev_input = remote_ptr != nullptr;

    if (is_static && can_use_usm && !is_dev_input) {
-        allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout);
+        auto is_cpu_impl = m_graph->GetNetwork()->is_cpu_impl(output_id);
+        allocate_dev_mem_if_needed(_deviceOutputs, outputBlob, outputName, output_layout, is_cpu_impl);
    }

    OPENVINO_ASSERT(!is_static || _deviceOutputs.find(outputName) != _deviceOutputs.end(),
@@ -1010,23 +1031,9 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
    auto l = cldnn::layout(shape, dt, format);

    if (m_graph->GetEngine()->use_unified_shared_memory()) {
-        auto blobPtr = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
-                                                         m_graph->GetNetwork()->get_stream(),
-                                                         desc,
-                                                         l,
-                                                         nullptr,
-                                                         0,
-                                                         0,
-                                                         RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
-        getBlobImpl(blobPtr.get())->allocate();
-        return blobPtr;
+        return create_remote_blob<RemoteUSMbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
    } else {
-        auto blobPtr = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
-                                                        m_graph->GetNetwork()->get_stream(),
-                                                        desc,
-                                                        l);
-        getBlobImpl(blobPtr.get())->allocate();
-        return blobPtr;
+        return create_remote_blob<RemoteCLbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
    }
 }

--- a/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp
+++ b/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp
@@ -189,6 +189,68 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
    }
 }

+TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
+    auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
+
+    auto ie = ov::Core();
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+
+    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
+    ov::InferRequest infer_request2 = compiled_model.create_infer_request();
+
+    auto input_tensor1 = infer_request1.get_input_tensor();
+    FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
+
+    auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
+    auto output_tensor2 = infer_request2.get_output_tensor();
+
+    // Use tensor from infer request #2 as an output for infer request #1
+    infer_request1.set_output_tensor(output_tensor2);
+    ASSERT_NO_THROW(infer_request1.infer());
+
+    // Modify tensor somehow and save as a reference values
+    FuncTestUtils::fill_tensor(output_tensor2);
+
+    std::vector<float> ref_values;
+    ref_values.resize(output_tensor2.get_byte_size());
+    std::memcpy(ref_values.data(), output_tensor2.data(), output_tensor2.get_byte_size());
+
+    // Perform second infer() call with a system host memory tensor
+    infer_request1.set_output_tensor(output_tensor1);
+    ASSERT_NO_THROW(infer_request1.infer());
+
+    // Expect that output_tensor2 will not change it's data after infer() call
+    auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+    FuncTestUtils::compareRawBuffers(ref_values.data(),
+                                     output_tensor2.data<float>(),
+                                     ref_values.size(),
+                                     ov::shape_size(output_tensor2.get_shape()),
+                                     thr);
+}
+
+TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
+    auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
+
+    auto ie = ov::Core();
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+
+    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
+    ov::InferRequest infer_request2 = compiled_model.create_infer_request();
+
+    auto input_tensor1 = infer_request1.get_input_tensor();
+    FuncTestUtils::fill_tensor(input_tensor1, 20, 0, 1, 0);
+
+    auto output_tensor1 = FuncTestUtils::create_and_fill_tensor(compiled_model.output().get_element_type(), compiled_model.output().get_shape());
+    auto output_tensor2 = infer_request2.get_output_tensor();
+
+    infer_request1.set_output_tensor(output_tensor2);
+    ASSERT_NO_THROW(infer_request1.infer());
+
+    FuncTestUtils::fill_tensor(input_tensor1, 10, 0, 1, 1);
+    infer_request1.set_output_tensor(output_tensor1);
+    ASSERT_NO_THROW(infer_request1.infer());
+}
+
 TEST(canSwapTensorsBetweenInferRequests, outputs) {
    std::vector<std::vector<uint8_t>> ref;
    std::vector<ov::Tensor> input_tensors;