From f68dd9041336cdbb29f65cf4525fca7e7832e326 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Thu, 13 Jan 2022 10:52:59 +0300
Subject: [PATCH] [GPU] Get rid of PreAllocator usages to allow setShape
 (#9576)

---
 .../intel_gpu/plugin/infer_request.hpp        |   5 +-
 .../intel_gpu/src/plugin/infer_request.cpp    | 116 ++++++------------
 .../intel_gpu/src/plugin/remote_context.cpp   |  15 +++
 .../intel_gpu/src/runtime/ocl/ocl_engine.cpp  |  12 ++
 .../intel_gpu/src/runtime/ocl/ocl_ext.hpp     |  11 ++
 .../plugin/gpu/behavior/infer_request.cpp     |  40 ++++++
 .../gpu_remote_tensor_tests.cpp               |  51 ++++++++
 7 files changed, 172 insertions(+), 78 deletions(-)
 create mode 100644 src/tests/functional/plugin/gpu/behavior/infer_request.cpp
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp
index 3602a1e3f52..98d031bea40 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request.hpp
@@ -89,7 +89,8 @@ private:
                        std::vector<cldnn::event::ptr>& dependencies);
     void prepare_output(const cldnn::primitive_id& outputName, InferenceEngine::Blob::Ptr& outputBlob);
 
-    InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr);
+    InferenceEngine::Blob::Ptr create_host_blob(const InferenceEngine::TensorDesc& desc,
+                                                std::shared_ptr<InferenceEngine::IAllocator> alloc = nullptr);
     InferenceEngine::Blob::Ptr create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout);
 
     void copy_output_data(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr);
@@ -97,7 +98,7 @@ private:
                          const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob,
                          buf_info* bi = nullptr);
 
-    InferenceEngine::Blob::Ptr host_blob_from_device_blob(const InferenceEngine::Blob::Ptr blobPtr);
+    InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
     void allocate_inputs();
     void allocate_outputs();
     void allocate_inputs_dynamic();
diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
index a746ce9b50e..a972e117fef 100644
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@@ -317,7 +317,6 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
                 // during pre-processing
                 if (_inputs[name]->is<gpu::ClBlob>()) {
                     Blob::Ptr inputHostBlob = create_host_blob(desc);
-                    inputHostBlob->allocate();
                     _inputs[name] = inputHostBlob;
                 }
                 _preProcData[name] = CreatePreprocDataHelper();
@@ -787,59 +786,27 @@ void InferRequest::setup_stream_graph() {
     m_graph = streamGraphs[streamID];
 }
 
-Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
+Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr<InferenceEngine::IAllocator> alloc) {
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob");
-    const Precision& p = desc.getPrecision();
+    auto blob = make_blob_with_precision(desc, alloc ? alloc : CreateDefaultAllocator());
+    blob->allocate();
+    return blob;
+}
 
-    switch (p) {
-    case Precision::FP32:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<float>(desc, reinterpret_cast<float*>(mem_ptr));
-        else
-            return make_shared_blob<float>(desc);
-    case Precision::FP16:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
-        else
-            return make_shared_blob<uint16_t>(desc);
-    case Precision::I16:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<int16_t>(desc, reinterpret_cast<int16_t*>(mem_ptr));
-        else
-            return make_shared_blob<int16_t>(desc);
-    case Precision::U16:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
-        else
-            return make_shared_blob<uint16_t>(desc);
-    case Precision::I32:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<int32_t>(desc, reinterpret_cast<int32_t*>(mem_ptr));
-        else
-            return make_shared_blob<int32_t>(desc);
-    case Precision::I64:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<int64_t>(desc, reinterpret_cast<int64_t*>(mem_ptr));
-        else
-            return make_shared_blob<int64_t>(desc);
-    case Precision::I8:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<int8_t>(desc, reinterpret_cast<int8_t*>(mem_ptr));
-        else
-            return make_shared_blob<int8_t>(desc);
-    case Precision::U8:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<uint8_t>(desc, reinterpret_cast<uint8_t*>(mem_ptr));
-        else
-            return make_shared_blob<uint8_t>(desc);
-    case Precision::BOOL:
-        if (mem_ptr != nullptr)
-            return make_shared_blob<uint8_t>(desc, reinterpret_cast<uint8_t*>(mem_ptr));
-        else
-            return make_shared_blob<uint8_t>(desc);
-    default:
-        IE_THROW(NotImplemented) << "The plugin does not support " << p.name() << " blob precision";
-    }
+Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
+    auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
+                                                  m_graph->GetNetwork()->get_stream(),
+                                                  desc,
+                                                  layout,
+                                                  usm_host_mem,
+                                                  0,
+                                                  0,
+                                                  RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
+    if (!blob)
+        IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
+    blob->allocate();
+
+    return blob;
 }
 
 void InferRequest::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) {
@@ -907,21 +874,6 @@ void InferRequest::copy_input_data(std::shared_ptr<cldnn::network> network,
     }
 }
 
-Blob::Ptr InferRequest::host_blob_from_device_blob(Blob::Ptr blobPtr) {
-    uint8_t* bufferMem = nullptr;
-    auto clblobPtr = std::dynamic_pointer_cast<InferenceEngine::gpu::ClBlob>(blobPtr);
-    if (clblobPtr) {
-        const auto memPtr = getBlobImpl(clblobPtr.get())->getMemory();
-        if (memPtr->get_allocation_type() == cldnn::allocation_type::usm_host) {
-            bufferMem = reinterpret_cast<uint8_t*>(memPtr->get_internal_params().mem);
-        }
-    }
-    Blob::Ptr hostBlob = create_host_blob(blobPtr->getTensorDesc(), bufferMem);
-    hostBlob->allocate();
-
-    return hostBlob;
-}
-
 void InferRequest::allocate_inputs() {
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs");
     auto inputLayouts = m_graph->GetInputLayouts();
@@ -964,12 +916,18 @@ void InferRequest::allocate_inputs() {
                 auto blobPtr = create_device_blob(desc_fp32, litr->second);
                 _deviceInputs[name] = blobPtr;
                 Blob::Ptr inputBlob = create_host_blob(desc);
-                inputBlob->allocate();
                 _inputs[name] = inputBlob;
             } else {
-                auto blobPtr = create_device_blob(desc, litr->second);
-                _deviceInputs[name] = blobPtr;
-                _inputs[name] = host_blob_from_device_blob(blobPtr);
+                if (m_graph->GetEngine()->use_unified_shared_memory()) {
+                    // For USM case we create host blob using custom USM host allocator
+                    // and then create shared device blob on top of this buffer
+                    auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
+                    _inputs[name] = host_blob;
+                    _deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as<void*>());
+                } else {
+                    _inputs[name] = create_host_blob(desc);
+                    _deviceInputs[name] = create_device_blob(desc, litr->second);
+                }
             }
         }
     }
@@ -996,7 +954,6 @@ void InferRequest::allocate_inputs_dynamic() {
             fp32inputBlob->allocate();
             _inputs[input.first + fp32_suffix] = fp32inputBlob;
         }
-        inputBlob->allocate();
         _inputs[input.first] = inputBlob;
     }
 }
@@ -1013,10 +970,18 @@ void InferRequest::allocate_outputs() {
         GPU_DEBUG_IF(debug_config->verbose >= 2) {
             GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
         }
-        auto blobPtr = create_device_blob(desc, output_layout);
-        _deviceOutputs[no.first] = blobPtr;
-        _outputs[no.first] = host_blob_from_device_blob(blobPtr);
+
         outputsMap[no.first] = outputID;
+        if (m_graph->GetEngine()->use_unified_shared_memory()) {
+            // For USM case we create host blob using custom USM host allocator
+            // and then create shared device blob on top of this buffer
+            auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
+            _outputs[no.first] = host_blob;
+            _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
+        } else {
+            _outputs[no.first] = create_host_blob(desc);
+            _deviceOutputs[no.first] = create_device_blob(desc, output_layout);
+        }
     }
 }
 
@@ -1036,7 +1001,6 @@ void InferRequest::allocate_outputs_dynamic() {
         }
 
         Blob::Ptr outputBlob = create_host_blob(desc);
-        outputBlob->allocate();
         _outputs[no.first] = outputBlob;
         outputsMap[no.first] = outputID;
     }
diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
index ea0f54fe238..201e3e50e05 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
@@ -26,6 +26,21 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
     BlobType mem_type) :
     m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
     _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
+    auto _impl = getContextImpl(m_context.lock());
+    auto eng = _impl->GetEngine();
+
+    // Verify shared buffer/usm memory and ensure that requested byte size is not greater than allocated one
+    switch (m_mem_type) {
+    case BlobType::BT_BUF_SHARED: {
+        eng->share_buffer(m_layout, m_mem);
+        break;
+    }
+    case BlobType::BT_USM_SHARED: {
+        eng->share_usm(m_layout, m_mem);
+        break;
+    }
+    default: break;
+    }
 }
 
 ParamMap RemoteBlobImpl::getParams() const {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
index a0b2774ef23..8e2d83d78de 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -167,9 +167,21 @@ memory::ptr ocl_engine::reinterpret_handle(const layout& new_layout, shared_mem_
 #endif
         } else if (params.mem_type == shared_mem_type::shared_mem_buffer) {
             cl::Buffer buf(static_cast<cl_mem>(params.mem), true);
+            auto actual_mem_size = buf.getInfo<CL_MEM_SIZE>();
+            auto requested_mem_size = new_layout.bytes_count();
+            if (actual_mem_size < requested_mem_size) {
+                throw std::runtime_error("[GPU] shared buffer has smaller size (" + std::to_string(actual_mem_size) +
+                                         ") than specified layout (" + std::to_string(requested_mem_size) + ")");
+            }
             return std::make_shared<ocl::gpu_buffer>(this, new_layout, buf);
         } else if (params.mem_type == shared_mem_type::shared_mem_usm) {
             cl::UsmMemory usm_buffer(get_usm_helper(), params.mem);
+            auto actual_mem_size = get_usm_helper().get_usm_allocation_size(usm_buffer.get());
+            auto requested_mem_size = new_layout.bytes_count();
+            if (actual_mem_size < requested_mem_size) {
+                throw std::runtime_error("[GPU] shared USM buffer has smaller size (" + std::to_string(actual_mem_size) +
+                                         ") than specified layout (" + std::to_string(requested_mem_size) + ")");
+            }
             return std::make_shared<ocl::gpu_usm>(this, new_layout, usm_buffer);
         } else {
             throw std::runtime_error("unknown shared object fromat or type");
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
index 512bc5a4204..1e1a23ed7ab 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp
@@ -633,6 +633,17 @@ public:
         return ret_val;
     }
 
+    size_t get_usm_allocation_size(const void* usm_ptr) const {
+        if (!_get_mem_alloc_info_fn) {
+            throw std::runtime_error("[GPU] clGetMemAllocInfoINTEL is nullptr");
+        }
+
+        size_t ret_val;
+        size_t ret_val_size;
+        _get_mem_alloc_info_fn(_ctx.get(), usm_ptr, CL_MEM_ALLOC_SIZE_INTEL, sizeof(size_t), &ret_val, &ret_val_size);
+        return ret_val;
+    }
+
 private:
     cl::Context _ctx;
     cl::Device _device;
diff --git a/src/tests/functional/plugin/gpu/behavior/infer_request.cpp b/src/tests/functional/plugin/gpu/behavior/infer_request.cpp
new file mode 100644
index 00000000000..aa1e7b925df
--- /dev/null
+++ b/src/tests/functional/plugin/gpu/behavior/infer_request.cpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <utility>
+#include <vector>
+#include <memory>
+
+#include "openvino/runtime/core.hpp"
+
+#include <common_test_utils/test_common.hpp>
+#include "ngraph_functions/subgraph_builders.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "openvino/core/preprocess/pre_post_process.hpp"
+#include "transformations/utils/utils.hpp"
+
+using namespace ::testing;
+
+TEST(TensorTest, smoke_canSetShapeForPreallocatedTensor) {
+    auto ie = ov::runtime::Core();
+    using namespace ov::preprocess;
+    auto p = PrePostProcessor(ngraph::builder::subgraph::makeSplitMultiConvConcat());
+    p.input().tensor().set_element_type(ov::element::i8);
+    p.input().preprocess().convert_element_type(ov::element::f32);
+
+    auto function = p.build();
+    auto exec_net = ie.compile_model(function, CommonTestUtils::DEVICE_GPU);
+    auto inf_req = exec_net.create_infer_request();
+
+    // Check set_shape call for pre-allocated input/output tensors
+    auto input_tensor = inf_req.get_input_tensor(0);
+    ASSERT_NO_THROW(input_tensor.set_shape({1, 4, 20, 20}));
+    ASSERT_NO_THROW(input_tensor.set_shape({1, 3, 20, 20}));
+    ASSERT_NO_THROW(input_tensor.set_shape({2, 3, 20, 20}));
+    auto output_tensor = inf_req.get_output_tensor(0);
+    ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 12, 12}));
+    ASSERT_NO_THROW(output_tensor.set_shape({1, 10, 10, 10}));
+    ASSERT_NO_THROW(output_tensor.set_shape({2, 10, 20, 20}));
+}
diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
index cf3fc5772e8..90b460cd4db 100644
--- a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
+++ b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp
@@ -87,6 +87,57 @@ public:
     }
 };
 
+TEST_P(OVRemoteTensorInputBlob_Test, smoke_cantCreateBlobWithInvalidSize) {
+    RemoteTensorSharingType sharing_type;
+    bool with_auto_batching;
+    std::tie(sharing_type, with_auto_batching) = GetParam();
+    if (with_auto_batching)
+        GTEST_SKIP();
+
+    if (sharing_type == RemoteTensorSharingType::PLUGIN_CL_TENSOR ||
+        sharing_type == RemoteTensorSharingType::PLUGIN_USM_HOST_TENSOR ||
+        sharing_type == RemoteTensorSharingType::PLUGIN_USM_DEVICE_TENSOR ||
+        sharing_type == RemoteTensorSharingType::PLUGIN_HOST_TENSOR)
+        GTEST_SKIP();
+
+    auto ie = ov::runtime::Core();
+    auto cldnn_context = ie.get_default_context(deviceName).as<ov::runtime::intel_gpu::ocl::ClContext>();
+    cl_context ctx = cldnn_context;
+    auto ocl_instance = std::make_shared<OpenCL>(ctx);
+    cl_int err;
+
+    ov::Shape invalid_shape = {1, 20, 30, 40};
+
+    auto imSize = ov::shape_size(ov::Shape({1, 2, 3, 4}));
+
+    switch (sharing_type) {
+        case RemoteTensorSharingType::USER_CL_TENSOR: {
+            cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
+            ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_DEVICE_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_device_buffer(imSize);
+            ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
+            ocl_instance->free_mem(shared_buffer);
+            break;
+        }
+        case RemoteTensorSharingType::USER_USM_HOST_TENSOR: {
+            if (!ocl_instance->supports_usm())
+                GTEST_SKIP();
+
+            void* shared_buffer = ocl_instance->allocate_usm_host_buffer(imSize);
+            ASSERT_ANY_THROW(cldnn_context.create_tensor(ov::element::i8, invalid_shape, shared_buffer));
+            ocl_instance->free_mem(shared_buffer);
+            break;
+        }
+        default: break;
+    }
+}
+
 TEST_P(OVRemoteTensorInputBlob_Test, smoke_canInputRemoteTensor) {
 #if defined(ANDROID)
     GTEST_SKIP();