From 6c6aa8fa957677bea8db9129c5ee8fef35753fee Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
Date: Tue, 1 Mar 2022 15:15:04 +0300
Subject: [PATCH] [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of
 multiple threads (#10685)

* [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads and add tests
---
 .../intel_gpu/plugin/remote_context.hpp       |  2 +
 .../intel_gpu/src/plugin/remote_context.cpp   | 20 +++++--
 .../cldnn_remote_blob_tests.cpp               | 59 +++++++++++++++++++
 3 files changed, 75 insertions(+), 6 deletions(-)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
index c8334220e17..5ddefc3dc70 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp
@@ -89,6 +89,8 @@ protected:
 
     cldnn::memory::ptr m_memObject;
 
+    mutable std::mutex lockedMutex;
+    mutable size_t lockedCounter;
     mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
     mutable void* _handle;
     mutable std::shared_ptr<InferenceEngine::IAllocator> _allocator;
diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
index 0720ec68eb4..1eaf01aea0b 100644
--- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
@@ -25,7 +25,7 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
     uint32_t plane,
     BlobType mem_type) :
     m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
-    _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
+    _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedCounter(0), lockedHolder(nullptr) {
     auto _impl = getContextImpl(m_context.lock());
     auto eng = _impl->GetEngine();
 
@@ -189,14 +189,22 @@ void RemoteBlobImpl::lock() const {
     if (!is_allocated()) {
         IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
     }
-    lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
-    auto ptr = lockedHolder->data();
-    _handle = reinterpret_cast<void*>(ptr);
-    m_allocator.regLockedBlob(_handle, this);
+
+    std::lock_guard<std::mutex> locker(lockedMutex);
+    if (lockedCounter == 0) {
+        lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
+        auto ptr = lockedHolder->data();
+        _handle = reinterpret_cast<void*>(ptr);
+        m_allocator.regLockedBlob(_handle, this);
+    }
+    lockedCounter++;
 }
 
 void RemoteBlobImpl::unlock() const {
-    lockedHolder.reset();
+    std::lock_guard<std::mutex> locker(lockedMutex);
+    lockedCounter--;
+    if (lockedCounter == 0)
+        lockedHolder.reset();
 }
 
 LockedMemory<void> RemoteBlobImpl::buffer() noexcept {
diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
index c3f30667002..0cab1150cf9 100644
--- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@@ -6,6 +6,7 @@
 #include <utility>
 #include <vector>
 #include <memory>
+#include <thread>
 
 #include <ie_compound_blob.h>
 
@@ -98,6 +99,64 @@ TEST_P(RemoteBlob_Test, smoke_canInputUserBlob) {
     }
 }
 
+TEST_P(RemoteBlob_Test, smoke_canUseRemoteBlobSimultaneously) {
+#if defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    const int batch = 2;
+    const int channels = 3;
+    const int height = 512;
+    const int width = 512;
+    const size_t img_size = batch * channels * height * width;
+    cl_int err;
+
+    const InferenceEngine::TensorDesc tensor_desc{InferenceEngine::Precision::U8,
+                                                  {batch, channels, height, width},
+                                                  InferenceEngine::Layout::NHWC};
+
+    InferenceEngine::Blob::Ptr ref_blob = FuncTestUtils::createAndFillBlob(tensor_desc);
+
+    auto ie = PluginCache::get().ie();
+    auto ocl_instance = std::make_shared<OpenCL>();
+    ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
+
+    // Allocate OpenCL buffer for data
+    cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, img_size, NULL, &err);
+
+    // Create shared context
+    auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
+
+    // Wrap buffer above with IE blob
+    Blob::Ptr shared_blob = make_shared_blob(tensor_desc, remote_context, shared_buffer);
+    // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
+    // TODO: Why do we need to call it explicitly? Consider doing it internally
+    shared_blob->allocate();
+
+    // Copy data from ordinary blob to OpenCL buffer
+    {
+        void* buffer = ref_blob->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, img_size, buffer);
+    }
+
+    // Lock remote buffer in multiple threads and compare data with ordinary one
+    const int threads_num = 8;
+    std::vector<std::thread> threads;
+    for (int i = 0; i < threads_num; i++) {
+        threads.emplace_back(std::thread{[&] {
+            auto ref_blob_buf = ref_blob->cbuffer();
+            auto ref_blob_ptr = ref_blob_buf.as<const char*>();
+            auto remote_blob_buf = shared_blob->cbuffer();
+            auto remote_blob_ptr = remote_blob_buf.as<const char*>();
+            ASSERT_EQ(ref_blob->size(), shared_blob->size());
+            for (size_t j = 0; j < ref_blob->size(); j++) {
+                ASSERT_EQ(ref_blob_ptr[j], remote_blob_ptr[j]);
+            }
+        }});
+    }
+
+    for (auto& t : threads)
+        t.join();
+}
 
 TEST_P(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
 #if defined(ANDROID)