From 6c6aa8fa957677bea8db9129c5ee8fef35753fee Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Tue, 1 Mar 2022 15:15:04 +0300 Subject: [PATCH] [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads (#10685) * [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads and add tests --- .../intel_gpu/plugin/remote_context.hpp | 2 + .../intel_gpu/src/plugin/remote_context.cpp | 20 +++++-- .../cldnn_remote_blob_tests.cpp | 59 +++++++++++++++++++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp index c8334220e17..5ddefc3dc70 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_context.hpp @@ -89,6 +89,8 @@ protected: cldnn::memory::ptr m_memObject; + mutable std::mutex lockedMutex; + mutable size_t lockedCounter; mutable std::unique_ptr> lockedHolder; mutable void* _handle; mutable std::shared_ptr _allocator; diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index 0720ec68eb4..1eaf01aea0b 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -25,7 +25,7 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context, uint32_t plane, BlobType mem_type) : m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane), - _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) { + _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedCounter(0), lockedHolder(nullptr) { auto _impl = getContextImpl(m_context.lock()); auto eng = _impl->GetEngine(); @@ -189,14 +189,22 @@ void RemoteBlobImpl::lock() const { if (!is_allocated()) { IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated"; } - lockedHolder = std::unique_ptr>(new cldnn::mem_lock(m_memObject, m_stream)); - auto ptr = lockedHolder->data(); - _handle = reinterpret_cast(ptr); - m_allocator.regLockedBlob(_handle, this); + + std::lock_guard locker(lockedMutex); + if (lockedCounter == 0) { + lockedHolder = std::unique_ptr>(new cldnn::mem_lock(m_memObject, m_stream)); + auto ptr = lockedHolder->data(); + _handle = reinterpret_cast(ptr); + m_allocator.regLockedBlob(_handle, this); + } + lockedCounter++; } void RemoteBlobImpl::unlock() const { - lockedHolder.reset(); + std::lock_guard locker(lockedMutex); + lockedCounter--; + if (lockedCounter == 0) + lockedHolder.reset(); } LockedMemory RemoteBlobImpl::buffer() noexcept { diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp index c3f30667002..0cab1150cf9 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -98,6 +99,64 @@ TEST_P(RemoteBlob_Test, smoke_canInputUserBlob) { } } +TEST_P(RemoteBlob_Test, smoke_canUseRemoteBlobSimultaneously) { +#if defined(ANDROID) + GTEST_SKIP(); +#endif + const int batch = 2; + const int channels = 3; + const int height = 512; + const int width = 512; + const size_t img_size = batch * channels * height * width; + cl_int err; + + const InferenceEngine::TensorDesc tensor_desc{InferenceEngine::Precision::U8, + {batch, channels, height, width}, + InferenceEngine::Layout::NHWC}; + + InferenceEngine::Blob::Ptr ref_blob = FuncTestUtils::createAndFillBlob(tensor_desc); + + auto ie = PluginCache::get().ie(); + auto ocl_instance = std::make_shared(); + ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device); + + // Allocate OpenCL buffer for data + cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, img_size, NULL, &err); + + // Create shared context + auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get()); + + // Wrap buffer above with IE blob + Blob::Ptr shared_blob = make_shared_blob(tensor_desc, remote_context, shared_buffer); + // Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl + // TODO: Why do we need to call it explicitly? Consider doing it internally + shared_blob->allocate(); + + // Copy data from ordinary blob to OpenCL buffer + { + void* buffer = ref_blob->buffer(); + ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, img_size, buffer); + } + + // Lock remote buffer in multiple threads and compare data with ordinary one + const int threads_num = 8; + std::vector threads; + for (int i = 0; i < threads_num; i++) { + threads.emplace_back(std::thread{[&] { + auto ref_blob_buf = ref_blob->cbuffer(); + auto ref_blob_ptr = ref_blob_buf.as(); + auto remote_blob_buf = shared_blob->cbuffer(); + auto remote_blob_ptr = remote_blob_buf.as(); + ASSERT_EQ(ref_blob->size(), shared_blob->size()); + for (size_t j = 0; j < ref_blob->size(); j++) { + ASSERT_EQ(ref_blob_ptr[j], remote_blob_ptr[j]); + } + }}); + } + + for (auto& t : threads) + t.join(); +} TEST_P(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) { #if defined(ANDROID)