[GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads (#10685)

* [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads and add tests
This commit is contained in:
Sergey Shlyapnikov 2022-03-01 15:15:04 +03:00 committed by GitHub
parent 1d469a2b87
commit 6c6aa8fa95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 75 additions and 6 deletions

View File

@ -89,6 +89,8 @@ protected:
cldnn::memory::ptr m_memObject;
mutable std::mutex lockedMutex;
mutable size_t lockedCounter;
mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
mutable void* _handle;
mutable std::shared_ptr<InferenceEngine::IAllocator> _allocator;

View File

@ -25,7 +25,7 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
uint32_t plane,
BlobType mem_type) :
m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
_handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
_handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedCounter(0), lockedHolder(nullptr) {
auto _impl = getContextImpl(m_context.lock());
auto eng = _impl->GetEngine();
@ -189,14 +189,22 @@ void RemoteBlobImpl::lock() const {
if (!is_allocated()) {
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
}
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
auto ptr = lockedHolder->data();
_handle = reinterpret_cast<void*>(ptr);
m_allocator.regLockedBlob(_handle, this);
std::lock_guard<std::mutex> locker(lockedMutex);
if (lockedCounter == 0) {
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
auto ptr = lockedHolder->data();
_handle = reinterpret_cast<void*>(ptr);
m_allocator.regLockedBlob(_handle, this);
}
lockedCounter++;
}
void RemoteBlobImpl::unlock() const {
lockedHolder.reset();
std::lock_guard<std::mutex> locker(lockedMutex);
lockedCounter--;
if (lockedCounter == 0)
lockedHolder.reset();
}
LockedMemory<void> RemoteBlobImpl::buffer() noexcept {

View File

@ -6,6 +6,7 @@
#include <utility>
#include <vector>
#include <memory>
#include <thread>
#include <ie_compound_blob.h>
@ -98,6 +99,64 @@ TEST_P(RemoteBlob_Test, smoke_canInputUserBlob) {
}
}
TEST_P(RemoteBlob_Test, smoke_canUseRemoteBlobSimultaneously) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
const int batch = 2;
const int channels = 3;
const int height = 512;
const int width = 512;
const size_t img_size = batch * channels * height * width;
cl_int err;
const InferenceEngine::TensorDesc tensor_desc{InferenceEngine::Precision::U8,
{batch, channels, height, width},
InferenceEngine::Layout::NHWC};
InferenceEngine::Blob::Ptr ref_blob = FuncTestUtils::createAndFillBlob(tensor_desc);
auto ie = PluginCache::get().ie();
auto ocl_instance = std::make_shared<OpenCL>();
ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
// Allocate OpenCL buffer for data
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, img_size, NULL, &err);
// Create shared context
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
// Wrap buffer above with IE blob
Blob::Ptr shared_blob = make_shared_blob(tensor_desc, remote_context, shared_buffer);
// Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
// TODO: Why do we need to call it explicitly? Consider doing it internally
shared_blob->allocate();
// Copy data from ordinary blob to OpenCL buffer
{
void* buffer = ref_blob->buffer();
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, img_size, buffer);
}
// Lock remote buffer in multiple threads and compare data with ordinary one
const int threads_num = 8;
std::vector<std::thread> threads;
for (int i = 0; i < threads_num; i++) {
threads.emplace_back(std::thread{[&] {
auto ref_blob_buf = ref_blob->cbuffer();
auto ref_blob_ptr = ref_blob_buf.as<const char*>();
auto remote_blob_buf = shared_blob->cbuffer();
auto remote_blob_ptr = remote_blob_buf.as<const char*>();
ASSERT_EQ(ref_blob->size(), shared_blob->size());
for (size_t j = 0; j < ref_blob->size(); j++) {
ASSERT_EQ(ref_blob_ptr[j], remote_blob_ptr[j]);
}
}});
}
for (auto& t : threads)
t.join();
}
TEST_P(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
#if defined(ANDROID)