[GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads (#10685)
* [GPU] Fix RemoteBlob lock() and ulock() behaviour in case of multiple threads and add tests
This commit is contained in:
parent
1d469a2b87
commit
6c6aa8fa95
@ -89,6 +89,8 @@ protected:
|
||||
|
||||
cldnn::memory::ptr m_memObject;
|
||||
|
||||
mutable std::mutex lockedMutex;
|
||||
mutable size_t lockedCounter;
|
||||
mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
|
||||
mutable void* _handle;
|
||||
mutable std::shared_ptr<InferenceEngine::IAllocator> _allocator;
|
||||
|
@ -25,7 +25,7 @@ RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
|
||||
uint32_t plane,
|
||||
BlobType mem_type) :
|
||||
m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
|
||||
_handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
|
||||
_handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedCounter(0), lockedHolder(nullptr) {
|
||||
auto _impl = getContextImpl(m_context.lock());
|
||||
auto eng = _impl->GetEngine();
|
||||
|
||||
@ -189,14 +189,22 @@ void RemoteBlobImpl::lock() const {
|
||||
if (!is_allocated()) {
|
||||
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
|
||||
}
|
||||
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
|
||||
auto ptr = lockedHolder->data();
|
||||
_handle = reinterpret_cast<void*>(ptr);
|
||||
m_allocator.regLockedBlob(_handle, this);
|
||||
|
||||
std::lock_guard<std::mutex> locker(lockedMutex);
|
||||
if (lockedCounter == 0) {
|
||||
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
|
||||
auto ptr = lockedHolder->data();
|
||||
_handle = reinterpret_cast<void*>(ptr);
|
||||
m_allocator.regLockedBlob(_handle, this);
|
||||
}
|
||||
lockedCounter++;
|
||||
}
|
||||
|
||||
void RemoteBlobImpl::unlock() const {
|
||||
lockedHolder.reset();
|
||||
std::lock_guard<std::mutex> locker(lockedMutex);
|
||||
lockedCounter--;
|
||||
if (lockedCounter == 0)
|
||||
lockedHolder.reset();
|
||||
}
|
||||
|
||||
LockedMemory<void> RemoteBlobImpl::buffer() noexcept {
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include <ie_compound_blob.h>
|
||||
|
||||
@ -98,6 +99,64 @@ TEST_P(RemoteBlob_Test, smoke_canInputUserBlob) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(RemoteBlob_Test, smoke_canUseRemoteBlobSimultaneously) {
|
||||
#if defined(ANDROID)
|
||||
GTEST_SKIP();
|
||||
#endif
|
||||
const int batch = 2;
|
||||
const int channels = 3;
|
||||
const int height = 512;
|
||||
const int width = 512;
|
||||
const size_t img_size = batch * channels * height * width;
|
||||
cl_int err;
|
||||
|
||||
const InferenceEngine::TensorDesc tensor_desc{InferenceEngine::Precision::U8,
|
||||
{batch, channels, height, width},
|
||||
InferenceEngine::Layout::NHWC};
|
||||
|
||||
InferenceEngine::Blob::Ptr ref_blob = FuncTestUtils::createAndFillBlob(tensor_desc);
|
||||
|
||||
auto ie = PluginCache::get().ie();
|
||||
auto ocl_instance = std::make_shared<OpenCL>();
|
||||
ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
|
||||
|
||||
// Allocate OpenCL buffer for data
|
||||
cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, img_size, NULL, &err);
|
||||
|
||||
// Create shared context
|
||||
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
|
||||
|
||||
// Wrap buffer above with IE blob
|
||||
Blob::Ptr shared_blob = make_shared_blob(tensor_desc, remote_context, shared_buffer);
|
||||
// Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
|
||||
// TODO: Why do we need to call it explicitly? Consider doing it internally
|
||||
shared_blob->allocate();
|
||||
|
||||
// Copy data from ordinary blob to OpenCL buffer
|
||||
{
|
||||
void* buffer = ref_blob->buffer();
|
||||
ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, img_size, buffer);
|
||||
}
|
||||
|
||||
// Lock remote buffer in multiple threads and compare data with ordinary one
|
||||
const int threads_num = 8;
|
||||
std::vector<std::thread> threads;
|
||||
for (int i = 0; i < threads_num; i++) {
|
||||
threads.emplace_back(std::thread{[&] {
|
||||
auto ref_blob_buf = ref_blob->cbuffer();
|
||||
auto ref_blob_ptr = ref_blob_buf.as<const char*>();
|
||||
auto remote_blob_buf = shared_blob->cbuffer();
|
||||
auto remote_blob_ptr = remote_blob_buf.as<const char*>();
|
||||
ASSERT_EQ(ref_blob->size(), shared_blob->size());
|
||||
for (size_t j = 0; j < ref_blob->size(); j++) {
|
||||
ASSERT_EQ(ref_blob_ptr[j], remote_blob_ptr[j]);
|
||||
}
|
||||
}});
|
||||
}
|
||||
|
||||
for (auto& t : threads)
|
||||
t.join();
|
||||
}
|
||||
|
||||
TEST_P(RemoteBlob_Test, smoke_canInputPluginRemoteBlob) {
|
||||
#if defined(ANDROID)
|
||||
|
Loading…
Reference in New Issue
Block a user