From bed0adf5ef2d84d57c8c06e2740a578829b3a044 Mon Sep 17 00:00:00 2001 From: Maxim Shevtsov Date: Mon, 28 Feb 2022 15:04:03 +0300 Subject: [PATCH] creating remote ocl buffer/tensor per request, to avoid simulteneous locking of the same ocl buffer when auto-batching is used (#10607) --- samples/cpp/benchmark_app/main.cpp | 7 ++- .../benchmark_app/remote_tensors_filling.cpp | 63 ++++++++++--------- .../benchmark_app/remote_tensors_filling.hpp | 3 +- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp index b4613fac299..3518ffc5eee 100644 --- a/samples/cpp/benchmark_app/main.cpp +++ b/samples/cpp/benchmark_app/main.cpp @@ -789,8 +789,11 @@ int main(int argc, char* argv[]) { std::map inputsData; if (isFlagSetInCommandLine("use_device_mem")) { if (device_name.find("GPU") == 0) { - inputsData = - ::gpu::get_remote_input_tensors(inputFiles, app_inputs_info, compiledModel, clInputsBuffer); + inputsData = ::gpu::get_remote_input_tensors(inputFiles, + app_inputs_info, + compiledModel, + clInputsBuffer, + inferRequestsQueue.requests.size()); useGpuMem = true; } else if (device_name.find("CPU") == 0) { if (newInputType) { diff --git a/samples/cpp/benchmark_app/remote_tensors_filling.cpp b/samples/cpp/benchmark_app/remote_tensors_filling.cpp index 0f2065c2979..40bc581d153 100644 --- a/samples/cpp/benchmark_app/remote_tensors_filling.cpp +++ b/samples/cpp/benchmark_app/remote_tensors_filling.cpp @@ -69,7 +69,8 @@ std::map get_remote_input_tensors( const std::map>& inputFiles, const std::vector& app_inputs_info, const ov::CompiledModel& compiledModel, - std::vector& clBuffer) { + std::vector& clBuffer, + size_t num_requests) { #ifdef HAVE_DEVICE_MEM_SUPPORT slog::info << "Device memory will be used for input and output blobs" << slog::endl; if (inputFiles.size()) { @@ -82,43 +83,45 @@ std::map get_remote_input_tensors( auto& oclContext = static_cast(context); auto oclInstance = std::make_shared(oclContext.get()); - for (auto& inputs_info : app_inputs_info) { - for (auto& input : inputs_info) { - // Fill random - slog::info << "Prepare remote blob for input '" << input.first << "' with random values (" - << std::string((input.second.is_image() ? "image" : "some binary data")) << " is expected)" - << slog::endl; + for (int i = 0; i < num_requests; i++) { + for (auto& inputs_info : app_inputs_info) { + for (auto& input : inputs_info) { + // Fill random + slog::info << "Prepare remote blob for input '" << input.first << "' with random values (" + << std::string((input.second.is_image() ? "image" : "some binary data")) << " is expected)" + << slog::endl; - // Creating and filling shared buffers - cl_int err; - auto elementsNum = std::accumulate(begin(input.second.dataShape), - end(input.second.dataShape), - 1, - std::multiplies()); - auto inputSize = elementsNum * input.second.type.bitwidth() / 8; + // Creating and filling shared buffers + cl_int err; + auto elementsNum = std::accumulate(begin(input.second.dataShape), + end(input.second.dataShape), + 1, + std::multiplies()); + auto inputSize = elementsNum * input.second.type.bitwidth() / 8; - clBuffer.push_back( - cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err)); + clBuffer.push_back( + cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err)); - void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(), - CL_TRUE, - CL_MEM_READ_WRITE, - 0, - (cl::size_type)inputSize); + void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(), + CL_TRUE, + CL_MEM_READ_WRITE, + 0, + (cl::size_type)inputSize); - auto tensor = oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get()); - remoteTensors[input.first].push_back(tensor); + auto tensor = + oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get()); + remoteTensors[input.first].push_back(tensor); - if (inputFiles.empty()) { - // Filling in random data - fill_buffer(mappedPtr, elementsNum, input.second.type); - } else { - // TODO: add filling with real image data + if (inputFiles.empty()) { + // Filling in random data + fill_buffer(mappedPtr, elementsNum, input.second.type); + } else { + // TODO: add filling with real image data + } + oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr); } - oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr); } } - return remoteTensors; #else IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; diff --git a/samples/cpp/benchmark_app/remote_tensors_filling.hpp b/samples/cpp/benchmark_app/remote_tensors_filling.hpp index 7cb919f565f..4e8555b844e 100644 --- a/samples/cpp/benchmark_app/remote_tensors_filling.hpp +++ b/samples/cpp/benchmark_app/remote_tensors_filling.hpp @@ -61,7 +61,8 @@ std::map get_remote_input_tensors( const std::map>& inputFiles, const std::vector& app_inputs_info, const ov::CompiledModel& compiledModel, - std::vector& clBuffer); + std::vector& clBuffer, + size_t num_requests); std::map get_remote_output_tensors(const ov::CompiledModel& compiledModel, std::map& clBuffer);