creating remote ocl buffer/tensor per request, to avoid simulteneous locking of the same ocl buffer when auto-batching is used (#10607)
This commit is contained in:
@@ -789,8 +789,11 @@ int main(int argc, char* argv[]) {
|
||||
std::map<std::string, ov::TensorVector> inputsData;
|
||||
if (isFlagSetInCommandLine("use_device_mem")) {
|
||||
if (device_name.find("GPU") == 0) {
|
||||
inputsData =
|
||||
::gpu::get_remote_input_tensors(inputFiles, app_inputs_info, compiledModel, clInputsBuffer);
|
||||
inputsData = ::gpu::get_remote_input_tensors(inputFiles,
|
||||
app_inputs_info,
|
||||
compiledModel,
|
||||
clInputsBuffer,
|
||||
inferRequestsQueue.requests.size());
|
||||
useGpuMem = true;
|
||||
} else if (device_name.find("CPU") == 0) {
|
||||
if (newInputType) {
|
||||
|
||||
@@ -69,7 +69,8 @@ std::map<std::string, ov::TensorVector> get_remote_input_tensors(
|
||||
const std::map<std::string, std::vector<std::string>>& inputFiles,
|
||||
const std::vector<benchmark_app::InputsInfo>& app_inputs_info,
|
||||
const ov::CompiledModel& compiledModel,
|
||||
std::vector<BufferType>& clBuffer) {
|
||||
std::vector<BufferType>& clBuffer,
|
||||
size_t num_requests) {
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
slog::info << "Device memory will be used for input and output blobs" << slog::endl;
|
||||
if (inputFiles.size()) {
|
||||
@@ -82,43 +83,45 @@ std::map<std::string, ov::TensorVector> get_remote_input_tensors(
|
||||
auto& oclContext = static_cast<ov::intel_gpu::ocl::ClContext&>(context);
|
||||
auto oclInstance = std::make_shared<gpu::OpenCL>(oclContext.get());
|
||||
|
||||
for (auto& inputs_info : app_inputs_info) {
|
||||
for (auto& input : inputs_info) {
|
||||
// Fill random
|
||||
slog::info << "Prepare remote blob for input '" << input.first << "' with random values ("
|
||||
<< std::string((input.second.is_image() ? "image" : "some binary data")) << " is expected)"
|
||||
<< slog::endl;
|
||||
for (int i = 0; i < num_requests; i++) {
|
||||
for (auto& inputs_info : app_inputs_info) {
|
||||
for (auto& input : inputs_info) {
|
||||
// Fill random
|
||||
slog::info << "Prepare remote blob for input '" << input.first << "' with random values ("
|
||||
<< std::string((input.second.is_image() ? "image" : "some binary data")) << " is expected)"
|
||||
<< slog::endl;
|
||||
|
||||
// Creating and filling shared buffers
|
||||
cl_int err;
|
||||
auto elementsNum = std::accumulate(begin(input.second.dataShape),
|
||||
end(input.second.dataShape),
|
||||
1,
|
||||
std::multiplies<size_t>());
|
||||
auto inputSize = elementsNum * input.second.type.bitwidth() / 8;
|
||||
// Creating and filling shared buffers
|
||||
cl_int err;
|
||||
auto elementsNum = std::accumulate(begin(input.second.dataShape),
|
||||
end(input.second.dataShape),
|
||||
1,
|
||||
std::multiplies<size_t>());
|
||||
auto inputSize = elementsNum * input.second.type.bitwidth() / 8;
|
||||
|
||||
clBuffer.push_back(
|
||||
cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err));
|
||||
clBuffer.push_back(
|
||||
cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err));
|
||||
|
||||
void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(),
|
||||
CL_TRUE,
|
||||
CL_MEM_READ_WRITE,
|
||||
0,
|
||||
(cl::size_type)inputSize);
|
||||
void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(),
|
||||
CL_TRUE,
|
||||
CL_MEM_READ_WRITE,
|
||||
0,
|
||||
(cl::size_type)inputSize);
|
||||
|
||||
auto tensor = oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get());
|
||||
remoteTensors[input.first].push_back(tensor);
|
||||
auto tensor =
|
||||
oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get());
|
||||
remoteTensors[input.first].push_back(tensor);
|
||||
|
||||
if (inputFiles.empty()) {
|
||||
// Filling in random data
|
||||
fill_buffer(mappedPtr, elementsNum, input.second.type);
|
||||
} else {
|
||||
// TODO: add filling with real image data
|
||||
if (inputFiles.empty()) {
|
||||
// Filling in random data
|
||||
fill_buffer(mappedPtr, elementsNum, input.second.type);
|
||||
} else {
|
||||
// TODO: add filling with real image data
|
||||
}
|
||||
oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr);
|
||||
}
|
||||
oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr);
|
||||
}
|
||||
}
|
||||
|
||||
return remoteTensors;
|
||||
#else
|
||||
IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
|
||||
|
||||
@@ -61,7 +61,8 @@ std::map<std::string, ov::TensorVector> get_remote_input_tensors(
|
||||
const std::map<std::string, std::vector<std::string>>& inputFiles,
|
||||
const std::vector<benchmark_app::InputsInfo>& app_inputs_info,
|
||||
const ov::CompiledModel& compiledModel,
|
||||
std::vector<BufferType>& clBuffer);
|
||||
std::vector<BufferType>& clBuffer,
|
||||
size_t num_requests);
|
||||
|
||||
std::map<std::string, ov::Tensor> get_remote_output_tensors(const ov::CompiledModel& compiledModel,
|
||||
std::map<std::string, ::gpu::BufferType>& clBuffer);
|
||||
|
||||
Reference in New Issue
Block a user