Files
openvino/samples/cpp/benchmark_app/remote_blobs_filling.cpp
Fedor Zharinov e9874ec1d4 Dynamic reshapes (#7788)
* Merged and compiling

* Fix for dynamic shape type

* review fixes

* renamed blob shape to tensor shape, small improvements

* fix code style

* added parsing of multiple shapes

* store latency per group, add isIdleRequestAvailable() to Infer Queue

* added cached random inputs

* redesign pipeline, added new metrics(avg, max, min), added metrics per groups

* fixed code style

* small improvements

* modified tensor parameters parsing

* modified -i parameter parsing: added possibility to specify input names

* implemented image cashing

* added cashed blobs creating

* added -pcseq flag, modified batch filling, changes fps formula

* improvements

* code formatting

* code formatting2

* apply suggestions from review

* replaced Buffer class with InferenceEngine Blobs

* use batch size in blobs filling

* added shared blob allocator to handle blob's data

* fixed warnings & code style

* allocate blobs

* fix for networks with image info input

* added comments & fixed codestyle

* clear data in free() in SharedBlobAllocator

* remove unnecessary check

* Delimeter is changed to ::

* stylefix

* added layout from string function, small improvements

* modified parsing to enable : in input parameters

* small fixes

* small fixes

* added missed blob allocation, fixes

* [TEST]added support for remote blobs

* fix remote blobs

* new inputs/files output format

* removed vectors resize which caused bugs

* made cl::Buffer type under ifdef, fix inputs filling

* changed batch() function to not throwing exceptions

* removed unused var

* fix code style

* replace empty name in input files with name from net input

* restored old behaviour for static models

* fix code style

* fix warning - made const iterator

* fix warning - remove reference in loop variable

* added random and image_info input types to -i, fix problem with layout

* replaced batch() with getBatchSize() in main

* fix layout, shape, tensor shape parameters parsing

* upd help messages for input, tensor shape and pcseq command

* added buffer for cl output blobs, small fixes

Signed-off-by: ivikhrev <ivan.vikhrev@intel.com>

* added legacy mode

* restore setBlob

* code style formatting

* move collecting latency for groups under flag

* removed not applicable layouts

* added hint to error message when wrong input name in -tensor_shape was specified

* added new metrics to statistics report

* Apply suggestions from code review

* fix binary blobs filling when layout is CN

* apply suggestions

* moved file in the right place after rebase

* improved -pcseq output

* updated args and readme

* removed TEMPLATE plugin registration

* fix -shape arg  decsription

* enable providing several -i args as input

* renamed legacy_mode to inference_only and made it default for static models, renamed tensor_shape to data_shape

* upd readme

* use getBlob() in inference only mode

* fix old input type for static case

* fix typo

* upd readme

* move log about benchmark mode to the measuring perfomance step

* added class for latency metrics

* upd readme, fix typos, renamed funcs

* fix warning and upd parsing to avoid error with : in file paths

* fix error on centos : error: use of deleted function ‘std::basic_stringstream<char>::basic_stringstream(const std::basic_stringstream<char>&)

* added check for key in inputs

* renamed input to inputs

* adjust batch size for binary blobs

* replaced warning with exception in bench mode defining

* align measurement cycle with master

Co-authored-by: ivikhrev <ivan.vikhrev@intel.com>
2021-12-17 12:20:43 +03:00

186 lines
7.8 KiB
C++

// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <memory>
#include <random>
#include <string>
#include <utility>
#include <vector>
// clang-format off
#include <samples/slog.hpp>
#include "remote_blobs_filling.hpp"
// clang-format on
namespace gpu {
template <typename T>
using uniformDistribution = typename std::conditional<
std::is_floating_point<T>::value,
std::uniform_real_distribution<T>,
typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
template <typename T, typename T2>
void fillBufferRandom(void* inputBuffer,
size_t elementsNum,
T rand_min = std::numeric_limits<uint8_t>::min(),
T rand_max = std::numeric_limits<uint8_t>::max()) {
std::mt19937 gen(0);
uniformDistribution<T2> distribution(rand_min, rand_max);
auto inputBufferData = static_cast<T*>(inputBuffer);
for (size_t i = 0; i < elementsNum; i++) {
inputBufferData[i] = static_cast<T>(distribution(gen));
}
}
void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) {
if (precision == InferenceEngine::Precision::FP32) {
fillBufferRandom<float, float>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::FP16) {
fillBufferRandom<short, short>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I32) {
fillBufferRandom<int32_t, int32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I64) {
fillBufferRandom<int64_t, int64_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::U8) {
// uniform_int_distribution<uint8_t> is not allowed in the C++17
// standard and vs2017/19
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I8) {
// uniform_int_distribution<int8_t> is not allowed in the C++17 standard
// and vs2017/19
fillBufferRandom<int8_t, int32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::U16) {
fillBufferRandom<uint16_t, uint16_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I16) {
fillBufferRandom<int16_t, int16_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::BOOL) {
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum, 0, 1);
} else {
IE_THROW() << "Requested precision is not supported";
}
}
size_t getBytesPerElement(InferenceEngine::Precision precision) {
switch (precision) {
case InferenceEngine::Precision::FP32:
return 4;
case InferenceEngine::Precision::FP16:
return 2;
case InferenceEngine::Precision::I32:
return 4;
case InferenceEngine::Precision::I64:
return 8;
case InferenceEngine::Precision::U8:
return 1;
case InferenceEngine::Precision::I8:
return 1;
case InferenceEngine::Precision::U16:
return 2;
case InferenceEngine::Precision::I16:
return 2;
case InferenceEngine::Precision::BOOL:
return 1;
default:
IE_THROW() << "Requested precision is not supported";
}
}
std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> getRemoteInputBlobs(
const std::map<std::string, std::vector<std::string>>& inputFiles,
const std::vector<benchmark_app::InputsInfo>& app_inputs_info,
const InferenceEngine::ExecutableNetwork& exeNetwork,
std::vector<BufferType>& clBuffer) {
#ifdef HAVE_DEVICE_MEM_SUPPORT
slog::info << "Device memory will be used for input and output blobs" << slog::endl;
if (inputFiles.size()) {
slog::warn << "Device memory supports only random data at this moment, input images will be ignored"
<< slog::endl;
}
std::map<std::string, std::vector<InferenceEngine::Blob::Ptr>> remoteBlobs;
auto context = exeNetwork.GetContext();
auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
auto oclInstance = std::make_shared<OpenCL>(oclContext);
auto setShared = [&](const std::string name, const InferenceEngine::TensorDesc& desc, bool fillRandom = false) {
cl_int err;
auto inputDims = desc.getDims();
auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
clBuffer.push_back(cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err));
if (fillRandom) {
void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(),
CL_TRUE,
CL_MEM_READ_WRITE,
0,
(cl::size_type)inputSize);
fillBuffer(mappedPtr, elementsNum, desc.getPrecision());
oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr);
}
auto blob = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer.back());
remoteBlobs[name].push_back(blob);
};
for (auto& inputs_info : app_inputs_info) {
for (auto& input : inputs_info) {
// Fill random
slog::info << "Prepare remote blob for input '" << input.first << "' with random values ("
<< std::string((input.second.isImage() ? "image" : "some binary data")) << " is expected)"
<< slog::endl;
setShared(input.first,
InferenceEngine::TensorDesc(input.second.precision,
input.second.dataShape,
getLayoutFromString(input.second.layout)),
true);
}
}
return remoteBlobs;
#else
IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
#endif
}
std::map<std::string, InferenceEngine::Blob::Ptr> getRemoteOutputBlobs(
const InferenceEngine::ExecutableNetwork& exeNetwork,
std::map<std::string, ::gpu::BufferType>& clBuffer) {
#ifdef HAVE_DEVICE_MEM_SUPPORT
std::map<std::string, InferenceEngine::Blob::Ptr> outputBlobs;
for (auto& output : exeNetwork.GetOutputsInfo()) {
cl_int err;
auto context = exeNetwork.GetContext();
auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
auto oclInstance = std::make_shared<OpenCL>(oclContext);
auto desc = output.second->getTensorDesc();
auto inputDims = desc.getDims();
auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
cl::size_type bufferSize = 0;
if (clBuffer.find(output.first) == clBuffer.end()) {
clBuffer[output.first] =
cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
} else {
auto& buff = clBuffer[output.first];
buff.getInfo(CL_MEM_SIZE, &bufferSize);
if (inputSize != bufferSize) {
buff = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
}
}
outputBlobs[output.first] = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer[output.first]);
}
return outputBlobs;
#else
IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
#endif
}
} // namespace gpu