Add use_device_mem option to benchmark_app (#7433)

This commit is contained in:
Sergey Shlyapnikov 2021-09-17 11:04:50 +03:00 committed by GitHub
parent 8690e14a5b
commit 1f85d4230d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 313 additions and 3 deletions

View File

@ -2,11 +2,46 @@
# SPDX-License-Identifier: Apache-2.0
#
set(TARGET_NAME "benchmark_app")
file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
ie_add_sample(NAME benchmark_app
ie_add_sample(NAME ${TARGET_NAME}
SOURCES ${SRC}
HEADERS ${HDR}
DEPENDENCIES format_reader ie_samples_utils
OPENCV_DEPENDENCIES core)
find_package(OpenCL)
find_path(OpenCL_HPP_INCLUDE_DIR
NAMES
CL/cl2.hpp OpenCL/cl2.hpp
HINTS
${opencl_root_hints}
ENV "PROGRAMFILES(X86)"
ENV AMDAPPSDKROOT
ENV INTELOCLSDKROOT
ENV NVSDKCOMPUTE_ROOT
ENV CUDA_PATH
ENV ATISTREAMSDKROOT
ENV OCL_ROOT
PATH_SUFFIXES
include
OpenCL/common/inc
"AMD APP/include")
if(OPENCL_HEADERS_DIR)
# Use OpenCL CPP headers from sources if present
set(OpenCL_HEADERS OPENCL_HEADERS_DIR)
elseif(OpenCL_HPP_INCLUDE_DIR)
# Append OpenCL CPP headers to C headers and use both
set(OpenCL_HEADERS OpenCL_INCLUDE_DIR OpenCL_HPP_INCLUDE_DIR)
endif()
if(OpenCL_FOUND AND OpenCL_HEADERS)
target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL)
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_HEADERS})
target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_GPU_DEVICE_MEM_SUPPORT)
endif()

View File

@ -4,6 +4,10 @@
#pragma once
#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
# define HAVE_DEVICE_MEM_SUPPORT
#endif
#include <gflags/gflags.h>
#include <iostream>
@ -132,6 +136,12 @@ static const char progress_message[] =
// @brief message for performance counters option
static const char pc_message[] = "Optional. Report performance counters.";
#ifdef HAVE_DEVICE_MEM_SUPPORT
// @brief message for switching memory allocation type option
static const char use_device_mem_message[] =
"Optional. Switch between host and device memory allocation for input and output buffers.";
#endif
#ifdef USE_OPENCV
// @brief message for load config option
static const char load_config_message[] =
@ -266,6 +276,11 @@ DEFINE_bool(progress, false, progress_message);
/// @brief Define flag for showing performance counters <br>
DEFINE_bool(pc, false, pc_message);
#ifdef HAVE_DEVICE_MEM_SUPPORT
/// @brief Define flag for switching beetwen host and device memory allocation for input and output buffers
DEFINE_bool(use_device_mem, false, use_device_mem_message);
#endif
#ifdef USE_OPENCV
/// @brief Define flag for loading configuration file <br>
DEFINE_string(load_config, "", load_config_message);
@ -339,6 +354,9 @@ static void showUsage() {
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
std::cout << " -enforcebf16=<true/false> " << enforce_bf16_message << std::endl;
std::cout << " -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl;
#ifdef HAVE_DEVICE_MEM_SUPPORT
std::cout << " -use_device_mem " << use_device_mem_message << std::endl;
#endif
std::cout << std::endl << " Statistics dumping options:" << std::endl;
std::cout << " -report_type \"<type>\" " << report_type_message << std::endl;
std::cout << " -report_folder " << report_folder_message << std::endl;

View File

@ -65,6 +65,10 @@ public:
return _request.GetBlob(name);
}
void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) {
_request.SetBlob(name, data);
}
double getExecutionTimeInMilliseconds() const {
auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
return static_cast<double>(execTime.count()) * 0.000001;

View File

@ -21,6 +21,7 @@
#include "infer_request_wrap.hpp"
#include "inputs_filling.hpp"
#include "progress_bar.hpp"
#include "remote_blobs_filling.hpp"
#include "statistics_report.hpp"
#include "utils.hpp"
@ -592,7 +593,16 @@ int main(int argc, char* argv[]) {
next_step();
InferRequestsQueue inferRequestsQueue(exeNetwork, nireq);
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
if (isFlagSetInCommandLine("use_device_mem")) {
if (device_name.find("GPU") == 0)
::gpu::fillRemoteBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, exeNetwork);
else if (device_name.find("CPU") == 0)
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
else
IE_THROW() << "Requested device doesn't support `use_device_mem` option.";
} else {
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
}
// ----------------- 10. Measuring performance
// ------------------------------------------------------------------

View File

@ -0,0 +1,140 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "remote_blobs_filling.hpp"
#include <memory>
#include <string>
#include <utility>
#include <vector>
namespace gpu {
template <typename T>
using uniformDistribution = typename std::conditional<
std::is_floating_point<T>::value,
std::uniform_real_distribution<T>,
typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
template <typename T, typename T2>
void fillBufferRandom(void* inputBuffer,
size_t elementsNum,
T rand_min = std::numeric_limits<uint8_t>::min(),
T rand_max = std::numeric_limits<uint8_t>::max()) {
std::mt19937 gen(0);
uniformDistribution<T2> distribution(rand_min, rand_max);
auto inputBufferData = static_cast<T*>(inputBuffer);
for (size_t i = 0; i < elementsNum; i++) {
inputBufferData[i] = static_cast<T>(distribution(gen));
}
}
void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) {
if (precision == InferenceEngine::Precision::FP32) {
fillBufferRandom<float, float>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::FP16) {
fillBufferRandom<short, short>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I32) {
fillBufferRandom<int32_t, int32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I64) {
fillBufferRandom<int64_t, int64_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::U8) {
// uniform_int_distribution<uint8_t> is not allowed in the C++17
// standard and vs2017/19
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I8) {
// uniform_int_distribution<int8_t> is not allowed in the C++17 standard
// and vs2017/19
fillBufferRandom<int8_t, int32_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::U16) {
fillBufferRandom<uint16_t, uint16_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::I16) {
fillBufferRandom<int16_t, int16_t>(inputBuffer, elementsNum);
} else if (precision == InferenceEngine::Precision::BOOL) {
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum, 0, 1);
} else {
IE_THROW() << "Requested precision is not supported";
}
}
size_t getBytesPerElement(InferenceEngine::Precision precision) {
switch (precision) {
case InferenceEngine::Precision::FP32:
return 4;
case InferenceEngine::Precision::FP16:
return 2;
case InferenceEngine::Precision::I32:
return 4;
case InferenceEngine::Precision::I64:
return 8;
case InferenceEngine::Precision::U8:
return 1;
case InferenceEngine::Precision::I8:
return 1;
case InferenceEngine::Precision::U16:
return 2;
case InferenceEngine::Precision::I16:
return 2;
case InferenceEngine::Precision::BOOL:
return 1;
default:
IE_THROW() << "Requested precision is not supported";
}
}
void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
const size_t& batchSize,
benchmark_app::InputsInfo& app_inputs_info,
std::vector<InferReqWrap::Ptr> requests,
const InferenceEngine::ExecutableNetwork& exeNetwork) {
#ifdef HAVE_DEVICE_MEM_SUPPORT
slog::info << "Device memory will be used for input and output blobs" << slog::endl;
if (inputFiles.size()) {
slog::warn << "Device memory supports only random data at this moment, input images will be ignored"
<< slog::endl;
}
auto context = exeNetwork.GetContext();
auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
auto oclInstance = std::make_shared<OpenCL>(oclContext);
auto setShared = [&](size_t requestId,
const std::string name,
const InferenceEngine::TensorDesc& desc,
bool fillRandom = false) {
cl_int err;
auto inputDims = desc.getDims();
auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
cl::Buffer sharedBuffer =
cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
if (fillRandom) {
void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(sharedBuffer,
CL_TRUE,
CL_MEM_READ_WRITE,
0,
(cl::size_type)inputSize);
fillBuffer(mappedPtr, elementsNum, desc.getPrecision());
oclInstance->_queue.enqueueUnmapMemObject(sharedBuffer, mappedPtr);
}
InferenceEngine::Blob::Ptr sharedBlob = InferenceEngine::gpu::make_shared_blob(desc, context, sharedBuffer);
requests.at(requestId)->setBlob(name, sharedBlob);
};
for (size_t requestId = 0; requestId < requests.size(); requestId++) {
for (auto& item : exeNetwork.GetInputsInfo())
setShared(requestId, item.first, item.second->getTensorDesc(), true);
for (auto& item : exeNetwork.GetOutputsInfo())
setShared(requestId, item.first, item.second->getTensorDesc());
}
#else
IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
#endif
}
} // namespace gpu

View File

@ -0,0 +1,64 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
# define HAVE_DEVICE_MEM_SUPPORT
# include <gpu/gpu_context_api_ocl.hpp>
#endif
#include <inference_engine.hpp>
#include "infer_request_wrap.hpp"
#include "utils.hpp"
namespace gpu {
#ifdef HAVE_DEVICE_MEM_SUPPORT
struct OpenCL {
cl::Context _context;
cl::Device _device;
cl::CommandQueue _queue;
explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
// get Intel GPU OCL device, create context and queue
{
std::vector<cl::Device> devices;
std::vector<cl::Platform> platforms;
const unsigned int refVendorID = 0x8086;
cl::Platform::get(&platforms);
for (auto& p : platforms) {
p.getDevices(CL_DEVICE_TYPE_GPU, &devices);
for (auto& d : devices) {
if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
_device = d;
_context = cl::Context(_device);
break;
}
}
}
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
_queue = cl::CommandQueue(_context, _device, props);
}
}
explicit OpenCL(cl_context context) {
// user-supplied context handle
_context = cl::Context(context, true);
_device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
_queue = cl::CommandQueue(_context, _device, props);
}
};
#endif
void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
const size_t& batchSize,
benchmark_app::InputsInfo& app_inputs_info,
std::vector<InferReqWrap::Ptr> requests,
const InferenceEngine::ExecutableNetwork& exeNetwork);
} // namespace gpu

View File

@ -2,6 +2,40 @@
# SPDX-License-Identifier: Apache-2.0
#
if(NOT ENABLE_CLDNN)
return()
endif()
function(get_lib_name TARGET_NAME LIBRARY_NAME)
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(BUILD_SUFFIX ${IE_DEBUG_POSTFIX})
else()
set(BUILD_SUFFIX ${IE_RELEASE_POSTFIX})
endif()
if(WIN32)
set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_LINK_LIBRARY_SUFFIX}")
else()
set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()
set("${LIBRARY_NAME}" "${CMAKE_SHARED_MODULE_PREFIX}${TARGET_NAME}${LIB_SUFFIX}" PARENT_SCOPE)
endfunction()
function(get_lib_path OUTPUT_DIR FINAL_OUTPUT_DIR)
if(WIN32)
set(LIB_DIR "")
else()
set(LIB_DIR "lib")
endif()
if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT UNIX)
set(OUTPUT_DIR "${OUTPUT_DIR}/${CMAKE_BUILD_TYPE}")
endif()
set("${FINAL_OUTPUT_DIR}" "${OUTPUT_DIR}/${LIB_DIR}" PARENT_SCOPE)
endfunction()
set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" CACHE PATH "Path to OCL includes" FORCE)
set(OPENCL_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" "${CMAKE_CURRENT_SOURCE_DIR}/clhpp_headers/include/" CACHE PATH "Path to OCL (CL and CLHPP) includes" FORCE)
@ -20,5 +54,10 @@ target_include_directories(OpenCL SYSTEM PUBLIC ${OPENCL_HEADERS_DIR})
# The following varables are needed to make find_package(OpenCL) work
set(OpenCL_VERSION_STRING "2.2" CACHE STRING "" FORCE)
set(OpenCL_INCLUDE_DIR "${OPENCL_ICD_LOADER_HEADERS_DIR}" CACHE PATH "" FORCE)
set(OpenCL_INCLUDE_DIR "${OPENCL_HEADERS_DIR}" CACHE PATH "" FORCE)
set(OPENCLROOT "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH "" FORCE)
get_lib_path("${OUTPUT_ROOT}/${BIN_FOLDER}" OPENCL_LIB_DIR)
get_lib_name("OpenCL" OPENCL_LIB_NAME)
set(OpenCL_LIBRARY "${OPENCL_LIB_DIR}/${OPENCL_LIB_NAME}" CACHE PATH "" FORCE)