Add use_device_mem
option to benchmark_app (#7433)
This commit is contained in:
parent
8690e14a5b
commit
1f85d4230d
@ -2,11 +2,46 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
set(TARGET_NAME "benchmark_app")
|
||||
|
||||
file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
|
||||
file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
|
||||
|
||||
ie_add_sample(NAME benchmark_app
|
||||
ie_add_sample(NAME ${TARGET_NAME}
|
||||
SOURCES ${SRC}
|
||||
HEADERS ${HDR}
|
||||
DEPENDENCIES format_reader ie_samples_utils
|
||||
OPENCV_DEPENDENCIES core)
|
||||
|
||||
find_package(OpenCL)
|
||||
|
||||
find_path(OpenCL_HPP_INCLUDE_DIR
|
||||
NAMES
|
||||
CL/cl2.hpp OpenCL/cl2.hpp
|
||||
HINTS
|
||||
${opencl_root_hints}
|
||||
ENV "PROGRAMFILES(X86)"
|
||||
ENV AMDAPPSDKROOT
|
||||
ENV INTELOCLSDKROOT
|
||||
ENV NVSDKCOMPUTE_ROOT
|
||||
ENV CUDA_PATH
|
||||
ENV ATISTREAMSDKROOT
|
||||
ENV OCL_ROOT
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
OpenCL/common/inc
|
||||
"AMD APP/include")
|
||||
|
||||
if(OPENCL_HEADERS_DIR)
|
||||
# Use OpenCL CPP headers from sources if present
|
||||
set(OpenCL_HEADERS OPENCL_HEADERS_DIR)
|
||||
elseif(OpenCL_HPP_INCLUDE_DIR)
|
||||
# Append OpenCL CPP headers to C headers and use both
|
||||
set(OpenCL_HEADERS OpenCL_INCLUDE_DIR OpenCL_HPP_INCLUDE_DIR)
|
||||
endif()
|
||||
|
||||
if(OpenCL_FOUND AND OpenCL_HEADERS)
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_HEADERS})
|
||||
target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_GPU_DEVICE_MEM_SUPPORT)
|
||||
endif()
|
||||
|
@ -4,6 +4,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
|
||||
# define HAVE_DEVICE_MEM_SUPPORT
|
||||
#endif
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
|
||||
#include <iostream>
|
||||
@ -132,6 +136,12 @@ static const char progress_message[] =
|
||||
// @brief message for performance counters option
|
||||
static const char pc_message[] = "Optional. Report performance counters.";
|
||||
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
// @brief message for switching memory allocation type option
|
||||
static const char use_device_mem_message[] =
|
||||
"Optional. Switch between host and device memory allocation for input and output buffers.";
|
||||
#endif
|
||||
|
||||
#ifdef USE_OPENCV
|
||||
// @brief message for load config option
|
||||
static const char load_config_message[] =
|
||||
@ -266,6 +276,11 @@ DEFINE_bool(progress, false, progress_message);
|
||||
/// @brief Define flag for showing performance counters <br>
|
||||
DEFINE_bool(pc, false, pc_message);
|
||||
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
/// @brief Define flag for switching beetwen host and device memory allocation for input and output buffers
|
||||
DEFINE_bool(use_device_mem, false, use_device_mem_message);
|
||||
#endif
|
||||
|
||||
#ifdef USE_OPENCV
|
||||
/// @brief Define flag for loading configuration file <br>
|
||||
DEFINE_string(load_config, "", load_config_message);
|
||||
@ -339,6 +354,9 @@ static void showUsage() {
|
||||
std::cout << " -nthreads \"<integer>\" " << infer_num_threads_message << std::endl;
|
||||
std::cout << " -enforcebf16=<true/false> " << enforce_bf16_message << std::endl;
|
||||
std::cout << " -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\" " << infer_threads_pinning_message << std::endl;
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
std::cout << " -use_device_mem " << use_device_mem_message << std::endl;
|
||||
#endif
|
||||
std::cout << std::endl << " Statistics dumping options:" << std::endl;
|
||||
std::cout << " -report_type \"<type>\" " << report_type_message << std::endl;
|
||||
std::cout << " -report_folder " << report_folder_message << std::endl;
|
||||
|
@ -65,6 +65,10 @@ public:
|
||||
return _request.GetBlob(name);
|
||||
}
|
||||
|
||||
void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) {
|
||||
_request.SetBlob(name, data);
|
||||
}
|
||||
|
||||
double getExecutionTimeInMilliseconds() const {
|
||||
auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
|
||||
return static_cast<double>(execTime.count()) * 0.000001;
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "infer_request_wrap.hpp"
|
||||
#include "inputs_filling.hpp"
|
||||
#include "progress_bar.hpp"
|
||||
#include "remote_blobs_filling.hpp"
|
||||
#include "statistics_report.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
@ -592,7 +593,16 @@ int main(int argc, char* argv[]) {
|
||||
next_step();
|
||||
|
||||
InferRequestsQueue inferRequestsQueue(exeNetwork, nireq);
|
||||
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
|
||||
if (isFlagSetInCommandLine("use_device_mem")) {
|
||||
if (device_name.find("GPU") == 0)
|
||||
::gpu::fillRemoteBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, exeNetwork);
|
||||
else if (device_name.find("CPU") == 0)
|
||||
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
|
||||
else
|
||||
IE_THROW() << "Requested device doesn't support `use_device_mem` option.";
|
||||
} else {
|
||||
fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
|
||||
}
|
||||
|
||||
// ----------------- 10. Measuring performance
|
||||
// ------------------------------------------------------------------
|
||||
|
140
inference-engine/samples/benchmark_app/remote_blobs_filling.cpp
Normal file
140
inference-engine/samples/benchmark_app/remote_blobs_filling.cpp
Normal file
@ -0,0 +1,140 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "remote_blobs_filling.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace gpu {
|
||||
|
||||
template <typename T>
|
||||
using uniformDistribution = typename std::conditional<
|
||||
std::is_floating_point<T>::value,
|
||||
std::uniform_real_distribution<T>,
|
||||
typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
|
||||
|
||||
template <typename T, typename T2>
|
||||
void fillBufferRandom(void* inputBuffer,
|
||||
size_t elementsNum,
|
||||
T rand_min = std::numeric_limits<uint8_t>::min(),
|
||||
T rand_max = std::numeric_limits<uint8_t>::max()) {
|
||||
std::mt19937 gen(0);
|
||||
uniformDistribution<T2> distribution(rand_min, rand_max);
|
||||
auto inputBufferData = static_cast<T*>(inputBuffer);
|
||||
for (size_t i = 0; i < elementsNum; i++) {
|
||||
inputBufferData[i] = static_cast<T>(distribution(gen));
|
||||
}
|
||||
}
|
||||
|
||||
void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) {
|
||||
if (precision == InferenceEngine::Precision::FP32) {
|
||||
fillBufferRandom<float, float>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::FP16) {
|
||||
fillBufferRandom<short, short>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::I32) {
|
||||
fillBufferRandom<int32_t, int32_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::I64) {
|
||||
fillBufferRandom<int64_t, int64_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::U8) {
|
||||
// uniform_int_distribution<uint8_t> is not allowed in the C++17
|
||||
// standard and vs2017/19
|
||||
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::I8) {
|
||||
// uniform_int_distribution<int8_t> is not allowed in the C++17 standard
|
||||
// and vs2017/19
|
||||
fillBufferRandom<int8_t, int32_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::U16) {
|
||||
fillBufferRandom<uint16_t, uint16_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::I16) {
|
||||
fillBufferRandom<int16_t, int16_t>(inputBuffer, elementsNum);
|
||||
} else if (precision == InferenceEngine::Precision::BOOL) {
|
||||
fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum, 0, 1);
|
||||
} else {
|
||||
IE_THROW() << "Requested precision is not supported";
|
||||
}
|
||||
}
|
||||
|
||||
size_t getBytesPerElement(InferenceEngine::Precision precision) {
|
||||
switch (precision) {
|
||||
case InferenceEngine::Precision::FP32:
|
||||
return 4;
|
||||
case InferenceEngine::Precision::FP16:
|
||||
return 2;
|
||||
case InferenceEngine::Precision::I32:
|
||||
return 4;
|
||||
case InferenceEngine::Precision::I64:
|
||||
return 8;
|
||||
case InferenceEngine::Precision::U8:
|
||||
return 1;
|
||||
case InferenceEngine::Precision::I8:
|
||||
return 1;
|
||||
case InferenceEngine::Precision::U16:
|
||||
return 2;
|
||||
case InferenceEngine::Precision::I16:
|
||||
return 2;
|
||||
case InferenceEngine::Precision::BOOL:
|
||||
return 1;
|
||||
default:
|
||||
IE_THROW() << "Requested precision is not supported";
|
||||
}
|
||||
}
|
||||
|
||||
void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
|
||||
const size_t& batchSize,
|
||||
benchmark_app::InputsInfo& app_inputs_info,
|
||||
std::vector<InferReqWrap::Ptr> requests,
|
||||
const InferenceEngine::ExecutableNetwork& exeNetwork) {
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
slog::info << "Device memory will be used for input and output blobs" << slog::endl;
|
||||
if (inputFiles.size()) {
|
||||
slog::warn << "Device memory supports only random data at this moment, input images will be ignored"
|
||||
<< slog::endl;
|
||||
}
|
||||
auto context = exeNetwork.GetContext();
|
||||
auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
|
||||
auto oclInstance = std::make_shared<OpenCL>(oclContext);
|
||||
|
||||
auto setShared = [&](size_t requestId,
|
||||
const std::string name,
|
||||
const InferenceEngine::TensorDesc& desc,
|
||||
bool fillRandom = false) {
|
||||
cl_int err;
|
||||
auto inputDims = desc.getDims();
|
||||
auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
|
||||
auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
|
||||
|
||||
cl::Buffer sharedBuffer =
|
||||
cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
|
||||
|
||||
if (fillRandom) {
|
||||
void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(sharedBuffer,
|
||||
CL_TRUE,
|
||||
CL_MEM_READ_WRITE,
|
||||
0,
|
||||
(cl::size_type)inputSize);
|
||||
fillBuffer(mappedPtr, elementsNum, desc.getPrecision());
|
||||
oclInstance->_queue.enqueueUnmapMemObject(sharedBuffer, mappedPtr);
|
||||
}
|
||||
|
||||
InferenceEngine::Blob::Ptr sharedBlob = InferenceEngine::gpu::make_shared_blob(desc, context, sharedBuffer);
|
||||
|
||||
requests.at(requestId)->setBlob(name, sharedBlob);
|
||||
};
|
||||
|
||||
for (size_t requestId = 0; requestId < requests.size(); requestId++) {
|
||||
for (auto& item : exeNetwork.GetInputsInfo())
|
||||
setShared(requestId, item.first, item.second->getTensorDesc(), true);
|
||||
|
||||
for (auto& item : exeNetwork.GetOutputsInfo())
|
||||
setShared(requestId, item.first, item.second->getTensorDesc());
|
||||
}
|
||||
#else
|
||||
IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace gpu
|
@ -0,0 +1,64 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
|
||||
# define HAVE_DEVICE_MEM_SUPPORT
|
||||
# include <gpu/gpu_context_api_ocl.hpp>
|
||||
#endif
|
||||
|
||||
#include <inference_engine.hpp>
|
||||
|
||||
#include "infer_request_wrap.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace gpu {
|
||||
|
||||
#ifdef HAVE_DEVICE_MEM_SUPPORT
|
||||
struct OpenCL {
|
||||
cl::Context _context;
|
||||
cl::Device _device;
|
||||
cl::CommandQueue _queue;
|
||||
|
||||
explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
|
||||
// get Intel GPU OCL device, create context and queue
|
||||
{
|
||||
std::vector<cl::Device> devices;
|
||||
std::vector<cl::Platform> platforms;
|
||||
const unsigned int refVendorID = 0x8086;
|
||||
|
||||
cl::Platform::get(&platforms);
|
||||
for (auto& p : platforms) {
|
||||
p.getDevices(CL_DEVICE_TYPE_GPU, &devices);
|
||||
for (auto& d : devices) {
|
||||
if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
|
||||
_device = d;
|
||||
_context = cl::Context(_device);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
|
||||
_queue = cl::CommandQueue(_context, _device, props);
|
||||
}
|
||||
}
|
||||
|
||||
explicit OpenCL(cl_context context) {
|
||||
// user-supplied context handle
|
||||
_context = cl::Context(context, true);
|
||||
_device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
|
||||
|
||||
cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
|
||||
_queue = cl::CommandQueue(_context, _device, props);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
|
||||
const size_t& batchSize,
|
||||
benchmark_app::InputsInfo& app_inputs_info,
|
||||
std::vector<InferReqWrap::Ptr> requests,
|
||||
const InferenceEngine::ExecutableNetwork& exeNetwork);
|
||||
|
||||
} // namespace gpu
|
41
thirdparty/ocl/CMakeLists.txt
vendored
41
thirdparty/ocl/CMakeLists.txt
vendored
@ -2,6 +2,40 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
if(NOT ENABLE_CLDNN)
|
||||
return()
|
||||
endif()
|
||||
|
||||
function(get_lib_name TARGET_NAME LIBRARY_NAME)
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
set(BUILD_SUFFIX ${IE_DEBUG_POSTFIX})
|
||||
else()
|
||||
set(BUILD_SUFFIX ${IE_RELEASE_POSTFIX})
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_LINK_LIBRARY_SUFFIX}")
|
||||
else()
|
||||
set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_SHARED_LIBRARY_SUFFIX}")
|
||||
endif()
|
||||
|
||||
set("${LIBRARY_NAME}" "${CMAKE_SHARED_MODULE_PREFIX}${TARGET_NAME}${LIB_SUFFIX}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(get_lib_path OUTPUT_DIR FINAL_OUTPUT_DIR)
|
||||
if(WIN32)
|
||||
set(LIB_DIR "")
|
||||
else()
|
||||
set(LIB_DIR "lib")
|
||||
endif()
|
||||
|
||||
if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT UNIX)
|
||||
set(OUTPUT_DIR "${OUTPUT_DIR}/${CMAKE_BUILD_TYPE}")
|
||||
endif()
|
||||
|
||||
set("${FINAL_OUTPUT_DIR}" "${OUTPUT_DIR}/${LIB_DIR}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" CACHE PATH "Path to OCL includes" FORCE)
|
||||
|
||||
set(OPENCL_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" "${CMAKE_CURRENT_SOURCE_DIR}/clhpp_headers/include/" CACHE PATH "Path to OCL (CL and CLHPP) includes" FORCE)
|
||||
@ -20,5 +54,10 @@ target_include_directories(OpenCL SYSTEM PUBLIC ${OPENCL_HEADERS_DIR})
|
||||
|
||||
# The following varables are needed to make find_package(OpenCL) work
|
||||
set(OpenCL_VERSION_STRING "2.2" CACHE STRING "" FORCE)
|
||||
set(OpenCL_INCLUDE_DIR "${OPENCL_ICD_LOADER_HEADERS_DIR}" CACHE PATH "" FORCE)
|
||||
set(OpenCL_INCLUDE_DIR "${OPENCL_HEADERS_DIR}" CACHE PATH "" FORCE)
|
||||
set(OPENCLROOT "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH "" FORCE)
|
||||
|
||||
get_lib_path("${OUTPUT_ROOT}/${BIN_FOLDER}" OPENCL_LIB_DIR)
|
||||
get_lib_name("OpenCL" OPENCL_LIB_NAME)
|
||||
|
||||
set(OpenCL_LIBRARY "${OPENCL_LIB_DIR}/${OPENCL_LIB_NAME}" CACHE PATH "" FORCE)
|
||||
|
Loading…
Reference in New Issue
Block a user