[IE CLDNN] Build time optimization for OCL kernels (#4725)

Build OCL kernel batches of different buckets in parallel

Co-authored-by: Donghyeon Jeong <donghyeon.jeong@intel.com>
This commit is contained in:
Taylor Yeonbok Lee 2021-03-31 16:22:19 +09:00 committed by GitHub
parent d674eebd52
commit b58c648d2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 340 additions and 180 deletions

View File

@ -72,6 +72,11 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
*/
DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);
/**
* @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
* Default value is maximum number of threads available in the environment.
*/
DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);
} // namespace CLDNNConfigParams
} // namespace InferenceEngine

View File

@ -48,6 +48,7 @@ DEFINE_string(i, "", image_message);
/// @brief Define parameter for set model file <br>
/// It is a required parameter
DEFINE_string(m, "", model_message);
DEFINE_string(m2, "", model_message);
/// @brief device the target device to infer on <br>
DEFINE_string(d, "CPU", target_device_message);

View File

@ -40,6 +40,8 @@ target_include_directories(${TARGET_NAME} PRIVATE
set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
set_ie_threading_interface_for(clDNN_lib)
# Failed because of OpenCL
# ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})

View File

@ -11,6 +11,7 @@
#include "ie_api.h"
#include "file_utils.h"
#include "cldnn_itt.h"
#include <thread>
#ifdef _WIN32
# include <direct.h>
@ -221,6 +222,20 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
} else {
IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
}
} else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
try {
int val_i = std::stoi(val);
if (val_i <= 0 || val_i > max_threads) {
n_threads = max_threads;
} else {
n_threads = val_i;
}
} catch (const std::exception&) {
IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
<< "\nSpecify the number of threads use for build as an integer."
<< "\nOut of range value will be set as a default value, maximum concurrent threads.";
}
} else {
IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
}
@ -306,5 +321,6 @@ void Config::adjustKeyMapValues() {
key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
}
} // namespace CLDNNPlugin

View File

@ -31,7 +31,8 @@ struct Config {
graph_dumps_dir(""),
sources_dumps_dir(""),
device_id(""),
kernels_cache_dir("") {
kernels_cache_dir(""),
n_threads(std::max(static_cast<unsigned int>(1), std::thread::hardware_concurrency())) {
adjustKeyMapValues();
}
@ -56,6 +57,7 @@ struct Config {
std::string sources_dumps_dir;
std::string device_id;
std::string kernels_cache_dir;
size_t n_threads;
std::map<std::string, std::string> key_config_map;
};

View File

@ -494,7 +494,8 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
context_config.tuningConfig.mode == current_config.tuningConfig.mode &&
context_config.tuningConfig.cache_file_path == current_config.tuningConfig.cache_file_path &&
context_config.kernels_cache_dir == current_config.kernels_cache_dir &&
context_config.device_id == current_config.device_id;
context_config.device_id == current_config.device_id &&
context_config.n_threads == current_config.n_threads;
};
{

View File

@ -267,7 +267,8 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
m_config.queueThrottle,
m_config.memory_pool_on,
m_config.throughput_streams,
m_config.kernels_cache_dir));
m_config.kernels_cache_dir,
m_config.n_threads));
}
}

View File

@ -4,7 +4,7 @@
#include <vector>
#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat.hpp"
#include "common_test_utils/test_constants.hpp"
using namespace LayerTestsDefinitions;
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
LayerTestsUtils::LayerTransformationParamsFactory::createParams()
};
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformationsForConcat,
::testing::Combine(
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::ValuesIn(trasformationParamValues)),
OutputLayersHandlingInTransformations::getTestCaseName);
OutputLayersHandlingInTransformationsForConcat::getTestCaseName);
} // namespace

View File

@ -4,7 +4,7 @@
#include <vector>
#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.hpp"
#include "common_test_utils/test_constants.hpp"
using namespace LayerTestsDefinitions;
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
LayerTestsUtils::LayerTransformationParamsFactory::createParams()
};
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, OutputLayersHandlingInTransformationsForConcatMultiChannel,
::testing::Combine(
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
::testing::Values(CommonTestUtils::DEVICE_GPU),
::testing::ValuesIn(trasformationParamValues)),
OutputLayersHandlingInTransformations::getTestCaseName);
OutputLayersHandlingInTransformationsForConcatMultiChannel::getTestCaseName);
} // namespace

View File

@ -39,7 +39,7 @@ if (ENABLE_CLDNN)
else()
set(CLDNN__ARCHITECTURE_TARGET "Linux64" CACHE STRING "" FORCE)
endif()
set(CLDNN_THREADING "${THREADING}" CACHE STRING "" FORCE)
add_subdirectory(clDNN)
# disable CLDNN docs build

View File

@ -59,6 +59,14 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
# ======================================================================================================
# ====================================== HELPER CONSTANT VARIABLES =====================================
# ======================================================================================================
# ======================================================================================================
if("${CLDNN_THREADING}" MATCHES "SEQ")
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_SEQ)
elseif("${CLDNN_THREADING}" MATCHES "TBB")
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_TBB)
else()
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_THREADPOOL)
endif()
# Path which points to main directory of project.
set(CLDNN__MAIN_DIR "${CMAKE_CURRENT_SOURCE_DIR}")

View File

@ -123,6 +123,7 @@
#include <memory>
#include <string>
#include <type_traits>
#include <thread>
namespace cldnn {

View File

@ -10,6 +10,7 @@
#include <stdexcept>
#include <vector>
#include <map>
#include <algorithm>
namespace cldnn {
@ -61,6 +62,7 @@ struct engine_configuration {
///< (switched off for older drivers then NEO).
uint16_t n_streams; ///< Number of queues executed in parallel
const std::string kernels_cache_path; ///< Path to compiled kernels cache
uint16_t n_threads; ///< Number of threads
const std::string tuning_cache_path; ///< Path to tuning kernel cache
/// @brief Constructs engine configuration with specified options.
@ -83,6 +85,7 @@ struct engine_configuration {
bool memory_pool = true,
uint16_t n_streams = 1,
const std::string& kernels_cache_path = "",
uint16_t n_threads = std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1)),
const std::string& tuning_cache_path = "cache.json")
: enable_profiling(profiling)
, meaningful_kernels_names(decorate_kernel_names)
@ -97,6 +100,7 @@ struct engine_configuration {
, enable_memory_pool(memory_pool)
, n_streams(n_streams)
, kernels_cache_path(kernels_cache_path)
, n_threads(n_threads)
, tuning_cache_path(tuning_cache_path) {
if (n_streams == 0) {
throw std::invalid_argument("Invalid streams count set in engine config");

View File

@ -83,6 +83,7 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) {
result.queues_num = conf.n_streams;
result.kernels_cache_path = conf.kernels_cache_path;
result.tuning_cache_path = conf.tuning_cache_path;
result.n_threads = conf.n_threads;
return result;
}

View File

@ -4,6 +4,7 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "configuration.h"
#include <algorithm>
namespace cldnn {
namespace gpu {
@ -22,6 +23,7 @@ configuration::configuration()
throttle_mode(throttle_mode_types::disabled),
queues_num(0),
tuning_cache_path("cache.json"),
kernels_cache_path("") {}
kernels_cache_path(""),
n_threads(std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1))) {}
} // namespace gpu
} // namespace cldnn

View File

@ -31,6 +31,7 @@ struct configuration {
uint16_t queues_num;
std::string tuning_cache_path;
std::string kernels_cache_path;
uint16_t n_threads;
};
} // namespace gpu
} // namespace cldnn

View File

@ -13,9 +13,17 @@
#include <string>
#include <memory>
#include <utility>
#include "kernel_selector_helper.h"
#include "cldnn_itt.h"
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
#include <tbb/parallel_for.h>
#include <tbb/blocked_range.h>
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
#include <thread>
#include <future>
#include <queue>
#include <condition_variable>
#endif
#ifndef ENABLE_UNICODE_PATH_SUPPORT
# ifdef _WIN32
@ -36,8 +44,10 @@
#include <Windows.h>
#endif
#if (CLDNN_THREADING != CLDNN_THREADING_SEQ)
#define DEFAULT_NUM_THREADS 2
#endif
namespace {
std::mutex cacheAccessMutex;
#ifdef ENABLE_UNICODE_PATH_SUPPORT
@ -84,7 +94,6 @@ static std::vector<unsigned char> loadBinaryFromFile(std::string path) {
return {};
}
static void saveBinaryToFile(std::string path, const std::vector<unsigned char> buffer) {
std::lock_guard<std::mutex> lock(cacheAccessMutex);
#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
@ -190,9 +199,10 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
return 10;
}
kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& kernels_source_code) const {
void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll::GetProgramSource");
sorted_code scode;
std::map<std::string, std::vector<batch_program>> program_buckets;
for (const auto& code : kernels_source_code) {
std::string full_code = code.kernel_strings->jit + code.kernel_strings->str;
@ -213,7 +223,7 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
std::string key = options;
if (batch_compilation == false) {
key += " __PROGRAM__" + std::to_string(scode.size());
key += " __PROGRAM__" + std::to_string(program_buckets.size());
}
if (dump_custom_program) {
@ -223,48 +233,63 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
if (one_time_kernel) {
key += " __ONE_TIME__";
}
auto& current_bucket = scode[key];
current_bucket.dump_custom_program = dump_custom_program;
current_bucket.one_time = one_time_kernel;
if (current_bucket.source.empty()) {
current_bucket.options = options;
auto& current_bucket = program_buckets[key];
if (current_bucket.empty()) { // new bucket
const auto& bucket_id = program_buckets.size() - 1;
current_bucket.push_back(batch_program());
current_bucket.back().bucket_id = static_cast<int32_t>(bucket_id);
current_bucket.back().batch_id = 0;
current_bucket.back().options = options;
}
// Create new kernels bucket when the limit is reached
if ((current_bucket.kernels_counter % get_max_kernels_per_batch()) == 0) {
current_bucket.source.push_back({});
// Create new kernels batch when the limit is reached
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
const auto& batch_id = current_bucket.size();
current_bucket.push_back(batch_program());
current_bucket.back().bucket_id = static_cast<int32_t>(program_buckets.size());
current_bucket.back().batch_id = static_cast<int32_t>(batch_id);
current_bucket.back().options = options;
}
current_bucket.entry_point_to_id[entry_point] = code.id;
auto& current_batch = current_bucket.back();
current_batch.dump_custom_program = dump_custom_program;
current_batch.one_time = one_time_kernel;
current_batch.entry_point_to_id[entry_point] = code.id;
assert(org_source_code.size() == 1);
current_bucket.source.back().push_back(std::move(org_source_code.front()));
current_bucket.kernels_counter++;
current_batch.source.push_back(std::move(org_source_code.front()));
current_batch.kernels_counter++;
}
// Compute hash value for each bucket
// Compute hash value for each batch
// Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading
// of the precompiled binaries or get_undef_jit calls
// Hash is computed for string that contains compilation options + driver version +
// full source code (jit + template + undef sections) of all kernels in the bucket
for (auto& c : scode) {
program_code& code = c.second;
// full source code (jit + template + undef sections) of all kernels in the batches
for (auto& c : program_buckets) {
auto options = c.first;
for (size_t i = 0; i < code.source.size(); i++) {
auto& batches = c.second;
for (auto& b : batches) {
std::string full_code = options + " " + _context.get_device_info().driver_version;
for (auto& ss : code.source[i])
for (auto& ss : b.source)
full_code += ss;
code.hash_values.push_back(std::hash<std::string>()(full_code));
b.hash_value = std::hash<std::string>()(full_code);
all_batches->push_back(b);
}
}
return scode;
}
kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {}
kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
int n_threads = _context.get_configuration().n_threads;
arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
arena->initialize(n_threads);
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
int n_threads = _context.get_configuration().n_threads;
pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
#endif
}
kernels_cache::kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
@ -301,149 +326,160 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
return program.getInfo<CL_PROGRAM_BINARIES>().front();
}
kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const {
void kernels_cache::build_batch(const batch_program& batch) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram");
static uint32_t current_file_index = 0;
bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program;
bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || batch.dump_custom_program;
std::string dump_file_name = "";
std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which
std::string current_dump_file_name = "";
if (dump_sources) {
dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
if (!dump_file_name.empty() && dump_file_name.back() != '/')
dump_file_name += '/';
current_dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
if (!current_dump_file_name.empty() && current_dump_file_name.back() != '/')
current_dump_file_name += '/';
dump_file_name += "clDNN_program_" + std::to_string(current_file_index++) + "_part_";
current_dump_file_name += "clDNN_program_" + std::to_string(batch.bucket_id) + "_part_" + std::to_string(batch.batch_id) + ".cl";
}
std::ofstream dump_file;
if (dump_sources) {
dump_file.open(current_dump_file_name);
if (dump_file.good()) {
for (auto& s : batch.source)
dump_file << s;
}
}
std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
cl::Program::Binaries precompiled_kernels = {};
if (is_cache_enabled()) {
// Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
// If read is successful, then remove kernels from compilation bucket
auto bin = loadBinaryFromFile(cached_bin_name);
if (!bin.empty()) {
precompiled_kernels.push_back(bin);
}
}
try {
kernels_map kmap;
std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which
// failed to compile)
cl::vector<cl::Kernel> kernels;
// Run compilation
if (precompiled_kernels.empty()) {
cl::Program program(_context.context(), batch.source);
{
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
program.build(_context.device(), batch.options.c_str());
}
if (dump_sources && dump_file.good()) {
dump_file << "\n/* Build Log:\n";
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
dump_file << p.second << "\n";
dump_file << "*/\n";
}
program.createKernels(&kernels);
uint32_t part_idx = 0;
for (size_t i = 0; i < program_source.source.size(); i++) {
auto sources_bucket_to_compile = program_source.source[i];
const auto& hash_value = program_source.hash_values[i];
std::string cached_bin_name = get_cache_path() + std::to_string(hash_value) + ".cl_cache";
cl::Program::Binaries precompiled_kernels = {};
if (is_cache_enabled()) {
// Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
// If read is successful, then remove kernels from compilation bucket
auto bin = loadBinaryFromFile(cached_bin_name);
if (!bin.empty()) {
precompiled_kernels.push_back(bin);
}
// If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
// Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
// Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
// compile time.
saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
}
auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl";
std::ofstream dump_file;
if (dump_sources) {
dump_file.open(current_dump_file_name);
if (dump_file.good()) {
for (auto& s : sources_bucket_to_compile)
dump_file << s;
}
}
try {
cl::vector<cl::Kernel> kernels;
// Run compilation
if (precompiled_kernels.empty()) {
cl::Program program(_context.context(), sources_bucket_to_compile);
{
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
program.build(_context.device(), program_source.options.c_str());
}
if (dump_sources && dump_file.good()) {
dump_file << "\n/* Build Log:\n";
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
dump_file << p.second << "\n";
dump_file << "*/\n";
}
program.createKernels(&kernels);
if (is_cache_enabled()) {
// If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
// Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
// Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
// compile time.
saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
} else {
cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
program.build(_context.device(), batch.options.c_str());
program.createKernels(&kernels);
}
{
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
for (auto& k : kernels) {
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
const auto& k_id = batch.entry_point_to_id.find(entry_point);
const auto& k_type = kernel_type(k, _context.get_device_info().supports_usm);
if (k_id != batch.entry_point_to_id.end()) {
const auto& kmap = std::make_pair(k_id->second, k_type);
if (batch.one_time) {
_one_time_kernels.insert(kmap);
} else {
_kernels.insert(kmap);
}
} else {
cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
program.build(_context.device(), program_source.options.c_str());
program.createKernels(&kernels);
throw std::runtime_error("Could not find entry point");
}
for (auto& k : kernels) {
auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
kmap.emplace(kernel_name, kernels_cache::kernel_type(k, _context.get_device_info().supports_usm));
}
} catch (const cl::BuildError& err) {
if (dump_sources && dump_file.good())
dump_file << "\n/* Build Log:\n";
for (auto& p : err.getBuildLog()) {
if (dump_sources && dump_file.good())
dump_file << p.second << "\n";
err_log += p.second + '\n';
}
if (dump_sources && dump_file.good())
dump_file << "*/\n";
}
}
} catch (const cl::BuildError& err) {
if (dump_sources && dump_file.good())
dump_file << "\n/* Build Log:\n";
if (!err_log.empty()) {
throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
for (auto& p : err.getBuildLog()) {
if (dump_sources && dump_file.good())
dump_file << p.second << "\n";
err_log += p.second + '\n';
}
return kmap;
} catch (const cl::Error& err) {
throw ocl_error(err);
if (dump_sources && dump_file.good())
dump_file << "*/\n";
}
if (!err_log.empty()) {
throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
}
}
kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) {
build_all();
if (one_time_kernel) {
return _one_time_kernels.at(id);
} else {
return _kernels.at(id);
}
kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) const {
if (_pending_compilation)
throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
const auto& kernels = one_time_kernel ? _one_time_kernels : _kernels;
auto res = kernels.find(id);
if (kernels.end() == res)
throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
return res->second;
}
void kernels_cache::build_all() {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
if (!_pending_compilation)
return;
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
auto sorted_program_code = get_program_source(_kernels_code);
_one_time_kernels.clear();
for (auto& program : sorted_program_code) {
auto kernels = build_program(program.second);
for (auto& k : kernels) {
const auto& entry_point = k.first;
const auto& k_id = program.second.entry_point_to_id[entry_point];
if (program.second.one_time) {
_one_time_kernels[k_id] = k.second;
} else {
_kernels[k_id] = k.second;
}
}
std::vector<batch_program> batches;
{
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
get_program_source(_kernels_code, &batches);
_one_time_kernels.clear();
}
_kernels_code.clear();
_pending_compilation = false;
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
arena->execute([this, &batches] {
tbb::parallel_for(tbb::blocked_range<size_t>(0, batches.size()), [this, &batches](const tbb::blocked_range<size_t>& r) {
for (auto i = r.begin(); i != r.end(); ++i) {
build_batch(batches[i]);
}
});
});
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
std::vector<std::future<void>> builds;
for (size_t i = 0; i < batches.size(); ++i) {
builds.push_back(pool->enqueue([this, &batches, i] () {
build_batch(batches[i]);
}));
}
std::for_each(builds.begin(), builds.end(), [] (std::future<void>& f) { f.wait(); });
#else
// no parallel build
for (const auto& batch : batches) {
build_batch(batch);
}
#endif
{
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
_kernels_code.clear();
_pending_compilation = false;
}
}
void kernels_cache::reset() {
@ -452,6 +488,5 @@ void kernels_cache::reset() {
_kernels_code.clear();
_pending_compilation = false;
}
} // namespace gpu
} // namespace cldnn

View File

@ -13,6 +13,19 @@
#include <unordered_set>
#include <kernel_selector_common.h>
#define CLDNN_THREADING_SEQ 0
#define CLDNN_THREADING_TBB 1
#define CLDNN_THREADING_THREADPOOL 2
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
#include <tbb/task_arena.h>
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
#include <queue>
#include <future>
#include <functional>
#include <condition_variable>
#endif
namespace cl {
class Kernel;
class KernelIntel;
@ -26,14 +39,76 @@ namespace cldnn {
namespace gpu {
class gpu_toolkit;
#if (CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
class thread_pool {
public:
thread_pool(size_t num_threads) : _stop_pool(false) {
_workers.reserve(num_threads);
for (size_t i = 0; i < num_threads; ++i) {
_workers.emplace_back(std::thread(&thread_pool::worker_thread, this));
}
}
~thread_pool() {
{
std::lock_guard<std::mutex> lock(_q_m);
_stop_pool = true;
}
this->wait_all();
}
template <class F, class... Args>
std::future<typename std::result_of<F(Args...)>::type> enqueue(F&& f, Args&&... args) {
if (_stop_pool) {
throw std::runtime_error("Thread pool is stoped");
}
using return_type = typename std::result_of<F(Args...)>::type;
auto task = std::make_shared<std::packaged_task<return_type()>> (std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> result = task->get_future();
{
std::lock_guard<std::mutex> lock(_q_m);
_tasks.push([task]() {(*task)();});
}
_cv.notify_one();
return result;
}
void wait_all() {
_cv.notify_all();
for (auto& w : _workers) {
w.join();
}
}
private:
std::vector<std::thread> _workers;
std::queue<std::function<void()>> _tasks;
std::condition_variable _cv;
std::mutex _q_m;
bool _stop_pool;
void worker_thread() {
while (true) {
std::unique_lock<std::mutex> lock(this->_q_m);
_cv.wait(lock, [this]() { return (!this->_tasks.empty()) || (_stop_pool); });
if ((_stop_pool) && (this->_tasks.empty())) return;
auto task = std::move(_tasks.front());
this->_tasks.pop();
lock.unlock();
task();
}
}
};
#endif
class kernels_cache {
public:
using source_code = std::vector<std::string>;
struct program_code {
std::vector<source_code> source;
std::vector<size_t> hash_values;
struct batch_program {
int32_t bucket_id = 0;
int32_t batch_id = 0;
source_code source;
size_t hash_value;
uint32_t kernels_counter = 0;
std::string options;
bool dump_custom_program = false;
@ -69,7 +144,6 @@ public:
typedef std::string kernel_id;
typedef cl::KernelIntel kernel_type;
using sorted_code = std::map<std::string, program_code>;
using kernels_map = std::map<std::string, kernel_type>;
using kernels_code = std::unordered_set<kernel_code, hash_kernel_code>;
@ -77,13 +151,19 @@ private:
gpu_toolkit& _context;
kernels_code _kernels_code;
std::atomic<bool> _pending_compilation{false};
std::map<std::string, kernel_type> _kernels;
std::map<std::string, kernel_type> _one_time_kernels; // These kernels are intended to be executed only once (can
std::map<const std::string, const kernel_type> _kernels;
std::map<const std::string, const kernel_type> _one_time_kernels; // These kernels are intended to be executed only once (can
// be removed later from the cache).
uint32_t _prog_id;
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
std::unique_ptr<tbb::task_arena> arena;
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
std::unique_ptr<thread_pool> pool;
#endif
sorted_code get_program_source(const kernels_code& kernels_source_code) const;
kernels_map build_program(const program_code& pcode) const;
void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
void build_batch(const batch_program& batch);
std::string get_cache_path() const;
bool is_cache_enabled() const;
@ -94,7 +174,7 @@ public:
kernel_id set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
bool dump_custom_program,
bool one_time_kernel);
kernel_type get_kernel(kernel_id id, bool one_time_kernel);
kernel_type get_kernel(kernel_id id, bool one_time_kernel) const;
gpu_toolkit& get_context() { return _context; }
// forces compilation of all pending kernels/programs
void build_all();

View File

@ -3,37 +3,37 @@
<models>
<!--Models with FP32 precision-->
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="740214" vmpeak="805110" vmrss="129308" vmhwm="129308" />
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="922147" vmpeak="922147" vmrss="587522" vmhwm="587522" />
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2709506" vmpeak="2794703" vmrss="1342104" vmhwm="1342104" />
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1007890" vmpeak="1007890" vmrss="138652" vmhwm="138652" />
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1006439" vmpeak="1091636" vmrss="587241" vmhwm="587241" />
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2709501" vmpeak="2794698" vmrss="1291404" vmhwm="1291404" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="691589" vmpeak="922864" vmrss="31054" vmhwm="31054" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="290695" vmhwm="290695" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="403228" vmhwm="403228" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="958240" vmpeak="1043437" vmrss="31366" vmhwm="31366" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="708734" vmpeak="793930" vmrss="287877" vmhwm="287877" />
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="854417" vmpeak="939614" vmrss="402339" vmhwm="402339" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="1046988" vmpeak="1179042" vmrss="307990" vmhwm="439457" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="1267775" vmpeak="1279647" vmrss="932672" vmhwm="944626" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2969241" vmpeak="2969241" vmrss="1506492" vmhwm="1506492" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1321819" vmpeak="1321819" vmrss="374207" vmhwm="439748" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1356565" vmpeak="1441762" vmrss="941418" vmhwm="947060" />
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2605324" vmpeak="26900521" vmrss="1549958" vmhwm="1549958" />
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="2133814" vmpeak="2836412" vmrss="1438049" vmhwm="2140533" />
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2801422" vmpeak="3915366" vmrss="2465065" vmhwm="3578811" />
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="2401380" vmpeak="2836412" vmrss="1469832" vmhwm="2140377" />
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2892432" vmpeak="3939166" vmrss="2472017" vmhwm="3602924" />
<!--Models with FP16 precision-->
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1057487" vmpeak="1085224" vmrss="109694" vmhwm="137295" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="760942" vmpeak="760942" vmrss="418298" vmhwm="418298" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2667537" vmpeak="2752734" vmrss="1304919" vmhwm="1304919" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1058844" vmpeak="1085224" vmrss="123016" vmhwm="136682" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="845348" vmpeak="930545" vmrss="417445" vmhwm="417445" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2347389" vmpeak="2432586" vmrss="1290504" vmhwm="1290504" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="955427" vmpeak="955806" vmrss="27700" vmhwm="27700" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="680862" vmpeak="680862" vmrss="331858" vmhwm="331858" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2123113" vmpeak="2208310" vmrss="453814" vmhwm="453814" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="955827" vmpeak="955827" vmrss="27222" vmhwm="27222" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="766053" vmpeak="851250" vmrss="331458" vmhwm="331458" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1760990" vmpeak="1760990" vmrss="454173" vmhwm="454173" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1372961" vmpeak="1505639" vmrss="369969" vmhwm="501649" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="866543" vmpeak="866543" vmrss="523967" vmhwm="523967" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2746588" vmpeak="2831784" vmrss="1296328" vmhwm="1296328" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1381265" vmpeak="1505472" vmrss="437039" vmhwm="500630" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="951584" vmpeak="1036781" vmrss="528060" vmhwm="528060" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2380580" vmpeak="2465777" vmrss="1326369" vmhwm="1326369" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="2748220" vmpeak="3450818" vmrss="1783704" vmhwm="2486161" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="1397463" vmpeak="1994402" vmrss="1049625" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2181312" vmpeak="2582752" vmrss="1060712" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="2749458" vmpeak="3450818" vmrss="1816765" vmhwm="2486525" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1482655" vmpeak="1998812" vmrss="1049692" vmhwm="1630782" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2910814" vmpeak="3347489" vmrss="1371380" vmhwm="1717102" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
</models>
</attributes>