[IE CLDNN] Build time optimization for OCL kernels (#4725)
Build OCL kernel batches of different buckets in parallel Co-authored-by: Donghyeon Jeong <donghyeon.jeong@intel.com>
This commit is contained in:
parent
d674eebd52
commit
b58c648d2d
@ -72,6 +72,11 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
|
||||
*/
|
||||
DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);
|
||||
|
||||
/**
|
||||
* @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
|
||||
* Default value is maximum number of threads available in the environment.
|
||||
*/
|
||||
DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);
|
||||
|
||||
} // namespace CLDNNConfigParams
|
||||
} // namespace InferenceEngine
|
||||
|
@ -48,6 +48,7 @@ DEFINE_string(i, "", image_message);
|
||||
/// @brief Define parameter for set model file <br>
|
||||
/// It is a required parameter
|
||||
DEFINE_string(m, "", model_message);
|
||||
DEFINE_string(m2, "", model_message);
|
||||
|
||||
/// @brief device the target device to infer on <br>
|
||||
DEFINE_string(d, "CPU", target_device_message);
|
||||
|
@ -40,6 +40,8 @@ target_include_directories(${TARGET_NAME} PRIVATE
|
||||
|
||||
set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
|
||||
|
||||
|
||||
set_ie_threading_interface_for(clDNN_lib)
|
||||
# Failed because of OpenCL
|
||||
# ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "ie_api.h"
|
||||
#include "file_utils.h"
|
||||
#include "cldnn_itt.h"
|
||||
#include <thread>
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <direct.h>
|
||||
@ -221,6 +222,20 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
|
||||
} else {
|
||||
IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
|
||||
}
|
||||
} else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
|
||||
int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
|
||||
try {
|
||||
int val_i = std::stoi(val);
|
||||
if (val_i <= 0 || val_i > max_threads) {
|
||||
n_threads = max_threads;
|
||||
} else {
|
||||
n_threads = val_i;
|
||||
}
|
||||
} catch (const std::exception&) {
|
||||
IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
|
||||
<< "\nSpecify the number of threads use for build as an integer."
|
||||
<< "\nOut of range value will be set as a default value, maximum concurrent threads.";
|
||||
}
|
||||
} else {
|
||||
IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
|
||||
}
|
||||
@ -306,5 +321,6 @@ void Config::adjustKeyMapValues() {
|
||||
key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
|
||||
key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
|
||||
key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
|
||||
key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
|
||||
}
|
||||
} // namespace CLDNNPlugin
|
||||
|
@ -31,7 +31,8 @@ struct Config {
|
||||
graph_dumps_dir(""),
|
||||
sources_dumps_dir(""),
|
||||
device_id(""),
|
||||
kernels_cache_dir("") {
|
||||
kernels_cache_dir(""),
|
||||
n_threads(std::max(static_cast<unsigned int>(1), std::thread::hardware_concurrency())) {
|
||||
adjustKeyMapValues();
|
||||
}
|
||||
|
||||
@ -56,6 +57,7 @@ struct Config {
|
||||
std::string sources_dumps_dir;
|
||||
std::string device_id;
|
||||
std::string kernels_cache_dir;
|
||||
size_t n_threads;
|
||||
|
||||
std::map<std::string, std::string> key_config_map;
|
||||
};
|
||||
|
@ -494,7 +494,8 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
|
||||
context_config.tuningConfig.mode == current_config.tuningConfig.mode &&
|
||||
context_config.tuningConfig.cache_file_path == current_config.tuningConfig.cache_file_path &&
|
||||
context_config.kernels_cache_dir == current_config.kernels_cache_dir &&
|
||||
context_config.device_id == current_config.device_id;
|
||||
context_config.device_id == current_config.device_id &&
|
||||
context_config.n_threads == current_config.n_threads;
|
||||
};
|
||||
|
||||
{
|
||||
|
@ -267,7 +267,8 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
|
||||
m_config.queueThrottle,
|
||||
m_config.memory_pool_on,
|
||||
m_config.throughput_streams,
|
||||
m_config.kernels_cache_dir));
|
||||
m_config.kernels_cache_dir,
|
||||
m_config.n_threads));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
|
||||
#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
using namespace LayerTestsDefinitions;
|
||||
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
|
||||
LayerTestsUtils::LayerTransformationParamsFactory::createParams()
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
|
||||
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformationsForConcat,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU),
|
||||
::testing::ValuesIn(trasformationParamValues)),
|
||||
OutputLayersHandlingInTransformations::getTestCaseName);
|
||||
OutputLayersHandlingInTransformationsForConcat::getTestCaseName);
|
||||
} // namespace
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
|
||||
#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
|
||||
using namespace LayerTestsDefinitions;
|
||||
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
|
||||
LayerTestsUtils::LayerTransformationParamsFactory::createParams()
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
|
||||
INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, OutputLayersHandlingInTransformationsForConcatMultiChannel,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU),
|
||||
::testing::ValuesIn(trasformationParamValues)),
|
||||
OutputLayersHandlingInTransformations::getTestCaseName);
|
||||
OutputLayersHandlingInTransformationsForConcatMultiChannel::getTestCaseName);
|
||||
} // namespace
|
||||
|
2
inference-engine/thirdparty/CMakeLists.txt
vendored
2
inference-engine/thirdparty/CMakeLists.txt
vendored
@ -39,7 +39,7 @@ if (ENABLE_CLDNN)
|
||||
else()
|
||||
set(CLDNN__ARCHITECTURE_TARGET "Linux64" CACHE STRING "" FORCE)
|
||||
endif()
|
||||
|
||||
set(CLDNN_THREADING "${THREADING}" CACHE STRING "" FORCE)
|
||||
add_subdirectory(clDNN)
|
||||
|
||||
# disable CLDNN docs build
|
||||
|
@ -59,6 +59,14 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
|
||||
# ======================================================================================================
|
||||
# ====================================== HELPER CONSTANT VARIABLES =====================================
|
||||
# ======================================================================================================
|
||||
# ======================================================================================================
|
||||
if("${CLDNN_THREADING}" MATCHES "SEQ")
|
||||
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_SEQ)
|
||||
elseif("${CLDNN_THREADING}" MATCHES "TBB")
|
||||
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_TBB)
|
||||
else()
|
||||
add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_THREADPOOL)
|
||||
endif()
|
||||
|
||||
# Path which points to main directory of project.
|
||||
set(CLDNN__MAIN_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
@ -123,6 +123,7 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <thread>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
@ -61,6 +62,7 @@ struct engine_configuration {
|
||||
///< (switched off for older drivers then NEO).
|
||||
uint16_t n_streams; ///< Number of queues executed in parallel
|
||||
const std::string kernels_cache_path; ///< Path to compiled kernels cache
|
||||
uint16_t n_threads; ///< Number of threads
|
||||
const std::string tuning_cache_path; ///< Path to tuning kernel cache
|
||||
|
||||
/// @brief Constructs engine configuration with specified options.
|
||||
@ -83,6 +85,7 @@ struct engine_configuration {
|
||||
bool memory_pool = true,
|
||||
uint16_t n_streams = 1,
|
||||
const std::string& kernels_cache_path = "",
|
||||
uint16_t n_threads = std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1)),
|
||||
const std::string& tuning_cache_path = "cache.json")
|
||||
: enable_profiling(profiling)
|
||||
, meaningful_kernels_names(decorate_kernel_names)
|
||||
@ -97,6 +100,7 @@ struct engine_configuration {
|
||||
, enable_memory_pool(memory_pool)
|
||||
, n_streams(n_streams)
|
||||
, kernels_cache_path(kernels_cache_path)
|
||||
, n_threads(n_threads)
|
||||
, tuning_cache_path(tuning_cache_path) {
|
||||
if (n_streams == 0) {
|
||||
throw std::invalid_argument("Invalid streams count set in engine config");
|
||||
|
@ -83,6 +83,7 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) {
|
||||
result.queues_num = conf.n_streams;
|
||||
result.kernels_cache_path = conf.kernels_cache_path;
|
||||
result.tuning_cache_path = conf.tuning_cache_path;
|
||||
result.n_threads = conf.n_threads;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include "configuration.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace cldnn {
|
||||
namespace gpu {
|
||||
@ -22,6 +23,7 @@ configuration::configuration()
|
||||
throttle_mode(throttle_mode_types::disabled),
|
||||
queues_num(0),
|
||||
tuning_cache_path("cache.json"),
|
||||
kernels_cache_path("") {}
|
||||
kernels_cache_path(""),
|
||||
n_threads(std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1))) {}
|
||||
} // namespace gpu
|
||||
} // namespace cldnn
|
||||
|
@ -31,6 +31,7 @@ struct configuration {
|
||||
uint16_t queues_num;
|
||||
std::string tuning_cache_path;
|
||||
std::string kernels_cache_path;
|
||||
uint16_t n_threads;
|
||||
};
|
||||
} // namespace gpu
|
||||
} // namespace cldnn
|
||||
|
@ -13,9 +13,17 @@
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "cldnn_itt.h"
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
|
||||
#include <tbb/parallel_for.h>
|
||||
#include <tbb/blocked_range.h>
|
||||
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
#include <thread>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_UNICODE_PATH_SUPPORT
|
||||
# ifdef _WIN32
|
||||
@ -36,8 +44,10 @@
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
#if (CLDNN_THREADING != CLDNN_THREADING_SEQ)
|
||||
#define DEFAULT_NUM_THREADS 2
|
||||
#endif
|
||||
namespace {
|
||||
|
||||
std::mutex cacheAccessMutex;
|
||||
|
||||
#ifdef ENABLE_UNICODE_PATH_SUPPORT
|
||||
@ -84,7 +94,6 @@ static std::vector<unsigned char> loadBinaryFromFile(std::string path) {
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
static void saveBinaryToFile(std::string path, const std::vector<unsigned char> buffer) {
|
||||
std::lock_guard<std::mutex> lock(cacheAccessMutex);
|
||||
#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
|
||||
@ -190,9 +199,10 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
|
||||
return 10;
|
||||
}
|
||||
|
||||
kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& kernels_source_code) const {
|
||||
|
||||
void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll::GetProgramSource");
|
||||
sorted_code scode;
|
||||
std::map<std::string, std::vector<batch_program>> program_buckets;
|
||||
|
||||
for (const auto& code : kernels_source_code) {
|
||||
std::string full_code = code.kernel_strings->jit + code.kernel_strings->str;
|
||||
@ -213,7 +223,7 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
|
||||
std::string key = options;
|
||||
|
||||
if (batch_compilation == false) {
|
||||
key += " __PROGRAM__" + std::to_string(scode.size());
|
||||
key += " __PROGRAM__" + std::to_string(program_buckets.size());
|
||||
}
|
||||
|
||||
if (dump_custom_program) {
|
||||
@ -223,48 +233,63 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
|
||||
if (one_time_kernel) {
|
||||
key += " __ONE_TIME__";
|
||||
}
|
||||
|
||||
auto& current_bucket = scode[key];
|
||||
current_bucket.dump_custom_program = dump_custom_program;
|
||||
current_bucket.one_time = one_time_kernel;
|
||||
|
||||
if (current_bucket.source.empty()) {
|
||||
current_bucket.options = options;
|
||||
auto& current_bucket = program_buckets[key];
|
||||
if (current_bucket.empty()) { // new bucket
|
||||
const auto& bucket_id = program_buckets.size() - 1;
|
||||
current_bucket.push_back(batch_program());
|
||||
current_bucket.back().bucket_id = static_cast<int32_t>(bucket_id);
|
||||
current_bucket.back().batch_id = 0;
|
||||
current_bucket.back().options = options;
|
||||
}
|
||||
|
||||
// Create new kernels bucket when the limit is reached
|
||||
if ((current_bucket.kernels_counter % get_max_kernels_per_batch()) == 0) {
|
||||
current_bucket.source.push_back({});
|
||||
// Create new kernels batch when the limit is reached
|
||||
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
|
||||
const auto& batch_id = current_bucket.size();
|
||||
current_bucket.push_back(batch_program());
|
||||
current_bucket.back().bucket_id = static_cast<int32_t>(program_buckets.size());
|
||||
current_bucket.back().batch_id = static_cast<int32_t>(batch_id);
|
||||
current_bucket.back().options = options;
|
||||
}
|
||||
|
||||
current_bucket.entry_point_to_id[entry_point] = code.id;
|
||||
auto& current_batch = current_bucket.back();
|
||||
current_batch.dump_custom_program = dump_custom_program;
|
||||
current_batch.one_time = one_time_kernel;
|
||||
current_batch.entry_point_to_id[entry_point] = code.id;
|
||||
|
||||
assert(org_source_code.size() == 1);
|
||||
|
||||
current_bucket.source.back().push_back(std::move(org_source_code.front()));
|
||||
|
||||
current_bucket.kernels_counter++;
|
||||
current_batch.source.push_back(std::move(org_source_code.front()));
|
||||
current_batch.kernels_counter++;
|
||||
}
|
||||
|
||||
// Compute hash value for each bucket
|
||||
// Compute hash value for each batch
|
||||
// Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading
|
||||
// of the precompiled binaries or get_undef_jit calls
|
||||
// Hash is computed for string that contains compilation options + driver version +
|
||||
// full source code (jit + template + undef sections) of all kernels in the bucket
|
||||
for (auto& c : scode) {
|
||||
program_code& code = c.second;
|
||||
// full source code (jit + template + undef sections) of all kernels in the batches
|
||||
for (auto& c : program_buckets) {
|
||||
auto options = c.first;
|
||||
for (size_t i = 0; i < code.source.size(); i++) {
|
||||
auto& batches = c.second;
|
||||
for (auto& b : batches) {
|
||||
std::string full_code = options + " " + _context.get_device_info().driver_version;
|
||||
for (auto& ss : code.source[i])
|
||||
for (auto& ss : b.source)
|
||||
full_code += ss;
|
||||
code.hash_values.push_back(std::hash<std::string>()(full_code));
|
||||
b.hash_value = std::hash<std::string>()(full_code);
|
||||
all_batches->push_back(b);
|
||||
}
|
||||
}
|
||||
|
||||
return scode;
|
||||
}
|
||||
|
||||
kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {}
|
||||
kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
|
||||
int n_threads = _context.get_configuration().n_threads;
|
||||
arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
|
||||
arena->initialize(n_threads);
|
||||
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
int n_threads = _context.get_configuration().n_threads;
|
||||
pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
|
||||
#endif
|
||||
}
|
||||
|
||||
kernels_cache::kernel_id kernels_cache::set_kernel_source(
|
||||
const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
|
||||
@ -301,149 +326,160 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
|
||||
return program.getInfo<CL_PROGRAM_BINARIES>().front();
|
||||
}
|
||||
|
||||
kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const {
|
||||
void kernels_cache::build_batch(const batch_program& batch) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram");
|
||||
static uint32_t current_file_index = 0;
|
||||
|
||||
bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program;
|
||||
bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || batch.dump_custom_program;
|
||||
|
||||
std::string dump_file_name = "";
|
||||
std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which
|
||||
|
||||
std::string current_dump_file_name = "";
|
||||
if (dump_sources) {
|
||||
dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
|
||||
if (!dump_file_name.empty() && dump_file_name.back() != '/')
|
||||
dump_file_name += '/';
|
||||
current_dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
|
||||
if (!current_dump_file_name.empty() && current_dump_file_name.back() != '/')
|
||||
current_dump_file_name += '/';
|
||||
|
||||
dump_file_name += "clDNN_program_" + std::to_string(current_file_index++) + "_part_";
|
||||
current_dump_file_name += "clDNN_program_" + std::to_string(batch.bucket_id) + "_part_" + std::to_string(batch.batch_id) + ".cl";
|
||||
}
|
||||
|
||||
std::ofstream dump_file;
|
||||
if (dump_sources) {
|
||||
dump_file.open(current_dump_file_name);
|
||||
if (dump_file.good()) {
|
||||
for (auto& s : batch.source)
|
||||
dump_file << s;
|
||||
}
|
||||
}
|
||||
|
||||
std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
|
||||
cl::Program::Binaries precompiled_kernels = {};
|
||||
|
||||
if (is_cache_enabled()) {
|
||||
// Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
|
||||
// If read is successful, then remove kernels from compilation bucket
|
||||
auto bin = loadBinaryFromFile(cached_bin_name);
|
||||
if (!bin.empty()) {
|
||||
precompiled_kernels.push_back(bin);
|
||||
}
|
||||
}
|
||||
try {
|
||||
kernels_map kmap;
|
||||
std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which
|
||||
// failed to compile)
|
||||
cl::vector<cl::Kernel> kernels;
|
||||
|
||||
// Run compilation
|
||||
if (precompiled_kernels.empty()) {
|
||||
cl::Program program(_context.context(), batch.source);
|
||||
{
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
|
||||
program.build(_context.device(), batch.options.c_str());
|
||||
}
|
||||
|
||||
if (dump_sources && dump_file.good()) {
|
||||
dump_file << "\n/* Build Log:\n";
|
||||
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
|
||||
dump_file << p.second << "\n";
|
||||
|
||||
dump_file << "*/\n";
|
||||
}
|
||||
|
||||
program.createKernels(&kernels);
|
||||
|
||||
uint32_t part_idx = 0;
|
||||
for (size_t i = 0; i < program_source.source.size(); i++) {
|
||||
auto sources_bucket_to_compile = program_source.source[i];
|
||||
const auto& hash_value = program_source.hash_values[i];
|
||||
std::string cached_bin_name = get_cache_path() + std::to_string(hash_value) + ".cl_cache";
|
||||
cl::Program::Binaries precompiled_kernels = {};
|
||||
if (is_cache_enabled()) {
|
||||
// Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
|
||||
// If read is successful, then remove kernels from compilation bucket
|
||||
auto bin = loadBinaryFromFile(cached_bin_name);
|
||||
if (!bin.empty()) {
|
||||
precompiled_kernels.push_back(bin);
|
||||
}
|
||||
// If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
|
||||
// Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
|
||||
// Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
|
||||
// compile time.
|
||||
saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
|
||||
}
|
||||
auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl";
|
||||
std::ofstream dump_file;
|
||||
|
||||
if (dump_sources) {
|
||||
dump_file.open(current_dump_file_name);
|
||||
|
||||
if (dump_file.good()) {
|
||||
for (auto& s : sources_bucket_to_compile)
|
||||
dump_file << s;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
cl::vector<cl::Kernel> kernels;
|
||||
// Run compilation
|
||||
if (precompiled_kernels.empty()) {
|
||||
cl::Program program(_context.context(), sources_bucket_to_compile);
|
||||
{
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
|
||||
program.build(_context.device(), program_source.options.c_str());
|
||||
}
|
||||
|
||||
if (dump_sources && dump_file.good()) {
|
||||
dump_file << "\n/* Build Log:\n";
|
||||
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
|
||||
dump_file << p.second << "\n";
|
||||
|
||||
dump_file << "*/\n";
|
||||
}
|
||||
|
||||
program.createKernels(&kernels);
|
||||
if (is_cache_enabled()) {
|
||||
// If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
|
||||
// Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
|
||||
// Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
|
||||
// compile time.
|
||||
saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
|
||||
} else {
|
||||
cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
|
||||
program.build(_context.device(), batch.options.c_str());
|
||||
program.createKernels(&kernels);
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
|
||||
for (auto& k : kernels) {
|
||||
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
|
||||
const auto& k_id = batch.entry_point_to_id.find(entry_point);
|
||||
const auto& k_type = kernel_type(k, _context.get_device_info().supports_usm);
|
||||
if (k_id != batch.entry_point_to_id.end()) {
|
||||
const auto& kmap = std::make_pair(k_id->second, k_type);
|
||||
if (batch.one_time) {
|
||||
_one_time_kernels.insert(kmap);
|
||||
} else {
|
||||
_kernels.insert(kmap);
|
||||
}
|
||||
} else {
|
||||
cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
|
||||
program.build(_context.device(), program_source.options.c_str());
|
||||
program.createKernels(&kernels);
|
||||
throw std::runtime_error("Could not find entry point");
|
||||
}
|
||||
|
||||
for (auto& k : kernels) {
|
||||
auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
|
||||
kmap.emplace(kernel_name, kernels_cache::kernel_type(k, _context.get_device_info().supports_usm));
|
||||
}
|
||||
} catch (const cl::BuildError& err) {
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << "\n/* Build Log:\n";
|
||||
|
||||
for (auto& p : err.getBuildLog()) {
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << p.second << "\n";
|
||||
|
||||
err_log += p.second + '\n';
|
||||
}
|
||||
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << "*/\n";
|
||||
}
|
||||
}
|
||||
} catch (const cl::BuildError& err) {
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << "\n/* Build Log:\n";
|
||||
|
||||
if (!err_log.empty()) {
|
||||
throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
|
||||
for (auto& p : err.getBuildLog()) {
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << p.second << "\n";
|
||||
err_log += p.second + '\n';
|
||||
}
|
||||
|
||||
return kmap;
|
||||
} catch (const cl::Error& err) {
|
||||
throw ocl_error(err);
|
||||
if (dump_sources && dump_file.good())
|
||||
dump_file << "*/\n";
|
||||
}
|
||||
if (!err_log.empty()) {
|
||||
throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
|
||||
}
|
||||
}
|
||||
|
||||
kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) {
|
||||
build_all();
|
||||
if (one_time_kernel) {
|
||||
return _one_time_kernels.at(id);
|
||||
} else {
|
||||
return _kernels.at(id);
|
||||
}
|
||||
kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) const {
|
||||
if (_pending_compilation)
|
||||
throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
|
||||
|
||||
const auto& kernels = one_time_kernel ? _one_time_kernels : _kernels;
|
||||
auto res = kernels.find(id);
|
||||
if (kernels.end() == res)
|
||||
throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
|
||||
return res->second;
|
||||
}
|
||||
|
||||
void kernels_cache::build_all() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
|
||||
if (!_pending_compilation)
|
||||
return;
|
||||
|
||||
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
|
||||
|
||||
auto sorted_program_code = get_program_source(_kernels_code);
|
||||
|
||||
_one_time_kernels.clear();
|
||||
for (auto& program : sorted_program_code) {
|
||||
auto kernels = build_program(program.second);
|
||||
|
||||
for (auto& k : kernels) {
|
||||
const auto& entry_point = k.first;
|
||||
const auto& k_id = program.second.entry_point_to_id[entry_point];
|
||||
if (program.second.one_time) {
|
||||
_one_time_kernels[k_id] = k.second;
|
||||
} else {
|
||||
_kernels[k_id] = k.second;
|
||||
}
|
||||
}
|
||||
std::vector<batch_program> batches;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
|
||||
get_program_source(_kernels_code, &batches);
|
||||
_one_time_kernels.clear();
|
||||
}
|
||||
|
||||
_kernels_code.clear();
|
||||
_pending_compilation = false;
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
|
||||
arena->execute([this, &batches] {
|
||||
tbb::parallel_for(tbb::blocked_range<size_t>(0, batches.size()), [this, &batches](const tbb::blocked_range<size_t>& r) {
|
||||
for (auto i = r.begin(); i != r.end(); ++i) {
|
||||
build_batch(batches[i]);
|
||||
}
|
||||
});
|
||||
});
|
||||
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
std::vector<std::future<void>> builds;
|
||||
for (size_t i = 0; i < batches.size(); ++i) {
|
||||
builds.push_back(pool->enqueue([this, &batches, i] () {
|
||||
build_batch(batches[i]);
|
||||
}));
|
||||
}
|
||||
std::for_each(builds.begin(), builds.end(), [] (std::future<void>& f) { f.wait(); });
|
||||
#else
|
||||
// no parallel build
|
||||
for (const auto& batch : batches) {
|
||||
build_batch(batch);
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
|
||||
_kernels_code.clear();
|
||||
_pending_compilation = false;
|
||||
}
|
||||
}
|
||||
|
||||
void kernels_cache::reset() {
|
||||
@ -452,6 +488,5 @@ void kernels_cache::reset() {
|
||||
_kernels_code.clear();
|
||||
_pending_compilation = false;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace cldnn
|
||||
|
@ -13,6 +13,19 @@
|
||||
#include <unordered_set>
|
||||
#include <kernel_selector_common.h>
|
||||
|
||||
#define CLDNN_THREADING_SEQ 0
|
||||
#define CLDNN_THREADING_TBB 1
|
||||
#define CLDNN_THREADING_THREADPOOL 2
|
||||
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
|
||||
#include <tbb/task_arena.h>
|
||||
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
#include <queue>
|
||||
#include <future>
|
||||
#include <functional>
|
||||
#include <condition_variable>
|
||||
#endif
|
||||
|
||||
namespace cl {
|
||||
class Kernel;
|
||||
class KernelIntel;
|
||||
@ -26,14 +39,76 @@ namespace cldnn {
|
||||
namespace gpu {
|
||||
|
||||
class gpu_toolkit;
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
class thread_pool {
|
||||
public:
|
||||
thread_pool(size_t num_threads) : _stop_pool(false) {
|
||||
_workers.reserve(num_threads);
|
||||
for (size_t i = 0; i < num_threads; ++i) {
|
||||
_workers.emplace_back(std::thread(&thread_pool::worker_thread, this));
|
||||
}
|
||||
}
|
||||
|
||||
~thread_pool() {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(_q_m);
|
||||
_stop_pool = true;
|
||||
}
|
||||
this->wait_all();
|
||||
}
|
||||
|
||||
template <class F, class... Args>
|
||||
std::future<typename std::result_of<F(Args...)>::type> enqueue(F&& f, Args&&... args) {
|
||||
if (_stop_pool) {
|
||||
throw std::runtime_error("Thread pool is stoped");
|
||||
}
|
||||
|
||||
using return_type = typename std::result_of<F(Args...)>::type;
|
||||
auto task = std::make_shared<std::packaged_task<return_type()>> (std::bind(std::forward<F>(f), std::forward<Args>(args)...));
|
||||
std::future<return_type> result = task->get_future();
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(_q_m);
|
||||
_tasks.push([task]() {(*task)();});
|
||||
}
|
||||
_cv.notify_one();
|
||||
return result;
|
||||
}
|
||||
|
||||
void wait_all() {
|
||||
_cv.notify_all();
|
||||
for (auto& w : _workers) {
|
||||
w.join();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::thread> _workers;
|
||||
std::queue<std::function<void()>> _tasks;
|
||||
std::condition_variable _cv;
|
||||
std::mutex _q_m;
|
||||
bool _stop_pool;
|
||||
|
||||
void worker_thread() {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(this->_q_m);
|
||||
_cv.wait(lock, [this]() { return (!this->_tasks.empty()) || (_stop_pool); });
|
||||
if ((_stop_pool) && (this->_tasks.empty())) return;
|
||||
auto task = std::move(_tasks.front());
|
||||
this->_tasks.pop();
|
||||
lock.unlock();
|
||||
task();
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
class kernels_cache {
|
||||
public:
|
||||
using source_code = std::vector<std::string>;
|
||||
|
||||
struct program_code {
|
||||
std::vector<source_code> source;
|
||||
std::vector<size_t> hash_values;
|
||||
struct batch_program {
|
||||
int32_t bucket_id = 0;
|
||||
int32_t batch_id = 0;
|
||||
source_code source;
|
||||
size_t hash_value;
|
||||
uint32_t kernels_counter = 0;
|
||||
std::string options;
|
||||
bool dump_custom_program = false;
|
||||
@ -69,7 +144,6 @@ public:
|
||||
|
||||
typedef std::string kernel_id;
|
||||
typedef cl::KernelIntel kernel_type;
|
||||
using sorted_code = std::map<std::string, program_code>;
|
||||
using kernels_map = std::map<std::string, kernel_type>;
|
||||
using kernels_code = std::unordered_set<kernel_code, hash_kernel_code>;
|
||||
|
||||
@ -77,13 +151,19 @@ private:
|
||||
gpu_toolkit& _context;
|
||||
kernels_code _kernels_code;
|
||||
std::atomic<bool> _pending_compilation{false};
|
||||
std::map<std::string, kernel_type> _kernels;
|
||||
std::map<std::string, kernel_type> _one_time_kernels; // These kernels are intended to be executed only once (can
|
||||
std::map<const std::string, const kernel_type> _kernels;
|
||||
std::map<const std::string, const kernel_type> _one_time_kernels; // These kernels are intended to be executed only once (can
|
||||
// be removed later from the cache).
|
||||
uint32_t _prog_id;
|
||||
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
|
||||
std::unique_ptr<tbb::task_arena> arena;
|
||||
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
|
||||
std::unique_ptr<thread_pool> pool;
|
||||
#endif
|
||||
|
||||
sorted_code get_program_source(const kernels_code& kernels_source_code) const;
|
||||
kernels_map build_program(const program_code& pcode) const;
|
||||
|
||||
void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
|
||||
void build_batch(const batch_program& batch);
|
||||
|
||||
std::string get_cache_path() const;
|
||||
bool is_cache_enabled() const;
|
||||
@ -94,7 +174,7 @@ public:
|
||||
kernel_id set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
|
||||
bool dump_custom_program,
|
||||
bool one_time_kernel);
|
||||
kernel_type get_kernel(kernel_id id, bool one_time_kernel);
|
||||
kernel_type get_kernel(kernel_id id, bool one_time_kernel) const;
|
||||
gpu_toolkit& get_context() { return _context; }
|
||||
// forces compilation of all pending kernels/programs
|
||||
void build_all();
|
||||
|
@ -3,37 +3,37 @@
|
||||
<models>
|
||||
<!--Models with FP32 precision-->
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="740214" vmpeak="805110" vmrss="129308" vmhwm="129308" />
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="922147" vmpeak="922147" vmrss="587522" vmhwm="587522" />
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2709506" vmpeak="2794703" vmrss="1342104" vmhwm="1342104" />
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1007890" vmpeak="1007890" vmrss="138652" vmhwm="138652" />
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1006439" vmpeak="1091636" vmrss="587241" vmhwm="587241" />
|
||||
<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2709501" vmpeak="2794698" vmrss="1291404" vmhwm="1291404" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="691589" vmpeak="922864" vmrss="31054" vmhwm="31054" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="290695" vmhwm="290695" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="403228" vmhwm="403228" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="958240" vmpeak="1043437" vmrss="31366" vmhwm="31366" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="708734" vmpeak="793930" vmrss="287877" vmhwm="287877" />
|
||||
<model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="854417" vmpeak="939614" vmrss="402339" vmhwm="402339" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="1046988" vmpeak="1179042" vmrss="307990" vmhwm="439457" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="1267775" vmpeak="1279647" vmrss="932672" vmhwm="944626" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2969241" vmpeak="2969241" vmrss="1506492" vmhwm="1506492" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1321819" vmpeak="1321819" vmrss="374207" vmhwm="439748" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1356565" vmpeak="1441762" vmrss="941418" vmhwm="947060" />
|
||||
<model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2605324" vmpeak="26900521" vmrss="1549958" vmhwm="1549958" />
|
||||
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="2133814" vmpeak="2836412" vmrss="1438049" vmhwm="2140533" />
|
||||
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2801422" vmpeak="3915366" vmrss="2465065" vmhwm="3578811" />
|
||||
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="2401380" vmpeak="2836412" vmrss="1469832" vmhwm="2140377" />
|
||||
<model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2892432" vmpeak="3939166" vmrss="2472017" vmhwm="3602924" />
|
||||
<!--Models with FP16 precision-->
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1057487" vmpeak="1085224" vmrss="109694" vmhwm="137295" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="760942" vmpeak="760942" vmrss="418298" vmhwm="418298" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2667537" vmpeak="2752734" vmrss="1304919" vmhwm="1304919" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1058844" vmpeak="1085224" vmrss="123016" vmhwm="136682" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="845348" vmpeak="930545" vmrss="417445" vmhwm="417445" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2347389" vmpeak="2432586" vmrss="1290504" vmhwm="1290504" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="955427" vmpeak="955806" vmrss="27700" vmhwm="27700" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="680862" vmpeak="680862" vmrss="331858" vmhwm="331858" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2123113" vmpeak="2208310" vmrss="453814" vmhwm="453814" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="955827" vmpeak="955827" vmrss="27222" vmhwm="27222" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="766053" vmpeak="851250" vmrss="331458" vmhwm="331458" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1760990" vmpeak="1760990" vmrss="454173" vmhwm="454173" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1372961" vmpeak="1505639" vmrss="369969" vmhwm="501649" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="866543" vmpeak="866543" vmrss="523967" vmhwm="523967" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2746588" vmpeak="2831784" vmrss="1296328" vmhwm="1296328" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1381265" vmpeak="1505472" vmrss="437039" vmhwm="500630" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="951584" vmpeak="1036781" vmrss="528060" vmhwm="528060" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2380580" vmpeak="2465777" vmrss="1326369" vmhwm="1326369" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="2748220" vmpeak="3450818" vmrss="1783704" vmhwm="2486161" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="1397463" vmpeak="1994402" vmrss="1049625" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2181312" vmpeak="2582752" vmrss="1060712" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="2749458" vmpeak="3450818" vmrss="1816765" vmhwm="2486525" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1482655" vmpeak="1998812" vmrss="1049692" vmhwm="1630782" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
<model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2910814" vmpeak="3347489" vmrss="1371380" vmhwm="1717102" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
|
||||
</models>
|
||||
</attributes>
|
||||
|
Loading…
Reference in New Issue
Block a user