[GPU] Add debug config for disabled async compilation (#18535)
This commit is contained in:
parent
cba84fd763
commit
38913f2184
@ -117,6 +117,7 @@ public:
|
||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
|
||||
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
||||
int disable_async_compilation; // Disable async compilation
|
||||
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
|
||||
static const debug_configuration *get_instance();
|
||||
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
|
||||
|
@ -100,7 +100,13 @@ struct primitive_type_base : primitive_type {
|
||||
|
||||
cldnn::layout calc_output_layout(const cldnn::program_node& node, const kernel_impl_params& impl_param) const override {
|
||||
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::calc_output_layout: primitive type mismatch");
|
||||
return typed_primitive_inst<PType>::calc_output_layout(node, impl_param);
|
||||
for (auto& t : impl_param.input_layouts) {
|
||||
GPU_DEBUG_TRACE_DETAIL << impl_param.desc->id << " input tensor: " << t.to_short_string() << std::endl;
|
||||
}
|
||||
auto res = typed_primitive_inst<PType>::calc_output_layout(node, impl_param);
|
||||
|
||||
GPU_DEBUG_TRACE_DETAIL << impl_param.desc->id << " output tensor: " << res.to_short_string() << std::endl;
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<cldnn::layout> calc_output_layouts(const cldnn::program_node& node, const kernel_impl_params& impl_param) const override {
|
||||
|
@ -541,26 +541,35 @@ bool primitive_inst::update_impl() {
|
||||
}
|
||||
if (!cached_impl) {
|
||||
if (_dynamic_impl) {
|
||||
auto& compilation_context = get_network().get_program()->get_compilation_context();
|
||||
compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
|
||||
if (compilation_context.is_stopped())
|
||||
return;
|
||||
auto _program = get_network().get_program();
|
||||
auto& cache = _program->get_implementations_cache();
|
||||
{
|
||||
// Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation
|
||||
// tasks created for same shapes
|
||||
if (cache.has(updated_params_no_dyn_pad))
|
||||
auto use_async_compilation = [&]() {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->disable_async_compilation) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
if (use_async_compilation()) {
|
||||
auto& compilation_context = get_network().get_program()->get_compilation_context();
|
||||
compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
|
||||
if (compilation_context.is_stopped())
|
||||
return;
|
||||
}
|
||||
auto _program = get_network().get_program();
|
||||
auto& cache = _program->get_implementations_cache();
|
||||
{
|
||||
// Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation
|
||||
// tasks created for same shapes
|
||||
if (cache.has(updated_params_no_dyn_pad))
|
||||
return;
|
||||
}
|
||||
|
||||
auto impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
|
||||
if (!can_be_optimized()) {
|
||||
auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
|
||||
impl->set_kernels(kernels);
|
||||
cache.add(updated_params_no_dyn_pad, impl->clone());
|
||||
}
|
||||
});
|
||||
auto impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
|
||||
if (!can_be_optimized()) {
|
||||
auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
|
||||
impl->set_kernels(kernels);
|
||||
cache.add(updated_params_no_dyn_pad, impl->clone());
|
||||
}
|
||||
});
|
||||
}
|
||||
if (!can_be_optimized()) {
|
||||
_impl = _dynamic_impl->clone();
|
||||
auto new_impl_params = _impl->canonicalize_shapes(*_impl_params);
|
||||
|
@ -131,6 +131,7 @@ static void print_help_messages() {
|
||||
" For example fc:onednn gemm:onednn reduce:ocl do:cpu"
|
||||
" For primitives fc, gemm, do, reduce, concat are supported. Separated by space.");
|
||||
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
|
||||
message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation");
|
||||
message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
|
||||
message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
|
||||
"the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
|
||||
@ -171,7 +172,8 @@ debug_configuration::debug_configuration()
|
||||
, dump_layers_raw(0)
|
||||
, base_batch_for_memory_estimation(-1)
|
||||
, serialize_compile(0)
|
||||
, max_kernels_per_batch(0) {
|
||||
, max_kernels_per_batch(0)
|
||||
, disable_async_compilation(0) {
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
get_gpu_debug_env_var("Help", help);
|
||||
get_common_debug_env_var("Verbose", verbose);
|
||||
@ -199,6 +201,7 @@ debug_configuration::debug_configuration()
|
||||
std::string forced_impl_types_str;
|
||||
get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str);
|
||||
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
|
||||
get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation);
|
||||
std::string dump_iteration_str;
|
||||
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
|
||||
std::string mem_preallocation_params_str;
|
||||
|
Loading…
Reference in New Issue
Block a user