[GPU] Add debug config for disabled async compilation (#18535)

This commit is contained in:
Andrew Kwangwoong Park 2023-07-14 15:42:03 +09:00 committed by GitHub
parent cba84fd763
commit 38913f2184
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 20 deletions

View File

@ -117,6 +117,7 @@ public:
int serialize_compile; // Serialize creating primitives and compiling kernels
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
int disable_async_compilation; // Disable async compilation
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
static const debug_configuration *get_instance();
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;

View File

@ -100,7 +100,13 @@ struct primitive_type_base : primitive_type {
cldnn::layout calc_output_layout(const cldnn::program_node& node, const kernel_impl_params& impl_param) const override {
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::calc_output_layout: primitive type mismatch");
return typed_primitive_inst<PType>::calc_output_layout(node, impl_param);
for (auto& t : impl_param.input_layouts) {
GPU_DEBUG_TRACE_DETAIL << impl_param.desc->id << " input tensor: " << t.to_short_string() << std::endl;
}
auto res = typed_primitive_inst<PType>::calc_output_layout(node, impl_param);
GPU_DEBUG_TRACE_DETAIL << impl_param.desc->id << " output tensor: " << res.to_short_string() << std::endl;
return res;
}
std::vector<cldnn::layout> calc_output_layouts(const cldnn::program_node& node, const kernel_impl_params& impl_param) const override {

View File

@ -541,26 +541,35 @@ bool primitive_inst::update_impl() {
}
if (!cached_impl) {
if (_dynamic_impl) {
auto& compilation_context = get_network().get_program()->get_compilation_context();
compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
if (compilation_context.is_stopped())
return;
auto _program = get_network().get_program();
auto& cache = _program->get_implementations_cache();
{
// Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation
// tasks created for same shapes
if (cache.has(updated_params_no_dyn_pad))
auto use_async_compilation = [&]() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_async_compilation) {
return false;
}
return true;
};
if (use_async_compilation()) {
auto& compilation_context = get_network().get_program()->get_compilation_context();
compilation_context.push_task(updated_params_no_dyn_pad.hash(), [this, &compilation_context, updated_params_no_dyn_pad]() {
if (compilation_context.is_stopped())
return;
}
auto _program = get_network().get_program();
auto& cache = _program->get_implementations_cache();
{
// Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation
// tasks created for same shapes
if (cache.has(updated_params_no_dyn_pad))
return;
}
auto impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
if (!can_be_optimized()) {
auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
impl->set_kernels(kernels);
cache.add(updated_params_no_dyn_pad, impl->clone());
}
});
auto impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
if (!can_be_optimized()) {
auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
impl->set_kernels(kernels);
cache.add(updated_params_no_dyn_pad, impl->clone());
}
});
}
if (!can_be_optimized()) {
_impl = _dynamic_impl->clone();
auto new_impl_params = _impl->canonicalize_shapes(*_impl_params);

View File

@ -131,6 +131,7 @@ static void print_help_messages() {
" For example fc:onednn gemm:onednn reduce:ocl do:cpu"
" For primitives fc, gemm, do, reduce, concat are supported. Separated by space.");
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
message_list.emplace_back("OV_GPU_DisableAsyncCompilation", "Disable async compilation");
message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
"the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@ -171,7 +172,8 @@ debug_configuration::debug_configuration()
, dump_layers_raw(0)
, base_batch_for_memory_estimation(-1)
, serialize_compile(0)
, max_kernels_per_batch(0) {
, max_kernels_per_batch(0)
, disable_async_compilation(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose);
@ -199,6 +201,7 @@ debug_configuration::debug_configuration()
std::string forced_impl_types_str;
get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str);
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
get_gpu_debug_env_var("DisableAsyncCompilation", disable_async_compilation);
std::string dump_iteration_str;
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
std::string mem_preallocation_params_str;