diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 4b845b0637b..280cbe4556f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -78,26 +78,26 @@ private: public: static const char *prefix; - int help; // Print help messages - int verbose; // Verbose execution - int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive - int disable_usm; // Disable usm usage - int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU) - int disable_onednn_opt_post_ops; // Disable onednn optimize post operators - std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir - std::string dump_graphs; // Dump optimized graph - std::string dump_sources; // Dump opencl sources - std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path - std::vector dump_layers; // Dump intermediate buffers of specified layers only - std::string dry_run_path; // Dry run and serialize execution graph into the specified path - int dump_layers_dst_only; // Dump only output of layers - int dump_layers_result; // Dump result layers - int dump_layers_limit_batch; // Limit the size of batch to dump - int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation - std::vector after_proc; // Start inference after the listed processes - int serialize_compile; // Serialize creating primitives and compiling kernels - std::string forced_impl_type; // Force implementation type either ocl or onednn - int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels + int help; // Print help messages + int verbose; // Verbose execution + int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive + int disable_usm; // Disable usm usage + int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU) + int disable_onednn_opt_post_ops; // Disable onednn optimize post operators + std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir + std::string dump_graphs; // Dump optimized graph + std::string dump_sources; // Dump opencl sources + std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path + std::vector dump_layers; // Dump intermediate buffers of specified layers only + std::string dry_run_path; // Dry run and serialize execution graph into the specified path + int dump_layers_dst_only; // Dump only output of layers + int dump_layers_result; // Dump result layers + int dump_layers_limit_batch; // Limit the size of batch to dump + int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation + std::vector after_proc; // Start inference after the listed processes + int serialize_compile; // Serialize creating primitives and compiling kernels + std::vector forced_impl_types; // Force implementation type either ocl or onednn + int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels static const debug_configuration *get_instance(); bool is_dumped_layer(const std::string& layerName, bool is_output = false) const; }; diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 397768b6698..f5e8422f6be 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -38,7 +38,10 @@ bool is_batch_after_spatial(const std::string order) { return false; } -format::type get_preferred_format(const kernel_impl_params& impl_param) { +format::type get_preferred_format(fully_connected_node const& node, const kernel_impl_params& impl_param) { + if (node.get_preferred_impl_type() == impl_types::onednn) + return format::bfyx; + auto input_layout = impl_param.get_input_layout(); // for 3d output we have to chose bfyx format @@ -125,13 +128,13 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node if (desc->input_size == 3) { output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch()); } - format output_format = get_preferred_format(impl_param); + format output_format = get_preferred_format(node, impl_param); return layout(output_type, output_format, output_size); } template -std::vector fully_connected_inst::calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param) { +std::vector fully_connected_inst::calc_output_layouts(fully_connected_node const& node, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); auto input_layout = impl_param.get_input_layout(); auto weights_layout = *impl_param.weights_layout; @@ -155,7 +158,7 @@ std::vector fully_connected_inst::calc_output_layouts(fully_connected_no bool is_static = input_layout.is_static() && weights_layout.is_static(); - format::type output_format = is_static ? get_preferred_format(impl_param) : + format::type output_format = is_static ? get_preferred_format(node, impl_param) : input_layout.format.value; return { layout{output_shapes[0], output_type, output_format} }; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp index be358f7e940..4466f97118d 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_onednn_optimization_attributes.cpp @@ -16,22 +16,6 @@ void add_onednn_optimization_attributes::run(program& p) { #ifdef ENABLE_ONEDNN_FOR_GPU for (auto& node : p.get_processing_order()) { if (node->get_preferred_impl_type() == impl_types::onednn) { - if (node->is_type()) { - auto fc_prim = node->as().get_primitive(); - - // Reshape fused ops tensors for OneDNN FC if needed - if (fc_prim->input_size == 3) { - for (auto& fused_prim : node->get_fused_primitives()) { - if (fused_prim.is_type()) { - auto& dependency = node->get_dependency(fused_prim.dep_start_idx); - auto original_layout = dependency.get_output_layout(); - onednn::combine_bf_with_first_spatial_dim(original_layout); - dependency.set_output_layout(original_layout, false); - } - } - } - } - node->init_onednn_primitive_attributes(); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index bb4f691f8fa..62582f66693 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -96,7 +96,7 @@ protected: } static std::shared_ptr get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params, - cldnn::engine& engine, size_t input_size, bool has_bias, + cldnn::engine& engine, size_t prim_input_size, bool has_bias, const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { auto input_layout = impl_params.get_input_layout(0); auto weights_layout = impl_params.get_input_layout(1); @@ -105,6 +105,7 @@ protected: auto input_pshape = input_layout.get_partial_shape(); auto weights_pshape = weights_layout.get_partial_shape(); + size_t input_size = (prim_input_size > input_pshape.size()) ? input_pshape.size() : prim_input_size; int64_t feature = input_pshape[std::min(input_size, static_cast(4)) - 1].get_length(); if (input_size == 3) { feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)}); diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 23860c53f29..348f9d282e9 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -865,33 +865,11 @@ static bool is_node_for_onednn(deconvolution_node const& node) { static bool is_node_for_onednn(fully_connected_node const& node) { - bool is_suitable_for_onednn = true; - auto out_layout = node.get_output_layout(); - for (auto& fo : node.get_fused_primitives()) { - if (fo.is_type()) { - // FC checkings - auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout(); - auto in_dt = in_layout.data_type; - auto out_dt = out_layout.data_type; - // if it is not eltwise sum and input is full tensor - if ((out_layout.count() == in_layout.count()) && in_dt != out_dt - && (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) - && onednn_add_fusing_helpers::is_full_tensor(in_layout)) { - return false; - } - - // WA: onednn sum/binary_add post-op are not supported due to perf drop. - auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo); - if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) { - return false; - } - } - } - auto fc_prim = node.get_primitive(); - auto ps = out_layout.get_partial_shape(); - int rank = ps.size(); + auto ps = node.get_output_layout().get_partial_shape(); int non_spatial_count = 2 + (fc_prim->input_size == 3 ? 1 : 0); + int rank = ps.size(); + // OneDnn doesn't support spatial dimensions for output for (int i = non_spatial_count; i < rank; i++) { if (ps[i].is_dynamic() || ps[i] != 1) { @@ -899,7 +877,7 @@ static bool is_node_for_onednn(fully_connected_node const& node) { } } - return is_suitable_for_onednn; + return true; } // This function is needed to avoid performance regressions for the convolutions with byxf layout @@ -1325,48 +1303,53 @@ bool layout_optimizer::are_layouts_suitable_for_onednn(program_node& node) { impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) { #ifdef GPU_DEBUG_CONFIG GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(!debug_config->forced_impl_type.empty()) { + GPU_DEBUG_IF(!debug_config->forced_impl_types.empty()) { // Forcing impl type of one primitive - std::string forced_impl_type = debug_config->forced_impl_type; - if (node.is_type()) { - if (forced_impl_type == "fc:ocl") - return impl_types::ocl; - else if (forced_impl_type == "fc:onednn") - return impl_types::onednn; - } else if (node.is_type()) { - if (forced_impl_type == "do:cpu") - return impl_types::cpu; - else if (forced_impl_type == "do:ocl") - return impl_types::ocl; - } else if (node.is_type()) { - if (forced_impl_type == "reduce:ocl") - return impl_types::ocl; - else if (forced_impl_type == "reduce:onednn") - return impl_types::onednn; - } else if (node.is_type()) { - if (forced_impl_type == "concat:ocl") - return impl_types::ocl; - else if (forced_impl_type == "concat:onednn") - return impl_types::onednn; - } + for (const auto& forced_impl_type : debug_config->forced_impl_types) { + if (node.is_type()) { + if (forced_impl_type == "fc:ocl") + return impl_types::ocl; + else if (forced_impl_type == "fc:onednn") + return impl_types::onednn; + } else if (node.is_type()) { + if (forced_impl_type == "gemm:ocl") + return impl_types::ocl; + else if (forced_impl_type == "gemm:onednn") + return impl_types::onednn; + } else if (node.is_type()) { + if (forced_impl_type == "do:cpu") + return impl_types::cpu; + else if (forced_impl_type == "do:ocl") + return impl_types::ocl; + } else if (node.is_type()) { + if (forced_impl_type == "reduce:ocl") + return impl_types::ocl; + else if (forced_impl_type == "reduce:onednn") + return impl_types::onednn; + } else if (node.is_type()) { + if (forced_impl_type == "concat:ocl") + return impl_types::ocl; + else if (forced_impl_type == "concat:onednn") + return impl_types::onednn; + } + // Forcing one layer + size_t found_type = forced_impl_type.rfind(":"); + if (found_type != std::string::npos) { + impl_types preferred_type = impl_types::any; + auto impl_type = forced_impl_type.substr(found_type + 1); + if (impl_type == "ocl") + preferred_type = impl_types::ocl; + else if (impl_type == "onednn") + preferred_type = impl_types::onednn; + else if (impl_type == "cpu") + preferred_type = impl_types::cpu; - // Forcing one layer - size_t found_type = forced_impl_type.rfind(":"); - if (found_type != std::string::npos) { - impl_types preferred_type = impl_types::any; - auto impl_type = forced_impl_type.substr(found_type + 1); - if (impl_type == "ocl") - preferred_type = impl_types::ocl; - else if (impl_type == "onednn") - preferred_type = impl_types::onednn; - else if (impl_type == "cpu") - preferred_type = impl_types::cpu; - - if (node.id() == forced_impl_type.substr(0, found_type)) { - GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : " - << forced_impl_type.substr(found_type + 1) << std::endl; - return preferred_type; + if (node.id() == forced_impl_type.substr(0, found_type)) { + GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : " + << forced_impl_type.substr(found_type + 1) << std::endl; + return preferred_type; + } } } } @@ -1589,70 +1572,9 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format if (node.is_type()) { if (!is_node_for_onednn(node.as())) impl_candidate = impl_types::ocl; - - // WA : Use cldnn FC due to perf drop of small batch size until onednn FC improve perf - if (node.get_output_layout().is_static() && node.get_output_layout().batch() < 32) - impl_candidate = impl_types::ocl; } else { - for (auto& fo : node.get_fused_primitives()) { - if (fo.is_type()) { - // Gemm checkings - // TODO: investigate why currently onednn gemm has some "sum" post-op restrictions - // which don't correlate with fc checkings in the code above - // Temprorary WA: disable onednn gemm with sum post-op inside - if (fo.typed_desc()->mode == eltwise_mode::sum) { - impl_candidate = impl_types::ocl; - break; - } - } - } - - auto gemm_prim = node.as().get_primitive(); if (node.is_dynamic()) { impl_candidate = impl_types::ocl; - } else { - auto has_input2 = gemm_prim->dependencies().size() == 3; - std::vector in_layouts { node.get_dependency(0).get_output_layout(), node.get_dependency(1).get_output_layout() }; - if (has_input2) { - in_layouts.emplace_back(node.get_dependency(2).get_output_layout()); - } - auto out_l = node.get_output_layout(); - - in_layouts = gemm_inst::transform_input_layouts(gemm_prim, in_layouts, out_l); - out_l = gemm_inst::transform_output_layout(gemm_prim, in_layouts, out_l); - - auto in0_l = in_layouts[0]; - auto in1_l = in_layouts[1]; - - size_t in2_batched_size = 0; - if (has_input2) { - auto in2_l = in_layouts[2]; - in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1)); - } - size_t size_k = gemm_prim->transpose_input0 ? in0_l.spatial(1) : in0_l.spatial(0); - - size_t in0_batched_size = in0_l.count() / (in0_l.spatial(0) * in0_l.spatial(1)); - size_t in1_batched_size = in1_l.count() / (in1_l.spatial(0) * in1_l.spatial(1)); - size_t out_batched_size = out_l.count() / (out_l.spatial(0) * out_l.spatial(1)); - - auto valid_input_batch = in0_batched_size != 1 && (in1_batched_size == in0_batched_size || in1_batched_size == 1); - auto valid_output_batch = in0_batched_size > in1_batched_size ? out_batched_size == in0_batched_size : - out_batched_size == in1_batched_size; - auto valid_extra_input_batch = has_input2 ? in2_batched_size == 1 || in2_batched_size == out_batched_size : true; - auto valid_scale_factor = gemm_prim->alpha == 1.f && (has_input2 ? gemm_prim->beta == 1.f : true); - auto unsupported_onednn_gemm = !valid_input_batch || - !valid_output_batch || - !valid_extra_input_batch || - !valid_scale_factor; - - bool is_u8_i8 = data_type_traits::is_i8_u8(in0_l.data_type) && data_type_traits::is_i8_u8(in1_l.data_type); - bool use_ops_cldnn_kernel = is_u8_i8 || (in0_l.spatial(0) % 16 == 0 && in0_l.spatial(1) % 16 == 0 && - in1_l.spatial(0) % 16 == 0 && in1_l.spatial(1) % 16 == 0); - - // Gemm with k < 64 may be faster in cldnn unless ref impl is used - if ((size_k < 64 && use_ops_cldnn_kernel) || unsupported_onednn_gemm) { - impl_candidate = impl_types::ocl; - } } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index c3ef43a1c51..ff740172128 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -9,6 +9,8 @@ #include "intel_gpu/runtime/debug_configuration.hpp" #ifdef ENABLE_ONEDNN_FOR_GPU #include "convolution_inst.h" +#include "gemm_inst.h" +#include "fully_connected_inst.h" #include "deconvolution_inst.h" #include "quantize_inst.h" #include "reorder_inst.h" @@ -950,6 +952,28 @@ void program_node::init_onednn_primitive_attributes() { auto dep_idx = desc.dep_start_idx; auto in = get_dependency(dep_idx).get_output_layout(); + auto set_binary_op = [&](dnnl::algorithm alg, onednn_post_op_type op_type) { + if (is_type()) { + const kernel_impl_params& impl_params = *get_kernel_impl_params(); + auto prim = impl_params.typed_desc(); + if (prim->input_size == 3) { + cldnn::onednn::combine_bf_with_first_spatial_dim(in); + } + post_ops.append_binary(alg, onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab)); + update_onednn_post_op_list(op_type, dep_idx); + } else if (is_type()) { + size_t rank = cldnn::format::dimension(in.format); + dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in.batch() > 1); + dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type); + dnnl::memory::format_tag fmt = onednn::convert_gemm_data_format(dims); + post_ops.append_binary(alg, dnnl::memory::desc(dims, dt, fmt)); + update_onednn_post_op_list(op_type, dep_idx); + } else { + post_ops.append_binary(alg, onednn::layout_to_memory_desc(in)); + update_onednn_post_op_list(op_type, dep_idx); + } + }; + if (desc.typed_desc()->mode == eltwise_mode::sum) { auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]); if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) { @@ -961,18 +985,12 @@ void program_node::init_onednn_primitive_attributes() { update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx); num_sum_post_ops++; } else { - dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in); - post_ops.append_binary(dnnl::algorithm::binary_add, in_desc); - update_onednn_post_op_list(onednn_post_op_type::binary_add, dep_idx); + set_binary_op(dnnl::algorithm::binary_add, onednn_post_op_type::binary_add); } } else if (desc.typed_desc()->mode == eltwise_mode::sub) { - dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in); - post_ops.append_binary(dnnl::algorithm::binary_sub, in_desc); - update_onednn_post_op_list(onednn_post_op_type::binary_sub, dep_idx); + set_binary_op(dnnl::algorithm::binary_sub, onednn_post_op_type::binary_sub); } else if (desc.typed_desc()->mode == eltwise_mode::prod) { - dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab, true); - post_ops.append_binary(dnnl::algorithm::binary_mul, in_desc); - update_onednn_post_op_list(onednn_post_op_type::binary_mul, dep_idx, dnnl::memory::format_tag::ab, true); + set_binary_op(dnnl::algorithm::binary_mul, onednn_post_op_type::binary_mul); } else { std::stringstream error_msg; error_msg << "Unsupported eltwise mode: " << static_cast(desc.typed_desc()->mode) << ". "; diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 3494a45ad7e..b92c67ce058 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -122,9 +122,9 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space." " Supported on only on linux."); message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels"); - message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]" - " For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:onednn, reduce:ocl, concat:onednn," - " and concat:ocl are supported"); + message_list.emplace_back("OV_GPU_ForceImplTypes", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]" + " For example fc:onednn gemm:onednn reduce:ocl do:cpu" + " For primitives fc, gemm, do, reduce, concat are supported. Separated by space."); message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels"); auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(), @@ -158,7 +158,6 @@ debug_configuration::debug_configuration() , dump_layers_limit_batch(std::numeric_limits::max()) , base_batch_for_memory_estimation(-1) , serialize_compile(0) - , forced_impl_type(std::string()) , max_kernels_per_batch(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); @@ -181,7 +180,8 @@ debug_configuration::debug_configuration() std::string after_proc_str; get_gpu_debug_env_var("AfterProc", after_proc_str); get_gpu_debug_env_var("SerialCompile", serialize_compile); - get_gpu_debug_env_var("ForceImplType", forced_impl_type); + std::string forced_impl_types_str; + get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str); get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch); if (help > 0) { @@ -198,6 +198,15 @@ debug_configuration::debug_configuration() } } + if (forced_impl_types_str.length() > 0) { + forced_impl_types_str = " " + forced_impl_types_str + " "; // Insert delimiter for easier parsing when used + std::stringstream ss(forced_impl_types_str); + std::string type; + while (ss >> type) { + forced_impl_types.push_back(type); + } + } + if (after_proc_str.length() > 0) { #ifdef _WIN32 GPU_DEBUG_COUT << "Warning: OV_GPU_AfterProc is supported only on linux" << std::endl;