[GPU] Use onednn fc/gemm in dGPU. (#15143)
* [GPU] Fix the functional issue using fc:onednn in bert model. * The issue had happened when input dims are 3 with post-po eltwise. * oneDNN FC out supports 2-dims only, so OV need to update output and post-op too. * Fix ACC issue in b16 onednn FC. cldnn updates yxfb format in b16 for opt kernel, but no need in onednn. * Remove W.A code for running fc cldnn. * Support gemm primtiive and multi types ForceImplTypes * Change env name OV_GPU_ForceImplTypes * Do not change elstwise post-op shape from original node: it caused the ACC issue when multiple users. Signed-off-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
parent
e9a208501b
commit
9d8532e998
@ -78,26 +78,26 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
static const char *prefix;
|
static const char *prefix;
|
||||||
int help; // Print help messages
|
int help; // Print help messages
|
||||||
int verbose; // Verbose execution
|
int verbose; // Verbose execution
|
||||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||||
int disable_usm; // Disable usm usage
|
int disable_usm; // Disable usm usage
|
||||||
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
||||||
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
|
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
|
||||||
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
|
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
|
||||||
std::string dump_graphs; // Dump optimized graph
|
std::string dump_graphs; // Dump optimized graph
|
||||||
std::string dump_sources; // Dump opencl sources
|
std::string dump_sources; // Dump opencl sources
|
||||||
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
||||||
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
||||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||||
int dump_layers_dst_only; // Dump only output of layers
|
int dump_layers_dst_only; // Dump only output of layers
|
||||||
int dump_layers_result; // Dump result layers
|
int dump_layers_result; // Dump result layers
|
||||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||||
std::string forced_impl_type; // Force implementation type either ocl or onednn
|
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
|
||||||
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
||||||
static const debug_configuration *get_instance();
|
static const debug_configuration *get_instance();
|
||||||
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
|
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
|
||||||
};
|
};
|
||||||
|
@ -38,7 +38,10 @@ bool is_batch_after_spatial(const std::string order) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
format::type get_preferred_format(const kernel_impl_params& impl_param) {
|
format::type get_preferred_format(fully_connected_node const& node, const kernel_impl_params& impl_param) {
|
||||||
|
if (node.get_preferred_impl_type() == impl_types::onednn)
|
||||||
|
return format::bfyx;
|
||||||
|
|
||||||
auto input_layout = impl_param.get_input_layout();
|
auto input_layout = impl_param.get_input_layout();
|
||||||
|
|
||||||
// for 3d output we have to chose bfyx format
|
// for 3d output we have to chose bfyx format
|
||||||
@ -125,13 +128,13 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
|
|||||||
if (desc->input_size == 3) {
|
if (desc->input_size == 3) {
|
||||||
output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
|
output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
|
||||||
}
|
}
|
||||||
format output_format = get_preferred_format(impl_param);
|
format output_format = get_preferred_format(node, impl_param);
|
||||||
|
|
||||||
return layout(output_type, output_format, output_size);
|
return layout(output_type, output_format, output_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename ShapeType>
|
template<typename ShapeType>
|
||||||
std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param) {
|
std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_node const& node, const kernel_impl_params& impl_param) {
|
||||||
auto desc = impl_param.typed_desc<fully_connected>();
|
auto desc = impl_param.typed_desc<fully_connected>();
|
||||||
auto input_layout = impl_param.get_input_layout();
|
auto input_layout = impl_param.get_input_layout();
|
||||||
auto weights_layout = *impl_param.weights_layout;
|
auto weights_layout = *impl_param.weights_layout;
|
||||||
@ -155,7 +158,7 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
|
|||||||
|
|
||||||
bool is_static = input_layout.is_static() && weights_layout.is_static();
|
bool is_static = input_layout.is_static() && weights_layout.is_static();
|
||||||
|
|
||||||
format::type output_format = is_static ? get_preferred_format(impl_param) :
|
format::type output_format = is_static ? get_preferred_format(node, impl_param) :
|
||||||
input_layout.format.value;
|
input_layout.format.value;
|
||||||
|
|
||||||
return { layout{output_shapes[0], output_type, output_format} };
|
return { layout{output_shapes[0], output_type, output_format} };
|
||||||
|
@ -16,22 +16,6 @@ void add_onednn_optimization_attributes::run(program& p) {
|
|||||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
for (auto& node : p.get_processing_order()) {
|
for (auto& node : p.get_processing_order()) {
|
||||||
if (node->get_preferred_impl_type() == impl_types::onednn) {
|
if (node->get_preferred_impl_type() == impl_types::onednn) {
|
||||||
if (node->is_type<fully_connected>()) {
|
|
||||||
auto fc_prim = node->as<fully_connected>().get_primitive();
|
|
||||||
|
|
||||||
// Reshape fused ops tensors for OneDNN FC if needed
|
|
||||||
if (fc_prim->input_size == 3) {
|
|
||||||
for (auto& fused_prim : node->get_fused_primitives()) {
|
|
||||||
if (fused_prim.is_type<eltwise>()) {
|
|
||||||
auto& dependency = node->get_dependency(fused_prim.dep_start_idx);
|
|
||||||
auto original_layout = dependency.get_output_layout();
|
|
||||||
onednn::combine_bf_with_first_spatial_dim(original_layout);
|
|
||||||
dependency.set_output_layout(original_layout, false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
node->init_onednn_primitive_attributes();
|
node->init_onednn_primitive_attributes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ protected:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::shared_ptr<dnnl::inner_product_forward::primitive_desc> get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params,
|
static std::shared_ptr<dnnl::inner_product_forward::primitive_desc> get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params,
|
||||||
cldnn::engine& engine, size_t input_size, bool has_bias,
|
cldnn::engine& engine, size_t prim_input_size, bool has_bias,
|
||||||
const dnnl::primitive_attr& attr = dnnl::primitive_attr()) {
|
const dnnl::primitive_attr& attr = dnnl::primitive_attr()) {
|
||||||
auto input_layout = impl_params.get_input_layout(0);
|
auto input_layout = impl_params.get_input_layout(0);
|
||||||
auto weights_layout = impl_params.get_input_layout(1);
|
auto weights_layout = impl_params.get_input_layout(1);
|
||||||
@ -105,6 +105,7 @@ protected:
|
|||||||
auto input_pshape = input_layout.get_partial_shape();
|
auto input_pshape = input_layout.get_partial_shape();
|
||||||
auto weights_pshape = weights_layout.get_partial_shape();
|
auto weights_pshape = weights_layout.get_partial_shape();
|
||||||
|
|
||||||
|
size_t input_size = (prim_input_size > input_pshape.size()) ? input_pshape.size() : prim_input_size;
|
||||||
int64_t feature = input_pshape[std::min(input_size, static_cast<size_t>(4)) - 1].get_length();
|
int64_t feature = input_pshape[std::min(input_size, static_cast<size_t>(4)) - 1].get_length();
|
||||||
if (input_size == 3) {
|
if (input_size == 3) {
|
||||||
feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)});
|
feature = std::max({input_layout.spatial(0), input_layout.spatial(1), input_layout.spatial(2)});
|
||||||
|
@ -865,33 +865,11 @@ static bool is_node_for_onednn(deconvolution_node const& node) {
|
|||||||
|
|
||||||
|
|
||||||
static bool is_node_for_onednn(fully_connected_node const& node) {
|
static bool is_node_for_onednn(fully_connected_node const& node) {
|
||||||
bool is_suitable_for_onednn = true;
|
|
||||||
auto out_layout = node.get_output_layout();
|
|
||||||
for (auto& fo : node.get_fused_primitives()) {
|
|
||||||
if (fo.is_type<eltwise>()) {
|
|
||||||
// FC checkings
|
|
||||||
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
|
|
||||||
auto in_dt = in_layout.data_type;
|
|
||||||
auto out_dt = out_layout.data_type;
|
|
||||||
// if it is not eltwise sum and input is full tensor
|
|
||||||
if ((out_layout.count() == in_layout.count()) && in_dt != out_dt
|
|
||||||
&& (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt))
|
|
||||||
&& onednn_add_fusing_helpers::is_full_tensor(in_layout)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// WA: onednn sum/binary_add post-op are not supported due to perf drop.
|
|
||||||
auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo);
|
|
||||||
if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto fc_prim = node.get_primitive();
|
auto fc_prim = node.get_primitive();
|
||||||
auto ps = out_layout.get_partial_shape();
|
auto ps = node.get_output_layout().get_partial_shape();
|
||||||
int rank = ps.size();
|
|
||||||
int non_spatial_count = 2 + (fc_prim->input_size == 3 ? 1 : 0);
|
int non_spatial_count = 2 + (fc_prim->input_size == 3 ? 1 : 0);
|
||||||
|
int rank = ps.size();
|
||||||
|
|
||||||
// OneDnn doesn't support spatial dimensions for output
|
// OneDnn doesn't support spatial dimensions for output
|
||||||
for (int i = non_spatial_count; i < rank; i++) {
|
for (int i = non_spatial_count; i < rank; i++) {
|
||||||
if (ps[i].is_dynamic() || ps[i] != 1) {
|
if (ps[i].is_dynamic() || ps[i] != 1) {
|
||||||
@ -899,7 +877,7 @@ static bool is_node_for_onednn(fully_connected_node const& node) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return is_suitable_for_onednn;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is needed to avoid performance regressions for the convolutions with byxf layout
|
// This function is needed to avoid performance regressions for the convolutions with byxf layout
|
||||||
@ -1325,48 +1303,53 @@ bool layout_optimizer::are_layouts_suitable_for_onednn(program_node& node) {
|
|||||||
impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) {
|
impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) {
|
||||||
#ifdef GPU_DEBUG_CONFIG
|
#ifdef GPU_DEBUG_CONFIG
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
GPU_DEBUG_IF(!debug_config->forced_impl_type.empty()) {
|
GPU_DEBUG_IF(!debug_config->forced_impl_types.empty()) {
|
||||||
// Forcing impl type of one primitive
|
// Forcing impl type of one primitive
|
||||||
std::string forced_impl_type = debug_config->forced_impl_type;
|
for (const auto& forced_impl_type : debug_config->forced_impl_types) {
|
||||||
if (node.is_type<fully_connected>()) {
|
if (node.is_type<fully_connected>()) {
|
||||||
if (forced_impl_type == "fc:ocl")
|
if (forced_impl_type == "fc:ocl")
|
||||||
return impl_types::ocl;
|
return impl_types::ocl;
|
||||||
else if (forced_impl_type == "fc:onednn")
|
else if (forced_impl_type == "fc:onednn")
|
||||||
return impl_types::onednn;
|
return impl_types::onednn;
|
||||||
} else if (node.is_type<detection_output>()) {
|
} else if (node.is_type<gemm>()) {
|
||||||
if (forced_impl_type == "do:cpu")
|
if (forced_impl_type == "gemm:ocl")
|
||||||
return impl_types::cpu;
|
return impl_types::ocl;
|
||||||
else if (forced_impl_type == "do:ocl")
|
else if (forced_impl_type == "gemm:onednn")
|
||||||
return impl_types::ocl;
|
return impl_types::onednn;
|
||||||
} else if (node.is_type<reduce>()) {
|
} else if (node.is_type<detection_output>()) {
|
||||||
if (forced_impl_type == "reduce:ocl")
|
if (forced_impl_type == "do:cpu")
|
||||||
return impl_types::ocl;
|
return impl_types::cpu;
|
||||||
else if (forced_impl_type == "reduce:onednn")
|
else if (forced_impl_type == "do:ocl")
|
||||||
return impl_types::onednn;
|
return impl_types::ocl;
|
||||||
} else if (node.is_type<concatenation>()) {
|
} else if (node.is_type<reduce>()) {
|
||||||
if (forced_impl_type == "concat:ocl")
|
if (forced_impl_type == "reduce:ocl")
|
||||||
return impl_types::ocl;
|
return impl_types::ocl;
|
||||||
else if (forced_impl_type == "concat:onednn")
|
else if (forced_impl_type == "reduce:onednn")
|
||||||
return impl_types::onednn;
|
return impl_types::onednn;
|
||||||
}
|
} else if (node.is_type<concatenation>()) {
|
||||||
|
if (forced_impl_type == "concat:ocl")
|
||||||
|
return impl_types::ocl;
|
||||||
|
else if (forced_impl_type == "concat:onednn")
|
||||||
|
return impl_types::onednn;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forcing one layer
|
||||||
|
size_t found_type = forced_impl_type.rfind(":");
|
||||||
|
if (found_type != std::string::npos) {
|
||||||
|
impl_types preferred_type = impl_types::any;
|
||||||
|
auto impl_type = forced_impl_type.substr(found_type + 1);
|
||||||
|
if (impl_type == "ocl")
|
||||||
|
preferred_type = impl_types::ocl;
|
||||||
|
else if (impl_type == "onednn")
|
||||||
|
preferred_type = impl_types::onednn;
|
||||||
|
else if (impl_type == "cpu")
|
||||||
|
preferred_type = impl_types::cpu;
|
||||||
|
|
||||||
// Forcing one layer
|
if (node.id() == forced_impl_type.substr(0, found_type)) {
|
||||||
size_t found_type = forced_impl_type.rfind(":");
|
GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
|
||||||
if (found_type != std::string::npos) {
|
<< forced_impl_type.substr(found_type + 1) << std::endl;
|
||||||
impl_types preferred_type = impl_types::any;
|
return preferred_type;
|
||||||
auto impl_type = forced_impl_type.substr(found_type + 1);
|
}
|
||||||
if (impl_type == "ocl")
|
|
||||||
preferred_type = impl_types::ocl;
|
|
||||||
else if (impl_type == "onednn")
|
|
||||||
preferred_type = impl_types::onednn;
|
|
||||||
else if (impl_type == "cpu")
|
|
||||||
preferred_type = impl_types::cpu;
|
|
||||||
|
|
||||||
if (node.id() == forced_impl_type.substr(0, found_type)) {
|
|
||||||
GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
|
|
||||||
<< forced_impl_type.substr(found_type + 1) << std::endl;
|
|
||||||
return preferred_type;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1589,70 +1572,9 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
if (node.is_type<fully_connected>()) {
|
if (node.is_type<fully_connected>()) {
|
||||||
if (!is_node_for_onednn(node.as<fully_connected>()))
|
if (!is_node_for_onednn(node.as<fully_connected>()))
|
||||||
impl_candidate = impl_types::ocl;
|
impl_candidate = impl_types::ocl;
|
||||||
|
|
||||||
// WA : Use cldnn FC due to perf drop of small batch size until onednn FC improve perf
|
|
||||||
if (node.get_output_layout().is_static() && node.get_output_layout().batch() < 32)
|
|
||||||
impl_candidate = impl_types::ocl;
|
|
||||||
} else {
|
} else {
|
||||||
for (auto& fo : node.get_fused_primitives()) {
|
|
||||||
if (fo.is_type<eltwise>()) {
|
|
||||||
// Gemm checkings
|
|
||||||
// TODO: investigate why currently onednn gemm has some "sum" post-op restrictions
|
|
||||||
// which don't correlate with fc checkings in the code above
|
|
||||||
// Temprorary WA: disable onednn gemm with sum post-op inside
|
|
||||||
if (fo.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
|
|
||||||
impl_candidate = impl_types::ocl;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto gemm_prim = node.as<gemm>().get_primitive();
|
|
||||||
if (node.is_dynamic()) {
|
if (node.is_dynamic()) {
|
||||||
impl_candidate = impl_types::ocl;
|
impl_candidate = impl_types::ocl;
|
||||||
} else {
|
|
||||||
auto has_input2 = gemm_prim->dependencies().size() == 3;
|
|
||||||
std::vector<layout> in_layouts { node.get_dependency(0).get_output_layout(), node.get_dependency(1).get_output_layout() };
|
|
||||||
if (has_input2) {
|
|
||||||
in_layouts.emplace_back(node.get_dependency(2).get_output_layout());
|
|
||||||
}
|
|
||||||
auto out_l = node.get_output_layout();
|
|
||||||
|
|
||||||
in_layouts = gemm_inst::transform_input_layouts(gemm_prim, in_layouts, out_l);
|
|
||||||
out_l = gemm_inst::transform_output_layout(gemm_prim, in_layouts, out_l);
|
|
||||||
|
|
||||||
auto in0_l = in_layouts[0];
|
|
||||||
auto in1_l = in_layouts[1];
|
|
||||||
|
|
||||||
size_t in2_batched_size = 0;
|
|
||||||
if (has_input2) {
|
|
||||||
auto in2_l = in_layouts[2];
|
|
||||||
in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1));
|
|
||||||
}
|
|
||||||
size_t size_k = gemm_prim->transpose_input0 ? in0_l.spatial(1) : in0_l.spatial(0);
|
|
||||||
|
|
||||||
size_t in0_batched_size = in0_l.count() / (in0_l.spatial(0) * in0_l.spatial(1));
|
|
||||||
size_t in1_batched_size = in1_l.count() / (in1_l.spatial(0) * in1_l.spatial(1));
|
|
||||||
size_t out_batched_size = out_l.count() / (out_l.spatial(0) * out_l.spatial(1));
|
|
||||||
|
|
||||||
auto valid_input_batch = in0_batched_size != 1 && (in1_batched_size == in0_batched_size || in1_batched_size == 1);
|
|
||||||
auto valid_output_batch = in0_batched_size > in1_batched_size ? out_batched_size == in0_batched_size :
|
|
||||||
out_batched_size == in1_batched_size;
|
|
||||||
auto valid_extra_input_batch = has_input2 ? in2_batched_size == 1 || in2_batched_size == out_batched_size : true;
|
|
||||||
auto valid_scale_factor = gemm_prim->alpha == 1.f && (has_input2 ? gemm_prim->beta == 1.f : true);
|
|
||||||
auto unsupported_onednn_gemm = !valid_input_batch ||
|
|
||||||
!valid_output_batch ||
|
|
||||||
!valid_extra_input_batch ||
|
|
||||||
!valid_scale_factor;
|
|
||||||
|
|
||||||
bool is_u8_i8 = data_type_traits::is_i8_u8(in0_l.data_type) && data_type_traits::is_i8_u8(in1_l.data_type);
|
|
||||||
bool use_ops_cldnn_kernel = is_u8_i8 || (in0_l.spatial(0) % 16 == 0 && in0_l.spatial(1) % 16 == 0 &&
|
|
||||||
in1_l.spatial(0) % 16 == 0 && in1_l.spatial(1) % 16 == 0);
|
|
||||||
|
|
||||||
// Gemm with k < 64 may be faster in cldnn unless ref impl is used
|
|
||||||
if ((size_k < 64 && use_ops_cldnn_kernel) || unsupported_onednn_gemm) {
|
|
||||||
impl_candidate = impl_types::ocl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
#include "convolution_inst.h"
|
#include "convolution_inst.h"
|
||||||
|
#include "gemm_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "deconvolution_inst.h"
|
#include "deconvolution_inst.h"
|
||||||
#include "quantize_inst.h"
|
#include "quantize_inst.h"
|
||||||
#include "reorder_inst.h"
|
#include "reorder_inst.h"
|
||||||
@ -950,6 +952,28 @@ void program_node::init_onednn_primitive_attributes() {
|
|||||||
auto dep_idx = desc.dep_start_idx;
|
auto dep_idx = desc.dep_start_idx;
|
||||||
auto in = get_dependency(dep_idx).get_output_layout();
|
auto in = get_dependency(dep_idx).get_output_layout();
|
||||||
|
|
||||||
|
auto set_binary_op = [&](dnnl::algorithm alg, onednn_post_op_type op_type) {
|
||||||
|
if (is_type<fully_connected>()) {
|
||||||
|
const kernel_impl_params& impl_params = *get_kernel_impl_params();
|
||||||
|
auto prim = impl_params.typed_desc<fully_connected>();
|
||||||
|
if (prim->input_size == 3) {
|
||||||
|
cldnn::onednn::combine_bf_with_first_spatial_dim(in);
|
||||||
|
}
|
||||||
|
post_ops.append_binary(alg, onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab));
|
||||||
|
update_onednn_post_op_list(op_type, dep_idx);
|
||||||
|
} else if (is_type<gemm>()) {
|
||||||
|
size_t rank = cldnn::format::dimension(in.format);
|
||||||
|
dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in.batch() > 1);
|
||||||
|
dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type);
|
||||||
|
dnnl::memory::format_tag fmt = onednn::convert_gemm_data_format(dims);
|
||||||
|
post_ops.append_binary(alg, dnnl::memory::desc(dims, dt, fmt));
|
||||||
|
update_onednn_post_op_list(op_type, dep_idx);
|
||||||
|
} else {
|
||||||
|
post_ops.append_binary(alg, onednn::layout_to_memory_desc(in));
|
||||||
|
update_onednn_post_op_list(op_type, dep_idx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (desc.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
|
if (desc.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
|
||||||
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]);
|
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]);
|
||||||
if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) {
|
if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) {
|
||||||
@ -961,18 +985,12 @@ void program_node::init_onednn_primitive_attributes() {
|
|||||||
update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx);
|
update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx);
|
||||||
num_sum_post_ops++;
|
num_sum_post_ops++;
|
||||||
} else {
|
} else {
|
||||||
dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in);
|
set_binary_op(dnnl::algorithm::binary_add, onednn_post_op_type::binary_add);
|
||||||
post_ops.append_binary(dnnl::algorithm::binary_add, in_desc);
|
|
||||||
update_onednn_post_op_list(onednn_post_op_type::binary_add, dep_idx);
|
|
||||||
}
|
}
|
||||||
} else if (desc.typed_desc<eltwise>()->mode == eltwise_mode::sub) {
|
} else if (desc.typed_desc<eltwise>()->mode == eltwise_mode::sub) {
|
||||||
dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in);
|
set_binary_op(dnnl::algorithm::binary_sub, onednn_post_op_type::binary_sub);
|
||||||
post_ops.append_binary(dnnl::algorithm::binary_sub, in_desc);
|
|
||||||
update_onednn_post_op_list(onednn_post_op_type::binary_sub, dep_idx);
|
|
||||||
} else if (desc.typed_desc<eltwise>()->mode == eltwise_mode::prod) {
|
} else if (desc.typed_desc<eltwise>()->mode == eltwise_mode::prod) {
|
||||||
dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab, true);
|
set_binary_op(dnnl::algorithm::binary_mul, onednn_post_op_type::binary_mul);
|
||||||
post_ops.append_binary(dnnl::algorithm::binary_mul, in_desc);
|
|
||||||
update_onednn_post_op_list(onednn_post_op_type::binary_mul, dep_idx, dnnl::memory::format_tag::ab, true);
|
|
||||||
} else {
|
} else {
|
||||||
std::stringstream error_msg;
|
std::stringstream error_msg;
|
||||||
error_msg << "Unsupported eltwise mode: " << static_cast<int>(desc.typed_desc<eltwise>()->mode) << ". ";
|
error_msg << "Unsupported eltwise mode: " << static_cast<int>(desc.typed_desc<eltwise>()->mode) << ". ";
|
||||||
|
@ -122,9 +122,9 @@ static void print_help_messages() {
|
|||||||
message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
|
message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
|
||||||
" Supported on only on linux.");
|
" Supported on only on linux.");
|
||||||
message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
|
message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
|
||||||
message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
|
message_list.emplace_back("OV_GPU_ForceImplTypes", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
|
||||||
" For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:onednn, reduce:ocl, concat:onednn,"
|
" For example fc:onednn gemm:onednn reduce:ocl do:cpu"
|
||||||
" and concat:ocl are supported");
|
" For primitives fc, gemm, do, reduce, concat are supported. Separated by space.");
|
||||||
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
|
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
|
||||||
|
|
||||||
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
|
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
|
||||||
@ -158,7 +158,6 @@ debug_configuration::debug_configuration()
|
|||||||
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
||||||
, base_batch_for_memory_estimation(-1)
|
, base_batch_for_memory_estimation(-1)
|
||||||
, serialize_compile(0)
|
, serialize_compile(0)
|
||||||
, forced_impl_type(std::string())
|
|
||||||
, max_kernels_per_batch(0) {
|
, max_kernels_per_batch(0) {
|
||||||
#ifdef GPU_DEBUG_CONFIG
|
#ifdef GPU_DEBUG_CONFIG
|
||||||
get_gpu_debug_env_var("Help", help);
|
get_gpu_debug_env_var("Help", help);
|
||||||
@ -181,7 +180,8 @@ debug_configuration::debug_configuration()
|
|||||||
std::string after_proc_str;
|
std::string after_proc_str;
|
||||||
get_gpu_debug_env_var("AfterProc", after_proc_str);
|
get_gpu_debug_env_var("AfterProc", after_proc_str);
|
||||||
get_gpu_debug_env_var("SerialCompile", serialize_compile);
|
get_gpu_debug_env_var("SerialCompile", serialize_compile);
|
||||||
get_gpu_debug_env_var("ForceImplType", forced_impl_type);
|
std::string forced_impl_types_str;
|
||||||
|
get_gpu_debug_env_var("ForceImplTypes", forced_impl_types_str);
|
||||||
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
|
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
|
||||||
|
|
||||||
if (help > 0) {
|
if (help > 0) {
|
||||||
@ -198,6 +198,15 @@ debug_configuration::debug_configuration()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (forced_impl_types_str.length() > 0) {
|
||||||
|
forced_impl_types_str = " " + forced_impl_types_str + " "; // Insert delimiter for easier parsing when used
|
||||||
|
std::stringstream ss(forced_impl_types_str);
|
||||||
|
std::string type;
|
||||||
|
while (ss >> type) {
|
||||||
|
forced_impl_types.push_back(type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (after_proc_str.length() > 0) {
|
if (after_proc_str.length() > 0) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
GPU_DEBUG_COUT << "Warning: OV_GPU_AfterProc is supported only on linux" << std::endl;
|
GPU_DEBUG_COUT << "Warning: OV_GPU_AfterProc is supported only on linux" << std::endl;
|
||||||
|
Loading…
Reference in New Issue
Block a user