[GPU] Rollback to cldnn from onednn and some fixes and improvements (#8761)

This commit is contained in:
Ilya Znamenskiy 2021-12-02 11:28:38 +03:00 committed by GitHub
parent 9e3b9b8fbc
commit 607814828d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 146 additions and 70 deletions

View File

@ -32,6 +32,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
k.EnableOutputLayout(DataLayout::bf);
k.EnableOutputLayout(DataLayout::fb);
k.EnableOutputLayout(DataLayout::bfyx);
@ -41,6 +44,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();

View File

@ -8,9 +8,11 @@
#include "program_node.h"
#include "layout_optimizer.h"
#include "cldnn/graph/program.hpp"
#include "cldnn/runtime/debug_configuration.hpp"
#include "program_helpers.h"
#include "binary_convolution_inst.h"
#include "mvn_inst.h"
#include "to_string_utils.h"
#include <vector>
#include <memory>
@ -18,29 +20,6 @@
#include <map>
#include <set>
#define CLDNN_REORDER_INPUTS_VERBOSE 0
// Prints overall statistics of performed selection, such as number of reorders required.
#define CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS (CLDNN_REORDER_INPUTS_VERBOSE > 0)
// Prints special cases and work-arounds matched.
#define CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH (CLDNN_REORDER_INPUTS_VERBOSE > 1)
// Prints full list of preferred formats for each node.
#define CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED (CLDNN_REORDER_INPUTS_VERBOSE > 2)
// Prints full list of selected formats for each node.
#define CLDNN_REORDER_INPUTS_VERBOSE_FORMATS (CLDNN_REORDER_INPUTS_VERBOSE > 2)
#if CLDNN_REORDER_INPUTS_VERBOSE
#include "to_string_utils.h"
#include <iostream>
#define CLDNN_REORDER_INPUTS_LOG(x) std::cout << "[clDNN][reorder_inputs] " << x << std::endl
#endif
#if CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) CLDNN_REORDER_INPUTS_LOG(id << " matched for pattern: " << desc)
#else
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) do { } while (false)
#endif
using namespace cldnn;
// ToDo remove friendship relation from program
@ -52,7 +31,17 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
namespace {
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
GPU_DEBUG_GET_INSTANCE(debug_config);
std::map<program_node*, format::type> fmt_map;
#ifdef ENABLE_ONEDNN_FOR_GPU
size_t onednn_impls_counter = 0;
size_t all_impls_counter = 0;
const float onednn_min_threshold = 0.1f;
bool should_update_fmt_map = false;
// Calculate onednn kernels number and all kernels number inside the network
for (auto n : p.get_processing_order()) {
if (!n->is_in_data_flow())
continue;
@ -62,6 +51,51 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
fmt_map[n] = ex;
n->set_preferred_impl_type(impl);
if (impl == impl_types::onednn)
onednn_impls_counter++;
all_impls_counter++;
}
float onednn_usage_ratio = all_impls_counter ? static_cast<float>(onednn_impls_counter) / static_cast<float>(all_impls_counter) : 0.f;
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
GPU_DEBUG_COUT << "Onednn kernels number: " << onednn_impls_counter << " from " << all_impls_counter
<< " (" << onednn_usage_ratio * 100.f << "%)" << std::endl;
GPU_DEBUG_COUT << "Onednn usage threshold: " << onednn_min_threshold * 100.f << "%" << std::endl;
}
// Reverted to cldnn way for cases when onednn kernels number inside the whole network is extremely low =>
// improvements from onednn usage less than losses due to unoptimized formats for cldnn kernels, extra reorders, etc.
if (onednn_usage_ratio < onednn_min_threshold && lo.get_optimization_attributes().use_onednn_impls) {
should_update_fmt_map = true;
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
}
}
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
}
#endif // ENABLE_ONEDNN_FOR_GPU
#ifdef ENABLE_ONEDNN_FOR_GPU
if (should_update_fmt_map)
#endif
{
for (auto n : p.get_processing_order()) {
if (!n->is_in_data_flow())
continue;
auto ex = lo.get_preferred_format(*n);
auto impl = lo.get_preferred_impl_type(*n, ex);
fmt_map[n] = ex;
n->set_preferred_impl_type(impl);
}
}
return fmt_map;
}
@ -408,34 +442,34 @@ void insert_reorders(program& p, const std::map<program_node*, format::type>& fm
} // namespace
void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) {
GPU_DEBUG_GET_INSTANCE(debug_config);
auto fmt_map = get_preferred_formats(p, lo);
#if CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED
{
CLDNN_REORDER_INPUTS_LOG("Preferred formats:");
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
for (auto& node_fmt : fmt_map) {
if (node_fmt.second != format::any) {
CLDNN_REORDER_INPUTS_LOG(" " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second));
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
}
}
}
#endif
propagate_formats(p, fmt_map, lo);
minimize_local_reorders(p, fmt_map, lo);
#if CLDNN_REORDER_INPUTS_VERBOSE_FORMATS
{
CLDNN_REORDER_INPUTS_LOG("Selected formats:");
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
for (auto node_ptr : p.get_processing_order()) {
if (fmt_map.count(node_ptr) == 0)
continue;
auto fmt = fmt_map.at(node_ptr);
CLDNN_REORDER_INPUTS_LOG(" " << node_ptr->id() << " " << fmt_to_str(fmt));
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
}
}
#endif
#if CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS
{
GPU_DEBUG_IF(debug_config->verbose >= 1) {
reorder_cnt total_reorder_count = std::accumulate(
p.get_processing_order().begin(),
p.get_processing_order().end(),
@ -447,8 +481,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
});
// Divide results by two as above function will each reorder from both sides
CLDNN_REORDER_INPUTS_LOG("Total number of reorders: " << total_reorder_count.number / 2);
CLDNN_REORDER_INPUTS_LOG("Total elements count of all reorders: " << total_reorder_count.total_sizes / 2);
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
// Count number of reorders that will be fused
size_t nodes_with_fusing = 0;
@ -464,9 +498,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
}
}
}
CLDNN_REORDER_INPUTS_LOG("Number of nodes with fused reorders: " << nodes_with_fusing);
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
}
#endif
insert_reorders(p, fmt_map, rf, lo);

View File

@ -154,38 +154,50 @@ namespace detail {
attach_convolution_impl::attach_convolution_impl() {
implementation_map<convolution>::add(impl_types::ocl, convolution_impl::create, {
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),
std::make_tuple(data_types::f32, format::winograd_2x3_s1_data),
std::make_tuple(data_types::f16, format::winograd_2x3_s1_data),
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::u8, format::byxf),
std::make_tuple(data_types::i8, format::byxf),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),

View File

@ -119,13 +119,17 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i8, format::bfyx),
@ -134,10 +138,14 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
std::make_tuple(data_types::u8, format::bfzyx),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
});

View File

@ -80,6 +80,17 @@ attach_mvn_impl::attach_mvn_impl() {
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
// TODO: uncomment this code when fsv32 optimizations for MVN will be implemented
/*std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),*/
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),

View File

@ -867,7 +867,8 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
onednn_valid_post_ops = false;
break;
}
@ -890,6 +891,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
/* ***************************** OneDNN impls format selection part ****************************** */
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
// TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented
// bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8;
// bool is_first_conv = input_layout.size.feature[0] < 4;
if (i8_u8_input) {
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
if (input_layout.size.batch[0] % 16 == 0) {
@ -929,14 +934,21 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
} else if (input_layout.data_type == data_types::f16 &&
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
(output_layout.data_type == input_layout.data_type ||
!data_type_traits::is_floating_point(input_layout.data_type))) {
!data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) {
expected_tensor = current_layout.size;
if (prim->groups == 1 || (output_layout.size.feature[0] % 16 == 0 && input_layout.size.feature[0] % 16 == 0)) {
expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
} else {
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
}
}
} // TODO: add this case when corresponding fsv32 optimizations inside clDNN will be implemented
//else if (input_layout.data_type == data_types::f32 && i8_u8_output && !is_first_conv && is_2d) {
// if (input_layout.size.batch[0] % 16 == 0) {
// expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32;
// } else {
// expected_format = cldnn::format::b_fs_yx_fsv32;
// }
//}
} else {
/* *************************** Native impls format selection part ************************** */
if (i8_u8_input) {
@ -1293,7 +1305,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
impl_candidate = impl_types::ocl;
break;
}
@ -1352,7 +1365,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
auto in_dt = in_layout.data_type;
auto out_dt = out_layout.data_type;
if ((out_layout.count() == in_layout.count()) &&
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
impl_candidate = impl_types::ocl;
break;
}
@ -1380,11 +1394,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
format layout_optimizer::get_preferred_format(program_node& node) {
format expected = format::any;
auto output_layout = node.get_output_layout();
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
expected = _forcing_map.at(node.id()).first;
} else if (node.is_type<convolution>()) {
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
auto& conv_node = node.as<convolution>();
auto weights_layout = conv_node.weights(0).get_output_layout();
expected = get_expected_layout(output_layout, conv_node, weights_layout).format;
@ -1400,21 +1414,12 @@ format layout_optimizer::get_preferred_format(program_node& node) {
auto& bconv_node = node.as<binary_convolution>();
auto weights_layout = bconv_node.weights(0).get_output_layout();
expected = get_expected_layout(output_layout, bconv_node, weights_layout).format;
} else if (node.is_type<pooling>() && _optimization_attributes.use_onednn_impls) {
auto in_layout = node.get_dependency(0).get_output_layout();
if (output_layout.size.batch[0] % 16 == 0 || output_layout.size.batch[0] == 8) {
if (!data_type_traits::is_floating_point(in_layout.data_type) && in_layout.data_type != output_layout.data_type) {
expected = format::b_fs_yx_fsv16;
}
}
} else if (node.is_type<detection_output>()) {
expected = get_expected_layout(
output_layout,
node.as<detection_output>(),
layout{ data_types::f32, format::bfyx, tensor{} }).format;
} else if (node.is_type<quantize>()) {
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
auto layout = node.get_output_layout();
if (layout.format.spatial_num() == 2 &&
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&