[GPU] Rollback to cldnn from onednn and some fixes and improvements (#8761)
This commit is contained in:
parent
9e3b9b8fbc
commit
607814828d
@ -32,6 +32,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::bf);
|
||||
k.EnableOutputLayout(DataLayout::fb);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
@ -41,6 +44,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
|
@ -8,9 +8,11 @@
|
||||
#include "program_node.h"
|
||||
#include "layout_optimizer.h"
|
||||
#include "cldnn/graph/program.hpp"
|
||||
#include "cldnn/runtime/debug_configuration.hpp"
|
||||
#include "program_helpers.h"
|
||||
#include "binary_convolution_inst.h"
|
||||
#include "mvn_inst.h"
|
||||
#include "to_string_utils.h"
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
@ -18,29 +20,6 @@
|
||||
#include <map>
|
||||
#include <set>
|
||||
|
||||
#define CLDNN_REORDER_INPUTS_VERBOSE 0
|
||||
|
||||
// Prints overall statistics of performed selection, such as number of reorders required.
|
||||
#define CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS (CLDNN_REORDER_INPUTS_VERBOSE > 0)
|
||||
// Prints special cases and work-arounds matched.
|
||||
#define CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH (CLDNN_REORDER_INPUTS_VERBOSE > 1)
|
||||
// Prints full list of preferred formats for each node.
|
||||
#define CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED (CLDNN_REORDER_INPUTS_VERBOSE > 2)
|
||||
// Prints full list of selected formats for each node.
|
||||
#define CLDNN_REORDER_INPUTS_VERBOSE_FORMATS (CLDNN_REORDER_INPUTS_VERBOSE > 2)
|
||||
|
||||
#if CLDNN_REORDER_INPUTS_VERBOSE
|
||||
#include "to_string_utils.h"
|
||||
#include <iostream>
|
||||
#define CLDNN_REORDER_INPUTS_LOG(x) std::cout << "[clDNN][reorder_inputs] " << x << std::endl
|
||||
#endif
|
||||
|
||||
#if CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH
|
||||
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) CLDNN_REORDER_INPUTS_LOG(id << " matched for pattern: " << desc)
|
||||
#else
|
||||
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) do { } while (false)
|
||||
#endif
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
// ToDo remove friendship relation from program
|
||||
@ -52,7 +31,17 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
|
||||
namespace {
|
||||
|
||||
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
std::map<program_node*, format::type> fmt_map;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
size_t onednn_impls_counter = 0;
|
||||
size_t all_impls_counter = 0;
|
||||
const float onednn_min_threshold = 0.1f;
|
||||
bool should_update_fmt_map = false;
|
||||
|
||||
// Calculate onednn kernels number and all kernels number inside the network
|
||||
for (auto n : p.get_processing_order()) {
|
||||
if (!n->is_in_data_flow())
|
||||
continue;
|
||||
@ -62,6 +51,51 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
|
||||
fmt_map[n] = ex;
|
||||
|
||||
n->set_preferred_impl_type(impl);
|
||||
|
||||
if (impl == impl_types::onednn)
|
||||
onednn_impls_counter++;
|
||||
|
||||
all_impls_counter++;
|
||||
}
|
||||
|
||||
float onednn_usage_ratio = all_impls_counter ? static_cast<float>(onednn_impls_counter) / static_cast<float>(all_impls_counter) : 0.f;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||
GPU_DEBUG_COUT << "Onednn kernels number: " << onednn_impls_counter << " from " << all_impls_counter
|
||||
<< " (" << onednn_usage_ratio * 100.f << "%)" << std::endl;
|
||||
GPU_DEBUG_COUT << "Onednn usage threshold: " << onednn_min_threshold * 100.f << "%" << std::endl;
|
||||
}
|
||||
|
||||
// Reverted to cldnn way for cases when onednn kernels number inside the whole network is extremely low =>
|
||||
// improvements from onednn usage less than losses due to unoptimized formats for cldnn kernels, extra reorders, etc.
|
||||
if (onednn_usage_ratio < onednn_min_threshold && lo.get_optimization_attributes().use_onednn_impls) {
|
||||
should_update_fmt_map = true;
|
||||
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||
}
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
if (should_update_fmt_map)
|
||||
#endif
|
||||
{
|
||||
for (auto n : p.get_processing_order()) {
|
||||
if (!n->is_in_data_flow())
|
||||
continue;
|
||||
|
||||
auto ex = lo.get_preferred_format(*n);
|
||||
auto impl = lo.get_preferred_impl_type(*n, ex);
|
||||
fmt_map[n] = ex;
|
||||
|
||||
n->set_preferred_impl_type(impl);
|
||||
}
|
||||
}
|
||||
return fmt_map;
|
||||
}
|
||||
@ -408,34 +442,34 @@ void insert_reorders(program& p, const std::map<program_node*, format::type>& fm
|
||||
} // namespace
|
||||
|
||||
void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
auto fmt_map = get_preferred_formats(p, lo);
|
||||
#if CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED
|
||||
{
|
||||
CLDNN_REORDER_INPUTS_LOG("Preferred formats:");
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
|
||||
for (auto& node_fmt : fmt_map) {
|
||||
if (node_fmt.second != format::any) {
|
||||
CLDNN_REORDER_INPUTS_LOG(" " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second));
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
propagate_formats(p, fmt_map, lo);
|
||||
minimize_local_reorders(p, fmt_map, lo);
|
||||
|
||||
#if CLDNN_REORDER_INPUTS_VERBOSE_FORMATS
|
||||
{
|
||||
CLDNN_REORDER_INPUTS_LOG("Selected formats:");
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
|
||||
for (auto node_ptr : p.get_processing_order()) {
|
||||
if (fmt_map.count(node_ptr) == 0)
|
||||
continue;
|
||||
|
||||
auto fmt = fmt_map.at(node_ptr);
|
||||
CLDNN_REORDER_INPUTS_LOG(" " << node_ptr->id() << " " << fmt_to_str(fmt));
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS
|
||||
{
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
reorder_cnt total_reorder_count = std::accumulate(
|
||||
p.get_processing_order().begin(),
|
||||
p.get_processing_order().end(),
|
||||
@ -447,8 +481,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
|
||||
});
|
||||
// Divide results by two as above function will each reorder from both sides
|
||||
CLDNN_REORDER_INPUTS_LOG("Total number of reorders: " << total_reorder_count.number / 2);
|
||||
CLDNN_REORDER_INPUTS_LOG("Total elements count of all reorders: " << total_reorder_count.total_sizes / 2);
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
|
||||
|
||||
// Count number of reorders that will be fused
|
||||
size_t nodes_with_fusing = 0;
|
||||
@ -464,9 +498,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
}
|
||||
}
|
||||
CLDNN_REORDER_INPUTS_LOG("Number of nodes with fused reorders: " << nodes_with_fusing);
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
|
||||
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
insert_reorders(p, fmt_map, rf, lo);
|
||||
|
||||
|
@ -154,38 +154,50 @@ namespace detail {
|
||||
|
||||
attach_convolution_impl::attach_convolution_impl() {
|
||||
implementation_map<convolution>::add(impl_types::ocl, convolution_impl::create, {
|
||||
std::make_tuple(data_types::f32, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::winograd_2x3_s1_data),
|
||||
std::make_tuple(data_types::f16, format::winograd_2x3_s1_data),
|
||||
|
||||
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
||||
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
|
||||
|
@ -119,13 +119,17 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
@ -134,10 +138,14 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
});
|
||||
|
@ -80,6 +80,17 @@ attach_mvn_impl::attach_mvn_impl() {
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
|
||||
// TODO: uncomment this code when fsv32 optimizations for MVN will be implemented
|
||||
/*std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),*/
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
|
@ -867,7 +867,8 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||
onednn_valid_post_ops = false;
|
||||
break;
|
||||
}
|
||||
@ -890,6 +891,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
|
||||
/* ***************************** OneDNN impls format selection part ****************************** */
|
||||
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
|
||||
// TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented
|
||||
// bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8;
|
||||
// bool is_first_conv = input_layout.size.feature[0] < 4;
|
||||
|
||||
if (i8_u8_input) {
|
||||
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
|
||||
if (input_layout.size.batch[0] % 16 == 0) {
|
||||
@ -929,14 +934,21 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
} else if (input_layout.data_type == data_types::f16 &&
|
||||
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
|
||||
(output_layout.data_type == input_layout.data_type ||
|
||||
!data_type_traits::is_floating_point(input_layout.data_type))) {
|
||||
!data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) {
|
||||
expected_tensor = current_layout.size;
|
||||
if (prim->groups == 1 || (output_layout.size.feature[0] % 16 == 0 && input_layout.size.feature[0] % 16 == 0)) {
|
||||
expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
|
||||
} else {
|
||||
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
|
||||
}
|
||||
}
|
||||
} // TODO: add this case when corresponding fsv32 optimizations inside clDNN will be implemented
|
||||
//else if (input_layout.data_type == data_types::f32 && i8_u8_output && !is_first_conv && is_2d) {
|
||||
// if (input_layout.size.batch[0] % 16 == 0) {
|
||||
// expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32;
|
||||
// } else {
|
||||
// expected_format = cldnn::format::b_fs_yx_fsv32;
|
||||
// }
|
||||
//}
|
||||
} else {
|
||||
/* *************************** Native impls format selection part ************************** */
|
||||
if (i8_u8_input) {
|
||||
@ -1293,7 +1305,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
@ -1352,7 +1365,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
@ -1380,11 +1394,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
format expected = format::any;
|
||||
auto output_layout = node.get_output_layout();
|
||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
|
||||
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
|
||||
expected = _forcing_map.at(node.id()).first;
|
||||
} else if (node.is_type<convolution>()) {
|
||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
auto& conv_node = node.as<convolution>();
|
||||
auto weights_layout = conv_node.weights(0).get_output_layout();
|
||||
expected = get_expected_layout(output_layout, conv_node, weights_layout).format;
|
||||
@ -1400,21 +1414,12 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
auto& bconv_node = node.as<binary_convolution>();
|
||||
auto weights_layout = bconv_node.weights(0).get_output_layout();
|
||||
expected = get_expected_layout(output_layout, bconv_node, weights_layout).format;
|
||||
} else if (node.is_type<pooling>() && _optimization_attributes.use_onednn_impls) {
|
||||
auto in_layout = node.get_dependency(0).get_output_layout();
|
||||
|
||||
if (output_layout.size.batch[0] % 16 == 0 || output_layout.size.batch[0] == 8) {
|
||||
if (!data_type_traits::is_floating_point(in_layout.data_type) && in_layout.data_type != output_layout.data_type) {
|
||||
expected = format::b_fs_yx_fsv16;
|
||||
}
|
||||
}
|
||||
} else if (node.is_type<detection_output>()) {
|
||||
expected = get_expected_layout(
|
||||
output_layout,
|
||||
node.as<detection_output>(),
|
||||
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
||||
} else if (node.is_type<quantize>()) {
|
||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
auto layout = node.get_output_layout();
|
||||
if (layout.format.spatial_num() == 2 &&
|
||||
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||
|
Loading…
Reference in New Issue
Block a user