[GPU] Rollback to cldnn from onednn and some fixes and improvements (#8761)
This commit is contained in:
parent
9e3b9b8fbc
commit
607814828d
@ -32,6 +32,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
|||||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||||
|
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||||
|
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
|
||||||
|
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
|
||||||
k.EnableOutputLayout(DataLayout::bf);
|
k.EnableOutputLayout(DataLayout::bf);
|
||||||
k.EnableOutputLayout(DataLayout::fb);
|
k.EnableOutputLayout(DataLayout::fb);
|
||||||
k.EnableOutputLayout(DataLayout::bfyx);
|
k.EnableOutputLayout(DataLayout::bfyx);
|
||||||
@ -41,6 +44,9 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
|||||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||||
|
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||||
|
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
|
||||||
|
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
|
||||||
k.EnableTensorOffset();
|
k.EnableTensorOffset();
|
||||||
k.EnableTensorPitches();
|
k.EnableTensorPitches();
|
||||||
k.EnableBatching();
|
k.EnableBatching();
|
||||||
|
@ -8,9 +8,11 @@
|
|||||||
#include "program_node.h"
|
#include "program_node.h"
|
||||||
#include "layout_optimizer.h"
|
#include "layout_optimizer.h"
|
||||||
#include "cldnn/graph/program.hpp"
|
#include "cldnn/graph/program.hpp"
|
||||||
|
#include "cldnn/runtime/debug_configuration.hpp"
|
||||||
#include "program_helpers.h"
|
#include "program_helpers.h"
|
||||||
#include "binary_convolution_inst.h"
|
#include "binary_convolution_inst.h"
|
||||||
#include "mvn_inst.h"
|
#include "mvn_inst.h"
|
||||||
|
#include "to_string_utils.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -18,29 +20,6 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
#define CLDNN_REORDER_INPUTS_VERBOSE 0
|
|
||||||
|
|
||||||
// Prints overall statistics of performed selection, such as number of reorders required.
|
|
||||||
#define CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS (CLDNN_REORDER_INPUTS_VERBOSE > 0)
|
|
||||||
// Prints special cases and work-arounds matched.
|
|
||||||
#define CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH (CLDNN_REORDER_INPUTS_VERBOSE > 1)
|
|
||||||
// Prints full list of preferred formats for each node.
|
|
||||||
#define CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED (CLDNN_REORDER_INPUTS_VERBOSE > 2)
|
|
||||||
// Prints full list of selected formats for each node.
|
|
||||||
#define CLDNN_REORDER_INPUTS_VERBOSE_FORMATS (CLDNN_REORDER_INPUTS_VERBOSE > 2)
|
|
||||||
|
|
||||||
#if CLDNN_REORDER_INPUTS_VERBOSE
|
|
||||||
#include "to_string_utils.h"
|
|
||||||
#include <iostream>
|
|
||||||
#define CLDNN_REORDER_INPUTS_LOG(x) std::cout << "[clDNN][reorder_inputs] " << x << std::endl
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if CLDNN_REORDER_INPUTS_VERBOSE_PATTERN_MATCH
|
|
||||||
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) CLDNN_REORDER_INPUTS_LOG(id << " matched for pattern: " << desc)
|
|
||||||
#else
|
|
||||||
#define CLDNN_REORDER_INPUTS_PATTERN_MATCH_LOG(desc, id) do { } while (false)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using namespace cldnn;
|
using namespace cldnn;
|
||||||
|
|
||||||
// ToDo remove friendship relation from program
|
// ToDo remove friendship relation from program
|
||||||
@ -52,7 +31,17 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
|
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
|
||||||
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
|
|
||||||
std::map<program_node*, format::type> fmt_map;
|
std::map<program_node*, format::type> fmt_map;
|
||||||
|
|
||||||
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
|
size_t onednn_impls_counter = 0;
|
||||||
|
size_t all_impls_counter = 0;
|
||||||
|
const float onednn_min_threshold = 0.1f;
|
||||||
|
bool should_update_fmt_map = false;
|
||||||
|
|
||||||
|
// Calculate onednn kernels number and all kernels number inside the network
|
||||||
for (auto n : p.get_processing_order()) {
|
for (auto n : p.get_processing_order()) {
|
||||||
if (!n->is_in_data_flow())
|
if (!n->is_in_data_flow())
|
||||||
continue;
|
continue;
|
||||||
@ -62,6 +51,51 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
|
|||||||
fmt_map[n] = ex;
|
fmt_map[n] = ex;
|
||||||
|
|
||||||
n->set_preferred_impl_type(impl);
|
n->set_preferred_impl_type(impl);
|
||||||
|
|
||||||
|
if (impl == impl_types::onednn)
|
||||||
|
onednn_impls_counter++;
|
||||||
|
|
||||||
|
all_impls_counter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
float onednn_usage_ratio = all_impls_counter ? static_cast<float>(onednn_impls_counter) / static_cast<float>(all_impls_counter) : 0.f;
|
||||||
|
|
||||||
|
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||||
|
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||||
|
GPU_DEBUG_COUT << "Onednn kernels number: " << onednn_impls_counter << " from " << all_impls_counter
|
||||||
|
<< " (" << onednn_usage_ratio * 100.f << "%)" << std::endl;
|
||||||
|
GPU_DEBUG_COUT << "Onednn usage threshold: " << onednn_min_threshold * 100.f << "%" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reverted to cldnn way for cases when onednn kernels number inside the whole network is extremely low =>
|
||||||
|
// improvements from onednn usage less than losses due to unoptimized formats for cldnn kernels, extra reorders, etc.
|
||||||
|
if (onednn_usage_ratio < onednn_min_threshold && lo.get_optimization_attributes().use_onednn_impls) {
|
||||||
|
should_update_fmt_map = true;
|
||||||
|
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
|
||||||
|
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||||
|
GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||||
|
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||||
|
}
|
||||||
|
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||||
|
|
||||||
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
|
if (should_update_fmt_map)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
for (auto n : p.get_processing_order()) {
|
||||||
|
if (!n->is_in_data_flow())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto ex = lo.get_preferred_format(*n);
|
||||||
|
auto impl = lo.get_preferred_impl_type(*n, ex);
|
||||||
|
fmt_map[n] = ex;
|
||||||
|
|
||||||
|
n->set_preferred_impl_type(impl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return fmt_map;
|
return fmt_map;
|
||||||
}
|
}
|
||||||
@ -408,34 +442,34 @@ void insert_reorders(program& p, const std::map<program_node*, format::type>& fm
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) {
|
void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) {
|
||||||
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
|
|
||||||
auto fmt_map = get_preferred_formats(p, lo);
|
auto fmt_map = get_preferred_formats(p, lo);
|
||||||
#if CLDNN_REORDER_INPUTS_VERBOSE_PREFERRED
|
|
||||||
{
|
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||||
CLDNN_REORDER_INPUTS_LOG("Preferred formats:");
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
|
||||||
for (auto& node_fmt : fmt_map) {
|
for (auto& node_fmt : fmt_map) {
|
||||||
if (node_fmt.second != format::any) {
|
if (node_fmt.second != format::any) {
|
||||||
CLDNN_REORDER_INPUTS_LOG(" " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second));
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
propagate_formats(p, fmt_map, lo);
|
propagate_formats(p, fmt_map, lo);
|
||||||
minimize_local_reorders(p, fmt_map, lo);
|
minimize_local_reorders(p, fmt_map, lo);
|
||||||
|
|
||||||
#if CLDNN_REORDER_INPUTS_VERBOSE_FORMATS
|
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||||
{
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
|
||||||
CLDNN_REORDER_INPUTS_LOG("Selected formats:");
|
|
||||||
for (auto node_ptr : p.get_processing_order()) {
|
for (auto node_ptr : p.get_processing_order()) {
|
||||||
if (fmt_map.count(node_ptr) == 0)
|
if (fmt_map.count(node_ptr) == 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto fmt = fmt_map.at(node_ptr);
|
auto fmt = fmt_map.at(node_ptr);
|
||||||
CLDNN_REORDER_INPUTS_LOG(" " << node_ptr->id() << " " << fmt_to_str(fmt));
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#if CLDNN_REORDER_INPUTS_VERBOSE_STATISTICS
|
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||||
{
|
|
||||||
reorder_cnt total_reorder_count = std::accumulate(
|
reorder_cnt total_reorder_count = std::accumulate(
|
||||||
p.get_processing_order().begin(),
|
p.get_processing_order().begin(),
|
||||||
p.get_processing_order().end(),
|
p.get_processing_order().end(),
|
||||||
@ -447,8 +481,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
|||||||
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
|
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
|
||||||
});
|
});
|
||||||
// Divide results by two as above function will each reorder from both sides
|
// Divide results by two as above function will each reorder from both sides
|
||||||
CLDNN_REORDER_INPUTS_LOG("Total number of reorders: " << total_reorder_count.number / 2);
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
|
||||||
CLDNN_REORDER_INPUTS_LOG("Total elements count of all reorders: " << total_reorder_count.total_sizes / 2);
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
|
||||||
|
|
||||||
// Count number of reorders that will be fused
|
// Count number of reorders that will be fused
|
||||||
size_t nodes_with_fusing = 0;
|
size_t nodes_with_fusing = 0;
|
||||||
@ -464,9 +498,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CLDNN_REORDER_INPUTS_LOG("Number of nodes with fused reorders: " << nodes_with_fusing);
|
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
|
||||||
|
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
insert_reorders(p, fmt_map, rf, lo);
|
insert_reorders(p, fmt_map, rf, lo);
|
||||||
|
|
||||||
|
@ -154,38 +154,50 @@ namespace detail {
|
|||||||
|
|
||||||
attach_convolution_impl::attach_convolution_impl() {
|
attach_convolution_impl::attach_convolution_impl() {
|
||||||
implementation_map<convolution>::add(impl_types::ocl, convolution_impl::create, {
|
implementation_map<convolution>::add(impl_types::ocl, convolution_impl::create, {
|
||||||
std::make_tuple(data_types::f32, format::yxfb),
|
|
||||||
std::make_tuple(data_types::f16, format::yxfb),
|
|
||||||
std::make_tuple(data_types::f32, format::bfyx),
|
std::make_tuple(data_types::f32, format::bfyx),
|
||||||
std::make_tuple(data_types::f16, format::bfyx),
|
std::make_tuple(data_types::f16, format::bfyx),
|
||||||
std::make_tuple(data_types::i8, format::bfyx),
|
std::make_tuple(data_types::i8, format::bfyx),
|
||||||
std::make_tuple(data_types::u8, format::bfyx),
|
std::make_tuple(data_types::u8, format::bfyx),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::f32, format::yxfb),
|
||||||
|
std::make_tuple(data_types::f16, format::yxfb),
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::bfzyx),
|
std::make_tuple(data_types::f32, format::bfzyx),
|
||||||
std::make_tuple(data_types::f16, format::bfzyx),
|
std::make_tuple(data_types::f16, format::bfzyx),
|
||||||
std::make_tuple(data_types::i8, format::bfzyx),
|
std::make_tuple(data_types::i8, format::bfzyx),
|
||||||
std::make_tuple(data_types::u8, format::bfzyx),
|
std::make_tuple(data_types::u8, format::bfzyx),
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::winograd_2x3_s1_data),
|
std::make_tuple(data_types::f32, format::winograd_2x3_s1_data),
|
||||||
std::make_tuple(data_types::f16, format::winograd_2x3_s1_data),
|
std::make_tuple(data_types::f16, format::winograd_2x3_s1_data),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::byxf),
|
std::make_tuple(data_types::f32, format::byxf),
|
||||||
std::make_tuple(data_types::f16, format::byxf),
|
std::make_tuple(data_types::f16, format::byxf),
|
||||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::u8, format::byxf),
|
||||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
|
||||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
|
||||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
|
||||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
|
||||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
|
||||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
|
|
||||||
std::make_tuple(data_types::i8, format::byxf),
|
std::make_tuple(data_types::i8, format::byxf),
|
||||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
||||||
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
|
|
||||||
|
@ -119,13 +119,17 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
|
|||||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
|
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
|
||||||
|
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
|
||||||
std::make_tuple(data_types::f16, format::yxfb),
|
std::make_tuple(data_types::f16, format::yxfb),
|
||||||
std::make_tuple(data_types::f16, format::bfyx),
|
std::make_tuple(data_types::f16, format::bfyx),
|
||||||
std::make_tuple(data_types::f16, format::bfzyx),
|
std::make_tuple(data_types::f16, format::bfzyx),
|
||||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
|
||||||
std::make_tuple(data_types::f32, format::byxf),
|
std::make_tuple(data_types::f32, format::byxf),
|
||||||
std::make_tuple(data_types::f16, format::byxf),
|
std::make_tuple(data_types::f16, format::byxf),
|
||||||
std::make_tuple(data_types::i8, format::bfyx),
|
std::make_tuple(data_types::i8, format::bfyx),
|
||||||
@ -134,10 +138,14 @@ attach_deconvolution_impl::attach_deconvolution_impl() {
|
|||||||
std::make_tuple(data_types::u8, format::bfzyx),
|
std::make_tuple(data_types::u8, format::bfzyx),
|
||||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
|
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
|
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
|
||||||
|
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
|
||||||
|
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
|
||||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
});
|
});
|
||||||
|
@ -80,6 +80,17 @@ attach_mvn_impl::attach_mvn_impl() {
|
|||||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||||
|
|
||||||
|
// TODO: uncomment this code when fsv32 optimizations for MVN will be implemented
|
||||||
|
/*std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||||
|
|
||||||
|
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv32),
|
||||||
|
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv32),
|
||||||
|
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||||
|
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),*/
|
||||||
|
|
||||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||||
|
@ -867,7 +867,8 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
|||||||
auto in_dt = in_layout.data_type;
|
auto in_dt = in_layout.data_type;
|
||||||
auto out_dt = out_layout.data_type;
|
auto out_dt = out_layout.data_type;
|
||||||
if ((out_layout.count() == in_layout.count()) &&
|
if ((out_layout.count() == in_layout.count()) &&
|
||||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||||
|
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||||
onednn_valid_post_ops = false;
|
onednn_valid_post_ops = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -890,6 +891,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
|||||||
|
|
||||||
/* ***************************** OneDNN impls format selection part ****************************** */
|
/* ***************************** OneDNN impls format selection part ****************************** */
|
||||||
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
|
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
|
||||||
|
// TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented
|
||||||
|
// bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8;
|
||||||
|
// bool is_first_conv = input_layout.size.feature[0] < 4;
|
||||||
|
|
||||||
if (i8_u8_input) {
|
if (i8_u8_input) {
|
||||||
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
|
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
|
||||||
if (input_layout.size.batch[0] % 16 == 0) {
|
if (input_layout.size.batch[0] % 16 == 0) {
|
||||||
@ -929,14 +934,21 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
|||||||
} else if (input_layout.data_type == data_types::f16 &&
|
} else if (input_layout.data_type == data_types::f16 &&
|
||||||
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
|
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
|
||||||
(output_layout.data_type == input_layout.data_type ||
|
(output_layout.data_type == input_layout.data_type ||
|
||||||
!data_type_traits::is_floating_point(input_layout.data_type))) {
|
!data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) {
|
||||||
expected_tensor = current_layout.size;
|
expected_tensor = current_layout.size;
|
||||||
if (prim->groups == 1 || (output_layout.size.feature[0] % 16 == 0 && input_layout.size.feature[0] % 16 == 0)) {
|
if (prim->groups == 1 || (output_layout.size.feature[0] % 16 == 0 && input_layout.size.feature[0] % 16 == 0)) {
|
||||||
expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
|
expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
|
||||||
} else {
|
} else {
|
||||||
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
|
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
|
||||||
}
|
}
|
||||||
}
|
} // TODO: add this case when corresponding fsv32 optimizations inside clDNN will be implemented
|
||||||
|
//else if (input_layout.data_type == data_types::f32 && i8_u8_output && !is_first_conv && is_2d) {
|
||||||
|
// if (input_layout.size.batch[0] % 16 == 0) {
|
||||||
|
// expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32;
|
||||||
|
// } else {
|
||||||
|
// expected_format = cldnn::format::b_fs_yx_fsv32;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
} else {
|
} else {
|
||||||
/* *************************** Native impls format selection part ************************** */
|
/* *************************** Native impls format selection part ************************** */
|
||||||
if (i8_u8_input) {
|
if (i8_u8_input) {
|
||||||
@ -1293,7 +1305,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
auto in_dt = in_layout.data_type;
|
auto in_dt = in_layout.data_type;
|
||||||
auto out_dt = out_layout.data_type;
|
auto out_dt = out_layout.data_type;
|
||||||
if ((out_layout.count() == in_layout.count()) &&
|
if ((out_layout.count() == in_layout.count()) &&
|
||||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||||
|
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||||
impl_candidate = impl_types::ocl;
|
impl_candidate = impl_types::ocl;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1352,7 +1365,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
auto in_dt = in_layout.data_type;
|
auto in_dt = in_layout.data_type;
|
||||||
auto out_dt = out_layout.data_type;
|
auto out_dt = out_layout.data_type;
|
||||||
if ((out_layout.count() == in_layout.count()) &&
|
if ((out_layout.count() == in_layout.count()) &&
|
||||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||||
|
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||||
impl_candidate = impl_types::ocl;
|
impl_candidate = impl_types::ocl;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1380,11 +1394,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
format layout_optimizer::get_preferred_format(program_node& node) {
|
format layout_optimizer::get_preferred_format(program_node& node) {
|
||||||
format expected = format::any;
|
format expected = format::any;
|
||||||
auto output_layout = node.get_output_layout();
|
auto output_layout = node.get_output_layout();
|
||||||
|
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||||
|
|
||||||
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
|
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
|
||||||
expected = _forcing_map.at(node.id()).first;
|
expected = _forcing_map.at(node.id()).first;
|
||||||
} else if (node.is_type<convolution>()) {
|
} else if (node.is_type<convolution>()) {
|
||||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
|
||||||
auto& conv_node = node.as<convolution>();
|
auto& conv_node = node.as<convolution>();
|
||||||
auto weights_layout = conv_node.weights(0).get_output_layout();
|
auto weights_layout = conv_node.weights(0).get_output_layout();
|
||||||
expected = get_expected_layout(output_layout, conv_node, weights_layout).format;
|
expected = get_expected_layout(output_layout, conv_node, weights_layout).format;
|
||||||
@ -1400,21 +1414,12 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
|||||||
auto& bconv_node = node.as<binary_convolution>();
|
auto& bconv_node = node.as<binary_convolution>();
|
||||||
auto weights_layout = bconv_node.weights(0).get_output_layout();
|
auto weights_layout = bconv_node.weights(0).get_output_layout();
|
||||||
expected = get_expected_layout(output_layout, bconv_node, weights_layout).format;
|
expected = get_expected_layout(output_layout, bconv_node, weights_layout).format;
|
||||||
} else if (node.is_type<pooling>() && _optimization_attributes.use_onednn_impls) {
|
|
||||||
auto in_layout = node.get_dependency(0).get_output_layout();
|
|
||||||
|
|
||||||
if (output_layout.size.batch[0] % 16 == 0 || output_layout.size.batch[0] == 8) {
|
|
||||||
if (!data_type_traits::is_floating_point(in_layout.data_type) && in_layout.data_type != output_layout.data_type) {
|
|
||||||
expected = format::b_fs_yx_fsv16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (node.is_type<detection_output>()) {
|
} else if (node.is_type<detection_output>()) {
|
||||||
expected = get_expected_layout(
|
expected = get_expected_layout(
|
||||||
output_layout,
|
output_layout,
|
||||||
node.as<detection_output>(),
|
node.as<detection_output>(),
|
||||||
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
||||||
} else if (node.is_type<quantize>()) {
|
} else if (node.is_type<quantize>()) {
|
||||||
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
|
||||||
auto layout = node.get_output_layout();
|
auto layout = node.get_output_layout();
|
||||||
if (layout.format.spatial_num() == 2 &&
|
if (layout.format.spatial_num() == 2 &&
|
||||||
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||||
|
Loading…
Reference in New Issue
Block a user