[GPU] Do not add paddings for OneDNN primitives (#8619)
* Fix review comments for Convolutions * [GPU] Do not add paddings for OneDNN primitives * Add unit test for correct padding between onednn and cldnn primitives
This commit is contained in:
parent
90e10e369d
commit
be786ee7af
@ -17,10 +17,30 @@ void prepare_padding::run(program& p) {
|
|||||||
if (output_size_handling_enabled) {
|
if (output_size_handling_enabled) {
|
||||||
// Prepare upper padding for primitives that support output_size parameter.
|
// Prepare upper padding for primitives that support output_size parameter.
|
||||||
for (const auto& node : p.get_processing_order()) {
|
for (const auto& node : p.get_processing_order()) {
|
||||||
|
if (node->get_dependencies().empty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (node->get_dependency(0).is_type<data>())
|
||||||
|
continue;
|
||||||
|
|
||||||
// Padded offsets aren't supported by onednn kernels
|
// Padded offsets aren't supported by onednn kernels
|
||||||
if (node->get_preferred_impl_type() == impl_types::onednn)
|
if (node->get_preferred_impl_type() == impl_types::onednn)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
auto add_required_padding = [&p](program_node& node, padding& needed_padding) {
|
||||||
|
// Add extra reorder for cldnn primitive to handle required padding if needed
|
||||||
|
auto& input = node.get_dependency(0);
|
||||||
|
if (input.get_preferred_impl_type() == impl_types::onednn &&
|
||||||
|
node.get_preferred_impl_type() == impl_types::ocl &&
|
||||||
|
static_cast<bool>(needed_padding)) {
|
||||||
|
auto new_reorder = std::make_shared<reorder>(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout());
|
||||||
|
auto& new_reorder_node = p.get_or_create(new_reorder);
|
||||||
|
p.add_intermediate(new_reorder_node, node, input);
|
||||||
|
}
|
||||||
|
|
||||||
|
p.apply_needed_padding(node, node.get_dependency(0), needed_padding);
|
||||||
|
};
|
||||||
|
|
||||||
if (node->is_type<convolution>()) {
|
if (node->is_type<convolution>()) {
|
||||||
auto& prim_node = node->as<convolution>();
|
auto& prim_node = node->as<convolution>();
|
||||||
const auto& prim = prim_node.get_primitive();
|
const auto& prim = prim_node.get_primitive();
|
||||||
@ -36,10 +56,6 @@ void prepare_padding::run(program& p) {
|
|||||||
format == format::b_fs_zyx_fsv32)
|
format == format::b_fs_zyx_fsv32)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (prim_node.input().is_type<data>()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto filter_size = prim_node.weights(0).get_output_layout().size;
|
auto filter_size = prim_node.weights(0).get_output_layout().size;
|
||||||
|
|
||||||
auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(),
|
auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(),
|
||||||
@ -51,7 +67,7 @@ void prepare_padding::run(program& p) {
|
|||||||
false,
|
false,
|
||||||
1);
|
1);
|
||||||
|
|
||||||
p.apply_needed_padding(prim_node, prim_node.input(), needed_padding);
|
add_required_padding(prim_node, needed_padding);
|
||||||
} else if (node->is_type<deconvolution>()) {
|
} else if (node->is_type<deconvolution>()) {
|
||||||
auto& prim_node = node->as<deconvolution>();
|
auto& prim_node = node->as<deconvolution>();
|
||||||
const auto& prim = prim_node.get_primitive();
|
const auto& prim = prim_node.get_primitive();
|
||||||
@ -59,10 +75,6 @@ void prepare_padding::run(program& p) {
|
|||||||
if (!prim->with_output_size)
|
if (!prim->with_output_size)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (prim_node.input().is_type<data>()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto filter_size = prim_node.weights(0).get_output_layout().size;
|
auto filter_size = prim_node.weights(0).get_output_layout().size;
|
||||||
|
|
||||||
auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(),
|
auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(),
|
||||||
@ -74,7 +86,7 @@ void prepare_padding::run(program& p) {
|
|||||||
true,
|
true,
|
||||||
1);
|
1);
|
||||||
|
|
||||||
p.apply_needed_padding(prim_node, prim_node.input(), needed_padding);
|
add_required_padding(prim_node, needed_padding);
|
||||||
} else if (node->is_type<pooling>()) {
|
} else if (node->is_type<pooling>()) {
|
||||||
auto& prim_node = node->as<pooling>();
|
auto& prim_node = node->as<pooling>();
|
||||||
const auto& prim = prim_node.get_primitive();
|
const auto& prim = prim_node.get_primitive();
|
||||||
@ -82,10 +94,6 @@ void prepare_padding::run(program& p) {
|
|||||||
if (!prim->with_output_size)
|
if (!prim->with_output_size)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (prim_node.input().is_type<data>()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
padding needed_padding;
|
padding needed_padding;
|
||||||
// WA for this format. sliding window needs to be fixed --perf degradation for IncepctionV1 type models
|
// WA for this format. sliding window needs to be fixed --perf degradation for IncepctionV1 type models
|
||||||
if (node->get_output_layout().format == format::b_fs_yx_fsv16)
|
if (node->get_output_layout().format == format::b_fs_yx_fsv16)
|
||||||
@ -100,17 +108,13 @@ void prepare_padding::run(program& p) {
|
|||||||
else
|
else
|
||||||
needed_padding = prim_node.input().get_output_layout().data_padding;
|
needed_padding = prim_node.input().get_output_layout().data_padding;
|
||||||
|
|
||||||
p.apply_needed_padding(prim_node, prim_node.input(), needed_padding);
|
add_required_padding(prim_node, needed_padding);
|
||||||
} else if (node->is_type<binary_convolution>()) {
|
} else if (node->is_type<binary_convolution>()) {
|
||||||
auto& prim_node = node->as<binary_convolution>();
|
auto& prim_node = node->as<binary_convolution>();
|
||||||
|
|
||||||
if (prim_node.input().is_type<data>()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto needed_padding = prim_node.input().get_output_layout().data_padding;
|
auto needed_padding = prim_node.input().get_output_layout().data_padding;
|
||||||
|
|
||||||
p.apply_needed_padding(prim_node, prim_node.input(), needed_padding);
|
add_required_padding(prim_node, needed_padding);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -828,7 +828,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
|||||||
int ofm_per_group = output_layout.size.feature[0] / prim->groups;
|
int ofm_per_group = output_layout.size.feature[0] / prim->groups;
|
||||||
int ifm_per_group = input_layout.size.feature[0] / prim->groups;
|
int ifm_per_group = input_layout.size.feature[0] / prim->groups;
|
||||||
int compute_block = 32;
|
int compute_block = 32;
|
||||||
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
|
|
||||||
bool valid_int8_dw = is_dw && output_layout.size.batch[0] % 16 == 0;
|
bool valid_int8_dw = is_dw && output_layout.size.batch[0] % 16 == 0;
|
||||||
bool non_grouped = prim->groups == 1;
|
bool non_grouped = prim->groups == 1;
|
||||||
bool is_2d = input_layout.format.spatial_num() == 2;
|
bool is_2d = input_layout.format.spatial_num() == 2;
|
||||||
@ -854,6 +853,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
|||||||
|
|
||||||
if (use_onednn_impls) {
|
if (use_onednn_impls) {
|
||||||
/* ***************************** OneDNN impls format selection part ****************************** */
|
/* ***************************** OneDNN impls format selection part ****************************** */
|
||||||
|
bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
|
||||||
if (i8_u8_input) {
|
if (i8_u8_input) {
|
||||||
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
|
if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
|
||||||
if (input_layout.size.batch[0] % 16 == 0) {
|
if (input_layout.size.batch[0] % 16 == 0) {
|
||||||
|
@ -774,18 +774,18 @@ void program::add_intermediate(program_node& node,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void program::add_intermediate(std::shared_ptr<primitive> prim,
|
void program::add_intermediate(std::shared_ptr<primitive> prim,
|
||||||
program_node& next,
|
program_node& next,
|
||||||
size_t prev_idx,
|
size_t prev_idx,
|
||||||
bool connect_int_node_with_old_dep,
|
bool connect_int_node_with_old_dep,
|
||||||
bool move_usrs_of_prev_to_node) {
|
bool move_usrs_of_prev_to_node) {
|
||||||
add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep, move_usrs_of_prev_to_node);
|
add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep, move_usrs_of_prev_to_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
void program::add_intermediate(program_node& node,
|
void program::add_intermediate(program_node& node,
|
||||||
program_node& next,
|
program_node& next,
|
||||||
program_node& prev,
|
program_node& prev,
|
||||||
bool connect_int_node_with_old_dep,
|
bool connect_int_node_with_old_dep,
|
||||||
bool move_usrs_of_prev_to_node) {
|
bool move_usrs_of_prev_to_node) {
|
||||||
bool node_found = false;
|
bool node_found = false;
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
for (size_t i = 0; i < next.get_dependencies().size(); i++) {
|
for (size_t i = 0; i < next.get_dependencies().size(); i++) {
|
||||||
|
@ -9066,4 +9066,81 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(convolution_gpu_onednn, padding_for_cldnn_kernel_after_onednn) {
|
||||||
|
auto& engine = get_onednn_test_engine();
|
||||||
|
|
||||||
|
int input_b = 1, input_f = 16, input_y = 3, input_x = 3;
|
||||||
|
int output_b = 1, output_f = 16, output_y = 6, output_x = 6;
|
||||||
|
|
||||||
|
auto input_size = tensor(input_b, input_f, input_x, input_y);
|
||||||
|
auto input_data = generate_random_4d<FLOAT16>(input_b, input_f, input_y, input_x, -1, 1);
|
||||||
|
auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
|
||||||
|
auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
|
||||||
|
set_values(input_mem, input_data_bfyx);
|
||||||
|
|
||||||
|
auto weights_size = tensor(16, 16, 1, 1, 1);
|
||||||
|
auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, 1, 1, -1, 1);
|
||||||
|
auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
|
||||||
|
auto weights_mem = engine.allocate_memory({data_types::f16, format::bfyx, weights_size});
|
||||||
|
set_values(weights_mem, weights_data_bfyx);
|
||||||
|
|
||||||
|
auto input = input_layout("input", input_mem->get_layout());
|
||||||
|
auto weights = data("weights", weights_mem);
|
||||||
|
auto input_reorder = reorder("input_fsv", "input", {data_types::f16, format::b_fs_yx_fsv16, input_size});
|
||||||
|
auto conv1 = convolution("conv1", "input_fsv", { "weights" });
|
||||||
|
auto conv2 = convolution("conv2", "conv1", { "weights" }, {1, 1, 1, 1}, {0, 0, -1, -1}, {1, 1, 1, 1}, {output_b, output_f, output_x, output_x});
|
||||||
|
auto output_reorder = reorder("reorder", "conv2", {data_types::f32, format::bfyx, {output_b, output_f, output_x, output_x}});
|
||||||
|
|
||||||
|
topology topology_test(input, weights, input_reorder, conv1, conv2, output_reorder);
|
||||||
|
topology topology_ref(input, weights, input_reorder, conv1, conv2, output_reorder);
|
||||||
|
|
||||||
|
build_options options_test;
|
||||||
|
implementation_desc conv1_impl_test = { format::b_fs_yx_fsv16, "", impl_types::onednn };
|
||||||
|
implementation_desc conv2_impl_test = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16", impl_types::ocl };
|
||||||
|
options_test.set_option(build_option::force_implementations({ {"conv1", conv1_impl_test}, {"conv2", conv2_impl_test} }));
|
||||||
|
options_test.set_option(build_option::optimize_data(true));
|
||||||
|
|
||||||
|
build_options options_ref;
|
||||||
|
implementation_desc conv1_impl_ref = { format::bfyx, "", impl_types::ocl };
|
||||||
|
implementation_desc conv2_impl_ref = { format::bfyx, "", impl_types::ocl };
|
||||||
|
options_ref.set_option(build_option::force_implementations({ {"conv1", conv1_impl_ref}, {"conv2", conv2_impl_ref} }));
|
||||||
|
options_ref.set_option(build_option::optimize_data(true));
|
||||||
|
|
||||||
|
network network_test(engine, topology_test, options_test);
|
||||||
|
network network_ref(engine, topology_ref, options_ref);
|
||||||
|
|
||||||
|
network_test.set_input_data("input", input_mem);
|
||||||
|
network_ref.set_input_data("input", input_mem);
|
||||||
|
|
||||||
|
auto outputs_test = network_test.execute();
|
||||||
|
auto outputs_ref = network_ref.execute();
|
||||||
|
|
||||||
|
EXPECT_EQ(outputs_test.size(), size_t(1));
|
||||||
|
EXPECT_EQ(outputs_test.begin()->first, "reorder");
|
||||||
|
EXPECT_EQ(outputs_ref.size(), size_t(1));
|
||||||
|
EXPECT_EQ(outputs_ref.begin()->first, "reorder");
|
||||||
|
|
||||||
|
auto output_memory_test = outputs_test.at("reorder").get_memory();
|
||||||
|
auto output_layout_test = output_memory_test->get_layout();
|
||||||
|
cldnn::mem_lock<float> output_ptr_test(output_memory_test, get_test_stream());
|
||||||
|
|
||||||
|
auto output_memory_ref = outputs_ref.at("reorder").get_memory();
|
||||||
|
auto output_layout_ref = output_memory_ref->get_layout();
|
||||||
|
cldnn::mem_lock<float> output_ptr_ref(output_memory_ref, get_test_stream());
|
||||||
|
|
||||||
|
EXPECT_EQ(output_layout_test.size.spatial[0], output_x);
|
||||||
|
EXPECT_EQ(output_layout_test.size.spatial[1], output_y);
|
||||||
|
EXPECT_EQ(output_layout_test.size.feature[0], output_f);
|
||||||
|
EXPECT_EQ(output_layout_test.size.batch[0], output_b);
|
||||||
|
|
||||||
|
EXPECT_EQ(output_layout_ref.size.spatial[0], output_x);
|
||||||
|
EXPECT_EQ(output_layout_ref.size.spatial[1], output_y);
|
||||||
|
EXPECT_EQ(output_layout_ref.size.feature[0], output_f);
|
||||||
|
EXPECT_EQ(output_layout_ref.size.batch[0], output_b);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < output_memory_ref->count(); i++) {
|
||||||
|
ASSERT_EQ(output_ptr_ref.data()[i], output_ptr_test.data()[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||||
|
Loading…
Reference in New Issue
Block a user