[GPU] Enable shuffle and fsv32 in implicit concat (#9888)

[GPU] Enable shuffle and fsv32 in implicit concat

* Support shuffle fsv32
* Check feature depths in first input depedency.
* Add to select onednn convolution in case block format in get_preferred_impl_type func.

Signed-off-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
hyunback kim 2022-02-18 09:40:14 +09:00 committed by GitHub
parent b6a75d7d91
commit 215db2dad8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 332 additions and 19 deletions

View File

@ -131,10 +131,12 @@ void concat_input_order::run(program& p) {
bool no_fusing = !concat_node.has_fused_primitives() && concat_node.get_dependencies().size() == inputs_count;
auto out_format = concat_node.get_output_layout().format;
bool correct_format = out_format == format::b_fs_yx_fsv16;
bool correct_format = (out_format == format::b_fs_yx_fsv16) || (out_format == format::b_fs_yx_fsv32);
tensor::value_type alignment = 1;
if (out_format == format::b_fs_yx_fsv16)
alignment = 16;
else if (out_format == format::b_fs_yx_fsv32)
alignment = 32;
bool single_format = true;
std::vector<tensor::value_type> feature_sizes;

View File

@ -120,10 +120,13 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
}
// For in place concatenation input layouts and data types must match.
// Also, it checks whether data along f-axis is aligned properly for implicit concat.
// Otherwise, use explicit concat instead.
auto output_format = node.get_output_layout().format;
auto output_datatype = node.get_output_layout().data_type;
auto concat_axis = node.get_primitive()->axis;
size_t idx = 0;
for (auto& input : node.get_dependencies()) {
if (input->is_type<reshape>())
// reshapes should be optimized out.
@ -134,24 +137,25 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
if (output_format != l.format || output_datatype != l.data_type)
return false;
if (l.format.block_sizes().size() > 1)
return false;
// TODO: Below condition should be moved to program_node::supports_padding.
// This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases.
// It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated.
if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
// This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
// It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
if (idx != node.get_dependencies().size() - 1) {
if ((l.format == format::b_fs_yx_fsv16 || l.format == format::b_fs_zyx_fsv16) &&
(l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
if (l.format == format::bs_fs_yx_bsv16_fsv16)
return false;
if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
return false;
if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 4 || node.get_primitive()->axis != concatenation::along_f))
return false;
}
idx++;
}
auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
@ -159,7 +163,7 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]);
// check if concatenation in place can be applied for inputs set
size_t idx = 0;
idx = 0;
for (auto input : node.get_dependencies()) {
// reverted condition - if any of this node's inputs is used by more than one primitive
// and is not optimized concatenation then do not fuse buffers

View File

@ -1494,10 +1494,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
for (auto& dep : node.get_dependencies()) {
if (dep->is_in_data_flow() && dep->get_preferred_impl_type() == impl_types::onednn) {
preferred_impl = impl_types::onednn;
break;
return impl_types::onednn;
}
}
if (format::is_blocked(node.get_output_layout().format)) {
return impl_types::onednn;
}
// TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn
} else if (node.is_type<fully_connected>()/* || node.is_type<gemm>()*/) {
if (!_optimization_attributes.use_onednn_impls)

View File

@ -784,6 +784,152 @@ INSTANTIATE_TEST_SUITE_P(smoke_low_precision,
),
concat_gpu::PrintToStringParamName);
template <typename Type>
struct concat_gpu_4d_implicit : public concat_gpu {
public:
cldnn::memory::ptr run_concat_network(std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input, format::type fmt, build_options options) {
auto data_type = type_to_data_type<Type>::value;
auto& engine = get_test_engine();
const size_t batch_num = testing::get<0>(GetParam());
const std::vector<size_t> in_features = testing::get<1>(GetParam());
const size_t input_y = testing::get<2>(GetParam());
const size_t input_x = testing::get<3>(GetParam());
size_t output_f = 0;
for (auto& f : in_features)
output_f += f;
topology topology;
std::vector<memory::ptr> in_memory;
std::vector<primitive_id> input_ids;
std::vector<primitive_id> pooling_ids;
for (size_t i = 0; i < in_features.size(); i++) {
auto size = tensor(static_cast<int32_t>(batch_num),
static_cast<int32_t>(in_features[i]),
static_cast<int32_t>(input_x),
static_cast<int32_t>(input_y));
auto data = input[i];
auto in_lay = layout(data_type, fmt, size);
auto data_flat = std::vector<Type>(in_lay.get_linear_size(), 0);
for (size_t bi = 0; bi < batch_num; ++bi) {
for (size_t fi = 0; fi < in_features[i]; ++fi) {
for (size_t yi = 0; yi < input_y; ++yi) {
for (size_t xi = 0; xi < input_x; ++xi) {
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
auto in_offset = in_lay.get_linear_offset(coords);
data_flat[in_offset] = data[bi][fi][yi][xi];
}
}
}
}
auto in_mem = engine.allocate_memory(in_lay);
set_values(in_mem, data_flat);
in_memory.push_back(in_mem);
topology.add(input_layout("input" + std::to_string(i), in_lay));
topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
input_ids.push_back("input" + std::to_string(i));
pooling_ids.push_back("pool" + std::to_string(i));
}
topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f));
auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f)));
auto weights_mem = engine.allocate_memory(weights_lay);
weights_mem->fill(get_test_stream());
get_test_stream().finish();
{
cldnn::mem_lock<Type> weights_ptr(weights_mem, get_test_stream());
for (size_t fi = 0; fi < output_f; ++fi) {
auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0));
auto offset = weights_lay.get_linear_offset(coords);
weights_ptr[offset] = static_cast<Type>(1.f);
}
}
topology.add(data("weights" , weights_mem));
topology.add(convolution("conv", "concat", { "weights" }));
topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x})));
network concat_network(engine, topology, options);
for (size_t i = 0; i < in_features.size(); i++) {
concat_network.set_input_data(input_ids[i], in_memory[i]);
}
concat_network.execute();
for (auto i : concat_network.get_primitives_info()) {
// std::cout << " " << i.original_id << " " << i.kernel_id << std::endl;
if (i.original_id == "concat") {
if (options.get<build_option_type::optimize_data>()->enabled()) {
EXPECT_TRUE(i.kernel_id == "undef");
} else {
EXPECT_FALSE(i.kernel_id == "undef");
}
}
}
return concat_network.get_output("reorder").get_memory();
}
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
const size_t batch_num = testing::get<0>(GetParam());
const std::vector<size_t> in_features = testing::get<1>(GetParam());
const size_t input_y = testing::get<2>(GetParam());
const size_t input_x = testing::get<3>(GetParam());
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input(in_features.size());
for (size_t i = 0; i < in_features.size(); ++i) {
input[i] = generate_random_4d<Type>(batch_num, in_features[i], input_y, input_x, -1, 1);
}
return input;
}
void test(format::type fmt) {
auto input = generate_input();
// implicit concat
build_options options1;
options1.set_option(build_option::optimize_data(true));
auto out_mem1 = run_concat_network(input, fmt, options1);
cldnn::mem_lock<Type> out_ptr1(out_mem1, get_test_stream());
// explicit concat
build_options options2;
options2.set_option(build_option::optimize_data(false));
auto out_mem2 = run_concat_network(input, fmt, options2);
cldnn::mem_lock<Type> out_ptr2(out_mem2, get_test_stream());
EXPECT_EQ(out_ptr1.size(), out_ptr2.size());
size_t diff_count = 0;
for (size_t i = 0; i < out_ptr1.size(); ++i) {
if (out_ptr1[i] != out_ptr2[i]) diff_count++;
}
EXPECT_EQ(diff_count, 0);
}
};
using concat_implicit_gpu_4d_f16 = concat_gpu_4d_implicit<FLOAT16>;
using concat_implicit_gpu_4d_i8 = concat_gpu_4d_implicit<int8_t>;
TEST_P(concat_implicit_gpu_4d_f16, input_order_opt_b_fs_yx_fsv16) {
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16));
}
INSTANTIATE_TEST_SUITE_P(smoke,
concat_implicit_gpu_4d_f16,
::testing::Values(
TestParamType_concat(1, { 16, 16 }, 2, 2),
TestParamType_concat(1, { 16, 8 }, 2, 2),
TestParamType_concat(1, { 8, 16 }, 2, 2)
),
concat_gpu::PrintToStringParamName);
TEST_P(concat_implicit_gpu_4d_i8, input_order_opt_b_fs_yx_fsv32) {
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32));
}
#ifdef ENABLE_ONEDNN_FOR_GPU
TEST(concat_gpu_onednn, basic_input_types) {
@ -856,4 +1002,163 @@ TEST(concat_gpu_onednn, basic_input_types) {
EXPECT_EQ(output_vec[x], output_ptr[x]);
}
}
template <typename Type>
struct concat_gpu_4d_implicit_onednn : public concat_gpu {
public:
cldnn::memory::ptr run_concat_network(std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input, format::type fmt, build_options options) {
auto data_type = type_to_data_type<Type>::value;
auto& engine = get_onednn_test_engine();
const size_t batch_num = testing::get<0>(GetParam());
const std::vector<size_t> in_features = testing::get<1>(GetParam());
const size_t input_y = testing::get<2>(GetParam());
const size_t input_x = testing::get<3>(GetParam());
size_t output_f = 0;
for (auto& f : in_features)
output_f += f;
topology topology;
std::vector<memory::ptr> in_memory;
std::vector<primitive_id> input_ids;
std::vector<primitive_id> pooling_ids;
for (size_t i = 0; i < in_features.size(); i++) {
auto size = tensor(static_cast<int32_t>(batch_num),
static_cast<int32_t>(in_features[i]),
static_cast<int32_t>(input_x),
static_cast<int32_t>(input_y));
auto data = input[i];
auto in_lay = layout(data_type, fmt, size);
auto data_flat = std::vector<Type>(in_lay.get_linear_size(), 0);
for (size_t bi = 0; bi < batch_num; ++bi) {
for (size_t fi = 0; fi < in_features[i]; ++fi) {
for (size_t yi = 0; yi < input_y; ++yi) {
for (size_t xi = 0; xi < input_x; ++xi) {
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
auto in_offset = in_lay.get_linear_offset(coords);
data_flat[in_offset] = data[bi][fi][yi][xi];
}
}
}
}
auto in_mem = engine.allocate_memory(in_lay);
set_values(in_mem, data_flat);
in_memory.push_back(in_mem);
topology.add(input_layout("input" + std::to_string(i), in_lay));
topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
input_ids.push_back("input" + std::to_string(i));
pooling_ids.push_back("pool" + std::to_string(i));
}
topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f));
auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f)));
auto weights_mem = engine.allocate_memory(weights_lay);
weights_mem->fill(get_test_stream());
get_test_stream().finish();
{
cldnn::mem_lock<Type> weights_ptr(weights_mem, get_test_stream());
for (size_t fi = 0; fi < output_f; ++fi) {
auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0));
auto offset = weights_lay.get_linear_offset(coords);
weights_ptr[offset] = static_cast<Type>(1.f);
}
}
topology.add(data("weights" , weights_mem));
topology.add(convolution("conv", "concat", { "weights" }));
topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x})));
network concat_network(engine, topology, options);
for (size_t i = 0; i < in_features.size(); i++) {
concat_network.set_input_data(input_ids[i], in_memory[i]);
}
concat_network.execute();
for (auto i : concat_network.get_primitives_info()) {
// std::cout << " " << i.original_id << " " << i.kernel_id << std::endl;
if (i.original_id == "concat") {
if (options.get<build_option_type::optimize_data>()->enabled()) {
EXPECT_TRUE(i.kernel_id == "undef");
} else {
EXPECT_FALSE(i.kernel_id == "undef");
}
}
}
return concat_network.get_output("reorder").get_memory();
}
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
const size_t batch_num = testing::get<0>(GetParam());
const std::vector<size_t> in_features = testing::get<1>(GetParam());
const size_t input_y = testing::get<2>(GetParam());
const size_t input_x = testing::get<3>(GetParam());
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input(in_features.size());
for (size_t i = 0; i < in_features.size(); ++i) {
input[i] = generate_random_4d<Type>(batch_num, in_features[i], input_y, input_x, -1, 1);
}
return input;
}
void test(format::type fmt) {
auto input = generate_input();
// implicit concat
build_options options1;
options1.set_option(build_option::optimize_data(true));
implementation_desc impl = { fmt, std::string(""), impl_types::onednn };
options1.set_option(build_option::force_implementations({ {"conv", impl} }));
auto out_mem1 = run_concat_network(input, fmt, options1);
cldnn::mem_lock<Type> out_ptr1(out_mem1, get_test_stream());
// explicit concat
build_options options2;
options2.set_option(build_option::optimize_data(false));
auto out_mem2 = run_concat_network(input, fmt, options2);
cldnn::mem_lock<Type> out_ptr2(out_mem2, get_test_stream());
EXPECT_EQ(out_ptr1.size(), out_ptr2.size());
size_t diff_count = 0;
for (size_t i = 0; i < out_ptr1.size(); ++i) {
if (out_ptr1[i] != out_ptr2[i]) diff_count++;
}
EXPECT_EQ(diff_count, 0);
}
};
using concat_implicit_gpu_onednn_4d_f16 = concat_gpu_4d_implicit_onednn<FLOAT16>;
using concat_implicit_gpu_onednn_4d_i8 = concat_gpu_4d_implicit_onednn<int8_t>;
TEST_P(concat_implicit_gpu_onednn_4d_f16, input_order_opt_b_fs_yx_fsv16) {
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16));
}
INSTANTIATE_TEST_SUITE_P(smoke,
concat_implicit_gpu_onednn_4d_f16,
::testing::Values(
TestParamType_concat(1, { 16, 16 }, 2, 2),
TestParamType_concat(1, { 16, 8 }, 2, 2),
TestParamType_concat(1, { 8, 16 }, 2, 2)
),
concat_gpu::PrintToStringParamName);
TEST_P(concat_implicit_gpu_onednn_4d_i8, input_order_opt_b_fs_yx_fsv32) {
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32));
}
INSTANTIATE_TEST_SUITE_P(smoke,
concat_implicit_gpu_onednn_4d_i8,
::testing::Values(
TestParamType_concat(1, { 32, 32 }, 2, 2),
TestParamType_concat(1, { 32, 8 }, 2, 2),
TestParamType_concat(1, { 8, 32 }, 2, 2)
),
concat_gpu::PrintToStringParamName);
#endif