[GPU] Enable shuffle and fsv32 in implicit concat (#9888)
[GPU] Enable shuffle and fsv32 in implicit concat * Support shuffle fsv32 * Check feature depths in first input depedency. * Add to select onednn convolution in case block format in get_preferred_impl_type func. Signed-off-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
parent
b6a75d7d91
commit
215db2dad8
@ -131,10 +131,12 @@ void concat_input_order::run(program& p) {
|
||||
bool no_fusing = !concat_node.has_fused_primitives() && concat_node.get_dependencies().size() == inputs_count;
|
||||
|
||||
auto out_format = concat_node.get_output_layout().format;
|
||||
bool correct_format = out_format == format::b_fs_yx_fsv16;
|
||||
bool correct_format = (out_format == format::b_fs_yx_fsv16) || (out_format == format::b_fs_yx_fsv32);
|
||||
tensor::value_type alignment = 1;
|
||||
if (out_format == format::b_fs_yx_fsv16)
|
||||
alignment = 16;
|
||||
else if (out_format == format::b_fs_yx_fsv32)
|
||||
alignment = 32;
|
||||
|
||||
bool single_format = true;
|
||||
std::vector<tensor::value_type> feature_sizes;
|
||||
|
@ -120,10 +120,13 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
|
||||
}
|
||||
|
||||
// For in place concatenation input layouts and data types must match.
|
||||
// Also, it checks whether data along f-axis is aligned properly for implicit concat.
|
||||
// Otherwise, use explicit concat instead.
|
||||
auto output_format = node.get_output_layout().format;
|
||||
auto output_datatype = node.get_output_layout().data_type;
|
||||
auto concat_axis = node.get_primitive()->axis;
|
||||
|
||||
size_t idx = 0;
|
||||
for (auto& input : node.get_dependencies()) {
|
||||
if (input->is_type<reshape>())
|
||||
// reshapes should be optimized out.
|
||||
@ -134,32 +137,33 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
|
||||
if (output_format != l.format || output_datatype != l.data_type)
|
||||
return false;
|
||||
|
||||
// TODO: Below condition should be moved to program_node::supports_padding.
|
||||
// This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases.
|
||||
// It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated.
|
||||
if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
if (l.format.block_sizes().size() > 1)
|
||||
return false;
|
||||
|
||||
if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
// TODO: Below condition should be moved to program_node::supports_padding.
|
||||
// This however will need updating the algorithm as it may make cascade adjustment impossible in some cases.
|
||||
// It however would make normal optimizations possible in others, so this is a trade-off to be investigated.
|
||||
if (idx != node.get_dependencies().size() - 1) {
|
||||
if ((l.format == format::b_fs_yx_fsv16 || l.format == format::b_fs_zyx_fsv16) &&
|
||||
(l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
return false;
|
||||
|
||||
if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
|
||||
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
return false;
|
||||
|
||||
if (l.format == format::bs_fs_yx_bsv16_fsv16)
|
||||
return false;
|
||||
|
||||
if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
|
||||
if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 4 || node.get_primitive()->axis != concatenation::along_f))
|
||||
return false;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
|
||||
lower_padd_in_axis = std::max(lower_padd_in_axis,
|
||||
node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]);
|
||||
|
||||
// check if concatenation in place can be applied for inputs set
|
||||
size_t idx = 0;
|
||||
idx = 0;
|
||||
for (auto input : node.get_dependencies()) {
|
||||
// reverted condition - if any of this node's inputs is used by more than one primitive
|
||||
// and is not optimized concatenation then do not fuse buffers
|
||||
|
@ -1494,10 +1494,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
|
||||
for (auto& dep : node.get_dependencies()) {
|
||||
if (dep->is_in_data_flow() && dep->get_preferred_impl_type() == impl_types::onednn) {
|
||||
preferred_impl = impl_types::onednn;
|
||||
break;
|
||||
return impl_types::onednn;
|
||||
}
|
||||
}
|
||||
if (format::is_blocked(node.get_output_layout().format)) {
|
||||
return impl_types::onednn;
|
||||
}
|
||||
// TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn
|
||||
} else if (node.is_type<fully_connected>()/* || node.is_type<gemm>()*/) {
|
||||
if (!_optimization_attributes.use_onednn_impls)
|
||||
|
@ -784,6 +784,152 @@ INSTANTIATE_TEST_SUITE_P(smoke_low_precision,
|
||||
),
|
||||
concat_gpu::PrintToStringParamName);
|
||||
|
||||
template <typename Type>
|
||||
struct concat_gpu_4d_implicit : public concat_gpu {
|
||||
public:
|
||||
cldnn::memory::ptr run_concat_network(std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input, format::type fmt, build_options options) {
|
||||
auto data_type = type_to_data_type<Type>::value;
|
||||
auto& engine = get_test_engine();
|
||||
const size_t batch_num = testing::get<0>(GetParam());
|
||||
const std::vector<size_t> in_features = testing::get<1>(GetParam());
|
||||
const size_t input_y = testing::get<2>(GetParam());
|
||||
const size_t input_x = testing::get<3>(GetParam());
|
||||
size_t output_f = 0;
|
||||
for (auto& f : in_features)
|
||||
output_f += f;
|
||||
|
||||
topology topology;
|
||||
|
||||
std::vector<memory::ptr> in_memory;
|
||||
std::vector<primitive_id> input_ids;
|
||||
std::vector<primitive_id> pooling_ids;
|
||||
|
||||
for (size_t i = 0; i < in_features.size(); i++) {
|
||||
auto size = tensor(static_cast<int32_t>(batch_num),
|
||||
static_cast<int32_t>(in_features[i]),
|
||||
static_cast<int32_t>(input_x),
|
||||
static_cast<int32_t>(input_y));
|
||||
auto data = input[i];
|
||||
auto in_lay = layout(data_type, fmt, size);
|
||||
auto data_flat = std::vector<Type>(in_lay.get_linear_size(), 0);
|
||||
|
||||
for (size_t bi = 0; bi < batch_num; ++bi) {
|
||||
for (size_t fi = 0; fi < in_features[i]; ++fi) {
|
||||
for (size_t yi = 0; yi < input_y; ++yi) {
|
||||
for (size_t xi = 0; xi < input_x; ++xi) {
|
||||
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
|
||||
auto in_offset = in_lay.get_linear_offset(coords);
|
||||
data_flat[in_offset] = data[bi][fi][yi][xi];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto in_mem = engine.allocate_memory(in_lay);
|
||||
set_values(in_mem, data_flat);
|
||||
in_memory.push_back(in_mem);
|
||||
|
||||
topology.add(input_layout("input" + std::to_string(i), in_lay));
|
||||
topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
|
||||
input_ids.push_back("input" + std::to_string(i));
|
||||
pooling_ids.push_back("pool" + std::to_string(i));
|
||||
}
|
||||
|
||||
topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f));
|
||||
auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f)));
|
||||
auto weights_mem = engine.allocate_memory(weights_lay);
|
||||
weights_mem->fill(get_test_stream());
|
||||
get_test_stream().finish();
|
||||
{
|
||||
cldnn::mem_lock<Type> weights_ptr(weights_mem, get_test_stream());
|
||||
for (size_t fi = 0; fi < output_f; ++fi) {
|
||||
auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0));
|
||||
auto offset = weights_lay.get_linear_offset(coords);
|
||||
weights_ptr[offset] = static_cast<Type>(1.f);
|
||||
}
|
||||
}
|
||||
topology.add(data("weights" , weights_mem));
|
||||
topology.add(convolution("conv", "concat", { "weights" }));
|
||||
topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x})));
|
||||
|
||||
network concat_network(engine, topology, options);
|
||||
for (size_t i = 0; i < in_features.size(); i++) {
|
||||
concat_network.set_input_data(input_ids[i], in_memory[i]);
|
||||
}
|
||||
concat_network.execute();
|
||||
|
||||
for (auto i : concat_network.get_primitives_info()) {
|
||||
// std::cout << " " << i.original_id << " " << i.kernel_id << std::endl;
|
||||
if (i.original_id == "concat") {
|
||||
if (options.get<build_option_type::optimize_data>()->enabled()) {
|
||||
EXPECT_TRUE(i.kernel_id == "undef");
|
||||
} else {
|
||||
EXPECT_FALSE(i.kernel_id == "undef");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return concat_network.get_output("reorder").get_memory();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
|
||||
const size_t batch_num = testing::get<0>(GetParam());
|
||||
const std::vector<size_t> in_features = testing::get<1>(GetParam());
|
||||
const size_t input_y = testing::get<2>(GetParam());
|
||||
const size_t input_x = testing::get<3>(GetParam());
|
||||
|
||||
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input(in_features.size());
|
||||
for (size_t i = 0; i < in_features.size(); ++i) {
|
||||
input[i] = generate_random_4d<Type>(batch_num, in_features[i], input_y, input_x, -1, 1);
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
void test(format::type fmt) {
|
||||
auto input = generate_input();
|
||||
|
||||
// implicit concat
|
||||
build_options options1;
|
||||
options1.set_option(build_option::optimize_data(true));
|
||||
auto out_mem1 = run_concat_network(input, fmt, options1);
|
||||
cldnn::mem_lock<Type> out_ptr1(out_mem1, get_test_stream());
|
||||
|
||||
// explicit concat
|
||||
build_options options2;
|
||||
options2.set_option(build_option::optimize_data(false));
|
||||
auto out_mem2 = run_concat_network(input, fmt, options2);
|
||||
cldnn::mem_lock<Type> out_ptr2(out_mem2, get_test_stream());
|
||||
|
||||
EXPECT_EQ(out_ptr1.size(), out_ptr2.size());
|
||||
size_t diff_count = 0;
|
||||
for (size_t i = 0; i < out_ptr1.size(); ++i) {
|
||||
if (out_ptr1[i] != out_ptr2[i]) diff_count++;
|
||||
}
|
||||
EXPECT_EQ(diff_count, 0);
|
||||
}
|
||||
};
|
||||
|
||||
using concat_implicit_gpu_4d_f16 = concat_gpu_4d_implicit<FLOAT16>;
|
||||
using concat_implicit_gpu_4d_i8 = concat_gpu_4d_implicit<int8_t>;
|
||||
|
||||
TEST_P(concat_implicit_gpu_4d_f16, input_order_opt_b_fs_yx_fsv16) {
|
||||
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke,
|
||||
concat_implicit_gpu_4d_f16,
|
||||
::testing::Values(
|
||||
TestParamType_concat(1, { 16, 16 }, 2, 2),
|
||||
TestParamType_concat(1, { 16, 8 }, 2, 2),
|
||||
TestParamType_concat(1, { 8, 16 }, 2, 2)
|
||||
),
|
||||
concat_gpu::PrintToStringParamName);
|
||||
|
||||
TEST_P(concat_implicit_gpu_4d_i8, input_order_opt_b_fs_yx_fsv32) {
|
||||
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32));
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
TEST(concat_gpu_onednn, basic_input_types) {
|
||||
@ -856,4 +1002,163 @@ TEST(concat_gpu_onednn, basic_input_types) {
|
||||
EXPECT_EQ(output_vec[x], output_ptr[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
struct concat_gpu_4d_implicit_onednn : public concat_gpu {
|
||||
public:
|
||||
cldnn::memory::ptr run_concat_network(std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input, format::type fmt, build_options options) {
|
||||
auto data_type = type_to_data_type<Type>::value;
|
||||
auto& engine = get_onednn_test_engine();
|
||||
const size_t batch_num = testing::get<0>(GetParam());
|
||||
const std::vector<size_t> in_features = testing::get<1>(GetParam());
|
||||
const size_t input_y = testing::get<2>(GetParam());
|
||||
const size_t input_x = testing::get<3>(GetParam());
|
||||
size_t output_f = 0;
|
||||
for (auto& f : in_features)
|
||||
output_f += f;
|
||||
|
||||
topology topology;
|
||||
|
||||
std::vector<memory::ptr> in_memory;
|
||||
std::vector<primitive_id> input_ids;
|
||||
std::vector<primitive_id> pooling_ids;
|
||||
|
||||
for (size_t i = 0; i < in_features.size(); i++) {
|
||||
auto size = tensor(static_cast<int32_t>(batch_num),
|
||||
static_cast<int32_t>(in_features[i]),
|
||||
static_cast<int32_t>(input_x),
|
||||
static_cast<int32_t>(input_y));
|
||||
auto data = input[i];
|
||||
auto in_lay = layout(data_type, fmt, size);
|
||||
auto data_flat = std::vector<Type>(in_lay.get_linear_size(), 0);
|
||||
|
||||
for (size_t bi = 0; bi < batch_num; ++bi) {
|
||||
for (size_t fi = 0; fi < in_features[i]; ++fi) {
|
||||
for (size_t yi = 0; yi < input_y; ++yi) {
|
||||
for (size_t xi = 0; xi < input_x; ++xi) {
|
||||
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
|
||||
auto in_offset = in_lay.get_linear_offset(coords);
|
||||
data_flat[in_offset] = data[bi][fi][yi][xi];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto in_mem = engine.allocate_memory(in_lay);
|
||||
set_values(in_mem, data_flat);
|
||||
in_memory.push_back(in_mem);
|
||||
|
||||
topology.add(input_layout("input" + std::to_string(i), in_lay));
|
||||
topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
|
||||
input_ids.push_back("input" + std::to_string(i));
|
||||
pooling_ids.push_back("pool" + std::to_string(i));
|
||||
}
|
||||
|
||||
topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f));
|
||||
auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f)));
|
||||
auto weights_mem = engine.allocate_memory(weights_lay);
|
||||
weights_mem->fill(get_test_stream());
|
||||
get_test_stream().finish();
|
||||
{
|
||||
cldnn::mem_lock<Type> weights_ptr(weights_mem, get_test_stream());
|
||||
for (size_t fi = 0; fi < output_f; ++fi) {
|
||||
auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0));
|
||||
auto offset = weights_lay.get_linear_offset(coords);
|
||||
weights_ptr[offset] = static_cast<Type>(1.f);
|
||||
}
|
||||
}
|
||||
topology.add(data("weights" , weights_mem));
|
||||
topology.add(convolution("conv", "concat", { "weights" }));
|
||||
topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x})));
|
||||
|
||||
network concat_network(engine, topology, options);
|
||||
for (size_t i = 0; i < in_features.size(); i++) {
|
||||
concat_network.set_input_data(input_ids[i], in_memory[i]);
|
||||
}
|
||||
concat_network.execute();
|
||||
|
||||
for (auto i : concat_network.get_primitives_info()) {
|
||||
// std::cout << " " << i.original_id << " " << i.kernel_id << std::endl;
|
||||
if (i.original_id == "concat") {
|
||||
if (options.get<build_option_type::optimize_data>()->enabled()) {
|
||||
EXPECT_TRUE(i.kernel_id == "undef");
|
||||
} else {
|
||||
EXPECT_FALSE(i.kernel_id == "undef");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return concat_network.get_output("reorder").get_memory();
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
|
||||
const size_t batch_num = testing::get<0>(GetParam());
|
||||
const std::vector<size_t> in_features = testing::get<1>(GetParam());
|
||||
const size_t input_y = testing::get<2>(GetParam());
|
||||
const size_t input_x = testing::get<3>(GetParam());
|
||||
|
||||
std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> input(in_features.size());
|
||||
for (size_t i = 0; i < in_features.size(); ++i) {
|
||||
input[i] = generate_random_4d<Type>(batch_num, in_features[i], input_y, input_x, -1, 1);
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
void test(format::type fmt) {
|
||||
auto input = generate_input();
|
||||
|
||||
// implicit concat
|
||||
build_options options1;
|
||||
options1.set_option(build_option::optimize_data(true));
|
||||
implementation_desc impl = { fmt, std::string(""), impl_types::onednn };
|
||||
options1.set_option(build_option::force_implementations({ {"conv", impl} }));
|
||||
auto out_mem1 = run_concat_network(input, fmt, options1);
|
||||
cldnn::mem_lock<Type> out_ptr1(out_mem1, get_test_stream());
|
||||
|
||||
// explicit concat
|
||||
build_options options2;
|
||||
options2.set_option(build_option::optimize_data(false));
|
||||
auto out_mem2 = run_concat_network(input, fmt, options2);
|
||||
cldnn::mem_lock<Type> out_ptr2(out_mem2, get_test_stream());
|
||||
|
||||
EXPECT_EQ(out_ptr1.size(), out_ptr2.size());
|
||||
size_t diff_count = 0;
|
||||
for (size_t i = 0; i < out_ptr1.size(); ++i) {
|
||||
if (out_ptr1[i] != out_ptr2[i]) diff_count++;
|
||||
}
|
||||
EXPECT_EQ(diff_count, 0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
using concat_implicit_gpu_onednn_4d_f16 = concat_gpu_4d_implicit_onednn<FLOAT16>;
|
||||
using concat_implicit_gpu_onednn_4d_i8 = concat_gpu_4d_implicit_onednn<int8_t>;
|
||||
|
||||
TEST_P(concat_implicit_gpu_onednn_4d_f16, input_order_opt_b_fs_yx_fsv16) {
|
||||
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke,
|
||||
concat_implicit_gpu_onednn_4d_f16,
|
||||
::testing::Values(
|
||||
TestParamType_concat(1, { 16, 16 }, 2, 2),
|
||||
TestParamType_concat(1, { 16, 8 }, 2, 2),
|
||||
TestParamType_concat(1, { 8, 16 }, 2, 2)
|
||||
),
|
||||
concat_gpu::PrintToStringParamName);
|
||||
|
||||
TEST_P(concat_implicit_gpu_onednn_4d_i8, input_order_opt_b_fs_yx_fsv32) {
|
||||
ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32));
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke,
|
||||
concat_implicit_gpu_onednn_4d_i8,
|
||||
::testing::Values(
|
||||
TestParamType_concat(1, { 32, 32 }, 2, 2),
|
||||
TestParamType_concat(1, { 32, 8 }, 2, 2),
|
||||
TestParamType_concat(1, { 8, 32 }, 2, 2)
|
||||
),
|
||||
concat_gpu::PrintToStringParamName);
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user