From 70cc27de535eefe2985d2c27743f50b3fda82e64 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Wed, 9 Nov 2022 16:46:48 -0800 Subject: [PATCH] [GPU] Fake alignment for fc kernel for improve dynamic shape perf improvement (#13887) * Fake alignment infra + fake alignment for fc kernel for improve dynamic shape performance optimization * Applied review * Added unittest * Removed unused function --- .../intel_gpu/src/graph/fully_connected.cpp | 31 ++++++ .../src/graph/include/fully_connected_inst.h | 1 + .../src/graph/include/primitive_inst.h | 4 + .../src/graph/include/primitive_type.h | 1 + .../src/graph/include/primitive_type_base.h | 4 +- .../intel_gpu/src/graph/primitive_inst.cpp | 15 +-- .../fake_alignment/fc_fake_alignment_test.cpp | 95 +++++++++++++++++++ 7 files changed, 144 insertions(+), 7 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/fake_alignment/fc_fake_alignment_test.cpp diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index f543e412381..4db3b7aba9f 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -9,6 +9,7 @@ #include "json_object.h" #include #include +#include "utils.hpp" #include "matmul_shape_inference.hpp" @@ -165,6 +166,36 @@ std::vector fully_connected_inst::calc_output_layouts(fully_connected_no return { layout{output_shapes[0], output_type, output_format} }; } + +kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) { + // fc_tiled_opt kernel is optimized for row shape aligned by 16. + // Thus, use fake aligned shape at kernel execution for better performance. + auto orig_input_layout = orig_impl_param.get_input_layout(); + auto orig_output_layout = orig_impl_param.output_layout; + OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(), + "in/out layouts should be static for fake alignment!"); + if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) { + auto updated_param = orig_impl_param; + auto input_shape = orig_input_layout.get_partial_shape().to_shape(); + auto input_row_idx = input_shape.size() - 2; + input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 16); + auto output_shape = orig_output_layout.get_partial_shape().to_shape(); + auto output_row_idx = output_shape.size() - 2; + output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 16); + + updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape), + orig_input_layout.data_type, + orig_input_layout.format, + orig_input_layout.data_padding); + updated_param.output_layout = layout(ov::PartialShape(output_shape), + orig_output_layout.data_type, + orig_output_layout.format, + orig_output_layout.data_padding); + return updated_param; + } + return std::move(orig_impl_param); +} + template std::vector fully_connected_inst::calc_output_layouts(fully_connected_node const& node, const kernel_impl_params& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h b/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h index 00ec67bd9ca..1f5f8a367af 100644 --- a/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h @@ -47,6 +47,7 @@ public: template static std::vector calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param); static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param); + static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param); static std::string to_string(fully_connected_node const& node); typed_primitive_inst(network& network, fully_connected_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 6e0d9c9c054..27543aa70fe 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -363,6 +363,10 @@ public: template static std::vector calc_output_layouts(const typed_node& node, const kernel_impl_params& impl_param) { return {}; } + static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) { + return std::move(orig_impl_param); + } + typed_primitive_inst_base(network& network, typed_node const& node) : typed_primitive_inst_base(network, node, do_allocate_memory(node)) {} diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_type.h b/src/plugins/intel_gpu/src/graph/include/primitive_type.h index 5053afd9230..7ed8e4732ce 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_type.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_type.h @@ -40,6 +40,7 @@ struct primitive_type { virtual layout calc_output_layout(const program_node& node, const kernel_impl_params& params) const = 0; virtual std::vector calc_output_layouts(const program_node& node, const kernel_impl_params& impl_param) const = 0; + virtual kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const = 0; virtual std::string to_string(const program_node& node) const = 0; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h index fe84d1bc760..964b79a53e9 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_type_base.h @@ -71,7 +71,9 @@ struct primitive_type_base : primitive_type { return typed_primitive_inst::template calc_output_layouts(node, impl_param); } - + kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const override { + return typed_primitive_inst::get_fake_aligned_params(orig_impl_param); + } std::string to_string(const cldnn::program_node& node) const override { OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::to_string: primitive type mismatch"); return typed_primitive_inst::to_string(node); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 7487cc75ef4..2564ae6651b 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -238,7 +238,9 @@ void primitive_inst::realloc_if_needed() { GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation); - auto actual_layout = _impl_params->output_layout; + // Update param if fake_alignment is available + auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params); + auto actual_layout = updated_params.output_layout; OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout"); // input_layout node is supposed to always use external memory in dynamic case @@ -267,31 +269,32 @@ void primitive_inst::update_impl() { GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation); auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; if (!_node->is_type() && !(_node->is_type() && _node->get_dependencies().empty())) { + // Update param if fake_alignment is available + auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params); auto get_layout_key = [&]() -> size_t { size_t seed = 0; - auto& id = _impl_params->desc->id; + auto& id = updated_params.desc->id; for (size_t i = 0; i < id.size(); i++) { seed = hash_combine(seed, id[i]); } seed = hash_combine(seed, _node->get_unique_id()); - for (auto& layout : _impl_params->input_layouts) { + for (auto& layout : updated_params.input_layouts) { for (auto& d : layout.get_shape()) { seed = hash_combine(seed, d); } } - for (auto& d : _impl_params->output_layout.get_shape()) { + for (auto& d : updated_params.output_layout.get_shape()) { seed = hash_combine(seed, d); } return seed; }; - auto layout_key = get_layout_key(); auto& cache = get_network().get_implementations_cache(); if (cache.has(layout_key)) { _impl = cache.get(layout_key)->clone(); GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); } else { - _impl = _node->type()->choose_impl(*_node, *_impl_params); + _impl = _node->type()->choose_impl(*_node, updated_params); auto& kernels_cache = get_network().get_kernels_cache(); auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source()); _impl->set_kernel_ids(kernel_ids); diff --git a/src/plugins/intel_gpu/tests/fake_alignment/fc_fake_alignment_test.cpp b/src/plugins/intel_gpu/tests/fake_alignment/fc_fake_alignment_test.cpp new file mode 100644 index 00000000000..0f6f8226608 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fake_alignment/fc_fake_alignment_test.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include +#include +#include + +#include "fully_connected_inst.h" + +#include "program_wrapper.h" + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +namespace fake_alignment_tests { + +struct fc_fake_align_params { + layout input_layout; + layout weight_layout; + data_types data_type; + layout expected_input_layout; + layout expected_output_layout; +}; + +class fully_connected_fake_align_test : public testing::TestWithParam {}; + +TEST_P(fully_connected_fake_align_test, fake_alignment) { + auto p = GetParam(); + + auto& engine = get_test_engine(); + + auto input_layout_prim = std::make_shared("input", p.input_layout); + auto weight_layout_prim = std::make_shared("weight", p.weight_layout); + auto fully_connected_prim = std::make_shared("output", "input", "weight", "", p.data_type); + + cldnn::program prog(engine); + + auto& input_node = prog.get_or_create(input_layout_prim); + auto& weight_node = prog.get_or_create(weight_layout_prim); + auto& fully_connected_node = prog.get_or_create(fully_connected_prim); + program_wrapper::add_connection(prog, input_node, fully_connected_node); + program_wrapper::add_connection(prog, weight_node, fully_connected_node); + + auto impl_param = fully_connected_node.get_kernel_impl_params(); + impl_param->output_layout = fully_connected_inst::calc_output_layouts(fully_connected_node, *fully_connected_node.get_kernel_impl_params())[0]; + + if (impl_param->get_input_layout().is_dynamic() || impl_param->output_layout.is_dynamic()) { + EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception); + } else { + auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param); + ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout); + ASSERT_EQ(updated_param.output_layout, p.expected_output_layout); + } +} + +INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test, + testing::ValuesIn(std::vector{ + { + layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout + layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout + layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout + }, + { + layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout + layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout + layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout + }, + { + layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout + layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx} // fake_aligned output layout + }, + { + layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout + layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy + layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy + }, + + })); + +} // fake_alignment_tests