[GPU] Fake alignment for fc kernel for improve dynamic shape perf improvement (#13887)
* Fake alignment infra + fake alignment for fc kernel for improve dynamic shape performance optimization * Applied review * Added unittest * Removed unused function
This commit is contained in:
parent
aaefe0a256
commit
70cc27de53
@ -9,6 +9,7 @@
|
||||
#include "json_object.h"
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "utils.hpp"
|
||||
|
||||
#include "matmul_shape_inference.hpp"
|
||||
|
||||
@ -165,6 +166,36 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
|
||||
return { layout{output_shapes[0], output_type, output_format} };
|
||||
}
|
||||
|
||||
|
||||
kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
|
||||
// fc_tiled_opt kernel is optimized for row shape aligned by 16.
|
||||
// Thus, use fake aligned shape at kernel execution for better performance.
|
||||
auto orig_input_layout = orig_impl_param.get_input_layout();
|
||||
auto orig_output_layout = orig_impl_param.output_layout;
|
||||
OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(),
|
||||
"in/out layouts should be static for fake alignment!");
|
||||
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) {
|
||||
auto updated_param = orig_impl_param;
|
||||
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
|
||||
auto input_row_idx = input_shape.size() - 2;
|
||||
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 16);
|
||||
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
|
||||
auto output_row_idx = output_shape.size() - 2;
|
||||
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 16);
|
||||
|
||||
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
|
||||
orig_input_layout.data_type,
|
||||
orig_input_layout.format,
|
||||
orig_input_layout.data_padding);
|
||||
updated_param.output_layout = layout(ov::PartialShape(output_shape),
|
||||
orig_output_layout.data_type,
|
||||
orig_output_layout.format,
|
||||
orig_output_layout.data_padding);
|
||||
return updated_param;
|
||||
}
|
||||
return std::move(orig_impl_param);
|
||||
}
|
||||
|
||||
template std::vector<layout> fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node const& node,
|
||||
const kernel_impl_params& impl_param);
|
||||
|
||||
|
@ -47,6 +47,7 @@ public:
|
||||
template<typename ShapeType>
|
||||
static std::vector<layout> calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param);
|
||||
static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param);
|
||||
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param);
|
||||
static std::string to_string(fully_connected_node const& node);
|
||||
|
||||
typed_primitive_inst(network& network, fully_connected_node const& node);
|
||||
|
@ -363,6 +363,10 @@ public:
|
||||
template<typename T>
|
||||
static std::vector<layout> calc_output_layouts(const typed_node& node, const kernel_impl_params& impl_param) { return {}; }
|
||||
|
||||
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
|
||||
return std::move(orig_impl_param);
|
||||
}
|
||||
|
||||
typed_primitive_inst_base(network& network, typed_node const& node)
|
||||
: typed_primitive_inst_base(network, node, do_allocate_memory(node)) {}
|
||||
|
||||
|
@ -40,6 +40,7 @@ struct primitive_type {
|
||||
|
||||
virtual layout calc_output_layout(const program_node& node, const kernel_impl_params& params) const = 0;
|
||||
virtual std::vector<layout> calc_output_layouts(const program_node& node, const kernel_impl_params& impl_param) const = 0;
|
||||
virtual kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const = 0;
|
||||
virtual std::string to_string(const program_node& node) const = 0;
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
@ -71,7 +71,9 @@ struct primitive_type_base : primitive_type {
|
||||
|
||||
return typed_primitive_inst<PType>::template calc_output_layouts<ov::PartialShape>(node, impl_param);
|
||||
}
|
||||
|
||||
kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const override {
|
||||
return typed_primitive_inst<PType>::get_fake_aligned_params(orig_impl_param);
|
||||
}
|
||||
std::string to_string(const cldnn::program_node& node) const override {
|
||||
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::to_string: primitive type mismatch");
|
||||
return typed_primitive_inst<PType>::to_string(node);
|
||||
|
@ -238,7 +238,9 @@ void primitive_inst::realloc_if_needed() {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
|
||||
|
||||
auto actual_layout = _impl_params->output_layout;
|
||||
// Update param if fake_alignment is available
|
||||
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||
auto actual_layout = updated_params.output_layout;
|
||||
OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout");
|
||||
|
||||
// input_layout node is supposed to always use external memory in dynamic case
|
||||
@ -267,31 +269,32 @@ void primitive_inst::update_impl() {
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
|
||||
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
|
||||
// Update param if fake_alignment is available
|
||||
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||
auto get_layout_key = [&]() -> size_t {
|
||||
size_t seed = 0;
|
||||
auto& id = _impl_params->desc->id;
|
||||
auto& id = updated_params.desc->id;
|
||||
for (size_t i = 0; i < id.size(); i++) {
|
||||
seed = hash_combine(seed, id[i]);
|
||||
}
|
||||
seed = hash_combine(seed, _node->get_unique_id());
|
||||
for (auto& layout : _impl_params->input_layouts) {
|
||||
for (auto& layout : updated_params.input_layouts) {
|
||||
for (auto& d : layout.get_shape()) {
|
||||
seed = hash_combine(seed, d);
|
||||
}
|
||||
}
|
||||
for (auto& d : _impl_params->output_layout.get_shape()) {
|
||||
for (auto& d : updated_params.output_layout.get_shape()) {
|
||||
seed = hash_combine(seed, d);
|
||||
}
|
||||
return seed;
|
||||
};
|
||||
|
||||
auto layout_key = get_layout_key();
|
||||
auto& cache = get_network().get_implementations_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
_impl = cache.get(layout_key)->clone();
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
} else {
|
||||
_impl = _node->type()->choose_impl(*_node, *_impl_params);
|
||||
_impl = _node->type()->choose_impl(*_node, updated_params);
|
||||
auto& kernels_cache = get_network().get_kernels_cache();
|
||||
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
|
||||
_impl->set_kernel_ids(kernel_ids);
|
||||
|
@ -0,0 +1,95 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include "fully_connected_inst.h"
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
namespace fake_alignment_tests {
|
||||
|
||||
struct fc_fake_align_params {
|
||||
layout input_layout;
|
||||
layout weight_layout;
|
||||
data_types data_type;
|
||||
layout expected_input_layout;
|
||||
layout expected_output_layout;
|
||||
};
|
||||
|
||||
class fully_connected_fake_align_test : public testing::TestWithParam<fc_fake_align_params> {};
|
||||
|
||||
TEST_P(fully_connected_fake_align_test, fake_alignment) {
|
||||
auto p = GetParam();
|
||||
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_layout_prim = std::make_shared<input_layout>("input", p.input_layout);
|
||||
auto weight_layout_prim = std::make_shared<input_layout>("weight", p.weight_layout);
|
||||
auto fully_connected_prim = std::make_shared<fully_connected>("output", "input", "weight", "", p.data_type);
|
||||
|
||||
cldnn::program prog(engine);
|
||||
|
||||
auto& input_node = prog.get_or_create(input_layout_prim);
|
||||
auto& weight_node = prog.get_or_create(weight_layout_prim);
|
||||
auto& fully_connected_node = prog.get_or_create(fully_connected_prim);
|
||||
program_wrapper::add_connection(prog, input_node, fully_connected_node);
|
||||
program_wrapper::add_connection(prog, weight_node, fully_connected_node);
|
||||
|
||||
auto impl_param = fully_connected_node.get_kernel_impl_params();
|
||||
impl_param->output_layout = fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node, *fully_connected_node.get_kernel_impl_params())[0];
|
||||
|
||||
if (impl_param->get_input_layout().is_dynamic() || impl_param->output_layout.is_dynamic()) {
|
||||
EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception);
|
||||
} else {
|
||||
auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param);
|
||||
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout);
|
||||
ASSERT_EQ(updated_param.output_layout, p.expected_output_layout);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
|
||||
testing::ValuesIn(std::vector<fc_fake_align_params>{
|
||||
{
|
||||
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy
|
||||
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy
|
||||
},
|
||||
|
||||
}));
|
||||
|
||||
} // fake_alignment_tests
|
Loading…
Reference in New Issue
Block a user