[GPU] Fake alignment for fc kernel for improve dynamic shape perf improvement (#13887)

* Fake alignment infra + fake alignment for fc kernel for improve dynamic shape performance optimization

* Applied review

* Added unittest

* Removed unused function
This commit is contained in:
Taylor Yeonbok Lee 2022-11-09 16:46:48 -08:00 committed by GitHub
parent aaefe0a256
commit 70cc27de53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 144 additions and 7 deletions

View File

@ -9,6 +9,7 @@
#include "json_object.h"
#include <string>
#include <algorithm>
#include "utils.hpp"
#include "matmul_shape_inference.hpp"
@ -165,6 +166,36 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
return { layout{output_shapes[0], output_type, output_format} };
}
kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
// fc_tiled_opt kernel is optimized for row shape aligned by 16.
// Thus, use fake aligned shape at kernel execution for better performance.
auto orig_input_layout = orig_impl_param.get_input_layout();
auto orig_output_layout = orig_impl_param.output_layout;
OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(),
"in/out layouts should be static for fake alignment!");
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) {
auto updated_param = orig_impl_param;
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
auto input_row_idx = input_shape.size() - 2;
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 16);
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
auto output_row_idx = output_shape.size() - 2;
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 16);
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
orig_input_layout.data_type,
orig_input_layout.format,
orig_input_layout.data_padding);
updated_param.output_layout = layout(ov::PartialShape(output_shape),
orig_output_layout.data_type,
orig_output_layout.format,
orig_output_layout.data_padding);
return updated_param;
}
return std::move(orig_impl_param);
}
template std::vector<layout> fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node const& node,
const kernel_impl_params& impl_param);

View File

@ -47,6 +47,7 @@ public:
template<typename ShapeType>
static std::vector<layout> calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param);
static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param);
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param);
static std::string to_string(fully_connected_node const& node);
typed_primitive_inst(network& network, fully_connected_node const& node);

View File

@ -363,6 +363,10 @@ public:
template<typename T>
static std::vector<layout> calc_output_layouts(const typed_node& node, const kernel_impl_params& impl_param) { return {}; }
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
return std::move(orig_impl_param);
}
typed_primitive_inst_base(network& network, typed_node const& node)
: typed_primitive_inst_base(network, node, do_allocate_memory(node)) {}

View File

@ -40,6 +40,7 @@ struct primitive_type {
virtual layout calc_output_layout(const program_node& node, const kernel_impl_params& params) const = 0;
virtual std::vector<layout> calc_output_layouts(const program_node& node, const kernel_impl_params& impl_param) const = 0;
virtual kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const = 0;
virtual std::string to_string(const program_node& node) const = 0;
};
} // namespace cldnn

View File

@ -71,7 +71,9 @@ struct primitive_type_base : primitive_type {
return typed_primitive_inst<PType>::template calc_output_layouts<ov::PartialShape>(node, impl_param);
}
kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const override {
return typed_primitive_inst<PType>::get_fake_aligned_params(orig_impl_param);
}
std::string to_string(const cldnn::program_node& node) const override {
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::to_string: primitive type mismatch");
return typed_primitive_inst<PType>::to_string(node);

View File

@ -238,7 +238,9 @@ void primitive_inst::realloc_if_needed() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
auto actual_layout = _impl_params->output_layout;
// Update param if fake_alignment is available
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
auto actual_layout = updated_params.output_layout;
OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout");
// input_layout node is supposed to always use external memory in dynamic case
@ -267,31 +269,32 @@ void primitive_inst::update_impl() {
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
// Update param if fake_alignment is available
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
auto get_layout_key = [&]() -> size_t {
size_t seed = 0;
auto& id = _impl_params->desc->id;
auto& id = updated_params.desc->id;
for (size_t i = 0; i < id.size(); i++) {
seed = hash_combine(seed, id[i]);
}
seed = hash_combine(seed, _node->get_unique_id());
for (auto& layout : _impl_params->input_layouts) {
for (auto& layout : updated_params.input_layouts) {
for (auto& d : layout.get_shape()) {
seed = hash_combine(seed, d);
}
}
for (auto& d : _impl_params->output_layout.get_shape()) {
for (auto& d : updated_params.output_layout.get_shape()) {
seed = hash_combine(seed, d);
}
return seed;
};
auto layout_key = get_layout_key();
auto& cache = get_network().get_implementations_cache();
if (cache.has(layout_key)) {
_impl = cache.get(layout_key)->clone();
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
} else {
_impl = _node->type()->choose_impl(*_node, *_impl_params);
_impl = _node->type()->choose_impl(*_node, updated_params);
auto& kernels_cache = get_network().get_kernels_cache();
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
_impl->set_kernel_ids(kernel_ids);

View File

@ -0,0 +1,95 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "fully_connected_inst.h"
#include "program_wrapper.h"
#include <cmath>
#include <algorithm>
using namespace cldnn;
using namespace ::tests;
namespace fake_alignment_tests {
struct fc_fake_align_params {
layout input_layout;
layout weight_layout;
data_types data_type;
layout expected_input_layout;
layout expected_output_layout;
};
class fully_connected_fake_align_test : public testing::TestWithParam<fc_fake_align_params> {};
TEST_P(fully_connected_fake_align_test, fake_alignment) {
auto p = GetParam();
auto& engine = get_test_engine();
auto input_layout_prim = std::make_shared<input_layout>("input", p.input_layout);
auto weight_layout_prim = std::make_shared<input_layout>("weight", p.weight_layout);
auto fully_connected_prim = std::make_shared<fully_connected>("output", "input", "weight", "", p.data_type);
cldnn::program prog(engine);
auto& input_node = prog.get_or_create(input_layout_prim);
auto& weight_node = prog.get_or_create(weight_layout_prim);
auto& fully_connected_node = prog.get_or_create(fully_connected_prim);
program_wrapper::add_connection(prog, input_node, fully_connected_node);
program_wrapper::add_connection(prog, weight_node, fully_connected_node);
auto impl_param = fully_connected_node.get_kernel_impl_params();
impl_param->output_layout = fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node, *fully_connected_node.get_kernel_impl_params())[0];
if (impl_param->get_input_layout().is_dynamic() || impl_param->output_layout.is_dynamic()) {
EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception);
} else {
auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param);
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout);
ASSERT_EQ(updated_param.output_layout, p.expected_output_layout);
}
}
INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
testing::ValuesIn(std::vector<fc_fake_align_params>{
{
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
},
{
layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
},
{
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
},
{
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy
},
}));
} // fake_alignment_tests