[GPU] Fake alignment for fc kernel for improve dynamic shape perf improvement (#13887)
* Fake alignment infra + fake alignment for fc kernel for improve dynamic shape performance optimization * Applied review * Added unittest * Removed unused function
This commit is contained in:
parent
aaefe0a256
commit
70cc27de53
@ -9,6 +9,7 @@
|
|||||||
#include "json_object.h"
|
#include "json_object.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
#include "matmul_shape_inference.hpp"
|
#include "matmul_shape_inference.hpp"
|
||||||
|
|
||||||
@ -165,6 +166,36 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
|
|||||||
return { layout{output_shapes[0], output_type, output_format} };
|
return { layout{output_shapes[0], output_type, output_format} };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
|
||||||
|
// fc_tiled_opt kernel is optimized for row shape aligned by 16.
|
||||||
|
// Thus, use fake aligned shape at kernel execution for better performance.
|
||||||
|
auto orig_input_layout = orig_impl_param.get_input_layout();
|
||||||
|
auto orig_output_layout = orig_impl_param.output_layout;
|
||||||
|
OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(),
|
||||||
|
"in/out layouts should be static for fake alignment!");
|
||||||
|
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) {
|
||||||
|
auto updated_param = orig_impl_param;
|
||||||
|
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
|
||||||
|
auto input_row_idx = input_shape.size() - 2;
|
||||||
|
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 16);
|
||||||
|
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
|
||||||
|
auto output_row_idx = output_shape.size() - 2;
|
||||||
|
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 16);
|
||||||
|
|
||||||
|
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
|
||||||
|
orig_input_layout.data_type,
|
||||||
|
orig_input_layout.format,
|
||||||
|
orig_input_layout.data_padding);
|
||||||
|
updated_param.output_layout = layout(ov::PartialShape(output_shape),
|
||||||
|
orig_output_layout.data_type,
|
||||||
|
orig_output_layout.format,
|
||||||
|
orig_output_layout.data_padding);
|
||||||
|
return updated_param;
|
||||||
|
}
|
||||||
|
return std::move(orig_impl_param);
|
||||||
|
}
|
||||||
|
|
||||||
template std::vector<layout> fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node const& node,
|
template std::vector<layout> fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node const& node,
|
||||||
const kernel_impl_params& impl_param);
|
const kernel_impl_params& impl_param);
|
||||||
|
|
||||||
|
@ -47,6 +47,7 @@ public:
|
|||||||
template<typename ShapeType>
|
template<typename ShapeType>
|
||||||
static std::vector<layout> calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param);
|
static std::vector<layout> calc_output_layouts(fully_connected_node const& /*node*/, const kernel_impl_params& impl_param);
|
||||||
static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param);
|
static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param);
|
||||||
|
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param);
|
||||||
static std::string to_string(fully_connected_node const& node);
|
static std::string to_string(fully_connected_node const& node);
|
||||||
|
|
||||||
typed_primitive_inst(network& network, fully_connected_node const& node);
|
typed_primitive_inst(network& network, fully_connected_node const& node);
|
||||||
|
@ -363,6 +363,10 @@ public:
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
static std::vector<layout> calc_output_layouts(const typed_node& node, const kernel_impl_params& impl_param) { return {}; }
|
static std::vector<layout> calc_output_layouts(const typed_node& node, const kernel_impl_params& impl_param) { return {}; }
|
||||||
|
|
||||||
|
static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
|
||||||
|
return std::move(orig_impl_param);
|
||||||
|
}
|
||||||
|
|
||||||
typed_primitive_inst_base(network& network, typed_node const& node)
|
typed_primitive_inst_base(network& network, typed_node const& node)
|
||||||
: typed_primitive_inst_base(network, node, do_allocate_memory(node)) {}
|
: typed_primitive_inst_base(network, node, do_allocate_memory(node)) {}
|
||||||
|
|
||||||
|
@ -40,6 +40,7 @@ struct primitive_type {
|
|||||||
|
|
||||||
virtual layout calc_output_layout(const program_node& node, const kernel_impl_params& params) const = 0;
|
virtual layout calc_output_layout(const program_node& node, const kernel_impl_params& params) const = 0;
|
||||||
virtual std::vector<layout> calc_output_layouts(const program_node& node, const kernel_impl_params& impl_param) const = 0;
|
virtual std::vector<layout> calc_output_layouts(const program_node& node, const kernel_impl_params& impl_param) const = 0;
|
||||||
|
virtual kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const = 0;
|
||||||
virtual std::string to_string(const program_node& node) const = 0;
|
virtual std::string to_string(const program_node& node) const = 0;
|
||||||
};
|
};
|
||||||
} // namespace cldnn
|
} // namespace cldnn
|
||||||
|
@ -71,7 +71,9 @@ struct primitive_type_base : primitive_type {
|
|||||||
|
|
||||||
return typed_primitive_inst<PType>::template calc_output_layouts<ov::PartialShape>(node, impl_param);
|
return typed_primitive_inst<PType>::template calc_output_layouts<ov::PartialShape>(node, impl_param);
|
||||||
}
|
}
|
||||||
|
kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param) const override {
|
||||||
|
return typed_primitive_inst<PType>::get_fake_aligned_params(orig_impl_param);
|
||||||
|
}
|
||||||
std::string to_string(const cldnn::program_node& node) const override {
|
std::string to_string(const cldnn::program_node& node) const override {
|
||||||
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::to_string: primitive type mismatch");
|
OPENVINO_ASSERT(node.type() == this, "[GPU] primitive_type_base::to_string: primitive type mismatch");
|
||||||
return typed_primitive_inst<PType>::to_string(node);
|
return typed_primitive_inst<PType>::to_string(node);
|
||||||
|
@ -238,7 +238,9 @@ void primitive_inst::realloc_if_needed() {
|
|||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
|
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
|
||||||
|
|
||||||
auto actual_layout = _impl_params->output_layout;
|
// Update param if fake_alignment is available
|
||||||
|
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||||
|
auto actual_layout = updated_params.output_layout;
|
||||||
OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout");
|
OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout");
|
||||||
|
|
||||||
// input_layout node is supposed to always use external memory in dynamic case
|
// input_layout node is supposed to always use external memory in dynamic case
|
||||||
@ -267,31 +269,32 @@ void primitive_inst::update_impl() {
|
|||||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
|
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
|
||||||
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||||
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
|
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
|
||||||
|
// Update param if fake_alignment is available
|
||||||
|
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||||
auto get_layout_key = [&]() -> size_t {
|
auto get_layout_key = [&]() -> size_t {
|
||||||
size_t seed = 0;
|
size_t seed = 0;
|
||||||
auto& id = _impl_params->desc->id;
|
auto& id = updated_params.desc->id;
|
||||||
for (size_t i = 0; i < id.size(); i++) {
|
for (size_t i = 0; i < id.size(); i++) {
|
||||||
seed = hash_combine(seed, id[i]);
|
seed = hash_combine(seed, id[i]);
|
||||||
}
|
}
|
||||||
seed = hash_combine(seed, _node->get_unique_id());
|
seed = hash_combine(seed, _node->get_unique_id());
|
||||||
for (auto& layout : _impl_params->input_layouts) {
|
for (auto& layout : updated_params.input_layouts) {
|
||||||
for (auto& d : layout.get_shape()) {
|
for (auto& d : layout.get_shape()) {
|
||||||
seed = hash_combine(seed, d);
|
seed = hash_combine(seed, d);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto& d : _impl_params->output_layout.get_shape()) {
|
for (auto& d : updated_params.output_layout.get_shape()) {
|
||||||
seed = hash_combine(seed, d);
|
seed = hash_combine(seed, d);
|
||||||
}
|
}
|
||||||
return seed;
|
return seed;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto layout_key = get_layout_key();
|
auto layout_key = get_layout_key();
|
||||||
auto& cache = get_network().get_implementations_cache();
|
auto& cache = get_network().get_implementations_cache();
|
||||||
if (cache.has(layout_key)) {
|
if (cache.has(layout_key)) {
|
||||||
_impl = cache.get(layout_key)->clone();
|
_impl = cache.get(layout_key)->clone();
|
||||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||||
} else {
|
} else {
|
||||||
_impl = _node->type()->choose_impl(*_node, *_impl_params);
|
_impl = _node->type()->choose_impl(*_node, updated_params);
|
||||||
auto& kernels_cache = get_network().get_kernels_cache();
|
auto& kernels_cache = get_network().get_kernels_cache();
|
||||||
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
|
auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source());
|
||||||
_impl->set_kernel_ids(kernel_ids);
|
_impl->set_kernel_ids(kernel_ids);
|
||||||
|
@ -0,0 +1,95 @@
|
|||||||
|
// Copyright (C) 2022 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "test_utils.h"
|
||||||
|
|
||||||
|
#include <intel_gpu/primitives/input_layout.hpp>
|
||||||
|
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||||
|
#include <intel_gpu/primitives/data.hpp>
|
||||||
|
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
|
|
||||||
|
#include "program_wrapper.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
using namespace cldnn;
|
||||||
|
using namespace ::tests;
|
||||||
|
|
||||||
|
namespace fake_alignment_tests {
|
||||||
|
|
||||||
|
struct fc_fake_align_params {
|
||||||
|
layout input_layout;
|
||||||
|
layout weight_layout;
|
||||||
|
data_types data_type;
|
||||||
|
layout expected_input_layout;
|
||||||
|
layout expected_output_layout;
|
||||||
|
};
|
||||||
|
|
||||||
|
class fully_connected_fake_align_test : public testing::TestWithParam<fc_fake_align_params> {};
|
||||||
|
|
||||||
|
TEST_P(fully_connected_fake_align_test, fake_alignment) {
|
||||||
|
auto p = GetParam();
|
||||||
|
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
|
||||||
|
auto input_layout_prim = std::make_shared<input_layout>("input", p.input_layout);
|
||||||
|
auto weight_layout_prim = std::make_shared<input_layout>("weight", p.weight_layout);
|
||||||
|
auto fully_connected_prim = std::make_shared<fully_connected>("output", "input", "weight", "", p.data_type);
|
||||||
|
|
||||||
|
cldnn::program prog(engine);
|
||||||
|
|
||||||
|
auto& input_node = prog.get_or_create(input_layout_prim);
|
||||||
|
auto& weight_node = prog.get_or_create(weight_layout_prim);
|
||||||
|
auto& fully_connected_node = prog.get_or_create(fully_connected_prim);
|
||||||
|
program_wrapper::add_connection(prog, input_node, fully_connected_node);
|
||||||
|
program_wrapper::add_connection(prog, weight_node, fully_connected_node);
|
||||||
|
|
||||||
|
auto impl_param = fully_connected_node.get_kernel_impl_params();
|
||||||
|
impl_param->output_layout = fully_connected_inst::calc_output_layouts<ov::PartialShape>(fully_connected_node, *fully_connected_node.get_kernel_impl_params())[0];
|
||||||
|
|
||||||
|
if (impl_param->get_input_layout().is_dynamic() || impl_param->output_layout.is_dynamic()) {
|
||||||
|
EXPECT_THROW(fully_connected_inst::get_fake_aligned_params(*impl_param), std::exception);
|
||||||
|
} else {
|
||||||
|
auto updated_param = fully_connected_inst::get_fake_aligned_params(*impl_param);
|
||||||
|
ASSERT_EQ(updated_param.get_input_layout(), p.expected_input_layout);
|
||||||
|
ASSERT_EQ(updated_param.output_layout, p.expected_output_layout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
|
||||||
|
testing::ValuesIn(std::vector<fc_fake_align_params>{
|
||||||
|
{
|
||||||
|
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||||
|
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||||
|
data_types::f16,
|
||||||
|
layout{ov::PartialShape{0, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||||
|
layout{ov::PartialShape{0, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||||
|
},
|
||||||
|
{
|
||||||
|
layout{ov::PartialShape{11, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||||
|
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||||
|
data_types::f16,
|
||||||
|
layout{ov::PartialShape{16, 1024}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||||
|
layout{ov::PartialShape{16, 1000}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||||
|
},
|
||||||
|
{
|
||||||
|
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||||
|
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
|
||||||
|
data_types::f16,
|
||||||
|
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
|
||||||
|
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
|
||||||
|
},
|
||||||
|
{
|
||||||
|
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
|
||||||
|
layout{ov::PartialShape{1000, 1024}, data_types::i8, format::bfyx}, // weight layout
|
||||||
|
data_types::f16,
|
||||||
|
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout // dummy
|
||||||
|
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout // dummy
|
||||||
|
},
|
||||||
|
|
||||||
|
}));
|
||||||
|
|
||||||
|
} // fake_alignment_tests
|
Loading…
Reference in New Issue
Block a user