[GPU] Improve fake alignment logic and fix missed zero point value (#21530)

* [GPU] Improve fake alignment logic and fix missed zero point value

* Add tests
This commit is contained in:
Sergey Shlyapnikov 2023-12-08 16:20:35 +04:00 committed by GitHub
parent c565bf0c37
commit 0ad0a6bfaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 125 additions and 17 deletions

View File

@ -174,15 +174,30 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
auto orig_output_layout = orig_impl_param.get_output_layout();
OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(),
"in/out layouts should be static for fake alignment!");
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) {
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
// Allow padding only for feature and outermost dimmension
auto can_apply_fake_alignment = true;
if (input_shape.size() == 3)
can_apply_fake_alignment &= orig_input_layout.data_padding.lower_size().sizes()[1] == 0 &&
orig_input_layout.data_padding.upper_size().sizes()[1] == 0;
if (output_shape.size() == 3)
can_apply_fake_alignment &= orig_output_layout.data_padding.lower_size().sizes()[1] == 0 &&
orig_output_layout.data_padding.upper_size().sizes()[1] == 0;
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx && can_apply_fake_alignment) {
auto updated_param = orig_impl_param;
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
auto input_row_idx = input_shape.size() - 2;
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
auto output_row_idx = output_shape.size() - 2;
auto batch_size = std::accumulate(input_shape.begin(),
input_shape.end() - 1,
size_t{1},
std::multiplies<size_t>());
// Vector by matrix multiplication sometimes works slower if we align it
if (input_shape[input_row_idx] == 1 && output_shape[output_row_idx] == 1 && input_shape[input_shape.size() - 1] >= 1024) {
if (batch_size == 1 && input_shape.back() >= 1024) {
return std::move(orig_impl_param);
}
@ -190,12 +205,15 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) {
auto weights_layout_dt = orig_impl_param.weights_layout.value().data_type;
auto is_4bit = weights_layout_dt == data_types::i4 || weights_layout_dt == data_types::u4;
auto is_extra_alignment_needed = output_shape[output_row_idx] >= 256;
auto is_extra_alignment_needed = batch_size >= 256;
fake_align_base = is_4bit && is_extra_alignment_needed ? 64 : 16;
}
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], fake_align_base);
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], fake_align_base);
std::fill(input_shape.begin(), input_shape.end() - 1, 1);
std::fill(output_shape.begin(), output_shape.end() - 1, 1);
input_shape[0] = align_to(batch_size, fake_align_base);
output_shape[0] = align_to(batch_size, fake_align_base);
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
orig_input_layout.data_type,

View File

@ -470,6 +470,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
fc_with_bias_prim->compressed_weights = true;
fc_with_bias_prim->decompression_scale = desc->decompression_scale;
fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point;
if (desc->decompression_zero_point_scalar.has_value())
fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value();
}
auto& new_fc_node = p.get_or_create(fc_with_bias_prim);
fuse_bias_f(fc, new_fc_node, bias_node, eltw_node);

View File

@ -38,9 +38,10 @@ TEST_P(fully_connected_fake_align_test, fake_alignment) {
auto& engine = get_test_engine();
auto input_size = p.input_layout.get_partial_shape().size();
auto input_layout_prim = std::make_shared<input_layout>("input", p.input_layout);
auto weight_layout_prim = std::make_shared<input_layout>("weight", p.weight_layout);
auto fully_connected_prim = std::make_shared<fully_connected>("output", input_info("input"), "weight", "", p.data_type);
auto fully_connected_prim = std::make_shared<fully_connected>("output", input_info("input"), "weight", "", p.data_type, padding(), input_size);
cldnn::program prog(engine);
@ -106,7 +107,51 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_dgpu // dummy
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu // dummy
},
{
layout{ov::PartialShape{1, 55, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{2, 55, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{112, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{112, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{112, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{112, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // fake_aligned input layout_igpu
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_igpu
layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
}));
} // fake_alignment_tests

View File

@ -71,6 +71,17 @@ public:
auto bias_shape = p.out_shape.size() == 3 ? ov::PartialShape{1, 1, p.out_shape[2]} : ov::PartialShape{1, p.out_shape[1]};
return layout{ bias_shape, p.default_type, p.default_format };
}
layout get_scale_layout(fully_connected_test_params& p, size_t group_size = 1) {
if (p.weights_type == data_types::u8 || p.weights_type == data_types::i8) {
auto scale_shape = p.out_shape.size() == 3 ? ov::PartialShape{p.out_shape[2]} : ov::PartialShape{p.out_shape[1]};
return layout{ scale_shape, p.default_type, p.default_format };
} else {
auto groups_num = p.in_shape.size() == 3 ? p.in_shape[2] / group_size : p.in_shape[1] / group_size;
auto scale_shape = p.out_shape.size() == 3 ? ov::PartialShape{p.out_shape[2], groups_num} : ov::PartialShape{p.out_shape[1], groups_num};
return layout{ scale_shape, p.default_type, p.default_format };
}
}
};
@ -165,6 +176,8 @@ public:
#define CASE_FC_FP16_3D_1 { 2, 32, 3 }, { 2, 32, 16 }, { 16, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_3D_2 { 1, 1, 3 }, { 1, 1, 32 }, { 32, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
#define CASE_FC_FP16_INT4_COMP_1 { 1, 128 }, { 1, 128 }, { 128, 128 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
/* ----------------------------------------------------------------------------------------------------- */
/* ---------------------------------------- FC cases --------------------------------------------------- */
/* ----------------------------------------------------------------------------------------------------- */
@ -276,6 +289,33 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_bias_dynamic, ::testing::ValuesIn(
fully_connected_test_params{ DYN_CASE_FC_FP32_3D_3, 2, 3 },
}));
class fc_compressed_int8_bias_dynamic : public FullyConnectedFusingTest {};
TEST_P(fc_compressed_int8_bias_dynamic, basic) {
auto p = GetParam();
auto test_input_layout = get_input_layout(p);
auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().rank()), test_input_layout.data_type, test_input_layout.format};
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
fc_prim.decompression_zero_point_scalar = 8.0f;
create_topologies(
input_layout("input", dynamic_input_layout),
data("weights", get_mem(get_weights_layout(p))),
data("scale", get_mem(get_scale_layout(p, 128))),
data("bias", get_mem(get_bias_layout(p))),
fc_prim,
eltwise("bias_add", { input_info("fc_prim"), input_info("bias") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("bias_add"), p.default_format, data_types::f32)
);
tolerance = 1e-5f;
execute(p, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_compressed_int8_bias_dynamic, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
fully_connected_test_params{ CASE_FC_FP16_INT4_COMP_1, 2, 3 },
}));
class fc_int8_eltwise : public FullyConnectedFusingTest {};
TEST_P(fc_int8_eltwise, basic) {
auto p = GetParam();

View File

@ -117,6 +117,9 @@ public:
if (l.data_type == data_types::i8 || l.data_type == data_types::u8) {
VF<uint8_t> rnd_vec = rg.generate_random_1d<uint8_t>(s.count(), min_random, max_random);
set_values(prim, rnd_vec);
} else if (l.data_type == data_types::i4 || l.data_type == data_types::u4) {
VF<int8_t> rnd_vec = rg.generate_random_1d<int8_t>(l.bytes_count(), min_random, max_random);
set_values(prim, rnd_vec);
} else if (l.data_type == data_types::f16) {
VF<ov::float16> rnd_vec = rg.generate_random_1d<ov::float16>(s.count(), -1, 1);
set_values(prim, rnd_vec);

View File

@ -50,11 +50,11 @@ public:
void test_fc_basic(bool is_caching_test) {
auto& engine = get_test_engine();
const int32_t b = 1, in_f = 128, in_x = 1, in_y = 1, out_f = 65;
const int32_t b = 1, in_f = 128, out_f = 65;
auto input_prim = engine.allocate_memory({ { b, in_f, in_y, in_x }, data_types::f32, format::bfyx });
auto weights_prim = engine.allocate_memory({ { out_f, in_f, in_y, in_x }, data_types::f32, format::bfyx });
auto bias_prim = engine.allocate_memory({ { 1, 1, out_f, 1 }, data_types::f32, format::bfyx });
auto input_prim = engine.allocate_memory({ { b, in_f }, data_types::f32, format::bfyx });
auto weights_prim = engine.allocate_memory({ { out_f, in_f }, data_types::f32, format::bfyx });
auto bias_prim = engine.allocate_memory({ { out_f }, data_types::f32, format::bfyx });
const auto key_prim_id = "fc";
topology topology(
@ -72,10 +72,10 @@ public:
const auto params_hash = primitve->type->get_fake_aligned_params(*prim_inst->get_impl_params()).hash();
if (!engine.get_device_info().supports_immad) {
ASSERT_EQ(primitive_hash, 14259723886449306729UL);
ASSERT_EQ(params_hash, 1637150664489130388UL);
ASSERT_EQ(params_hash, 3365957578641948513UL);
} else {
ASSERT_EQ(primitive_hash, 14259723886449306729UL);
ASSERT_EQ(params_hash, 6343702278017463925UL);
ASSERT_EQ(params_hash, 9831190959346679696UL);
}
}