[GPU] Improve fake alignment logic and fix missed zero point value (#21530)
* [GPU] Improve fake alignment logic and fix missed zero point value * Add tests
This commit is contained in:
parent
c565bf0c37
commit
0ad0a6bfaa
@ -174,15 +174,30 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
|
||||
auto orig_output_layout = orig_impl_param.get_output_layout();
|
||||
OPENVINO_ASSERT(orig_input_layout.is_static() && orig_output_layout.is_static(),
|
||||
"in/out layouts should be static for fake alignment!");
|
||||
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx) {
|
||||
|
||||
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
|
||||
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
|
||||
|
||||
// Allow padding only for feature and outermost dimmension
|
||||
auto can_apply_fake_alignment = true;
|
||||
if (input_shape.size() == 3)
|
||||
can_apply_fake_alignment &= orig_input_layout.data_padding.lower_size().sizes()[1] == 0 &&
|
||||
orig_input_layout.data_padding.upper_size().sizes()[1] == 0;
|
||||
|
||||
if (output_shape.size() == 3)
|
||||
can_apply_fake_alignment &= orig_output_layout.data_padding.lower_size().sizes()[1] == 0 &&
|
||||
orig_output_layout.data_padding.upper_size().sizes()[1] == 0;
|
||||
|
||||
if (orig_input_layout.format == format::bfyx && orig_output_layout.format == format::bfyx && can_apply_fake_alignment) {
|
||||
auto updated_param = orig_impl_param;
|
||||
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
|
||||
auto input_row_idx = input_shape.size() - 2;
|
||||
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
|
||||
auto output_row_idx = output_shape.size() - 2;
|
||||
|
||||
auto batch_size = std::accumulate(input_shape.begin(),
|
||||
input_shape.end() - 1,
|
||||
size_t{1},
|
||||
std::multiplies<size_t>());
|
||||
|
||||
// Vector by matrix multiplication sometimes works slower if we align it
|
||||
if (input_shape[input_row_idx] == 1 && output_shape[output_row_idx] == 1 && input_shape[input_shape.size() - 1] >= 1024) {
|
||||
if (batch_size == 1 && input_shape.back() >= 1024) {
|
||||
return std::move(orig_impl_param);
|
||||
}
|
||||
|
||||
@ -190,12 +205,15 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
|
||||
if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) {
|
||||
auto weights_layout_dt = orig_impl_param.weights_layout.value().data_type;
|
||||
auto is_4bit = weights_layout_dt == data_types::i4 || weights_layout_dt == data_types::u4;
|
||||
auto is_extra_alignment_needed = output_shape[output_row_idx] >= 256;
|
||||
auto is_extra_alignment_needed = batch_size >= 256;
|
||||
fake_align_base = is_4bit && is_extra_alignment_needed ? 64 : 16;
|
||||
}
|
||||
|
||||
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], fake_align_base);
|
||||
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], fake_align_base);
|
||||
std::fill(input_shape.begin(), input_shape.end() - 1, 1);
|
||||
std::fill(output_shape.begin(), output_shape.end() - 1, 1);
|
||||
|
||||
input_shape[0] = align_to(batch_size, fake_align_base);
|
||||
output_shape[0] = align_to(batch_size, fake_align_base);
|
||||
|
||||
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
|
||||
orig_input_layout.data_type,
|
||||
|
@ -470,6 +470,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
|
||||
fc_with_bias_prim->compressed_weights = true;
|
||||
fc_with_bias_prim->decompression_scale = desc->decompression_scale;
|
||||
fc_with_bias_prim->decompression_zero_point = desc->decompression_zero_point;
|
||||
if (desc->decompression_zero_point_scalar.has_value())
|
||||
fc_with_bias_prim->decompression_zero_point_scalar = desc->decompression_zero_point_scalar.value();
|
||||
}
|
||||
auto& new_fc_node = p.get_or_create(fc_with_bias_prim);
|
||||
fuse_bias_f(fc, new_fc_node, bias_node, eltw_node);
|
||||
|
@ -38,9 +38,10 @@ TEST_P(fully_connected_fake_align_test, fake_alignment) {
|
||||
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_size = p.input_layout.get_partial_shape().size();
|
||||
auto input_layout_prim = std::make_shared<input_layout>("input", p.input_layout);
|
||||
auto weight_layout_prim = std::make_shared<input_layout>("weight", p.weight_layout);
|
||||
auto fully_connected_prim = std::make_shared<fully_connected>("output", input_info("input"), "weight", "", p.data_type);
|
||||
auto fully_connected_prim = std::make_shared<fully_connected>("output", input_info("input"), "weight", "", p.data_type, padding(), input_size);
|
||||
|
||||
cldnn::program prog(engine);
|
||||
|
||||
@ -106,7 +107,51 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
|
||||
layout{ov::PartialShape{-1, -1}, data_types::i8, format::bfyx}, // fake_aligned input layout_dgpu // dummy
|
||||
layout{ov::PartialShape{-1, -1}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu // dummy
|
||||
},
|
||||
|
||||
{
|
||||
layout{ov::PartialShape{1, 55, 511}, data_types::f16, format::bfyx}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
|
||||
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
|
||||
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
|
||||
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{2, 55, 511}, data_types::f16, format::bfyx}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{112, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
|
||||
layout{ov::PartialShape{112, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
|
||||
layout{ov::PartialShape{112, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
|
||||
layout{ov::PartialShape{112, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
|
||||
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
|
||||
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
|
||||
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{64, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // fake_aligned input layout_igpu
|
||||
layout{ov::PartialShape{64, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
|
||||
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // fake_aligned input layout_dgpu
|
||||
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
|
||||
},
|
||||
{
|
||||
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // input_layout
|
||||
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
|
||||
data_types::f16,
|
||||
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_igpu
|
||||
layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
|
||||
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_dgpu
|
||||
layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
|
||||
},
|
||||
}));
|
||||
|
||||
} // fake_alignment_tests
|
||||
|
@ -71,6 +71,17 @@ public:
|
||||
auto bias_shape = p.out_shape.size() == 3 ? ov::PartialShape{1, 1, p.out_shape[2]} : ov::PartialShape{1, p.out_shape[1]};
|
||||
return layout{ bias_shape, p.default_type, p.default_format };
|
||||
}
|
||||
|
||||
layout get_scale_layout(fully_connected_test_params& p, size_t group_size = 1) {
|
||||
if (p.weights_type == data_types::u8 || p.weights_type == data_types::i8) {
|
||||
auto scale_shape = p.out_shape.size() == 3 ? ov::PartialShape{p.out_shape[2]} : ov::PartialShape{p.out_shape[1]};
|
||||
return layout{ scale_shape, p.default_type, p.default_format };
|
||||
} else {
|
||||
auto groups_num = p.in_shape.size() == 3 ? p.in_shape[2] / group_size : p.in_shape[1] / group_size;
|
||||
auto scale_shape = p.out_shape.size() == 3 ? ov::PartialShape{p.out_shape[2], groups_num} : ov::PartialShape{p.out_shape[1], groups_num};
|
||||
return layout{ scale_shape, p.default_type, p.default_format };
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -165,6 +176,8 @@ public:
|
||||
#define CASE_FC_FP16_3D_1 { 2, 32, 3 }, { 2, 32, 16 }, { 16, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP16_3D_2 { 1, 1, 3 }, { 1, 1, 32 }, { 32, 3, 1 }, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_FC_FP16_INT4_COMP_1 { 1, 128 }, { 1, 128 }, { 128, 128 }, data_types::f16, format::bfyx, data_types::u4, format::oiyx, data_types::f16, format::bfyx
|
||||
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
/* ---------------------------------------- FC cases --------------------------------------------------- */
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
@ -276,6 +289,33 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_bias_dynamic, ::testing::ValuesIn(
|
||||
fully_connected_test_params{ DYN_CASE_FC_FP32_3D_3, 2, 3 },
|
||||
}));
|
||||
|
||||
class fc_compressed_int8_bias_dynamic : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_compressed_int8_bias_dynamic, basic) {
|
||||
auto p = GetParam();
|
||||
auto test_input_layout = get_input_layout(p);
|
||||
auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().rank()), test_input_layout.data_type, test_input_layout.format};
|
||||
|
||||
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
|
||||
fc_prim.decompression_zero_point_scalar = 8.0f;
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", dynamic_input_layout),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("scale", get_mem(get_scale_layout(p, 128))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
fc_prim,
|
||||
eltwise("bias_add", { input_info("fc_prim"), input_info("bias") }, eltwise_mode::sum),
|
||||
reorder("reorder_bfyx", input_info("bias_add"), p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p, true);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_compressed_int8_bias_dynamic, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
fully_connected_test_params{ CASE_FC_FP16_INT4_COMP_1, 2, 3 },
|
||||
}));
|
||||
|
||||
class fc_int8_eltwise : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_eltwise, basic) {
|
||||
auto p = GetParam();
|
||||
|
@ -117,6 +117,9 @@ public:
|
||||
if (l.data_type == data_types::i8 || l.data_type == data_types::u8) {
|
||||
VF<uint8_t> rnd_vec = rg.generate_random_1d<uint8_t>(s.count(), min_random, max_random);
|
||||
set_values(prim, rnd_vec);
|
||||
} else if (l.data_type == data_types::i4 || l.data_type == data_types::u4) {
|
||||
VF<int8_t> rnd_vec = rg.generate_random_1d<int8_t>(l.bytes_count(), min_random, max_random);
|
||||
set_values(prim, rnd_vec);
|
||||
} else if (l.data_type == data_types::f16) {
|
||||
VF<ov::float16> rnd_vec = rg.generate_random_1d<ov::float16>(s.count(), -1, 1);
|
||||
set_values(prim, rnd_vec);
|
||||
|
@ -50,11 +50,11 @@ public:
|
||||
void test_fc_basic(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int32_t b = 1, in_f = 128, in_x = 1, in_y = 1, out_f = 65;
|
||||
const int32_t b = 1, in_f = 128, out_f = 65;
|
||||
|
||||
auto input_prim = engine.allocate_memory({ { b, in_f, in_y, in_x }, data_types::f32, format::bfyx });
|
||||
auto weights_prim = engine.allocate_memory({ { out_f, in_f, in_y, in_x }, data_types::f32, format::bfyx });
|
||||
auto bias_prim = engine.allocate_memory({ { 1, 1, out_f, 1 }, data_types::f32, format::bfyx });
|
||||
auto input_prim = engine.allocate_memory({ { b, in_f }, data_types::f32, format::bfyx });
|
||||
auto weights_prim = engine.allocate_memory({ { out_f, in_f }, data_types::f32, format::bfyx });
|
||||
auto bias_prim = engine.allocate_memory({ { out_f }, data_types::f32, format::bfyx });
|
||||
|
||||
const auto key_prim_id = "fc";
|
||||
topology topology(
|
||||
@ -72,10 +72,10 @@ public:
|
||||
const auto params_hash = primitve->type->get_fake_aligned_params(*prim_inst->get_impl_params()).hash();
|
||||
if (!engine.get_device_info().supports_immad) {
|
||||
ASSERT_EQ(primitive_hash, 14259723886449306729UL);
|
||||
ASSERT_EQ(params_hash, 1637150664489130388UL);
|
||||
ASSERT_EQ(params_hash, 3365957578641948513UL);
|
||||
} else {
|
||||
ASSERT_EQ(primitive_hash, 14259723886449306729UL);
|
||||
ASSERT_EQ(params_hash, 6343702278017463925UL);
|
||||
ASSERT_EQ(params_hash, 9831190959346679696UL);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user