[GPU] Shape agnostic FC opt tiled kernel (#15396)

This commit is contained in:
Sergey Shlyapnikov 2023-02-06 12:17:55 +04:00 committed by GitHub
parent cd48d76009
commit e003bf3af7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 270 additions and 65 deletions

View File

@ -166,7 +166,7 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
// fc_tiled_opt kernel is optimized for row shape aligned by 16.
// fc_tiled_opt kernel is optimized for row shape aligned by 8.
// Thus, use fake aligned shape at kernel execution for better performance.
auto orig_input_layout = orig_impl_param.get_input_layout();
auto orig_output_layout = orig_impl_param.get_output_layout();
@ -176,10 +176,10 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
auto updated_param = orig_impl_param;
auto input_shape = orig_input_layout.get_partial_shape().to_shape();
auto input_row_idx = input_shape.size() - 2;
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 16);
input_shape[input_row_idx] = align_to(input_shape[input_row_idx], 8);
auto output_shape = orig_output_layout.get_partial_shape().to_shape();
auto output_row_idx = output_shape.size() - 2;
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 16);
output_shape[output_row_idx] = align_to(output_shape[output_row_idx], 8);
updated_param.input_layouts[0] = layout(ov::PartialShape(input_shape),
orig_input_layout.data_type,

View File

@ -356,9 +356,9 @@ bool primitive_inst::update_impl() {
});
_impl = _dynamic_impl->clone();
_impl->update_dispatch_data(updated_params);
_impl->update_dispatch_data(*_impl_params);
update_shape_info(updated_params);
update_shape_info(*_impl_params);
} else {
_impl = _node->type()->choose_impl(*_node, updated_params);
auto& kernels_cache = get_network().get_kernels_cache();
@ -1274,6 +1274,7 @@ size_t primitive_inst::get_impl_key(const kernel_impl_params& params) const {
}
return seed;
}
size_t primitive_inst::get_impl_key() const {
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
return get_impl_key(updated_params);

View File

@ -79,6 +79,7 @@
REQD_SUB_GROUP_SIZE(SIMD)
KERNEL(fc)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
const __global FILTER_TYPE* weights
@ -149,10 +150,10 @@ KERNEL(fc)(
weights_offset += TILE_K_OFM * SIMD;
unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
const uint total_k = ki * TILE_K + kii;
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
const uint total_k = ki * TILE_K + kii;
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
}
}
@ -236,11 +237,18 @@ KERNEL(fc)(
uint output_offset = out_f * TILE_OUT_F_PITCH + out_b * TILE_OUT_B_PITCH + OUTPUT_OFFSET;
if (USE_BLOCK_WRITE && (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= TILE_OUT_F_NUM)) {
#if IS_DYNAMIC
#define WRITE_OUTPUT(bi) do { \
if (bi + out_b < BATCH_SIZE) \
OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \
output_offset += TILE_OUT_B_PITCH; \
} while (false)
#else
#define WRITE_OUTPUT(bi) do { \
OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \
output_offset += TILE_OUT_B_PITCH; \
} while (false)
#endif
CONST_LOOP(TILE_B, WRITE_OUTPUT);
#undef WRITE_OUTPUT
} else {
@ -270,8 +278,11 @@ KERNEL(fc)(
for (uint bi = 0; bi < TILE_B; ++bi) {
for (uint fi = 0; fi < TILE_OFM; ++fi) {
const bool should_write =
TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 ||
out_f + fi * SIMD + sglid < TILE_OUT_F_NUM;
#if IS_DYNAMIC
bi + out_b < BATCH_SIZE &&
#endif
(TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 ||
out_f + fi * SIMD + sglid < TILE_OUT_F_NUM);
if (should_write) {
output[output_offset] = ((OUTPUT_TYPE*)(&result[bi]))[fi];
}

View File

@ -51,6 +51,7 @@ ParamsKey FullyConnected_bf_tiled::GetSupportedKey() const {
k.EnableTensorPitches();
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.EnableDynamicShapesSupport();
return k;
}
@ -72,10 +73,17 @@ bool FullyConnected_bf_tiled::Validate(const Params& params, const optional_para
// Block reads must be aligned to 4 bytes, for fp16 we can correct for offset misalignment,
// but we need to ensure that batch pitch preserves alignment.
if (input.GetDType() == Datatype::F16) {
if (input.Batch().pitch % 2 != 0 && input.Batch().v > 1)
if (input.Batch().pitch % 2 != 0 && (input.Batch().v > 1 || fc_params.is_shape_agnostic))
return false;
// for 3d case we have to check feature alignment as well
if (output.GetLayout() == DataLayout::bfyx && input.Feature().pitch % 2 != 0 && input.Feature().v > 1)
if (output.GetLayout() == DataLayout::bfyx && input.Feature().pitch % 2 != 0 && (input.Feature().v > 1 || fc_params.is_shape_agnostic))
return false;
}
// Dynamic kernel doesn't support dynamic weights yet
if (fc_params.is_shape_agnostic && input.is_dynamic()) {
if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) ||
(output.GetLayout() == DataLayout::bf && input.Feature().v == 0))
return false;
}
@ -141,7 +149,8 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params,
output_f = params.outputs[0].Y().v;
}
if (output_b % (tparams.tile_b * tparams.dispatch_bsv) != 0)
auto batch_size = params.is_shape_agnostic ? Align(output_b, tparams.tile_b) : output_b;
if (batch_size % (tparams.tile_b * tparams.dispatch_bsv) != 0)
return false;
if (CeilDiv(output_f, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
return false;
@ -191,44 +200,52 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
while (max_tile_ofm * 2 * simd <= output_f && max_tile_ofm < 4)
max_tile_ofm *= 2;
if (dtype == Datatype::F16) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 16, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 16, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 4, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 8, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 2, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 4, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_AGE_BASED));
if (params.is_shape_agnostic) {
if (dtype == Datatype::F16) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_AGE_BASED));
} else if (dtype == Datatype::F32) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 1, 1, EXE_MODE_AGE_BASED));
}
} else {
if (dtype == Datatype::F16) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 16, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 16, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 4, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 8, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 2, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 4, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(16, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_AGE_BASED));
} else if (dtype == Datatype::F32) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 16, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 16, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 8, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 4, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 2, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 1, 1, EXE_MODE_AGE_BASED));
}
selector.Case([&](const fully_connected_params&) -> tune_params {
tune_params result(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_DEFAULT);
while (batch % result.tile_b != 0)
result.tile_b--;
result.dispatch_bsv = 16;
while (batch % (result.tile_b * result.dispatch_bsv) != 0)
result.dispatch_bsv--;
if (result.tile_b >= 8)
result.exec_options = EXE_MODE_AGE_BASED;
return result;
});
}
if (dtype == Datatype::F32) {
// tune_params(tile_b, tile_ofm, tile_ifm, tile_k, dispatch_bsv, dispatch_fsv, exec_options)
selector.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 16, 2, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 16, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 8, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 4, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 2, 1, EXE_MODE_AGE_BASED))
.Case(tune_params(8, std::min(max_tile_ofm, 2u), 1, 1, 1, 1, EXE_MODE_AGE_BASED));
}
selector.Case([&](const fully_connected_params&) -> tune_params {
tune_params result(8, std::min(max_tile_ofm, 2u), 1, 2, 1, 1, EXE_MODE_DEFAULT);
while (batch % result.tile_b != 0)
result.tile_b--;
result.dispatch_bsv = 16;
while (batch % (result.tile_b * result.dispatch_bsv) != 0)
result.dispatch_bsv--;
if (result.tile_b >= 8)
result.exec_options = EXE_MODE_AGE_BASED;
return result;
});
return selector.Default(tune_params(1, 1, 1, 1, 1, 1, EXE_MODE_DEFAULT));
}
@ -238,12 +255,14 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au
auto tparams = GetAutoTuneParams(params, autoTuneIndex);
size_t feature_threads = CeilDiv(params.outputs[0].Feature().v, tparams.tile_ofm * simd);
size_t batch_threads = params.outputs[0].Batch().v / tparams.tile_b;
size_t batch_threads = params.outputs[0].Batch().v;
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
feature_threads = CeilDiv(params.outputs[0].Y().v, tparams.tile_ofm * simd);
batch_threads = (params.outputs[0].Batch().v * params.outputs[0].Feature().v) / tparams.tile_b;
batch_threads = params.outputs[0].Batch().v * params.outputs[0].Feature().v;
}
batch_threads = CeilDiv(batch_threads, tparams.tile_b);
dispatchData.gws[0] = feature_threads * batch_threads * simd;
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
@ -308,11 +327,13 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)"));
} else {
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v));
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
}
if (!params.fused_ops.empty()) {

View File

@ -77,10 +77,10 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
},
{
layout{ov::PartialShape{133, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
layout{ov::PartialShape{800, 511}, data_types::i8, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{144, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{144, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
layout{ov::PartialShape{136, 511}, data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // fake_aligned input layout
layout{ov::PartialShape{136, 800}, data_types::f16, format::bfyx} // fake_aligned output layout
},
{
layout{ov::PartialShape::dynamic(2), data_types::i8, format::bfyx, padding{{1,1,1,1}, 0}}, // input_layout

View File

@ -1791,7 +1791,7 @@ TEST(fully_connected_gpu, dynamic) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -1843,7 +1843,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -1867,7 +1867,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -1928,7 +1928,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), 2);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -1957,7 +1957,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), 1);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -2015,7 +2015,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(2, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), 2); // fake_alignment
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -2044,7 +2044,7 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 16)); // fake_alignment
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(1, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), 1); // fake_alignment
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
@ -2059,3 +2059,175 @@ TEST(fully_connected_gpu, dynamic_multi_inference_multiple_shapes) {
}
}
}
namespace {
template <typename OutputT,
typename InputT,
typename WeightsT,
typename BiasT,
typename AccT = OutputT>
VF<OutputT> dynamic_fully_connected_reference_calc(ov::Dimension::value_type batch,
ov::Dimension::value_type input_f,
ov::Dimension::value_type output_f,
VF<InputT>& input,
VF<WeightsT>& weights,
VF<BiasT>& bias) {
VF<OutputT> result(batch * output_f);
for (int b = 0; b < batch; b++) {
for (int ofm = 0; ofm < output_f; ofm++) {
AccT acc = static_cast<AccT>(bias[ofm]);
for (int ifm = 0; ifm < input_f; ifm++) {
acc += weights[ofm * input_f + ifm] * input[b * input_f + ifm];
}
result[b * output_f + ofm] = acc;
}
}
return result;
}
} // namespace
using fully_connected_dynamic_test_params = std::tuple<
std::vector<ov::Dimension::value_type>, // batch_sizes
ov::Dimension::value_type, // input_f
ov::Dimension::value_type, // output_f
bool // 3D case
>;
template <typename InputT, typename WeightsT, typename BiasT, typename OutputT>
struct dynamic_fully_connected_gpu : ::testing::TestWithParam<fully_connected_dynamic_test_params> {
void run_test() {
std::vector<ov::Dimension::value_type> batch_sizes;
ov::Dimension::value_type input_f;
ov::Dimension::value_type output_f;
bool fc_3d = false;
std::tie(batch_sizes, input_f, output_f, fc_3d) = GetParam();
auto input_dt = cldnn::type_to_data_type<InputT>::value;
auto weights_dt = cldnn::type_to_data_type<WeightsT>::value;
auto output_dt = cldnn::type_to_data_type<OutputT>::value;
auto& engine = get_test_engine();
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(), input_f }, input_dt, format::bfyx };
if (fc_3d)
input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(), ov::Dimension(), input_f }, input_dt, format::bfyx };
auto weights_mem = engine.allocate_memory({ ov::PartialShape{ output_f, input_f }, weights_dt, format::bfyx });
auto weights_data_vec = generate_random_1d<WeightsT>(output_f * input_f, -1, 1);
auto bias_mem = engine.allocate_memory({ ov::PartialShape{ output_f }, output_dt, format::bfyx });
auto bias_data_vec = generate_random_1d<OutputT>(output_f, 0, 1);
set_values(weights_mem, weights_data_vec);
set_values(bias_mem, bias_data_vec);
cldnn::topology topology{
input_layout("input", input_dyn_layout),
data("weights", weights_mem),
data("bias", bias_mem),
};
if (fc_3d)
topology.add(fully_connected("fc", input_info("input"), "weights", "bias", padding(), 3));
else
topology.add(fully_connected("fc", input_info("input"), "weights", "bias"));
ExecutionConfig config;
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
for (const auto& batch_size : batch_sizes) {
auto input_actual_layout = layout{ ov::PartialShape{ batch_size, input_f }, input_dt, format::bfyx };
if (fc_3d)
input_actual_layout = layout{ ov::PartialShape{ 1, batch_size, input_f }, input_dt, format::bfyx };
cldnn::memory_ptr input_mem = engine.allocate_memory(input_actual_layout);
std::vector<InputT> input_data_vec = generate_random_1d<InputT>(batch_size * input_f, 0, 1);
set_values(input_mem, input_data_vec);
network.set_input_data("input", input_mem);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc");
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(out_l.batch(), fc_3d ? 1 : batch_size);
ASSERT_EQ(out_l.feature(), fc_3d ? batch_size : output_f);
ASSERT_EQ(out_l.spatial(0), 1);
ASSERT_EQ(out_l.spatial(1), fc_3d ? output_f : 1);
cldnn::mem_lock<OutputT> output_ptr(output_prim_mem, get_test_stream());
auto ref_result = dynamic_fully_connected_reference_calc<OutputT>(batch_size,
input_f,
output_f,
input_data_vec,
weights_data_vec,
bias_data_vec);
for (int b = 0; b < batch_size; b++) {
for (int ofm = 0; ofm < output_f; ofm++) {
ASSERT_EQ(ref_result[b * output_f + ofm], output_ptr[b * output_f + ofm]);
}
}
}
}
};
using dynamic_fully_connected_gpu_f32_3d = dynamic_fully_connected_gpu<float, float, float, float>;
using dynamic_fully_connected_gpu_f16_3d = dynamic_fully_connected_gpu<FLOAT16, FLOAT16, FLOAT16, FLOAT16>;
static const std::vector<ov::Dimension::value_type>
dyn_batches_full = {1, 2, 4, 7, 8, 9, 15, 16, 31, 32, 33, 47, 48, 49, 58, 63, 64};
static const std::vector<ov::Dimension::value_type>
dyn_batches_smoke = {1, 2, 7, 8, 9, 16, 32, 33, 47, 48, 58};
TEST_P(dynamic_fully_connected_gpu_f32_3d, basic) {
run_test();
}
TEST_P(dynamic_fully_connected_gpu_f16_3d, basic) {
run_test();
}
INSTANTIATE_TEST_SUITE_P(
smoke,
dynamic_fully_connected_gpu_f32_3d,
::testing::Combine(
::testing::Values(dyn_batches_smoke),
::testing::Values(10, 32, 42, 53, 64, 128),
::testing::Values(2, 9, 128),
::testing::Values(false, true))
);
INSTANTIATE_TEST_SUITE_P(
smoke,
dynamic_fully_connected_gpu_f16_3d,
::testing::Combine(
::testing::Values(dyn_batches_smoke),
::testing::Values(10, 32, 42, 53, 64, 128),
::testing::Values(2, 9, 128),
::testing::Values(false, true))
);
INSTANTIATE_TEST_SUITE_P(
full,
dynamic_fully_connected_gpu_f32_3d,
::testing::Combine(
::testing::Values(dyn_batches_full),
::testing::Values(10, 32, 42, 53, 64, 128),
::testing::Values(2, 9, 16, 32, 64, 128),
::testing::Values(false, true))
);
INSTANTIATE_TEST_SUITE_P(
full,
dynamic_fully_connected_gpu_f16_3d,
::testing::Combine(
::testing::Values(dyn_batches_full),
::testing::Values(10, 32, 42, 53, 64, 128),
::testing::Values(2, 9, 16, 32, 64, 128),
::testing::Values(false, true))
);

View File

@ -59,7 +59,7 @@ TEST(check_hash_value, fc_basic) {
auto weights_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { out_f, in_f, in_y, in_x } });
auto bias_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, out_f, 1 } });
auto key_prim_id = "eltwise";
const auto key_prim_id = "fc";
topology topology(
input_layout("input", input_prim->get_layout()),
data("weights", weights_prim),
@ -79,7 +79,7 @@ TEST(check_hash_value, fc_basic) {
ASSERT_EQ(primitive_hash, 7881065839556591629UL);
ASSERT_EQ(prog_node_hash, 7881065839556591629UL);
ASSERT_EQ(prim_inst_hash, 2803059017090178132UL);
ASSERT_EQ(prim_inst_hash, 12327057149074647711UL);
}
TEST(check_hash_value, gather_basic) {