[GPU] Added shape agnostic optimized GEMM kernel (#15317)

* [GPU] Shape agnositc optimized gemm kernel

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Fix CI failure

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Apply code review

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Fix dynamic shape accuracy drop on SQuAD v1.1

- F1: 91.81%, EM: 85.25% @bert-small-uncased-whole-word-masking-squad-0001

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Apply code review

Signed-off-by: Andrew Park <andrew.park@intel.com>

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2023-02-03 14:26:35 +09:00 committed by GitHub
parent 9d8532e998
commit ab509ce164
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 356 additions and 65 deletions

View File

@ -201,6 +201,7 @@ kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations);
bool use_legacy_fused_ops(const kernel_impl_params& param_info);
bool is_shape_agnostic(const kernel_impl_params& param_info);
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params);
@ -215,6 +216,9 @@ inline params_t get_default_params(const kernel_impl_params& param_info) {
params.inputs[0] = convert_data_tensor(input_layout);
params.outputs[0] = convert_data_tensor(output_layout);
if (is_shape_agnostic(param_info)) {
params.is_shape_agnostic = true;
}
params.layerID = param_info.desc->id;
if (use_legacy_fused_ops(param_info)) {

View File

@ -1057,6 +1057,16 @@ bool use_legacy_fused_ops(const kernel_impl_params& param_info) {
return true;
}
bool is_shape_agnostic(const kernel_impl_params& param_info) {
const auto& program = param_info.prog;
const auto& node = program->get_node(param_info.desc->id);
if (node.is_dynamic())
return true;
return false;
}
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) {
const auto& program = param_info.prog;
const auto& device_info = program->get_engine().get_device_info();

View File

@ -27,7 +27,7 @@
#define BLOCK_WRITE_C(ptr, offset, data) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, data)
#endif // TILE_N > SIMD_WIDTH
inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
inline uint FUNC(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
#if INPUT0_SIMPLE
return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, 0, 0);
#else // INPUT0_SIMPLE
@ -35,7 +35,7 @@ inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
#endif // INPUT0_SIMPLE
}
inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
inline uint FUNC(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
#if INPUT1_SIMPLE
return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, 0, 0);
#else // INPUT1_SIMPLE
@ -44,7 +44,7 @@ inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
}
#ifdef INPUT2_TYPE
inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
inline uint FUNC(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
#if INPUT2_SIMPLE
return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, 0, 0);
#else // INPUT2_SIMPLE
@ -53,7 +53,7 @@ inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
}
#endif // INPUT2_TYPE
inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
inline uint FUNC(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
#if OUTPUT_SIMPLE
return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, 0, 0);
#else // OUTPUT_SIMPLE
@ -65,6 +65,7 @@ inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
REQD_SUB_GROUP_SIZE(SIMD_WIDTH)
__attribute__((reqd_work_group_size(SIMD_WIDTH, 1, 1)))
KERNEL(gemm_tiled_opt)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input0,
const __global INPUT1_TYPE* input1,
#ifdef INPUT2_TYPE
@ -86,19 +87,18 @@ KERNEL(gemm_tiled_opt)(
// Setting x and y for fusings indexing
// TODO: investigate how we can use only TILE_N_NOT_DIVISIBLE here for getting stable results in fusings
#if IS_DYNAMIC
const uint x = (uint)get_global_id(0);
#else // IS_DYNAMIC
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
const uint x = (uint)get_global_id(0);
#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
const uint x = tile_n_num * SIMD_WIDTH * B_VEC_SIZE;
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
#endif // IS_DYNAMIC
uint y = tile_m_offset;
#if TILE_M_NOT_DIVISIBLE
const uint tile_m_iterations = tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M;
#else // TILE_M_NOT_DIVISIBLE
const uint tile_m_iterations = TILE_M;
#endif // TILE_M_NOT_DIVISIBLE
const uint tile_m_iterations = TILE_M_NOT_DIVISIBLE ? (tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M) : TILE_M;
const uint z = batch_number % OUTPUT_SIZE_Z;
batch_number /= OUTPUT_SIZE_Z;
const uint w = batch_number % OUTPUT_SIZE_W;
@ -108,12 +108,12 @@ KERNEL(gemm_tiled_opt)(
const uint b = batch_number % OUTPUT_BATCH_NUM;
// Batch offsets
const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(b, f, w, z);
const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(b, f, w, z);
const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
#ifdef INPUT2_TYPE
const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(b, f, w, z);
const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
#endif // INPUT2_TYPE
const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(b, f, w, z);
const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
// Start pointers offsets
#if !TRANSPOSE_INPUT0
@ -152,11 +152,15 @@ KERNEL(gemm_tiled_opt)(
// Loading B tile
unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) {
#if IS_DYNAMIC
b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
#else // IS_DYNAMIC
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else // TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#endif // TILE_N_NOT_DIVISIBLE
#endif // IS_DYNAMIC
#if !TRANSPOSE_INPUT1
b_ptr += N;
#else // !TRANSPOSE_INPUT1
@ -198,11 +202,15 @@ KERNEL(gemm_tiled_opt)(
// Loading A tile and tile C calculation
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
#if !TRANSPOSE_INPUT0
#if IS_DYNAMIC
A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[dot_id * K + sglid] : BLOCK_READ_A(a_ptr, dot_id * K);
#else // IS_DYNAMIC
#if TILE_K_NOT_DIVISIBLE
A_FLOATN a_read = a_ptr[dot_id * K + sglid];
#else // TILE_K_NOT_DIVISIBLE
A_FLOATN a_read = BLOCK_READ_A(a_ptr, dot_id * K);
#endif // TILE_K_NOT_DIVISIBLE
#endif // IS_DYNAMIC
unroll_for (uint subtile_k_id = 0; subtile_k_id < TILE_K / SIMD_WIDTH; subtile_k_id++) {
unroll_for (uint simd_local_id = 0; simd_local_id < SIMD_WIDTH; simd_local_id++) {
@ -261,6 +269,24 @@ KERNEL(gemm_tiled_opt)(
} // Full tile calculation end
#if IS_DYNAMIC
if (TILE_K_NOT_DIVISIBLE) {
// Loading leftovers of the matrix B
unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
b_ptr += N;
} // Loading leftovers of the matrix B end
// Loading leftovers of the matrix A and tile C calculation
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
INPUT0_TYPE a_read = a_ptr[dot_id * K + sglid];
unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
}
} // Loading leftovers of the matrix A and tile C calculation end
}
#else // IS_DYNAMIC
#if TILE_K_NOT_DIVISIBLE
// Loading leftovers of the matrix B
unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
@ -281,17 +307,43 @@ KERNEL(gemm_tiled_opt)(
}
} // Loading leftovers of the matrix A and tile C calculation end
#endif // TILE_K_NOT_DIVISIBLE
#endif // IS_DYNAMIC
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
#if IS_DYNAMIC
FUSED_OPS_PRELOAD_SCALAR;
#else // IS_DYNAMIC
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
FUSED_OPS_PRELOAD_SCALAR;
#else // TILE_N_NOT_DIVISIBLE
FUSED_OPS_PRELOAD_VEC;
#endif // TILE_N_NOT_DIVISIBLE
#endif // IS_DYNAMIC
#endif // HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
// Writing result in the global memory
unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) {
#if IS_DYNAMIC
if (b_raw_global_id < N) {
#ifdef INPUT2_TYPE
ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
#else // INPUT2_TYPE
ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
#endif // INPUT2_TYPE
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC_SCALAR;
#else // FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_SCALAR;
#endif // FUSED_OPS_CAN_USE_PRELOAD
OUTPUT_TYPE res = FUSED_OPS_RESULT_SCALAR;
d_ptr[sglid] = res;
#else // HAS_FUSED_OPS
d_ptr[sglid] = dequantized;
#endif // HAS_FUSED_OPS
}
#else // IS_DYNAMIC
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
if (b_raw_global_id < N) {
#ifdef INPUT2_TYPE
@ -335,6 +387,7 @@ KERNEL(gemm_tiled_opt)(
#endif // HAS_FUSED_OPS
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
#endif // IS_DYNAMIC
d_ptr += N;
#ifdef INPUT2_TYPE
c_ptr += N;

View File

@ -631,6 +631,7 @@ struct base_params : public Params {
std::vector<fused_operation_desc> fused_ops = {};
MultiDataTensor inputs;
MultiDataTensor outputs;
bool is_shape_agnostic;
std::string to_string() const override;
std::string to_cache_string_v2() const override;
ParamsKey GetParamsKey() const override;
@ -648,7 +649,7 @@ struct base_params : public Params {
}
protected:
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {}
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1), is_shape_agnostic(false) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -24,6 +24,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const {
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableDynamicShapesSupport();
return k;
}
@ -32,19 +33,20 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
const auto& output = params.outputs[0];
DispatchData dispatchData;
GemmTuningData td = SetTuningParams(params);
if (!params.has_dynamic_tensors()) {
GemmTuningData td = SetTuningParams(params);
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
dispatchData.gws[2] = global[2];
dispatchData.lws[0] = td.simd_size;
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;
dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
dispatchData.gws[2] = global[2];
dispatchData.lws[0] = td.simd_size;
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;
}
return dispatchData;
}
@ -53,25 +55,34 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
GemmKernelTiledOpt::GemmTuningData tuning_data;
auto m_size = output.Y().v;
auto n_size = output.X().v;
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
if (!params.is_shape_agnostic) {
auto m_size = output.Y().v;
auto n_size = output.X().v;
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
tuning_data.simd_size = 8;
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
tuning_data.simd_size = 8;
tuning_data.tile_n_size = tuning_data.simd_size;
while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
tuning_data.tile_n_size *= 2;
}
tuning_data.tile_n_size = tuning_data.simd_size;
while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
tuning_data.tile_n_size *= 2;
}
// tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
// tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;
bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;
if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
tuning_data.simd_size = 16;
tuning_data.tile_n_size = tuning_data.simd_size;
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
}
} else {
// In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
// so the tile sizes must be the same as simd_size
tuning_data.simd_size = 16;
tuning_data.tile_n_size = tuning_data.simd_size;
tuning_data.tile_k_size = tuning_data.simd_size;
@ -86,33 +97,62 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
const auto& output = params.outputs[0];
GemmTuningData tuning_data = SetTuningParams(params);
auto m_size = output.Y().v;
auto n_size = output.X().v;
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
auto leftover_m = m_size % tuning_data.tile_m_size;
auto leftover_n = n_size % tuning_data.tile_n_size;
auto leftover_k = k_size % tuning_data.tile_k_size;
auto b_vec_size = tuning_data.tile_n_size / tuning_data.simd_size;
jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));
if (params.has_dynamic_tensors()) {
auto m_size = params.transpose_input0 ? toCodeString(params.inputs[0].X(), 5) : toCodeString(params.inputs[0].Y(), 4);
auto n_size = params.transpose_input1 ? toCodeString(params.inputs[1].Y(), 10) : toCodeString(params.inputs[1].X(), 11);
auto k_size = params.transpose_input0 ? toCodeString(params.inputs[0].Y(), 4) : toCodeString(params.inputs[0].X(), 5);
const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")";
const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")";
const std::string leftover_k = "(" + k_size + "%" + std::to_string(tuning_data.tile_k_size) + ")";
const std::string not_divisible_m = "(" + leftover_m + "!=0)";
const std::string not_divisible_n = "(" + leftover_n + "!=0)";
const std::string not_divisible_k = "(" + leftover_k + "!=0)";
const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")";
jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
});
jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
MakeJitConstant("K_FULL_ITERATIONS", full_iteration_k),
MakeJitConstant("TILE_M_NOT_DIVISIBLE", not_divisible_m),
MakeJitConstant("TILE_K_NOT_DIVISIBLE", not_divisible_k),
MakeJitConstant("TILE_N_NOT_DIVISIBLE", not_divisible_n),
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
});
} else {
auto m_size = output.Y().v;
auto n_size = output.X().v;
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
auto leftover_m = m_size % tuning_data.tile_m_size;
auto leftover_n = n_size % tuning_data.tile_n_size;
auto leftover_k = k_size % tuning_data.tile_k_size;
jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
});
}
if (tuning_data.tile_k_size > tuning_data.simd_size) {
jit.AddConstants({
@ -184,7 +224,9 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o
const auto& gmm_params = static_cast<const gemm_params&>(params);
bool gemm_leftovers = gmm_params.inputs[0].X().v % 16 || gmm_params.inputs[0].Y().v % 16 ||
gmm_params.inputs[1].X().v % 16 || gmm_params.inputs[1].Y().v % 16;
if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && gemm_leftovers)
// If gmm_params has dynamic inputs, the correct dimension value cannot be obtained
// and leftovers cannot be calculated, so it returns false
if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && (gemm_leftovers || gmm_params.has_dynamic_inputs()))
return false;
for (size_t i = 1; i < gmm_params.inputs.size(); i++)

View File

@ -298,8 +298,8 @@ TEST(gemm_gpu, dynamic) {
auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
std::vector<float> input1_data = {
1.f, -2.f, 3.f, -4.f,
5.f, 6.f, 1.f, 2.f,
1.f, -2.f, 3.f, -4.f,
5.f, 6.f, 1.f, 2.f,
3.f, 3.f, 2.f, -1.f,
};
@ -342,6 +342,187 @@ TEST(gemm_gpu, dynamic) {
}
}
TEST(gemm_gpu, dynamic_multi_inference_same_shape) {
auto& engine = get_test_engine();
auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
auto in1_actual_layout = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
auto input1_1 = engine.allocate_memory(in1_actual_layout);
auto input1_2 = engine.allocate_memory(in1_actual_layout);
auto input2_1 = engine.allocate_memory(in2_actual_layout);
auto input2_2 = engine.allocate_memory(in2_actual_layout);
std::vector<float> input1_data1 = {
1.f, -2.f, 3.f, -4.f,
5.f, 6.f, 1.f, 2.f,
3.f, 3.f, 2.f, -1.f,
};
std::vector<float> input1_data2 = {
-1.f, 2.f, -3.f, 4.f,
5.f, 6.f, -1.f, 2.f,
3.f, -3.f, 2.f, 1.f,
};
std::vector<float> input2_data1 = {
2.f, 5.f, -4.f, -7.f,
};
std::vector<float> input2_data2 = {
4.f, 7.f, 2.f, 5.f,
};
set_values(input1_1, input1_data1);
set_values(input1_2, input1_data2);
set_values(input2_1, input2_data1);
set_values(input2_2, input2_data2);
std::vector<float> out_data1 = {
8.f, 22.f, 20.f
};
std::vector<float> out_data2 = {
24.f, 70.f, 0.f
};
topology topology;
topology.add(input_layout("input1", in1_dyn_layout),
input_layout("input2", in2_dyn_layout),
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
);
ExecutionConfig config;
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
{
network.set_input_data("input1", input1_1);
network.set_input_data("input2", input2_1);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gemm");
auto prog = network.get_program();
auto& node = prog->get_node("gemm");
auto impl = node.get_selected_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto output_prim_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
for (uint32_t i = 0; i < out_data1.size(); ++i) {
ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
}
}
{
network.set_input_data("input1", input1_2);
network.set_input_data("input2", input2_2);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gemm");
auto output_prim_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
for (uint32_t i = 0; i < out_data2.size(); ++i) {
ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
}
}
}
TEST(gemm_gpu, dynamic_multi_inference_different_shape) {
auto& engine = get_test_engine();
auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
auto in1_actual_layout1 = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
auto in1_actual_layout2 = layout{ ov::PartialShape{ 1, 1, 4, 4 }, data_types::f32, format::bfyx };
auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
auto input1_1 = engine.allocate_memory(in1_actual_layout1);
auto input1_2 = engine.allocate_memory(in1_actual_layout2);
auto input2 = engine.allocate_memory(in2_actual_layout);
std::vector<float> input1_data1 = {
1.f, -2.f, 3.f, -4.f,
5.f, 6.f, 1.f, 2.f,
3.f, 3.f, 2.f, -1.f,
};
std::vector<float> input1_data2 = {
-1.f, 2.f, -3.f, 4.f,
5.f, 6.f, -1.f, 2.f,
3.f, -3.f, 2.f, 1.f,
1.f, 2.f, -5.f, 6.f,
};
std::vector<float> input2_data = {
2.f, 5.f, -4.f, -7.f,
};
set_values(input1_1, input1_data1);
set_values(input1_2, input1_data2);
set_values(input2, input2_data);
std::vector<float> out_data1 = {
8.f, 22.f, 20.f
};
std::vector<float> out_data2 = {
-8.f, 30.f, -24.f, -10.f
};
topology topology;
topology.add(input_layout("input1", in1_dyn_layout),
input_layout("input2", in2_dyn_layout),
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
);
ExecutionConfig config;
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
{
network.set_input_data("input1", input1_1);
network.set_input_data("input2", input2);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gemm");
auto prog = network.get_program();
auto& node = prog->get_node("gemm");
auto impl = node.get_selected_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto output_prim_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
for (uint32_t i = 0; i < out_data1.size(); ++i) {
ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
}
}
{
network.set_input_data("input1", input1_2);
network.set_input_data("input2", input2);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gemm");
auto output_prim_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
ASSERT_EQ(output_ptr.size(), (uint32_t)4);
for (uint32_t i = 0; i < out_data2.size(); ++i) {
ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
}
}
}
INSTANTIATE_TEST_SUITE_P(
GemmGPUTest_t1t2,
GemmGPUTestRandom,