[GPU] Added shape agnostic optimized GEMM kernel (#15317)
* [GPU] Shape agnositc optimized gemm kernel Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix CI failure Signed-off-by: Andrew Park <andrew.park@intel.com> * Apply code review Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix dynamic shape accuracy drop on SQuAD v1.1 - F1: 91.81%, EM: 85.25% @bert-small-uncased-whole-word-masking-squad-0001 Signed-off-by: Andrew Park <andrew.park@intel.com> * Apply code review Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
parent
9d8532e998
commit
ab509ce164
@ -201,6 +201,7 @@ kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {
|
||||
|
||||
void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations);
|
||||
bool use_legacy_fused_ops(const kernel_impl_params& param_info);
|
||||
bool is_shape_agnostic(const kernel_impl_params& param_info);
|
||||
|
||||
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params);
|
||||
|
||||
@ -215,6 +216,9 @@ inline params_t get_default_params(const kernel_impl_params& param_info) {
|
||||
|
||||
params.inputs[0] = convert_data_tensor(input_layout);
|
||||
params.outputs[0] = convert_data_tensor(output_layout);
|
||||
if (is_shape_agnostic(param_info)) {
|
||||
params.is_shape_agnostic = true;
|
||||
}
|
||||
params.layerID = param_info.desc->id;
|
||||
|
||||
if (use_legacy_fused_ops(param_info)) {
|
||||
|
@ -1057,6 +1057,16 @@ bool use_legacy_fused_ops(const kernel_impl_params& param_info) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_shape_agnostic(const kernel_impl_params& param_info) {
|
||||
const auto& program = param_info.prog;
|
||||
const auto& node = program->get_node(param_info.desc->id);
|
||||
|
||||
if (node.is_dynamic())
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) {
|
||||
const auto& program = param_info.prog;
|
||||
const auto& device_info = program->get_engine().get_device_info();
|
||||
|
@ -27,7 +27,7 @@
|
||||
#define BLOCK_WRITE_C(ptr, offset, data) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, data)
|
||||
#endif // TILE_N > SIMD_WIDTH
|
||||
|
||||
inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
inline uint FUNC(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
|
||||
#if INPUT0_SIMPLE
|
||||
return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, 0, 0);
|
||||
#else // INPUT0_SIMPLE
|
||||
@ -35,7 +35,7 @@ inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
#endif // INPUT0_SIMPLE
|
||||
}
|
||||
|
||||
inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
inline uint FUNC(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
|
||||
#if INPUT1_SIMPLE
|
||||
return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, 0, 0);
|
||||
#else // INPUT1_SIMPLE
|
||||
@ -44,7 +44,7 @@ inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
}
|
||||
|
||||
#ifdef INPUT2_TYPE
|
||||
inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
inline uint FUNC(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
|
||||
#if INPUT2_SIMPLE
|
||||
return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, 0, 0);
|
||||
#else // INPUT2_SIMPLE
|
||||
@ -53,7 +53,7 @@ inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
}
|
||||
#endif // INPUT2_TYPE
|
||||
|
||||
inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
inline uint FUNC(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
|
||||
#if OUTPUT_SIMPLE
|
||||
return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, 0, 0);
|
||||
#else // OUTPUT_SIMPLE
|
||||
@ -65,6 +65,7 @@ inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
|
||||
REQD_SUB_GROUP_SIZE(SIMD_WIDTH)
|
||||
__attribute__((reqd_work_group_size(SIMD_WIDTH, 1, 1)))
|
||||
KERNEL(gemm_tiled_opt)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* input0,
|
||||
const __global INPUT1_TYPE* input1,
|
||||
#ifdef INPUT2_TYPE
|
||||
@ -86,19 +87,18 @@ KERNEL(gemm_tiled_opt)(
|
||||
|
||||
// Setting x and y for fusings indexing
|
||||
// TODO: investigate how we can use only TILE_N_NOT_DIVISIBLE here for getting stable results in fusings
|
||||
#if IS_DYNAMIC
|
||||
const uint x = (uint)get_global_id(0);
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
const uint x = (uint)get_global_id(0);
|
||||
#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
const uint x = tile_n_num * SIMD_WIDTH * B_VEC_SIZE;
|
||||
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
#endif // IS_DYNAMIC
|
||||
uint y = tile_m_offset;
|
||||
|
||||
#if TILE_M_NOT_DIVISIBLE
|
||||
const uint tile_m_iterations = tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M;
|
||||
#else // TILE_M_NOT_DIVISIBLE
|
||||
const uint tile_m_iterations = TILE_M;
|
||||
#endif // TILE_M_NOT_DIVISIBLE
|
||||
|
||||
const uint tile_m_iterations = TILE_M_NOT_DIVISIBLE ? (tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M) : TILE_M;
|
||||
const uint z = batch_number % OUTPUT_SIZE_Z;
|
||||
batch_number /= OUTPUT_SIZE_Z;
|
||||
const uint w = batch_number % OUTPUT_SIZE_W;
|
||||
@ -108,12 +108,12 @@ KERNEL(gemm_tiled_opt)(
|
||||
const uint b = batch_number % OUTPUT_BATCH_NUM;
|
||||
|
||||
// Batch offsets
|
||||
const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(b, f, w, z);
|
||||
const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(b, f, w, z);
|
||||
const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
|
||||
const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
|
||||
#ifdef INPUT2_TYPE
|
||||
const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(b, f, w, z);
|
||||
const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
|
||||
#endif // INPUT2_TYPE
|
||||
const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(b, f, w, z);
|
||||
const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
|
||||
|
||||
// Start pointers offsets
|
||||
#if !TRANSPOSE_INPUT0
|
||||
@ -152,11 +152,15 @@ KERNEL(gemm_tiled_opt)(
|
||||
|
||||
// Loading B tile
|
||||
unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) {
|
||||
#if IS_DYNAMIC
|
||||
b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_N_NOT_DIVISIBLE
|
||||
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
|
||||
#else // TILE_N_NOT_DIVISIBLE
|
||||
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
|
||||
#endif // TILE_N_NOT_DIVISIBLE
|
||||
#endif // IS_DYNAMIC
|
||||
#if !TRANSPOSE_INPUT1
|
||||
b_ptr += N;
|
||||
#else // !TRANSPOSE_INPUT1
|
||||
@ -198,11 +202,15 @@ KERNEL(gemm_tiled_opt)(
|
||||
// Loading A tile and tile C calculation
|
||||
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
|
||||
#if !TRANSPOSE_INPUT0
|
||||
#if IS_DYNAMIC
|
||||
A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[dot_id * K + sglid] : BLOCK_READ_A(a_ptr, dot_id * K);
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_K_NOT_DIVISIBLE
|
||||
A_FLOATN a_read = a_ptr[dot_id * K + sglid];
|
||||
#else // TILE_K_NOT_DIVISIBLE
|
||||
A_FLOATN a_read = BLOCK_READ_A(a_ptr, dot_id * K);
|
||||
#endif // TILE_K_NOT_DIVISIBLE
|
||||
#endif // IS_DYNAMIC
|
||||
|
||||
unroll_for (uint subtile_k_id = 0; subtile_k_id < TILE_K / SIMD_WIDTH; subtile_k_id++) {
|
||||
unroll_for (uint simd_local_id = 0; simd_local_id < SIMD_WIDTH; simd_local_id++) {
|
||||
@ -261,6 +269,24 @@ KERNEL(gemm_tiled_opt)(
|
||||
|
||||
} // Full tile calculation end
|
||||
|
||||
#if IS_DYNAMIC
|
||||
if (TILE_K_NOT_DIVISIBLE) {
|
||||
// Loading leftovers of the matrix B
|
||||
unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
|
||||
b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
|
||||
b_ptr += N;
|
||||
} // Loading leftovers of the matrix B end
|
||||
|
||||
// Loading leftovers of the matrix A and tile C calculation
|
||||
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
|
||||
INPUT0_TYPE a_read = a_ptr[dot_id * K + sglid];
|
||||
|
||||
unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
|
||||
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
|
||||
}
|
||||
} // Loading leftovers of the matrix A and tile C calculation end
|
||||
}
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_K_NOT_DIVISIBLE
|
||||
// Loading leftovers of the matrix B
|
||||
unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
|
||||
@ -281,17 +307,43 @@ KERNEL(gemm_tiled_opt)(
|
||||
}
|
||||
} // Loading leftovers of the matrix A and tile C calculation end
|
||||
#endif // TILE_K_NOT_DIVISIBLE
|
||||
#endif // IS_DYNAMIC
|
||||
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
#if IS_DYNAMIC
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#else // TILE_N_NOT_DIVISIBLE
|
||||
FUSED_OPS_PRELOAD_VEC;
|
||||
#endif // TILE_N_NOT_DIVISIBLE
|
||||
#endif // IS_DYNAMIC
|
||||
#endif // HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
|
||||
// Writing result in the global memory
|
||||
unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) {
|
||||
#if IS_DYNAMIC
|
||||
if (b_raw_global_id < N) {
|
||||
#ifdef INPUT2_TYPE
|
||||
ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
|
||||
#else // INPUT2_TYPE
|
||||
ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
|
||||
#endif // INPUT2_TYPE
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC_SCALAR;
|
||||
#else // FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_SCALAR;
|
||||
#endif // FUSED_OPS_CAN_USE_PRELOAD
|
||||
OUTPUT_TYPE res = FUSED_OPS_RESULT_SCALAR;
|
||||
d_ptr[sglid] = res;
|
||||
#else // HAS_FUSED_OPS
|
||||
d_ptr[sglid] = dequantized;
|
||||
#endif // HAS_FUSED_OPS
|
||||
}
|
||||
#else // IS_DYNAMIC
|
||||
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
if (b_raw_global_id < N) {
|
||||
#ifdef INPUT2_TYPE
|
||||
@ -335,6 +387,7 @@ KERNEL(gemm_tiled_opt)(
|
||||
#endif // HAS_FUSED_OPS
|
||||
|
||||
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
|
||||
#endif // IS_DYNAMIC
|
||||
d_ptr += N;
|
||||
#ifdef INPUT2_TYPE
|
||||
c_ptr += N;
|
||||
|
@ -631,6 +631,7 @@ struct base_params : public Params {
|
||||
std::vector<fused_operation_desc> fused_ops = {};
|
||||
MultiDataTensor inputs;
|
||||
MultiDataTensor outputs;
|
||||
bool is_shape_agnostic;
|
||||
std::string to_string() const override;
|
||||
std::string to_cache_string_v2() const override;
|
||||
ParamsKey GetParamsKey() const override;
|
||||
@ -648,7 +649,7 @@ struct base_params : public Params {
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {}
|
||||
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1), is_shape_agnostic(false) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -24,6 +24,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const {
|
||||
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDynamicShapesSupport();
|
||||
|
||||
return k;
|
||||
}
|
||||
@ -32,19 +33,20 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
|
||||
const auto& output = params.outputs[0];
|
||||
|
||||
DispatchData dispatchData;
|
||||
GemmTuningData td = SetTuningParams(params);
|
||||
if (!params.has_dynamic_tensors()) {
|
||||
GemmTuningData td = SetTuningParams(params);
|
||||
|
||||
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
|
||||
std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
|
||||
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
|
||||
std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
|
||||
|
||||
dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
|
||||
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
|
||||
dispatchData.gws[2] = global[2];
|
||||
|
||||
dispatchData.lws[0] = td.simd_size;
|
||||
dispatchData.lws[1] = 1;
|
||||
dispatchData.lws[2] = 1;
|
||||
dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
|
||||
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
|
||||
dispatchData.gws[2] = global[2];
|
||||
|
||||
dispatchData.lws[0] = td.simd_size;
|
||||
dispatchData.lws[1] = 1;
|
||||
dispatchData.lws[2] = 1;
|
||||
}
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
@ -53,25 +55,34 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
|
||||
|
||||
GemmKernelTiledOpt::GemmTuningData tuning_data;
|
||||
|
||||
auto m_size = output.Y().v;
|
||||
auto n_size = output.X().v;
|
||||
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
|
||||
if (!params.is_shape_agnostic) {
|
||||
auto m_size = output.Y().v;
|
||||
auto n_size = output.X().v;
|
||||
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
|
||||
|
||||
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
|
||||
tuning_data.simd_size = 8;
|
||||
auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
|
||||
tuning_data.simd_size = 8;
|
||||
|
||||
tuning_data.tile_n_size = tuning_data.simd_size;
|
||||
while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
|
||||
tuning_data.tile_n_size *= 2;
|
||||
}
|
||||
tuning_data.tile_n_size = tuning_data.simd_size;
|
||||
while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
|
||||
tuning_data.tile_n_size *= 2;
|
||||
}
|
||||
|
||||
// tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
|
||||
tuning_data.tile_k_size = tuning_data.simd_size;
|
||||
tuning_data.tile_m_size = tuning_data.simd_size;
|
||||
// tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
|
||||
tuning_data.tile_k_size = tuning_data.simd_size;
|
||||
tuning_data.tile_m_size = tuning_data.simd_size;
|
||||
|
||||
bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;
|
||||
bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;
|
||||
|
||||
if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
|
||||
if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
|
||||
tuning_data.simd_size = 16;
|
||||
tuning_data.tile_n_size = tuning_data.simd_size;
|
||||
tuning_data.tile_k_size = tuning_data.simd_size;
|
||||
tuning_data.tile_m_size = tuning_data.simd_size;
|
||||
}
|
||||
} else {
|
||||
// In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
|
||||
// so the tile sizes must be the same as simd_size
|
||||
tuning_data.simd_size = 16;
|
||||
tuning_data.tile_n_size = tuning_data.simd_size;
|
||||
tuning_data.tile_k_size = tuning_data.simd_size;
|
||||
@ -86,33 +97,62 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
|
||||
|
||||
const auto& output = params.outputs[0];
|
||||
GemmTuningData tuning_data = SetTuningParams(params);
|
||||
|
||||
auto m_size = output.Y().v;
|
||||
auto n_size = output.X().v;
|
||||
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
|
||||
auto leftover_m = m_size % tuning_data.tile_m_size;
|
||||
auto leftover_n = n_size % tuning_data.tile_n_size;
|
||||
auto leftover_k = k_size % tuning_data.tile_k_size;
|
||||
auto b_vec_size = tuning_data.tile_n_size / tuning_data.simd_size;
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));
|
||||
if (params.has_dynamic_tensors()) {
|
||||
auto m_size = params.transpose_input0 ? toCodeString(params.inputs[0].X(), 5) : toCodeString(params.inputs[0].Y(), 4);
|
||||
auto n_size = params.transpose_input1 ? toCodeString(params.inputs[1].Y(), 10) : toCodeString(params.inputs[1].X(), 11);
|
||||
auto k_size = params.transpose_input0 ? toCodeString(params.inputs[0].Y(), 4) : toCodeString(params.inputs[0].X(), 5);
|
||||
const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")";
|
||||
const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")";
|
||||
const std::string leftover_k = "(" + k_size + "%" + std::to_string(tuning_data.tile_k_size) + ")";
|
||||
const std::string not_divisible_m = "(" + leftover_m + "!=0)";
|
||||
const std::string not_divisible_n = "(" + leftover_n + "!=0)";
|
||||
const std::string not_divisible_k = "(" + leftover_k + "!=0)";
|
||||
const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")";
|
||||
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("M", m_size),
|
||||
MakeJitConstant("K", k_size),
|
||||
MakeJitConstant("N", n_size),
|
||||
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
|
||||
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
|
||||
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
|
||||
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
|
||||
MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
|
||||
MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
|
||||
MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
|
||||
MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
|
||||
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
|
||||
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
|
||||
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
|
||||
});
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("M", m_size),
|
||||
MakeJitConstant("K", k_size),
|
||||
MakeJitConstant("N", n_size),
|
||||
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
|
||||
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
|
||||
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
|
||||
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
|
||||
MakeJitConstant("K_FULL_ITERATIONS", full_iteration_k),
|
||||
MakeJitConstant("TILE_M_NOT_DIVISIBLE", not_divisible_m),
|
||||
MakeJitConstant("TILE_K_NOT_DIVISIBLE", not_divisible_k),
|
||||
MakeJitConstant("TILE_N_NOT_DIVISIBLE", not_divisible_n),
|
||||
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
|
||||
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
|
||||
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
|
||||
});
|
||||
} else {
|
||||
auto m_size = output.Y().v;
|
||||
auto n_size = output.X().v;
|
||||
auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
|
||||
auto leftover_m = m_size % tuning_data.tile_m_size;
|
||||
auto leftover_n = n_size % tuning_data.tile_n_size;
|
||||
auto leftover_k = k_size % tuning_data.tile_k_size;
|
||||
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("M", m_size),
|
||||
MakeJitConstant("K", k_size),
|
||||
MakeJitConstant("N", n_size),
|
||||
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
|
||||
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
|
||||
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
|
||||
MakeJitConstant("TILE_N", tuning_data.tile_n_size),
|
||||
MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
|
||||
MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
|
||||
MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
|
||||
MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
|
||||
MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
|
||||
MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
|
||||
MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
|
||||
});
|
||||
}
|
||||
|
||||
if (tuning_data.tile_k_size > tuning_data.simd_size) {
|
||||
jit.AddConstants({
|
||||
@ -184,7 +224,9 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o
|
||||
const auto& gmm_params = static_cast<const gemm_params&>(params);
|
||||
bool gemm_leftovers = gmm_params.inputs[0].X().v % 16 || gmm_params.inputs[0].Y().v % 16 ||
|
||||
gmm_params.inputs[1].X().v % 16 || gmm_params.inputs[1].Y().v % 16;
|
||||
if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && gemm_leftovers)
|
||||
// If gmm_params has dynamic inputs, the correct dimension value cannot be obtained
|
||||
// and leftovers cannot be calculated, so it returns false
|
||||
if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && (gemm_leftovers || gmm_params.has_dynamic_inputs()))
|
||||
return false;
|
||||
|
||||
for (size_t i = 1; i < gmm_params.inputs.size(); i++)
|
||||
|
@ -298,8 +298,8 @@ TEST(gemm_gpu, dynamic) {
|
||||
auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
|
||||
|
||||
std::vector<float> input1_data = {
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
3.f, 3.f, 2.f, -1.f,
|
||||
};
|
||||
|
||||
@ -342,6 +342,187 @@ TEST(gemm_gpu, dynamic) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(gemm_gpu, dynamic_multi_inference_same_shape) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
|
||||
auto in1_actual_layout = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
|
||||
auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
|
||||
auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
|
||||
auto input1_1 = engine.allocate_memory(in1_actual_layout);
|
||||
auto input1_2 = engine.allocate_memory(in1_actual_layout);
|
||||
auto input2_1 = engine.allocate_memory(in2_actual_layout);
|
||||
auto input2_2 = engine.allocate_memory(in2_actual_layout);
|
||||
|
||||
std::vector<float> input1_data1 = {
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
3.f, 3.f, 2.f, -1.f,
|
||||
};
|
||||
std::vector<float> input1_data2 = {
|
||||
-1.f, 2.f, -3.f, 4.f,
|
||||
5.f, 6.f, -1.f, 2.f,
|
||||
3.f, -3.f, 2.f, 1.f,
|
||||
};
|
||||
std::vector<float> input2_data1 = {
|
||||
2.f, 5.f, -4.f, -7.f,
|
||||
};
|
||||
std::vector<float> input2_data2 = {
|
||||
4.f, 7.f, 2.f, 5.f,
|
||||
};
|
||||
set_values(input1_1, input1_data1);
|
||||
set_values(input1_2, input1_data2);
|
||||
set_values(input2_1, input2_data1);
|
||||
set_values(input2_2, input2_data2);
|
||||
|
||||
std::vector<float> out_data1 = {
|
||||
8.f, 22.f, 20.f
|
||||
};
|
||||
std::vector<float> out_data2 = {
|
||||
24.f, 70.f, 0.f
|
||||
};
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in1_dyn_layout),
|
||||
input_layout("input2", in2_dyn_layout),
|
||||
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
|
||||
);
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
|
||||
{
|
||||
network.set_input_data("input1", input1_1);
|
||||
network.set_input_data("input2", input2_1);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gemm");
|
||||
|
||||
auto prog = network.get_program();
|
||||
auto& node = prog->get_node("gemm");
|
||||
auto impl = node.get_selected_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
|
||||
for (uint32_t i = 0; i < out_data1.size(); ++i) {
|
||||
ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
network.set_input_data("input1", input1_2);
|
||||
network.set_input_data("input2", input2_2);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gemm");
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
|
||||
for (uint32_t i = 0; i < out_data2.size(); ++i) {
|
||||
ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(gemm_gpu, dynamic_multi_inference_different_shape) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
|
||||
auto in1_actual_layout1 = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
|
||||
auto in1_actual_layout2 = layout{ ov::PartialShape{ 1, 1, 4, 4 }, data_types::f32, format::bfyx };
|
||||
auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
|
||||
auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
|
||||
auto input1_1 = engine.allocate_memory(in1_actual_layout1);
|
||||
auto input1_2 = engine.allocate_memory(in1_actual_layout2);
|
||||
auto input2 = engine.allocate_memory(in2_actual_layout);
|
||||
|
||||
std::vector<float> input1_data1 = {
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
3.f, 3.f, 2.f, -1.f,
|
||||
};
|
||||
std::vector<float> input1_data2 = {
|
||||
-1.f, 2.f, -3.f, 4.f,
|
||||
5.f, 6.f, -1.f, 2.f,
|
||||
3.f, -3.f, 2.f, 1.f,
|
||||
1.f, 2.f, -5.f, 6.f,
|
||||
};
|
||||
std::vector<float> input2_data = {
|
||||
2.f, 5.f, -4.f, -7.f,
|
||||
};
|
||||
set_values(input1_1, input1_data1);
|
||||
set_values(input1_2, input1_data2);
|
||||
set_values(input2, input2_data);
|
||||
|
||||
std::vector<float> out_data1 = {
|
||||
8.f, 22.f, 20.f
|
||||
};
|
||||
std::vector<float> out_data2 = {
|
||||
-8.f, 30.f, -24.f, -10.f
|
||||
};
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in1_dyn_layout),
|
||||
input_layout("input2", in2_dyn_layout),
|
||||
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
|
||||
);
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
|
||||
{
|
||||
network.set_input_data("input1", input1_1);
|
||||
network.set_input_data("input2", input2);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gemm");
|
||||
|
||||
auto prog = network.get_program();
|
||||
auto& node = prog->get_node("gemm");
|
||||
auto impl = node.get_selected_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
|
||||
for (uint32_t i = 0; i < out_data1.size(); ++i) {
|
||||
ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
network.set_input_data("input1", input1_2);
|
||||
network.set_input_data("input2", input2);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gemm");
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)4);
|
||||
for (uint32_t i = 0; i < out_data2.size(); ++i) {
|
||||
ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
GemmGPUTest_t1t2,
|
||||
GemmGPUTestRandom,
|
||||
|
Loading…
Reference in New Issue
Block a user