[GPU] Added shape agnostic optimized GEMM kernel (#15317)

* [GPU] Shape agnositc optimized gemm kernel Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix CI failure Signed-off-by: Andrew Park <andrew.park@intel.com> * Apply code review Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix dynamic shape accuracy drop on SQuAD v1.1 - F1: 91.81%, EM: 85.25% @bert-small-uncased-whole-word-masking-squad-0001 Signed-off-by: Andrew Park <andrew.park@intel.com> * Apply code review Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
2023-02-03 14:26:35 +09:00 · 2023-02-03 14:26:35 +09:00 · ab509ce164
commit ab509ce164
parent 9d8532e998
6 changed files with 356 additions and 65 deletions
--- a/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h
+++ b/src/plugins/intel_gpu/src/graph/include/kernel_selector_helper.h
@ -201,6 +201,7 @@ kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {

 void convert_fused_ops_to_legacy_activations(const kernel_impl_params& param_info, std::vector<kernel_selector::base_activation_params>& activations);
 bool use_legacy_fused_ops(const kernel_impl_params& param_info);
+bool is_shape_agnostic(const kernel_impl_params& param_info);

 void set_params(const kernel_impl_params& param_info, kernel_selector::params& params);

@ -215,6 +216,9 @@ inline params_t get_default_params(const kernel_impl_params& param_info) {

    params.inputs[0] = convert_data_tensor(input_layout);
    params.outputs[0] = convert_data_tensor(output_layout);
+    if (is_shape_agnostic(param_info)) {
+        params.is_shape_agnostic = true;
+    }
    params.layerID = param_info.desc->id;

    if (use_legacy_fused_ops(param_info)) {
--- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp
@ -1057,6 +1057,16 @@ bool use_legacy_fused_ops(const kernel_impl_params& param_info) {
    return true;
 }

+bool is_shape_agnostic(const kernel_impl_params& param_info) {
+    const auto& program = param_info.prog;
+    const auto& node = program->get_node(param_info.desc->id);
+
+    if (node.is_dynamic())
+        return true;
+
+    return false;
+}
+
 void set_params(const kernel_impl_params& param_info, kernel_selector::params& params) {
    const auto& program = param_info.prog;
    const auto& device_info = program->get_engine().get_device_info();
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
@ -27,7 +27,7 @@
    #define BLOCK_WRITE_C(ptr, offset, data) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, data)
 #endif // TILE_N > SIMD_WIDTH

-inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
+inline uint FUNC(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
 #if INPUT0_SIMPLE
    return GET_DATA_INDEX_6D_SAFE(INPUT0, b, f, w, z, 0, 0);
 #else // INPUT0_SIMPLE
@ -35,7 +35,7 @@ inline uint FUNC(get_input0_batch_offset)(uint b, uint f, uint w, uint z) {
 #endif // INPUT0_SIMPLE
 }

-inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
+inline uint FUNC(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
 #if INPUT1_SIMPLE
    return GET_DATA_INDEX_6D_SAFE(INPUT1, b, f, w, z, 0, 0);
 #else // INPUT1_SIMPLE
@ -44,7 +44,7 @@ inline uint FUNC(get_input1_batch_offset)(uint b, uint f, uint w, uint z) {
 }

 #ifdef INPUT2_TYPE
-inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
+inline uint FUNC(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
 #if INPUT2_SIMPLE
    return GET_DATA_INDEX_6D_SAFE(INPUT2, b, f, w, z, 0, 0);
 #else // INPUT2_SIMPLE
@ -53,7 +53,7 @@ inline uint FUNC(get_input2_batch_offset)(uint b, uint f, uint w, uint z) {
 }
 #endif // INPUT2_TYPE

-inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
+inline uint FUNC(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z) {
 #if OUTPUT_SIMPLE
    return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, 0, 0);
 #else // OUTPUT_SIMPLE
@ -65,6 +65,7 @@ inline uint FUNC(get_output_batch_offset)(uint b, uint f, uint w, uint z) {
 REQD_SUB_GROUP_SIZE(SIMD_WIDTH)
 __attribute__((reqd_work_group_size(SIMD_WIDTH, 1, 1)))
 KERNEL(gemm_tiled_opt)(
+    OPTIONAL_SHAPE_INFO_ARG
    const __global INPUT0_TYPE* input0,
    const __global INPUT1_TYPE* input1,
 #ifdef INPUT2_TYPE
@ -86,19 +87,18 @@ KERNEL(gemm_tiled_opt)(

    // Setting x and y for fusings indexing
    // TODO: investigate how we can use only TILE_N_NOT_DIVISIBLE here for getting stable results in fusings
+#if IS_DYNAMIC
+    const uint x = (uint)get_global_id(0);
+#else // IS_DYNAMIC
 #if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    const uint x = (uint)get_global_id(0);
 #else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    const uint x = tile_n_num * SIMD_WIDTH * B_VEC_SIZE;
 #endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
+#endif // IS_DYNAMIC
    uint y = tile_m_offset;

-#if TILE_M_NOT_DIVISIBLE
-    const uint tile_m_iterations = tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M;
-#else // TILE_M_NOT_DIVISIBLE
-    const uint tile_m_iterations = TILE_M;
-#endif // TILE_M_NOT_DIVISIBLE
-
+    const uint tile_m_iterations = TILE_M_NOT_DIVISIBLE ? (tile_m_num == (tile_m_size - 1) ? TILE_M_LEFTOVER : TILE_M) : TILE_M;
    const uint z = batch_number % OUTPUT_SIZE_Z;
    batch_number /= OUTPUT_SIZE_Z;
    const uint w = batch_number % OUTPUT_SIZE_W;
@ -108,12 +108,12 @@ KERNEL(gemm_tiled_opt)(
    const uint b = batch_number % OUTPUT_BATCH_NUM;

    // Batch offsets
-    const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(b, f, w, z);
-    const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(b, f, w, z);
+    const uint batch_offset_input0 = FUNC_CALL(get_input0_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
+    const uint batch_offset_input1 = FUNC_CALL(get_input1_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
 #ifdef INPUT2_TYPE
-    const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(b, f, w, z);
+    const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
 #endif // INPUT2_TYPE
-    const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(b, f, w, z);
+    const uint batch_offset_output = FUNC_CALL(get_output_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);

    // Start pointers offsets
 #if !TRANSPOSE_INPUT0
@ -152,11 +152,15 @@ KERNEL(gemm_tiled_opt)(

        // Loading B tile
        unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) {
+#if IS_DYNAMIC
+            b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
+#else // IS_DYNAMIC
 #if TILE_N_NOT_DIVISIBLE
            b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
 #else // TILE_N_NOT_DIVISIBLE
            b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
 #endif // TILE_N_NOT_DIVISIBLE
+#endif // IS_DYNAMIC
 #if !TRANSPOSE_INPUT1
            b_ptr += N;
 #else // !TRANSPOSE_INPUT1
@ -198,11 +202,15 @@ KERNEL(gemm_tiled_opt)(
        // Loading A tile and tile C calculation
        unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
 #if !TRANSPOSE_INPUT0
+#if IS_DYNAMIC
+            A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[dot_id * K + sglid] : BLOCK_READ_A(a_ptr, dot_id * K);
+#else // IS_DYNAMIC
 #if TILE_K_NOT_DIVISIBLE
            A_FLOATN a_read = a_ptr[dot_id * K + sglid];
 #else // TILE_K_NOT_DIVISIBLE
            A_FLOATN a_read = BLOCK_READ_A(a_ptr, dot_id * K);
 #endif // TILE_K_NOT_DIVISIBLE
+#endif // IS_DYNAMIC

            unroll_for (uint subtile_k_id = 0; subtile_k_id < TILE_K / SIMD_WIDTH; subtile_k_id++) {
                unroll_for (uint simd_local_id = 0; simd_local_id < SIMD_WIDTH; simd_local_id++) {
@ -261,6 +269,24 @@ KERNEL(gemm_tiled_opt)(

    } // Full tile calculation end

+#if IS_DYNAMIC
+    if (TILE_K_NOT_DIVISIBLE) {
+        // Loading leftovers of the matrix B
+        unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
+            b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0);
+            b_ptr += N;
+        } // Loading leftovers of the matrix B end
+
+        // Loading leftovers of the matrix A and tile C calculation
+        unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
+            INPUT0_TYPE a_read = a_ptr[dot_id * K + sglid];
+
+            unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
+                c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
+            }
+        } // Loading leftovers of the matrix A and tile C calculation end
+    }
+#else // IS_DYNAMIC
 #if TILE_K_NOT_DIVISIBLE
    // Loading leftovers of the matrix B
    unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) {
@ -281,17 +307,43 @@ KERNEL(gemm_tiled_opt)(
        }
    } // Loading leftovers of the matrix A and tile C calculation end
 #endif // TILE_K_NOT_DIVISIBLE
+#endif // IS_DYNAMIC

 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+#if IS_DYNAMIC
+    FUSED_OPS_PRELOAD_SCALAR;
+#else // IS_DYNAMIC
 #if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    FUSED_OPS_PRELOAD_SCALAR;
 #else // TILE_N_NOT_DIVISIBLE
    FUSED_OPS_PRELOAD_VEC;
 #endif // TILE_N_NOT_DIVISIBLE
+#endif // IS_DYNAMIC
 #endif // HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD

    // Writing result in the global memory
    unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) {
+#if IS_DYNAMIC
+        if (b_raw_global_id < N) {
+#ifdef INPUT2_TYPE
+            ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
+#else // INPUT2_TYPE
+            ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
+#endif // INPUT2_TYPE
+
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+            FUSED_OPS_CALC_SCALAR;
+#else // FUSED_OPS_CAN_USE_PRELOAD
+            FUSED_OPS_SCALAR;
+#endif // FUSED_OPS_CAN_USE_PRELOAD
+            OUTPUT_TYPE res = FUSED_OPS_RESULT_SCALAR;
+            d_ptr[sglid] = res;
+#else // HAS_FUSED_OPS
+            d_ptr[sglid] = dequantized;
+#endif // HAS_FUSED_OPS
+        }
+#else // IS_DYNAMIC
 #if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
        if (b_raw_global_id < N) {
 #ifdef INPUT2_TYPE
@ -335,6 +387,7 @@ KERNEL(gemm_tiled_opt)(
 #endif // HAS_FUSED_OPS

 #endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
+#endif // IS_DYNAMIC
        d_ptr += N;
 #ifdef INPUT2_TYPE
        c_ptr += N;
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
@ -631,6 +631,7 @@ struct base_params : public Params {
    std::vector<fused_operation_desc> fused_ops = {};
    MultiDataTensor inputs;
    MultiDataTensor outputs;
+    bool is_shape_agnostic;
    std::string to_string() const override;
    std::string to_cache_string_v2() const override;
    ParamsKey GetParamsKey() const override;
@ -648,7 +649,7 @@ struct base_params : public Params {
    }

 protected:
-    explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {}
+    explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1), is_shape_agnostic(false) {}
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
@ -24,6 +24,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const {

    k.EnableBatching();
    k.EnableDifferentTypes();
+    k.EnableDynamicShapesSupport();

    return k;
 }
@ -32,19 +33,20 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
    const auto& output = params.outputs[0];

    DispatchData dispatchData;
-    GemmTuningData td = SetTuningParams(params);
+    if (!params.has_dynamic_tensors()) {
+        GemmTuningData td = SetTuningParams(params);

-    auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
-    std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };
+        auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
+        std::vector<size_t> global = { output.X().v, output.Y().v, total_batches };

-    dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
-    dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
-    dispatchData.gws[2] = global[2];
-
-    dispatchData.lws[0] = td.simd_size;
-    dispatchData.lws[1] = 1;
-    dispatchData.lws[2] = 1;
+        dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
+        dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
+        dispatchData.gws[2] = global[2];

+        dispatchData.lws[0] = td.simd_size;
+        dispatchData.lws[1] = 1;
+        dispatchData.lws[2] = 1;
+    }
    return dispatchData;
 }

@ -53,25 +55,34 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem

    GemmKernelTiledOpt::GemmTuningData tuning_data;

-    auto m_size = output.Y().v;
-    auto n_size = output.X().v;
-    auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
+     if (!params.is_shape_agnostic) {
+        auto m_size = output.Y().v;
+        auto n_size = output.X().v;
+        auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;

-    auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
-    tuning_data.simd_size = 8;
+        auto total_batches = output.LogicalSize() / (output.X().v * output.Y().v);
+        tuning_data.simd_size = 8;

-    tuning_data.tile_n_size = tuning_data.simd_size;
-    while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
-        tuning_data.tile_n_size *= 2;
-    }
+        tuning_data.tile_n_size = tuning_data.simd_size;
+        while (tuning_data.tile_n_size < 64 && n_size / (tuning_data.tile_n_size * 2) >= 1) {
+            tuning_data.tile_n_size *= 2;
+        }

-    // tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
-    tuning_data.tile_k_size = tuning_data.simd_size;
-    tuning_data.tile_m_size = tuning_data.simd_size;
+        // tuning_data.tile_k_size must be the same as simd_size when k % tile_k != 0
+        tuning_data.tile_k_size = tuning_data.simd_size;
+        tuning_data.tile_m_size = tuning_data.simd_size;

-    bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;
+        bool leftovers = m_size % tuning_data.tile_m_size || k_size % tuning_data.tile_k_size || n_size % tuning_data.tile_n_size;

-    if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
+        if (leftovers || total_batches > 1 || params.transpose_input0 || params.transpose_input1) {
+            tuning_data.simd_size = 16;
+            tuning_data.tile_n_size = tuning_data.simd_size;
+            tuning_data.tile_k_size = tuning_data.simd_size;
+            tuning_data.tile_m_size = tuning_data.simd_size;
+        }
+    } else {
+        // In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
+        // so the tile sizes must be the same as simd_size
        tuning_data.simd_size = 16;
        tuning_data.tile_n_size = tuning_data.simd_size;
        tuning_data.tile_k_size = tuning_data.simd_size;
@ -86,33 +97,62 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons

    const auto& output = params.outputs[0];
    GemmTuningData tuning_data = SetTuningParams(params);
-
-    auto m_size = output.Y().v;
-    auto n_size = output.X().v;
-    auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
-    auto leftover_m = m_size % tuning_data.tile_m_size;
-    auto leftover_n = n_size % tuning_data.tile_n_size;
-    auto leftover_k = k_size % tuning_data.tile_k_size;
    auto b_vec_size = tuning_data.tile_n_size / tuning_data.simd_size;

    jit.Merge(MakeTypeJitConstants(params.inputs[0].GetDType(), "ACCUMULATOR"));
+    if (params.has_dynamic_tensors()) {
+        auto m_size = params.transpose_input0 ? toCodeString(params.inputs[0].X(), 5) : toCodeString(params.inputs[0].Y(), 4);
+        auto n_size = params.transpose_input1 ? toCodeString(params.inputs[1].Y(), 10) : toCodeString(params.inputs[1].X(), 11);
+        auto k_size = params.transpose_input0 ? toCodeString(params.inputs[0].Y(), 4) : toCodeString(params.inputs[0].X(), 5);
+        const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")";
+        const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")";
+        const std::string leftover_k = "(" + k_size + "%" + std::to_string(tuning_data.tile_k_size) + ")";
+        const std::string not_divisible_m = "(" + leftover_m + "!=0)";
+        const std::string not_divisible_n = "(" + leftover_n + "!=0)";
+        const std::string not_divisible_k = "(" + leftover_k + "!=0)";
+        const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")";

-    jit.AddConstants({
-        MakeJitConstant("M", m_size),
-        MakeJitConstant("K", k_size),
-        MakeJitConstant("N", n_size),
-        MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
-        MakeJitConstant("TILE_M", tuning_data.tile_m_size),
-        MakeJitConstant("TILE_K", tuning_data.tile_k_size),
-        MakeJitConstant("TILE_N", tuning_data.tile_n_size),
-        MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
-        MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
-        MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
-        MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
-        MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
-        MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
-        MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
-    });
+        jit.AddConstants({
+            MakeJitConstant("M", m_size),
+            MakeJitConstant("K", k_size),
+            MakeJitConstant("N", n_size),
+            MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
+            MakeJitConstant("TILE_M", tuning_data.tile_m_size),
+            MakeJitConstant("TILE_K", tuning_data.tile_k_size),
+            MakeJitConstant("TILE_N", tuning_data.tile_n_size),
+            MakeJitConstant("K_FULL_ITERATIONS", full_iteration_k),
+            MakeJitConstant("TILE_M_NOT_DIVISIBLE", not_divisible_m),
+            MakeJitConstant("TILE_K_NOT_DIVISIBLE", not_divisible_k),
+            MakeJitConstant("TILE_N_NOT_DIVISIBLE", not_divisible_n),
+            MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
+            MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
+            MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
+        });
+    } else {
+        auto m_size = output.Y().v;
+        auto n_size = output.X().v;
+        auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v;
+        auto leftover_m = m_size % tuning_data.tile_m_size;
+        auto leftover_n = n_size % tuning_data.tile_n_size;
+        auto leftover_k = k_size % tuning_data.tile_k_size;
+
+        jit.AddConstants({
+            MakeJitConstant("M", m_size),
+            MakeJitConstant("K", k_size),
+            MakeJitConstant("N", n_size),
+            MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
+            MakeJitConstant("TILE_M", tuning_data.tile_m_size),
+            MakeJitConstant("TILE_K", tuning_data.tile_k_size),
+            MakeJitConstant("TILE_N", tuning_data.tile_n_size),
+            MakeJitConstant("K_FULL_ITERATIONS", k_size / tuning_data.tile_k_size),
+            MakeJitConstant("TILE_M_NOT_DIVISIBLE", leftover_m != 0),
+            MakeJitConstant("TILE_K_NOT_DIVISIBLE", leftover_k != 0),
+            MakeJitConstant("TILE_N_NOT_DIVISIBLE", leftover_n != 0),
+            MakeJitConstant("TILE_M_LEFTOVER", leftover_m),
+            MakeJitConstant("TILE_K_LEFTOVER", leftover_k),
+            MakeJitConstant("TILE_N_LEFTOVER", leftover_n),
+        });
+    }

    if (tuning_data.tile_k_size > tuning_data.simd_size) {
        jit.AddConstants({
@ -184,7 +224,9 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o
    const auto& gmm_params = static_cast<const gemm_params&>(params);
    bool gemm_leftovers = gmm_params.inputs[0].X().v % 16 || gmm_params.inputs[0].Y().v % 16 ||
                          gmm_params.inputs[1].X().v % 16 || gmm_params.inputs[1].Y().v % 16;
-    if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && gemm_leftovers)
+    // If gmm_params has dynamic inputs, the correct dimension value cannot be obtained
+    // and leftovers cannot be calculated, so it returns false
+    if ((gmm_params.transpose_input0 || gmm_params.transpose_input1) && (gemm_leftovers || gmm_params.has_dynamic_inputs()))
        return false;

    for (size_t i = 1; i < gmm_params.inputs.size(); i++)
--- a/src/plugins/intel_gpu/tests/test_cases/gemm_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/gemm_gpu_test.cpp
@ -298,8 +298,8 @@ TEST(gemm_gpu, dynamic) {
    auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});

    std::vector<float> input1_data = {
-        1.f, -2.f,  3.f, -4.f,
-        5.f,  6.f, 1.f, 2.f,
+        1.f, -2.f, 3.f, -4.f,
+        5.f, 6.f, 1.f, 2.f,
        3.f, 3.f, 2.f, -1.f,
    };

@ -342,6 +342,187 @@ TEST(gemm_gpu, dynamic) {
    }
 }

+TEST(gemm_gpu, dynamic_multi_inference_same_shape) {
+    auto& engine = get_test_engine();
+
+    auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
+    auto in1_actual_layout = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
+    auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
+    auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
+    auto input1_1 = engine.allocate_memory(in1_actual_layout);
+    auto input1_2 = engine.allocate_memory(in1_actual_layout);
+    auto input2_1 = engine.allocate_memory(in2_actual_layout);
+    auto input2_2 = engine.allocate_memory(in2_actual_layout);
+
+    std::vector<float> input1_data1 = {
+        1.f, -2.f, 3.f, -4.f,
+        5.f, 6.f, 1.f, 2.f,
+        3.f, 3.f, 2.f, -1.f,
+    };
+    std::vector<float> input1_data2 = {
+        -1.f, 2.f, -3.f, 4.f,
+        5.f, 6.f, -1.f, 2.f,
+        3.f, -3.f, 2.f, 1.f,
+    };
+    std::vector<float> input2_data1 = {
+        2.f, 5.f, -4.f, -7.f,
+    };
+    std::vector<float> input2_data2 = {
+        4.f, 7.f, 2.f, 5.f,
+    };
+    set_values(input1_1, input1_data1);
+    set_values(input1_2, input1_data2);
+    set_values(input2_1, input2_data1);
+    set_values(input2_2, input2_data2);
+
+    std::vector<float> out_data1 = {
+        8.f, 22.f, 20.f
+    };
+    std::vector<float> out_data2 = {
+        24.f, 70.f, 0.f
+    };
+
+    topology topology;
+    topology.add(input_layout("input1", in1_dyn_layout),
+                 input_layout("input2", in2_dyn_layout),
+                 gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
+    );
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    {
+        network.set_input_data("input1", input1_1);
+        network.set_input_data("input2", input2_1);
+
+        auto outputs = network.execute();
+        ASSERT_EQ(outputs.size(), size_t(1));
+        ASSERT_EQ(outputs.begin()->first, "gemm");
+
+        auto prog = network.get_program();
+        auto& node = prog->get_node("gemm");
+        auto impl = node.get_selected_impl();
+        ASSERT_TRUE(impl != nullptr);
+        ASSERT_TRUE(impl->is_dynamic());
+
+        auto output_prim_mem = outputs.begin()->second.get_memory();
+        cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), (uint32_t)3);
+        for (uint32_t i = 0; i < out_data1.size(); ++i) {
+            ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
+        }
+    }
+
+    {
+        network.set_input_data("input1", input1_2);
+        network.set_input_data("input2", input2_2);
+
+        auto outputs = network.execute();
+        ASSERT_EQ(outputs.size(), size_t(1));
+        ASSERT_EQ(outputs.begin()->first, "gemm");
+
+        auto output_prim_mem = outputs.begin()->second.get_memory();
+        cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), (uint32_t)3);
+        for (uint32_t i = 0; i < out_data2.size(); ++i) {
+            ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
+        }
+    }
+}
+
+TEST(gemm_gpu, dynamic_multi_inference_different_shape) {
+    auto& engine = get_test_engine();
+
+    auto in1_dyn_layout = layout{ ov::PartialShape{ 1, 1, ov::Dimension(1, 10), 4 }, data_types::f32, format::bfyx };
+    auto in1_actual_layout1 = layout{ ov::PartialShape{ 1, 1, 3, 4 }, data_types::f32, format::bfyx };
+    auto in1_actual_layout2 = layout{ ov::PartialShape{ 1, 1, 4, 4 }, data_types::f32, format::bfyx };
+    auto in2_dyn_layout = layout{ ov::PartialShape{ 4, ov::Dimension(1, 10) }, data_types::f32, format::bfyx };
+    auto in2_actual_layout = layout{ ov::PartialShape{ 4, 1 }, data_types::f32, format::bfyx };
+    auto input1_1 = engine.allocate_memory(in1_actual_layout1);
+    auto input1_2 = engine.allocate_memory(in1_actual_layout2);
+    auto input2 = engine.allocate_memory(in2_actual_layout);
+
+    std::vector<float> input1_data1 = {
+        1.f, -2.f, 3.f, -4.f,
+        5.f, 6.f, 1.f, 2.f,
+        3.f, 3.f, 2.f, -1.f,
+    };
+    std::vector<float> input1_data2 = {
+        -1.f, 2.f, -3.f, 4.f,
+        5.f, 6.f, -1.f, 2.f,
+        3.f, -3.f, 2.f, 1.f,
+        1.f, 2.f, -5.f, 6.f,
+    };
+    std::vector<float> input2_data = {
+        2.f, 5.f, -4.f, -7.f,
+    };
+    set_values(input1_1, input1_data1);
+    set_values(input1_2, input1_data2);
+    set_values(input2, input2_data);
+
+    std::vector<float> out_data1 = {
+        8.f, 22.f, 20.f
+    };
+    std::vector<float> out_data2 = {
+        -8.f, 30.f, -24.f, -10.f
+    };
+
+    topology topology;
+    topology.add(input_layout("input1", in1_dyn_layout),
+                 input_layout("input2", in2_dyn_layout),
+                 gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f32, false, false, 1.0f, 0.0f, 4, 2)
+    );
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    {
+        network.set_input_data("input1", input1_1);
+        network.set_input_data("input2", input2);
+
+        auto outputs = network.execute();
+        ASSERT_EQ(outputs.size(), size_t(1));
+        ASSERT_EQ(outputs.begin()->first, "gemm");
+
+        auto prog = network.get_program();
+        auto& node = prog->get_node("gemm");
+        auto impl = node.get_selected_impl();
+        ASSERT_TRUE(impl != nullptr);
+        ASSERT_TRUE(impl->is_dynamic());
+
+        auto output_prim_mem = outputs.begin()->second.get_memory();
+        cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), (uint32_t)3);
+        for (uint32_t i = 0; i < out_data1.size(); ++i) {
+            ASSERT_FLOAT_EQ(output_ptr[i], out_data1[i]);
+        }
+    }
+
+    {
+        network.set_input_data("input1", input1_2);
+        network.set_input_data("input2", input2);
+
+        auto outputs = network.execute();
+        ASSERT_EQ(outputs.size(), size_t(1));
+        ASSERT_EQ(outputs.begin()->first, "gemm");
+
+        auto output_prim_mem = outputs.begin()->second.get_memory();
+        cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), (uint32_t)4);
+        for (uint32_t i = 0; i < out_data2.size(); ++i) {
+            ASSERT_FLOAT_EQ(output_ptr[i], out_data2[i]);
+        }
+    }
+}
+
 INSTANTIATE_TEST_SUITE_P(
        GemmGPUTest_t1t2,
        GemmGPUTestRandom,