[GPU] Fixed gemm_tiled_opt fusing error and code refactoring in fusing tests (#9303)

2021-12-22 14:38:24 +03:00
parent b7750f8f0d
commit eca9ff6981
4 changed files with 5325 additions and 5177 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_tiled_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_tiled_opt.cpp
@@ -160,7 +160,7 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
                                               "dequantized",
                                               input_dt,
                                               1,
-                                               LoadType::LT_ALIGNED_READ,
+                                               LoadType::LT_UNALIGNED,
                                               BoundaryCheck::ENABLED,
                                               IndexType::TENSOR_COORD,
                                               Tensor::DataChannelName::Y };
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_tiled_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gemm_tiled_opt.cl
@@ -85,11 +85,12 @@ KERNEL(gemm_tiled_opt)(
    const uint sglid = (uint)get_sub_group_local_id();

    // Setting x and y for fusings indexing
-#if B_VEC_SIZE == 1
+    // TODO: investigate how we can use only TILE_N_NOT_DIVISIBLE here for getting stable results in fusings
+#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    const uint x = (uint)get_global_id(0);
-#else // B_VEC_SIZE == 1
+#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    const uint x = tile_n_num * SIMD_WIDTH * B_VEC_SIZE;
-#endif // B_VEC_SIZE == 1
+#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    uint y = tile_m_offset;

 #if TILE_M_NOT_DIVISIBLE
@@ -282,7 +283,7 @@ KERNEL(gemm_tiled_opt)(
 #endif // TILE_K_NOT_DIVISIBLE

 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
-#if TILE_N_NOT_DIVISIBLE
+#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
    FUSED_OPS_PRELOAD_SCALAR;
 #else // TILE_N_NOT_DIVISIBLE
    FUSED_OPS_PRELOAD_VEC;
@@ -291,7 +292,7 @@ KERNEL(gemm_tiled_opt)(

    // Writing result in the global memory
    unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) {
-#if TILE_N_NOT_DIVISIBLE
+#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
        if (b_raw_global_id < N) {
 #ifdef INPUT2_TYPE
            ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
@@ -312,7 +313,7 @@ KERNEL(gemm_tiled_opt)(
 #endif // HAS_FUSED_OPS
        }

-#else // TILE_N_NOT_DIVISIBLE
+#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1

 #ifdef INPUT2_TYPE
        B_FLOATN c_val = BLOCK_READ_B(c_ptr, 0);
@@ -333,7 +334,7 @@ KERNEL(gemm_tiled_opt)(
        BLOCK_WRITE_C(d_ptr, 0, dequantized);
 #endif // HAS_FUSED_OPS

-#endif // TILE_N_NOT_DIVISIBLE
+#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
        d_ptr += N;
 #ifdef INPUT2_TYPE
        c_ptr += N;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@@ -1868,10 +1868,10 @@ std::string FusedOpsCodeGenerator::GetJitLoad(const FusedOpsConfiguration& conf,
                return block_read;
            } else if (input_tensor.LogicalSize() > 1) {
                // Currently we assume that in such scenario we can safely load sub_group_size elements from the pointer
-                return Broadcast(block_read, input_dt, conf.vec_size);
+                return Broadcast(block_read, input_dt, vec_size);
            } else {
                // Input has only one element, so broadcast it for the whole vector size
-                return Broadcast(GetInputPtrName(input_id) + "[" + index_func_call + "]", input_dt, conf.vec_size);
+                return Broadcast(GetInputPtrName(input_id) + "[" + index_func_call + "]", input_dt, vec_size);
            }
        } else {
            if (vec_size > 1) {
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp