[GPU] Fixed gemm_tiled_opt fusing error and code refactoring in fusing tests (#9303)

This commit is contained in:
Ilya Znamenskiy
2021-12-22 14:38:24 +03:00
committed by GitHub
parent b7750f8f0d
commit eca9ff6981
4 changed files with 5325 additions and 5177 deletions

View File

@@ -160,7 +160,7 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
"dequantized",
input_dt,
1,
LoadType::LT_ALIGNED_READ,
LoadType::LT_UNALIGNED,
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::Y };

View File

@@ -85,11 +85,12 @@ KERNEL(gemm_tiled_opt)(
const uint sglid = (uint)get_sub_group_local_id();
// Setting x and y for fusings indexing
#if B_VEC_SIZE == 1
// TODO: investigate how we can use only TILE_N_NOT_DIVISIBLE here for getting stable results in fusings
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
const uint x = (uint)get_global_id(0);
#else // B_VEC_SIZE == 1
#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
const uint x = tile_n_num * SIMD_WIDTH * B_VEC_SIZE;
#endif // B_VEC_SIZE == 1
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
uint y = tile_m_offset;
#if TILE_M_NOT_DIVISIBLE
@@ -282,7 +283,7 @@ KERNEL(gemm_tiled_opt)(
#endif // TILE_K_NOT_DIVISIBLE
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
#if TILE_N_NOT_DIVISIBLE
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
FUSED_OPS_PRELOAD_SCALAR;
#else // TILE_N_NOT_DIVISIBLE
FUSED_OPS_PRELOAD_VEC;
@@ -291,7 +292,7 @@ KERNEL(gemm_tiled_opt)(
// Writing result in the global memory
unroll_for (uint write_id = 0; write_id < tile_m_iterations; write_id++) {
#if TILE_N_NOT_DIVISIBLE
#if TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
if (b_raw_global_id < N) {
#ifdef INPUT2_TYPE
ACCUMULATOR_TYPE dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
@@ -312,7 +313,7 @@ KERNEL(gemm_tiled_opt)(
#endif // HAS_FUSED_OPS
}
#else // TILE_N_NOT_DIVISIBLE
#else // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
#ifdef INPUT2_TYPE
B_FLOATN c_val = BLOCK_READ_B(c_ptr, 0);
@@ -333,7 +334,7 @@ KERNEL(gemm_tiled_opt)(
BLOCK_WRITE_C(d_ptr, 0, dequantized);
#endif // HAS_FUSED_OPS
#endif // TILE_N_NOT_DIVISIBLE
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
d_ptr += N;
#ifdef INPUT2_TYPE
c_ptr += N;

View File

@@ -1868,10 +1868,10 @@ std::string FusedOpsCodeGenerator::GetJitLoad(const FusedOpsConfiguration& conf,
return block_read;
} else if (input_tensor.LogicalSize() > 1) {
// Currently we assume that in such scenario we can safely load sub_group_size elements from the pointer
return Broadcast(block_read, input_dt, conf.vec_size);
return Broadcast(block_read, input_dt, vec_size);
} else {
// Input has only one element, so broadcast it for the whole vector size
return Broadcast(GetInputPtrName(input_id) + "[" + index_func_call + "]", input_dt, conf.vec_size);
return Broadcast(GetInputPtrName(input_id) + "[" + index_func_call + "]", input_dt, vec_size);
}
} else {
if (vec_size > 1) {

File diff suppressed because it is too large Load Diff