[GPU] Applied w/a to resolve softmax accuracy issue (#16818)

* Applied w/a to resolve softmax accuracy issue
The original impl resulted in accuracy issue if leftover is not aligned with subgroup size.
(e.g., for shape [1024, 306] where the lws = 32, itemsNum = 9, leftover = 18, subgroup size = 16)
In such a case, the result got wrong if subgroup block read/write is used.
As a w/a, not to use subgroup block read/write if leftover is not aligned with nsubgroup size.
However we can come up with better itenNum size / lefover handling in the follot bwing up work.

* Fix build error & minor revise

* Fix condition
This commit is contained in:
Taylor Yeonbok Lee 2023-04-11 10:01:22 -07:00 committed by GitHub
parent 4fbd094cba
commit 7513e9dee1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 33 deletions

View File

@ -21,12 +21,7 @@
#define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;})
#endif
#define SUB_GROUP_SIZE 16
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
#if !IS_DYNAMIC
__attribute__((reqd_work_group_size(LWS, 1, 1)))
#endif
KERNEL (softmax_gpu_continuous_bfyx)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
@ -61,24 +56,24 @@ KERNEL (softmax_gpu_continuous_bfyx)(
__local INPUT0_TYPE lg_storage[SLM_SIZE];
uint i=0;
#if SUBGROUP_BLOCK_SIZE != 1
if (workers_per_data_set > SUB_GROUP_SIZE)
{
for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
{
INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + i * get_sub_group_size() + get_sub_group_local_id()];
my_maximum = max(my_maximum, tmp);
my_chunk[i] = tmp;
BLOCK_TYPE vec_tmp = BLOCK_READ(input, data_set_offset + subgroup_offset + i * get_sub_group_size());
#if SUBGROUP_BLOCK_SIZE == 1
my_maximum = max(my_maximum, vec_tmp);
my_chunk[i] = vec_tmp;
#else
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
{
INPUT0_TYPE tmp = vec_tmp[j];
my_maximum = max(my_maximum, tmp);
my_chunk[i+j] = tmp;
}
#endif
}
}
#endif
for (; i<items_num; i++)
{
INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()];
@ -142,29 +137,25 @@ KERNEL (softmax_gpu_continuous_bfyx)(
my_sum = lg_storage[0];
i=0;
i=0;
#if HAS_FUSED_OPS
#if SUBGROUP_BLOCK_SIZE != 1
if (workers_per_data_set > SUB_GROUP_SIZE)
{
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
{
BLOCK_TYPE vec_tmp;
#if SUBGROUP_BLOCK_SIZE == 1
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
FUSED_OPS_MAIN;
vec_tmp = FUSED_OPS_RESULT_MAIN;
#else
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
{
ACTIVATION_TYPE dequantized = my_chunk[i + j] / my_sum;
FUSED_OPS_MAIN;
vec_tmp[j] = FUSED_OPS_RESULT_MAIN;
}
#endif
BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
}
}
#endif
for (; i<items_num; i++)
{
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
@ -178,20 +169,18 @@ KERNEL (softmax_gpu_continuous_bfyx)(
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
}
#else
#if SUBGROUP_BLOCK_SIZE != 1
if (workers_per_data_set > SUB_GROUP_SIZE)
{
for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
{
BLOCK_TYPE vec_tmp;
#if SUBGROUP_BLOCK_SIZE == 1
vec_tmp = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
#else
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
vec_tmp[j] = ACTIVATION(my_chunk[i + j] / my_sum, ACTIVATION_PARAMS);
#endif
BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
}
}
#endif
for (; i < items_num; i++)
{
output[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()] = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);

View File

@ -7,6 +7,7 @@
#include <algorithm>
namespace kernel_selector {
static constexpr size_t subgroup_size = 16;
ParamsKey SoftmaxKernel_bf::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
@ -52,19 +53,24 @@ SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softma
dispatchData.itemsNum /= 2;
}
if (dispatchData.itemsNum >> 3)
dispatchData.subgroupBlockSize = 8;
else if (dispatchData.itemsNum >> 2)
dispatchData.subgroupBlockSize = 4;
else if (dispatchData.itemsNum >> 1)
dispatchData.subgroupBlockSize = 2;
else
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
//if (dispatchData.leftovers % subgroup_size) {
// To use subgroup read/write, the starting address should be aligned to 128 bit
if ((dispatchData.dataSetSize * params.inputs[0].ElementSize()) >> 4) {
dispatchData.subgroupBlockSize = 1;
} else {
if (dispatchData.itemsNum >> 3)
dispatchData.subgroupBlockSize = 8;
else if (dispatchData.itemsNum >> 2)
dispatchData.subgroupBlockSize = 4;
else if (dispatchData.itemsNum >> 1)
dispatchData.subgroupBlockSize = 2;
else
dispatchData.subgroupBlockSize = 1;
}
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
dispatchData.gws[0] = dispatchData.lws[0];
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
} else {
@ -115,7 +121,6 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
MakeJitConstant("DATA_SET_SIZE", data_set_size),
MakeJitConstant("STACK_SIZE", stack_size),
MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
});
} else {
jit.AddConstants({
@ -126,9 +131,10 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1),
MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
});
}
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
jit.AddConstant(MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize));
auto activation_dt = GetActivationType(params);
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));

View File

@ -20,7 +20,10 @@ const std::vector<ov::Shape> inputShapes2D = {
{1, 100},
{100, 1},
{10, 10},
{100, 10}
{100, 10},
{1024, 300},
{1024, 304},
{1024, 306}
};
const std::vector<int64_t> axis2D = {