[GPU] Applied w/a to resolve softmax accuracy issue (#16818)
* Applied w/a to resolve softmax accuracy issue The original impl resulted in accuracy issue if leftover is not aligned with subgroup size. (e.g., for shape [1024, 306] where the lws = 32, itemsNum = 9, leftover = 18, subgroup size = 16) In such a case, the result got wrong if subgroup block read/write is used. As a w/a, not to use subgroup block read/write if leftover is not aligned with nsubgroup size. However we can come up with better itenNum size / lefover handling in the follot bwing up work. * Fix build error & minor revise * Fix condition
This commit is contained in:
parent
4fbd094cba
commit
7513e9dee1
@ -21,12 +21,7 @@
|
||||
#define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;})
|
||||
#endif
|
||||
|
||||
#define SUB_GROUP_SIZE 16
|
||||
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
#if !IS_DYNAMIC
|
||||
__attribute__((reqd_work_group_size(LWS, 1, 1)))
|
||||
#endif
|
||||
KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* input,
|
||||
@ -61,24 +56,24 @@ KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
__local INPUT0_TYPE lg_storage[SLM_SIZE];
|
||||
|
||||
uint i=0;
|
||||
#if SUBGROUP_BLOCK_SIZE != 1
|
||||
if (workers_per_data_set > SUB_GROUP_SIZE)
|
||||
{
|
||||
for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
|
||||
{
|
||||
INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + i * get_sub_group_size() + get_sub_group_local_id()];
|
||||
my_maximum = max(my_maximum, tmp);
|
||||
my_chunk[i] = tmp;
|
||||
BLOCK_TYPE vec_tmp = BLOCK_READ(input, data_set_offset + subgroup_offset + i * get_sub_group_size());
|
||||
#if SUBGROUP_BLOCK_SIZE == 1
|
||||
my_maximum = max(my_maximum, vec_tmp);
|
||||
my_chunk[i] = vec_tmp;
|
||||
#else
|
||||
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
|
||||
{
|
||||
INPUT0_TYPE tmp = vec_tmp[j];
|
||||
my_maximum = max(my_maximum, tmp);
|
||||
my_chunk[i+j] = tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; i<items_num; i++)
|
||||
{
|
||||
INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()];
|
||||
@ -142,29 +137,25 @@ KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
|
||||
my_sum = lg_storage[0];
|
||||
|
||||
i=0;
|
||||
|
||||
i=0;
|
||||
#if HAS_FUSED_OPS
|
||||
#if SUBGROUP_BLOCK_SIZE != 1
|
||||
if (workers_per_data_set > SUB_GROUP_SIZE)
|
||||
{
|
||||
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
|
||||
{
|
||||
BLOCK_TYPE vec_tmp;
|
||||
#if SUBGROUP_BLOCK_SIZE == 1
|
||||
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
|
||||
FUSED_OPS_MAIN;
|
||||
vec_tmp = FUSED_OPS_RESULT_MAIN;
|
||||
#else
|
||||
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
|
||||
{
|
||||
ACTIVATION_TYPE dequantized = my_chunk[i + j] / my_sum;
|
||||
FUSED_OPS_MAIN;
|
||||
vec_tmp[j] = FUSED_OPS_RESULT_MAIN;
|
||||
}
|
||||
#endif
|
||||
BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; i<items_num; i++)
|
||||
{
|
||||
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
|
||||
@ -178,20 +169,18 @@ KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
|
||||
}
|
||||
#else
|
||||
#if SUBGROUP_BLOCK_SIZE != 1
|
||||
if (workers_per_data_set > SUB_GROUP_SIZE)
|
||||
{
|
||||
for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
|
||||
{
|
||||
BLOCK_TYPE vec_tmp;
|
||||
#if SUBGROUP_BLOCK_SIZE == 1
|
||||
vec_tmp = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
|
||||
#else
|
||||
for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
|
||||
vec_tmp[j] = ACTIVATION(my_chunk[i + j] / my_sum, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; i < items_num; i++)
|
||||
{
|
||||
output[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()] = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <algorithm>
|
||||
|
||||
namespace kernel_selector {
|
||||
static constexpr size_t subgroup_size = 16;
|
||||
ParamsKey SoftmaxKernel_bf::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
@ -52,19 +53,24 @@ SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softma
|
||||
dispatchData.itemsNum /= 2;
|
||||
}
|
||||
|
||||
if (dispatchData.itemsNum >> 3)
|
||||
dispatchData.subgroupBlockSize = 8;
|
||||
else if (dispatchData.itemsNum >> 2)
|
||||
dispatchData.subgroupBlockSize = 4;
|
||||
else if (dispatchData.itemsNum >> 1)
|
||||
dispatchData.subgroupBlockSize = 2;
|
||||
else
|
||||
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
|
||||
//if (dispatchData.leftovers % subgroup_size) {
|
||||
// To use subgroup read/write, the starting address should be aligned to 128 bit
|
||||
if ((dispatchData.dataSetSize * params.inputs[0].ElementSize()) >> 4) {
|
||||
dispatchData.subgroupBlockSize = 1;
|
||||
|
||||
} else {
|
||||
if (dispatchData.itemsNum >> 3)
|
||||
dispatchData.subgroupBlockSize = 8;
|
||||
else if (dispatchData.itemsNum >> 2)
|
||||
dispatchData.subgroupBlockSize = 4;
|
||||
else if (dispatchData.itemsNum >> 1)
|
||||
dispatchData.subgroupBlockSize = 2;
|
||||
else
|
||||
dispatchData.subgroupBlockSize = 1;
|
||||
}
|
||||
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
|
||||
|
||||
dispatchData.gws[0] = dispatchData.lws[0];
|
||||
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
|
||||
|
||||
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
|
||||
} else {
|
||||
@ -115,7 +121,6 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
|
||||
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
|
||||
MakeJitConstant("DATA_SET_SIZE", data_set_size),
|
||||
MakeJitConstant("STACK_SIZE", stack_size),
|
||||
MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
|
||||
});
|
||||
} else {
|
||||
jit.AddConstants({
|
||||
@ -126,9 +131,10 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
|
||||
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
|
||||
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
|
||||
MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1),
|
||||
MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
|
||||
});
|
||||
}
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
|
||||
jit.AddConstant(MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize));
|
||||
auto activation_dt = GetActivationType(params);
|
||||
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
|
||||
|
||||
|
@ -20,7 +20,10 @@ const std::vector<ov::Shape> inputShapes2D = {
|
||||
{1, 100},
|
||||
{100, 1},
|
||||
{10, 10},
|
||||
{100, 10}
|
||||
{100, 10},
|
||||
{1024, 300},
|
||||
{1024, 304},
|
||||
{1024, 306}
|
||||
};
|
||||
|
||||
const std::vector<int64_t> axis2D = {
|
||||
|
Loading…
Reference in New Issue
Block a user