[GPU] Applied w/a to resolve softmax accuracy issue (#16818)

* Applied w/a to resolve softmax accuracy issue The original impl resulted in accuracy issue if leftover is not aligned with subgroup size. (e.g., for shape [1024, 306] where the lws = 32, itemsNum = 9, leftover = 18, subgroup size = 16) In such a case, the result got wrong if subgroup block read/write is used. As a w/a, not to use subgroup block read/write if leftover is not aligned with nsubgroup size. However we can come up with better itenNum size / lefover handling in the follot bwing up work. * Fix build error & minor revise * Fix condition
2023-04-11 10:01:22 -07:00 · 2023-04-11 10:01:22 -07:00 · 7513e9dee1
commit 7513e9dee1
parent 4fbd094cba
3 changed files with 31 additions and 33 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl
@ -21,12 +21,7 @@
 #define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;})
 #endif

-#define SUB_GROUP_SIZE 16
-
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
-#if !IS_DYNAMIC
-__attribute__((reqd_work_group_size(LWS, 1, 1)))
-#endif
 KERNEL (softmax_gpu_continuous_bfyx)(
    OPTIONAL_SHAPE_INFO_ARG
    const __global INPUT0_TYPE* input,
@ -61,24 +56,24 @@ KERNEL (softmax_gpu_continuous_bfyx)(
    __local INPUT0_TYPE lg_storage[SLM_SIZE];

    uint i=0;
+#if SUBGROUP_BLOCK_SIZE != 1
    if (workers_per_data_set > SUB_GROUP_SIZE)
    {
        for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
        {
+            INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + i * get_sub_group_size() + get_sub_group_local_id()];
+            my_maximum = max(my_maximum, tmp);
+            my_chunk[i] = tmp;
            BLOCK_TYPE vec_tmp = BLOCK_READ(input, data_set_offset + subgroup_offset + i * get_sub_group_size());
-#if SUBGROUP_BLOCK_SIZE == 1
-            my_maximum = max(my_maximum, vec_tmp);
-            my_chunk[i] = vec_tmp;
-#else
            for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
            {
                INPUT0_TYPE tmp = vec_tmp[j];
                my_maximum = max(my_maximum, tmp);
                my_chunk[i+j] = tmp;
            }
-#endif
        }
    }
+#endif
    for (; i<items_num; i++)
    {
        INPUT0_TYPE tmp = input[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()];
@ -142,29 +137,25 @@ KERNEL (softmax_gpu_continuous_bfyx)(

    my_sum = lg_storage[0];
    
-    i=0;

+    i=0;
 #if HAS_FUSED_OPS
+#if SUBGROUP_BLOCK_SIZE != 1
    if (workers_per_data_set > SUB_GROUP_SIZE)
    {
        for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
        {
            BLOCK_TYPE vec_tmp;
-#if SUBGROUP_BLOCK_SIZE == 1
-            ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
-            FUSED_OPS_MAIN;
-            vec_tmp = FUSED_OPS_RESULT_MAIN;
-#else
            for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
            {
                ACTIVATION_TYPE dequantized = my_chunk[i + j] / my_sum;
                FUSED_OPS_MAIN;
                vec_tmp[j] = FUSED_OPS_RESULT_MAIN;
            }
-#endif
            BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
        }
    }
+#endif
    for (; i<items_num; i++)
    {
        ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
@ -178,20 +169,18 @@ KERNEL (softmax_gpu_continuous_bfyx)(
        output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
    }
 #else
+#if SUBGROUP_BLOCK_SIZE != 1
    if (workers_per_data_set > SUB_GROUP_SIZE)
    {
        for (; i<items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE)
        {
            BLOCK_TYPE vec_tmp;
-#if SUBGROUP_BLOCK_SIZE == 1
-            vec_tmp = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
-#else
            for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
                vec_tmp[j] = ACTIVATION(my_chunk[i + j] / my_sum, ACTIVATION_PARAMS);
-#endif
            BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
        }
    }
+#endif
    for (; i < items_num; i++)
    {
        output[data_set_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()] = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp
@ -7,6 +7,7 @@
 #include <algorithm>

 namespace kernel_selector {
+static constexpr size_t subgroup_size = 16;
 ParamsKey SoftmaxKernel_bf::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::F16);
@ -52,19 +53,24 @@ SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softma
            dispatchData.itemsNum /= 2;
        }

-        if (dispatchData.itemsNum >> 3)
-            dispatchData.subgroupBlockSize = 8;
-        else if (dispatchData.itemsNum >> 2)
-            dispatchData.subgroupBlockSize = 4;
-        else if (dispatchData.itemsNum >> 1)
-            dispatchData.subgroupBlockSize = 2;
-        else
+        dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
+        //if (dispatchData.leftovers % subgroup_size) {
+        // To use subgroup read/write, the starting address should be aligned to 128 bit
+        if ((dispatchData.dataSetSize * params.inputs[0].ElementSize()) >> 4) {
            dispatchData.subgroupBlockSize = 1;
-
+        } else {
+            if (dispatchData.itemsNum >> 3)
+                dispatchData.subgroupBlockSize = 8;
+            else if (dispatchData.itemsNum >> 2)
+                dispatchData.subgroupBlockSize = 4;
+            else if (dispatchData.itemsNum >> 1)
+                dispatchData.subgroupBlockSize = 2;
+            else
+                dispatchData.subgroupBlockSize = 1;
+        }
        assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");

        dispatchData.gws[0] = dispatchData.lws[0];
-        dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];

        assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
    } else {
@ -115,7 +121,6 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
            MakeJitConstant("DATA_SETS_COUNT", data_set_count),
            MakeJitConstant("DATA_SET_SIZE", data_set_size),
            MakeJitConstant("STACK_SIZE", stack_size),
-            MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
        });
    } else {
        jit.AddConstants({
@ -126,9 +131,10 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis
            MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
            MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
            MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1),
-            MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize),
        });
    }
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
+    jit.AddConstant(MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize));
    auto activation_dt = GetActivationType(params);
    jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));

--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp
@ -20,7 +20,10 @@ const std::vector<ov::Shape> inputShapes2D = {
    {1, 100},
    {100, 1},
    {10, 10},
-    {100, 10}
+    {100, 10},
+    {1024, 300},
+    {1024, 304},
+    {1024, 306}
 };

 const std::vector<int64_t> axis2D = {