diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl index 9e65869700d..56107020372 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl @@ -3,11 +3,27 @@ // #include "include/batch_headers/common.cl" +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" + +#if SUBGROUP_BLOCK_SIZE == 1 +#define BLOCK_READ(ptr, offset) DT_INPUT_BLOCK_READ(ptr, offset) +#define BLOCK_WRITE(ptr, offset, val) DT_OUTPUT_BLOCK_WRITE(ptr, offset, val) +#define BLOCK_TYPE INPUT0_TYPE +#else +#define BLOCK_READ(ptr, offset) CAT(DT_INPUT_BLOCK_READ, SUBGROUP_BLOCK_SIZE)(ptr, offset) +#define BLOCK_WRITE(ptr, offset, val) CAT(DT_OUTPUT_BLOCK_WRITE, SUBGROUP_BLOCK_SIZE)(ptr, offset, val) +#define BLOCK_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, SUBGROUP_BLOCK_SIZE) +#endif #if IS_DYNAMIC #define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;}) #endif +#define SUB_GROUP_SIZE 16 + +REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) #if !IS_DYNAMIC __attribute__((reqd_work_group_size(LWS, 1, 1))) #endif @@ -36,36 +52,54 @@ KERNEL (softmax_gpu_continuous_bfyx)( #endif const uint data_set_offset = data_set_idx * data_set_size; - const uint my_data_offset = data_set_offset + in_data_set_idx; + const uint subgroup_offset = get_sub_group_id() * get_sub_group_size() * items_num; INPUT0_TYPE my_chunk[STACK_SIZE]; INPUT0_TYPE my_maximum = -UNIT_VAL_MAX; INPUT0_TYPE my_sum = UNIT_VAL_ZERO; - INPUT0_TYPE tmp; - + __local INPUT0_TYPE lg_storage[SLM_SIZE]; - //each WI reads items_num consecutive items from batch - for (uint i=0; i SUB_GROUP_SIZE) { - tmp = input[my_data_offset + i * workers_per_data_set]; + for (; i SUB_GROUP_SIZE) + { + for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i+=SUBGROUP_BLOCK_SIZE) + { + BLOCK_TYPE vec_tmp; +#if SUBGROUP_BLOCK_SIZE == 1 + ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum; + FUSED_OPS_MAIN; + vec_tmp = FUSED_OPS_RESULT_MAIN; +#else + for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++) + { + ACTIVATION_TYPE dequantized = my_chunk[i + j] / my_sum; + FUSED_OPS_MAIN; + vec_tmp[j] = FUSED_OPS_RESULT_MAIN; + } +#endif + BLOCK_WRITE(output, data_set_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp); + } + } + for (; i SUB_GROUP_SIZE) + { + for (; i> 3) + dispatchData.subgroupBlockSize = 8; + else if (dispatchData.itemsNum >> 2) + dispatchData.subgroupBlockSize = 4; + else if (dispatchData.itemsNum >> 1) + dispatchData.subgroupBlockSize = 2; + else + dispatchData.subgroupBlockSize = 1; + assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?"); dispatchData.gws[0] = dispatchData.lws[0]; dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0]; assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0); + } else { + dispatchData.subgroupBlockSize = 1; } return dispatchData; } @@ -106,6 +118,7 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis MakeJitConstant("DATA_SETS_COUNT", data_set_count), MakeJitConstant("DATA_SET_SIZE", data_set_size), MakeJitConstant("STACK_SIZE", stack_size), + MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize), }); } else { jit.AddConstants({ @@ -116,6 +129,7 @@ JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, Dis MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize), MakeJitConstant("LEFTOVERS", dispatchData.leftovers), MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1), + MakeJitConstant("SUBGROUP_BLOCK_SIZE", dispatchData.subgroupBlockSize), }); } auto activation_dt = GetActivationType(params); diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp index e83327d17d5..a135f1a75b1 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/softmax.cpp @@ -84,6 +84,18 @@ INSTANTIATE_TEST_SUITE_P( testing::Values(ov::AnyMap())), SoftMax8LayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P( + smoke_SoftMaxStableDiffusion, + SoftMax8LayerTest, + testing::Combine(testing::ValuesIn(netPrecisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + ::testing::ValuesIn(ov::test::static_shapes_to_test_representation({{16, 4096, 4096}})), + testing::Values(-1), + testing::Values(CommonTestUtils::DEVICE_GPU), + testing::Values(ov::AnyMap())), + SoftMax8LayerTest::getTestCaseName); + const std::vector inputShapes5D = { {1, 100, 1, 1, 1}, {1, 3, 4, 3, 4}, diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp index db7818d5c21..9b197162aa0 100644 --- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp +++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp @@ -812,6 +812,21 @@ ov::runtime::Tensor generate(const std::shared_ptr& node, return Activation::generate(elemType, targetShape, InputGenerateData(-10, 20, 4)); } +ov::runtime::Tensor generate(const std::shared_ptr& node, + size_t port, + const ov::element::Type& elemType, + const ov::Shape& targetShape) { + auto axis = node->get_axis(); + axis = axis < 0 ? targetShape.size() + axis : axis; + unsigned datasetSize = std::accumulate(targetShape.begin() + axis, targetShape.end(), 1, + [](std::size_t a, size_t b) { return a * b; }); + // Generate small negative values for datasets which exceed 2048 size + // to avoid NaN values in Softmax results for fp16 precision + if (datasetSize >= 2048 && static_cast(elemType) == ov::element::Type_t::f16) + return ov::test::utils::create_and_fill_tensor_normal_distribution(elemType, targetShape, -5.f, 0.5f, 7235346); + return generate(std::dynamic_pointer_cast(node), port, elemType, targetShape); +} + template ov::runtime::Tensor generateInput(const std::shared_ptr& node, size_t port,