diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl index 94421930ea3..9e65869700d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/softmax_gpu_bf.cl @@ -4,43 +4,60 @@ #include "include/batch_headers/common.cl" +#if IS_DYNAMIC +#define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;}) +#endif + +#if !IS_DYNAMIC __attribute__((reqd_work_group_size(LWS, 1, 1))) +#endif KERNEL (softmax_gpu_continuous_bfyx)( + OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS , FUSED_OPS_DECLS #endif ) { - const uint data_set_idx = get_global_id(1); //in processing of which data set this WI participates? - const uint workers_per_data_set = LWS; //how many WI participates in processing of one data set - const uint in_data_set_idx = get_global_id(0); //this WI's id in group of items processing single data set - const uint data_set_size = DATA_SET_SIZE; //how many elements are in one data set - const uint data_sets_count = DATA_SETS_COUNT; //how many data sets are in the processing payload + const uint data_set_idx = get_global_id(1); // in processing of which data set this WI participates? + const uint workers_per_data_set = LWS; // how many WI participates in processing of one data set + const uint in_data_set_idx = get_global_id(0); // this WI's id in group of items processing single data set + const uint data_set_size = DATA_SET_SIZE; // how many elements are in one data set + const uint data_sets_count = DATA_SETS_COUNT; // how many data sets are in the processing payload +#if !IS_DYNAMIC + const uint items_num = ITEMS_NUM; // how many elements are processed per one WI + const uint leftovers = LEFTOVERS; +#else + // since workers_per_data_set is calculated by power of 2 + // items_num can be calculated by dividing data_set_size by power of 2 + const uint power = CALC_POWER(workers_per_data_set); + const uint items_num = data_set_size>>power; + const uint leftovers = data_set_size-(items_num<(params); KernelData kd = KernelData::Default(params); - kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { - const auto& prim_params = static_cast(params); - auto dispatchData = SetDefault(prim_params); - OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); - kd.kernels[0].params.workGroups.global = dispatchData.gws; - kd.kernels[0].params.workGroups.local = dispatchData.lws; - kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes()); - kd.internalBufferDataType = prim_params.inputs[0].GetDType(); - }; - auto dispatchData = SetDefault(orgParams); auto cldnn_jit = GetJitConstants(orgParams, dispatchData); auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, params, options); auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[0]; - bool is_dynamic = orgParams.outputs[0].is_dynamic(); - - FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, EXE_MODE_DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params), 1, - is_dynamic); - - if (is_dynamic) { - auto& args = kernel.params.arguments; - args.clear(); - args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); - args.push_back({ArgumentDescriptor::Types::INPUT, 0}); - args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - args.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); - - kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes()); - kd.internalBufferDataType = orgParams.inputs[0].GetDType(); - } + orgParams.outputs[0].is_dynamic()); return {kd}; } @@ -122,13 +92,21 @@ bool SoftmaxKernelBaseBF::Validate(const Params& p, const optional_params& o) co switch (params.dim) { case SoftmaxDim::X: - return input.Y().v == 1 && input.Z().v == 1 && input.Feature().v == 1; + return !input.Y().is_dynamic && input.Y().v == 1 && + !input.Z().is_dynamic && input.Z().v == 1 && + !input.Feature().is_dynamic && input.Feature().v == 1; case SoftmaxDim::Y: - return input.X().v == 1 && input.Z().v == 1 && (input.Feature().v == 1 || input.GetLayout() == DataLayout::bfyx); + return !input.X().is_dynamic && input.X().v == 1 && + !input.Z().is_dynamic && input.Z().v == 1 && + ((!input.Feature().is_dynamic && input.Feature().v == 1) || input.GetLayout() == DataLayout::bfyx); case SoftmaxDim::Z: - return input.X().v == 1 && input.Y().v == 1 && input.Feature().v == 1; + return !input.X().is_dynamic && input.X().v == 1 && + !input.Y().is_dynamic && input.Y().v == 1 && + !input.Feature().is_dynamic && input.Feature().v == 1; case SoftmaxDim::FEATURE: - return input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1; + return !input.X().is_dynamic && input.X().v == 1 && + !input.Y().is_dynamic && input.Y().v == 1 && + !input.Z().is_dynamic && input.Z().v == 1; default: return false; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_base.h index eef245aab40..b523e05e294 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_base.h @@ -43,6 +43,7 @@ public: size_t leftovers; size_t dataSetsCount; size_t dataSetSize; + size_t maxSlmSize; size_t normIndex; // which dimension (from in-memory representation) is normalized, e.g. for bfyx and // softmax::normalize_f, it will be f's index == 2 (used only by naive kernel) }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp index c11624febfa..b2923ff82b6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_bf.cpp @@ -23,39 +23,40 @@ ParamsKey SoftmaxKernel_bf::GetSupportedKey() const { k.EnableSoftmaxDim(SoftmaxDim::FEATURE); k.EnableDifferentTypes(); k.EnableBatching(); + k.EnableDynamicShapesSupport(); return k; } SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softmax_params& params) const { auto dispatchData = Parent::SetDefault(params); - // start with 1 thread per data set - dispatchData.gws[0] = 1; - dispatchData.gws[1] = dispatchData.dataSetsCount; - dispatchData.itemsNum = dispatchData.dataSetSize; - dispatchData.normIndex = 0; - // We have two units of data per work item in current implementation. auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType()); // Combining device execution and local memory restrictions to compute maximum possible LWS. auto max_lws = std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi); + dispatchData.maxSlmSize = max_lws; + if (!params.has_dynamic_tensors()) { + // start with 1 thread per data set + dispatchData.gws[0] = 1; + dispatchData.gws[1] = dispatchData.dataSetsCount; + dispatchData.itemsNum = dispatchData.dataSetSize; - dispatchData.lws[0] = 1; - // Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory - // reads. - while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) { - dispatchData.lws[0] *= 2; - dispatchData.itemsNum /= 2; + dispatchData.lws[0] = 1; + // Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory + // reads. + while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) { + dispatchData.lws[0] *= 2; + dispatchData.itemsNum /= 2; + } + + assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?"); + + dispatchData.gws[0] = dispatchData.lws[0]; + dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0]; + + assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0); } - - assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?"); - - dispatchData.gws[0] = dispatchData.lws[0]; - dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0]; - - assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0); - return dispatchData; } @@ -64,24 +65,71 @@ KernelsPriority SoftmaxKernel_bf::GetKernelsPriority(const Params& /*params*/, c } KernelsData SoftmaxKernel_bf::GetKernelsData(const Params& params, const optional_params& optionalParams) const { - return GetCommonKernelsData(params, optionalParams); + KernelsData kds = GetCommonKernelsData(params, optionalParams); + if (!kds.empty()) { + kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + }; + } + + return kds; } JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const { auto jit = Parent::GetJitConstants(params, dispatchData); - jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + if (params.has_dynamic_tensors()) { + const auto& input = params.inputs[0]; + auto x = toCodeString(input.X(), 5); + auto y = toCodeString(input.Y(), 4); + auto z = toCodeString(input.Z(), 3); + auto w = toCodeString(input.W(), 2); + auto f = toCodeString(input.Feature(), 1); + auto b = toCodeString(input.Batch(), 0); + auto softmax_dim_y_bfyx = (params.dim == SoftmaxDim::Y && input.GetLayout() == DataLayout::bfyx); + const std::string flatten_bf = "(SOFTMAX_DIM_Y_BFYX&&(" + f + ">1))"; + const std::string lws_0 = "get_local_size(0)"; + const std::string data_set_count = "(FLATTEN_BF?" + toVectorMulString({f, b}) + ":" + b + ")"; + const std::string data_set_size = "(FLATTEN_BF?" + y + ":" + toVectorMulString({x, y, z, f}) + ")"; + // It can be expected that the maximum possible itemsNum will not exceed 32 + // Therefore, in dynamic shape, stack_size including additional buffer is set to 33 + constexpr size_t stack_size = 33; // The size of stack for my_chunk + jit.AddConstants({ + MakeJitConstant("SOFTMAX_DIM_Y_BFYX", softmax_dim_y_bfyx), + MakeJitConstant("FLATTEN_BF", flatten_bf), + MakeJitConstant("LWS", lws_0), + MakeJitConstant("SLM_SIZE", dispatchData.maxSlmSize), + MakeJitConstant("DATA_SETS_COUNT", data_set_count), + MakeJitConstant("DATA_SET_SIZE", data_set_size), + MakeJitConstant("STACK_SIZE", stack_size), + }); + } else { + jit.AddConstants({ + MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum), + MakeJitConstant("LWS", dispatchData.lws[0]), + MakeJitConstant("SLM_SIZE", dispatchData.lws[0]), + MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount), + MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize), + MakeJitConstant("LEFTOVERS", dispatchData.leftovers), + MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1), + }); + } + auto activation_dt = GetActivationType(params); + jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION")); if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); FusedOpsConfiguration conf_main = {"_MAIN", {"data_set_offset", "in_data_set_idx + i * workers_per_data_set", "0", "0"}, "dequantized", - input_dt}; + activation_dt}; FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS", {"data_set_offset", "workers_per_data_set * ITEMS_NUM + in_data_set_idx", "0", "0"}, "dequantized", - input_dt}; + activation_dt}; jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers})); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_fb.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_fb.cpp index 5d907eee85d..2d0f1eaa1d1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_fb.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_fb.cpp @@ -98,18 +98,26 @@ KernelsData SoftmaxKernel_fb::GetKernelsData(const Params& params, const optiona JitConstants SoftmaxKernel_fb::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const { auto jit = Parent::GetJitConstants(params, dispatchData); - jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.AddConstants({ + MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum), + MakeJitConstant("LWS", dispatchData.lws[0]), + MakeJitConstant("GWS", dispatchData.gws[0]), + MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount), + MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize), + MakeJitConstant("LEFTOVERS", dispatchData.leftovers), + }); + auto activation_dt = GetActivationType(params); + jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION")); if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); FusedOpsConfiguration conf_main = {"_MAIN", {"global_id", "LWS * i", "0", "0"}, "dequantized", - input_dt}; + activation_dt}; FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS", {"global_id", "LWS * ITEMS_NUM", "0", "0"}, "dequantized", - input_dt}; + activation_dt}; jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers})); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp index 87d7e6869ea..d684f34875f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp @@ -77,8 +77,11 @@ KernelsPriority SoftmaxKerneItemsClassOptimized::GetKernelsPriority(const Params JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const { auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData); - jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes)); - jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad)); + jit.AddConstants({ + MakeJitConstant("LEFTOVERS", dispatchData.leftovers), + MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes), + MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad), + }); return jit; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_ref.cpp index fca3d40250a..c3653555f31 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_ref.cpp @@ -45,7 +45,36 @@ KernelsPriority SoftmaxKernelRef::GetKernelsPriority(const Params& /*params*/, c } KernelsData SoftmaxKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); + KernelsData kds = GetCommonKernelsData(params, options); + if (!kds.empty()) { + const softmax_params& orgParams = static_cast(params); + bool is_dynamic = orgParams.outputs[0].is_dynamic(); + + kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.internalBufferSizes.clear(); + kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes()); + kd.internalBufferDataType = prim_params.inputs[0].GetDType(); + }; + + if (is_dynamic) { + auto& args = kds[0].kernels[0].params.arguments; + args.clear(); + args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); + args.push_back({ArgumentDescriptor::Types::INPUT, 0}); + args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + args.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); + + kds[0].internalBufferSizes.clear(); + kds[0].internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes()); + kds[0].internalBufferDataType = orgParams.inputs[0].GetDType(); + } + } + return kds; } JitConstants SoftmaxKernelRef::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const { diff --git a/src/plugins/intel_gpu/tests/test_cases/softmax_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/softmax_gpu_test.cpp index d76bd1ed1a8..35658f33ba3 100644 --- a/src/plugins/intel_gpu/tests/test_cases/softmax_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/softmax_gpu_test.cpp @@ -1122,3 +1122,82 @@ TEST_P(softmax_gpu_formats_test_f16, softmax_gpu_formats_test_f16_cached) { ASSERT_NO_FATAL_FAILURE(test(true)); } #endif + +TEST(softmax_gpu_bfyx_f32, bf_opt_normalize_f_dynamic) { + auto& engine = get_test_engine(); + + const int64_t x = 1, y = 1, f = 3, b = 2; + const int64_t buf_size = b*f*y*x; + auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), y, x}, + data_types::f32, format::bfyx}; + auto input_layout_static = layout{ov::PartialShape{b, f, y, x}, data_types::f32, format::bfyx}; + + auto input = engine.allocate_memory(input_layout_static); + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(softmax("softmax", input_info("input"), 1)); + + vector input_vec = { + //y0x0 + /*b0f0*/0.1f, + /*b0f1*/0.2f, + /*b0f2*/0.2f, + /*b1f0*/3.f, + /*b1f1*/4.f, + /*b1f2*/0.2f, + }; + set_values(input, input_vec); + + float expected_max_values[2] = { + 0.344253346f, //b=0, y=0, x=0 + 0.719294981f //b=1, y=0, x=0 + }; + + ExecutionConfig config; + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + auto inst = network.get_primitive("softmax"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "softmax"); + + auto output = outputs.at("softmax").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + float out_buffer[buf_size]; + for (uint32_t i = 0; i < buf_size; i++) { + out_buffer[i] = output_ptr[i]; + } + + float temp_max = 0; + float expected_sum = 1.0f; + int max_value_buffer_index = 0; + for (uint32_t i = 0; i < b; i++) { //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f + for (uint32_t j = 0; j < y; j++) { + for (uint32_t k = 0; k < x; k++) { + float sum = 0.0f; + for (uint32_t l = 0; l < f; l++) { + int index = i * f * x * y + + l * x * y + + j * x + + k; + if (out_buffer[index] >= temp_max) { + temp_max = out_buffer[index]; + } + sum += out_buffer[index]; + } + ASSERT_TRUE(are_equal(temp_max, expected_max_values[max_value_buffer_index])); + temp_max = 0; + max_value_buffer_index++; + + ASSERT_TRUE(are_equal(sum, expected_sum)); + sum = 0.0f; + } + } + } +}