[GPU] Added shape agnostic optimized SoftMax kernel (#15834)

* [GPU] Added shape agnostic optimized SoftMax kernel

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Update SoftmaxKernelBaseBF::Validate policy for shape agnostic kernel

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Add softmax_gpu_bf shape agnostic TC for ov_gpu_unit_tests

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Fix failed TCs for ie-tests-linux-ubuntu20-gpu

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Update to use stack array instead of global buffer

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Remove global buffer usage completely

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Add #undef directive

Signed-off-by: Andrew Park <andrew.park@intel.com>

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park
2023-03-07 02:10:29 +09:00
committed by GitHub
parent 41fd836196
commit 7123e8879e
8 changed files with 266 additions and 100 deletions

View File

@@ -4,43 +4,60 @@
#include "include/batch_headers/common.cl"
#if IS_DYNAMIC
#define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;})
#endif
#if !IS_DYNAMIC
__attribute__((reqd_work_group_size(LWS, 1, 1)))
#endif
KERNEL (softmax_gpu_continuous_bfyx)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
) {
const uint data_set_idx = get_global_id(1); //in processing of which data set this WI participates?
const uint workers_per_data_set = LWS; //how many WI participates in processing of one data set
const uint in_data_set_idx = get_global_id(0); //this WI's id in group of items processing single data set
const uint data_set_size = DATA_SET_SIZE; //how many elements are in one data set
const uint data_sets_count = DATA_SETS_COUNT; //how many data sets are in the processing payload
const uint data_set_idx = get_global_id(1); // in processing of which data set this WI participates?
const uint workers_per_data_set = LWS; // how many WI participates in processing of one data set
const uint in_data_set_idx = get_global_id(0); // this WI's id in group of items processing single data set
const uint data_set_size = DATA_SET_SIZE; // how many elements are in one data set
const uint data_sets_count = DATA_SETS_COUNT; // how many data sets are in the processing payload
#if !IS_DYNAMIC
const uint items_num = ITEMS_NUM; // how many elements are processed per one WI
const uint leftovers = LEFTOVERS;
#else
// since workers_per_data_set is calculated by power of 2
// items_num can be calculated by dividing data_set_size by power of 2
const uint power = CALC_POWER(workers_per_data_set);
const uint items_num = data_set_size>>power;
const uint leftovers = data_set_size-(items_num<<power);
#endif
const uint data_set_offset = data_set_idx * data_set_size;
const uint my_data_offset = data_set_offset + in_data_set_idx;
INPUT0_TYPE my_chunk[ITEMS_NUM + 1];
INPUT0_TYPE my_chunk[STACK_SIZE];
INPUT0_TYPE my_maximum = -UNIT_VAL_MAX;
INPUT0_TYPE my_sum = UNIT_VAL_ZERO;
INPUT0_TYPE tmp;
__local INPUT0_TYPE lg_storage[LWS];
__local INPUT0_TYPE lg_storage[SLM_SIZE];
//each WI reads ITEMS_NUM consecutive items from batch
for (uint i=0; i<ITEMS_NUM; ++i)
//each WI reads items_num consecutive items from batch
for (uint i=0; i<items_num; ++i)
{
tmp = input[my_data_offset + i * workers_per_data_set];
my_maximum = max(my_maximum, tmp);
my_chunk[i] = tmp;
}
if (in_data_set_idx < LEFTOVERS)
if (in_data_set_idx < leftovers)
{
tmp = input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
tmp = input[data_set_offset + workers_per_data_set * items_num + in_data_set_idx];
my_maximum = max(my_maximum, tmp);
my_chunk[ITEMS_NUM] = tmp;
my_chunk[items_num] = tmp;
}
lg_storage[in_data_set_idx] = my_maximum;
@@ -60,18 +77,18 @@ KERNEL (softmax_gpu_continuous_bfyx)(
barrier(CLK_LOCAL_MEM_FENCE);
for (uint i=0; i<ITEMS_NUM; ++i)
for (uint i=0; i<items_num; ++i)
{
tmp = native_exp(my_chunk[i] - my_maximum);
my_sum += tmp;
my_chunk[i] = tmp;
}
if (in_data_set_idx < LEFTOVERS)
if (in_data_set_idx < leftovers)
{
tmp = native_exp(my_chunk[ITEMS_NUM] - my_maximum);
tmp = native_exp(my_chunk[items_num] - my_maximum);
my_sum += tmp;
my_chunk[ITEMS_NUM] = tmp;
my_chunk[items_num] = tmp;
}
lg_storage[in_data_set_idx] = my_sum;
@@ -89,22 +106,25 @@ KERNEL (softmax_gpu_continuous_bfyx)(
my_sum = lg_storage[0];
#if HAS_FUSED_OPS
for (uint i=0; i<ITEMS_NUM; ++i)
for (uint i=0; i<items_num; ++i)
{
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
FUSED_OPS_MAIN;
output[my_data_offset + i * workers_per_data_set] = FUSED_OPS_RESULT_MAIN;
}
if (in_data_set_idx < LEFTOVERS)
if (in_data_set_idx < leftovers)
{
ACTIVATION_TYPE dequantized = my_chunk[ITEMS_NUM] / my_sum;
ACTIVATION_TYPE dequantized = my_chunk[items_num] / my_sum;
FUSED_OPS_LEFTOVERS;
output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
}
#else
for (uint i=0; i<ITEMS_NUM; ++i)
for (uint i=0; i<items_num; ++i)
output[my_data_offset + i * workers_per_data_set] = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
if (in_data_set_idx < LEFTOVERS)
output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION(my_chunk[ITEMS_NUM] / my_sum, ACTIVATION_PARAMS);
if (in_data_set_idx < leftovers)
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = ACTIVATION(my_chunk[items_num] / my_sum, ACTIVATION_PARAMS);
#endif
}
#ifdef CALC_POWER
#undef CALC_POWER
#endif

View File

@@ -7,20 +7,11 @@
namespace kernel_selector {
JitConstants SoftmaxKernelBase::GetJitConstants(const softmax_params& params,
SoftmaxKernelBase::DispatchData dispatchData) const {
JitConstants mem_consts = MakeBaseParamsJitConstants(params);
JitConstants jit = MakeBaseParamsJitConstants(params);
mem_consts.AddConstants({MakeJitConstant("ALONG_" + toString(params.dim), "")});
jit.AddConstants({MakeJitConstant("ALONG_" + toString(params.dim), "1")});
mem_consts.AddConstants({
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
MakeJitConstant("LWS", dispatchData.lws[0]),
MakeJitConstant("GWS", dispatchData.gws[0]),
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
});
return mem_consts;
return jit;
}
SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_params&) const {
@@ -39,6 +30,7 @@ SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_para
dispatchData.normIndex = 0;
dispatchData.dataSetsCount = 0;
dispatchData.dataSetSize = 0;
dispatchData.maxSlmSize = 0;
return dispatchData;
}
@@ -59,47 +51,25 @@ KernelsData SoftmaxKernelBase::GetCommonKernelsData(const Params& params, const
const softmax_params& orgParams = static_cast<const softmax_params&>(params);
KernelData kd = KernelData::Default<softmax_params>(params);
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const softmax_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
kd.internalBufferSizes.clear();
kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes());
kd.internalBufferDataType = prim_params.inputs[0].GetDType();
};
auto dispatchData = SetDefault(orgParams);
auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, params, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
bool is_dynamic = orgParams.outputs[0].is_dynamic();
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
FillCLKernelData(kernel,
dispatchData,
params.engineInfo,
kernelName,
jit,
entry_point,
EXE_MODE_DEFAULT,
false,
false,
1,
GetFusedPrimitiveInputsCount(params),
1,
is_dynamic);
if (is_dynamic) {
auto& args = kernel.params.arguments;
args.clear();
args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
args.push_back({ArgumentDescriptor::Types::INPUT, 0});
args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
args.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
kd.internalBufferSizes.clear();
kd.internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes());
kd.internalBufferDataType = orgParams.inputs[0].GetDType();
}
orgParams.outputs[0].is_dynamic());
return {kd};
}
@@ -122,13 +92,21 @@ bool SoftmaxKernelBaseBF::Validate(const Params& p, const optional_params& o) co
switch (params.dim) {
case SoftmaxDim::X:
return input.Y().v == 1 && input.Z().v == 1 && input.Feature().v == 1;
return !input.Y().is_dynamic && input.Y().v == 1 &&
!input.Z().is_dynamic && input.Z().v == 1 &&
!input.Feature().is_dynamic && input.Feature().v == 1;
case SoftmaxDim::Y:
return input.X().v == 1 && input.Z().v == 1 && (input.Feature().v == 1 || input.GetLayout() == DataLayout::bfyx);
return !input.X().is_dynamic && input.X().v == 1 &&
!input.Z().is_dynamic && input.Z().v == 1 &&
((!input.Feature().is_dynamic && input.Feature().v == 1) || input.GetLayout() == DataLayout::bfyx);
case SoftmaxDim::Z:
return input.X().v == 1 && input.Y().v == 1 && input.Feature().v == 1;
return !input.X().is_dynamic && input.X().v == 1 &&
!input.Y().is_dynamic && input.Y().v == 1 &&
!input.Feature().is_dynamic && input.Feature().v == 1;
case SoftmaxDim::FEATURE:
return input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1;
return !input.X().is_dynamic && input.X().v == 1 &&
!input.Y().is_dynamic && input.Y().v == 1 &&
!input.Z().is_dynamic && input.Z().v == 1;
default:
return false;
}

View File

@@ -43,6 +43,7 @@ public:
size_t leftovers;
size_t dataSetsCount;
size_t dataSetSize;
size_t maxSlmSize;
size_t normIndex; // which dimension (from in-memory representation) is normalized, e.g. for bfyx and
// softmax::normalize_f, it will be f's index == 2 (used only by naive kernel)
};

View File

@@ -23,39 +23,40 @@ ParamsKey SoftmaxKernel_bf::GetSupportedKey() const {
k.EnableSoftmaxDim(SoftmaxDim::FEATURE);
k.EnableDifferentTypes();
k.EnableBatching();
k.EnableDynamicShapesSupport();
return k;
}
SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softmax_params& params) const {
auto dispatchData = Parent::SetDefault(params);
// start with 1 thread per data set
dispatchData.gws[0] = 1;
dispatchData.gws[1] = dispatchData.dataSetsCount;
dispatchData.itemsNum = dispatchData.dataSetSize;
dispatchData.normIndex = 0;
// We have two units of data per work item in current implementation.
auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType());
// Combining device execution and local memory restrictions to compute maximum possible LWS.
auto max_lws = std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi);
dispatchData.maxSlmSize = max_lws;
if (!params.has_dynamic_tensors()) {
// start with 1 thread per data set
dispatchData.gws[0] = 1;
dispatchData.gws[1] = dispatchData.dataSetsCount;
dispatchData.itemsNum = dispatchData.dataSetSize;
dispatchData.lws[0] = 1;
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
// reads.
while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
dispatchData.lws[0] *= 2;
dispatchData.itemsNum /= 2;
dispatchData.lws[0] = 1;
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
// reads.
while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
dispatchData.lws[0] *= 2;
dispatchData.itemsNum /= 2;
}
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
dispatchData.gws[0] = dispatchData.lws[0];
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
}
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
dispatchData.gws[0] = dispatchData.lws[0];
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
return dispatchData;
}
@@ -64,24 +65,71 @@ KernelsPriority SoftmaxKernel_bf::GetKernelsPriority(const Params& /*params*/, c
}
KernelsData SoftmaxKernel_bf::GetKernelsData(const Params& params, const optional_params& optionalParams) const {
return GetCommonKernelsData(params, optionalParams);
KernelsData kds = GetCommonKernelsData(params, optionalParams);
if (!kds.empty()) {
kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const softmax_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
};
}
return kds;
}
JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
if (params.has_dynamic_tensors()) {
const auto& input = params.inputs[0];
auto x = toCodeString(input.X(), 5);
auto y = toCodeString(input.Y(), 4);
auto z = toCodeString(input.Z(), 3);
auto w = toCodeString(input.W(), 2);
auto f = toCodeString(input.Feature(), 1);
auto b = toCodeString(input.Batch(), 0);
auto softmax_dim_y_bfyx = (params.dim == SoftmaxDim::Y && input.GetLayout() == DataLayout::bfyx);
const std::string flatten_bf = "(SOFTMAX_DIM_Y_BFYX&&(" + f + ">1))";
const std::string lws_0 = "get_local_size(0)";
const std::string data_set_count = "(FLATTEN_BF?" + toVectorMulString({f, b}) + ":" + b + ")";
const std::string data_set_size = "(FLATTEN_BF?" + y + ":" + toVectorMulString({x, y, z, f}) + ")";
// It can be expected that the maximum possible itemsNum will not exceed 32
// Therefore, in dynamic shape, stack_size including additional buffer is set to 33
constexpr size_t stack_size = 33; // The size of stack for my_chunk
jit.AddConstants({
MakeJitConstant("SOFTMAX_DIM_Y_BFYX", softmax_dim_y_bfyx),
MakeJitConstant("FLATTEN_BF", flatten_bf),
MakeJitConstant("LWS", lws_0),
MakeJitConstant("SLM_SIZE", dispatchData.maxSlmSize),
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
MakeJitConstant("DATA_SET_SIZE", data_set_size),
MakeJitConstant("STACK_SIZE", stack_size),
});
} else {
jit.AddConstants({
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
MakeJitConstant("LWS", dispatchData.lws[0]),
MakeJitConstant("SLM_SIZE", dispatchData.lws[0]),
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1),
});
}
auto activation_dt = GetActivationType(params);
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_main = {"_MAIN",
{"data_set_offset", "in_data_set_idx + i * workers_per_data_set", "0", "0"},
"dequantized",
input_dt};
activation_dt};
FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS",
{"data_set_offset", "workers_per_data_set * ITEMS_NUM + in_data_set_idx", "0", "0"},
"dequantized",
input_dt};
activation_dt};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers}));
}

View File

@@ -98,18 +98,26 @@ KernelsData SoftmaxKernel_fb::GetKernelsData(const Params& params, const optiona
JitConstants SoftmaxKernel_fb::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.AddConstants({
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
MakeJitConstant("LWS", dispatchData.lws[0]),
MakeJitConstant("GWS", dispatchData.gws[0]),
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
});
auto activation_dt = GetActivationType(params);
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_main = {"_MAIN",
{"global_id", "LWS * i", "0", "0"},
"dequantized",
input_dt};
activation_dt};
FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS",
{"global_id", "LWS * ITEMS_NUM", "0", "0"},
"dequantized",
input_dt};
activation_dt};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers}));
}

View File

@@ -77,8 +77,11 @@ KernelsPriority SoftmaxKerneItemsClassOptimized::GetKernelsPriority(const Params
JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData);
jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad));
jit.AddConstants({
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes),
MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad),
});
return jit;
}

View File

@@ -45,7 +45,36 @@ KernelsPriority SoftmaxKernelRef::GetKernelsPriority(const Params& /*params*/, c
}
KernelsData SoftmaxKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
return GetCommonKernelsData(params, options);
KernelsData kds = GetCommonKernelsData(params, options);
if (!kds.empty()) {
const softmax_params& orgParams = static_cast<const softmax_params&>(params);
bool is_dynamic = orgParams.outputs[0].is_dynamic();
kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const softmax_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
kd.internalBufferSizes.clear();
kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes());
kd.internalBufferDataType = prim_params.inputs[0].GetDType();
};
if (is_dynamic) {
auto& args = kds[0].kernels[0].params.arguments;
args.clear();
args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
args.push_back({ArgumentDescriptor::Types::INPUT, 0});
args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
args.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
kds[0].internalBufferSizes.clear();
kds[0].internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes());
kds[0].internalBufferDataType = orgParams.inputs[0].GetDType();
}
}
return kds;
}
JitConstants SoftmaxKernelRef::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {

View File

@@ -1122,3 +1122,82 @@ TEST_P(softmax_gpu_formats_test_f16, softmax_gpu_formats_test_f16_cached) {
ASSERT_NO_FATAL_FAILURE(test(true));
}
#endif
TEST(softmax_gpu_bfyx_f32, bf_opt_normalize_f_dynamic) {
auto& engine = get_test_engine();
const int64_t x = 1, y = 1, f = 3, b = 2;
const int64_t buf_size = b*f*y*x;
auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), y, x},
data_types::f32, format::bfyx};
auto input_layout_static = layout{ov::PartialShape{b, f, y, x}, data_types::f32, format::bfyx};
auto input = engine.allocate_memory(input_layout_static);
topology topology;
topology.add(input_layout("input", input_layout_dynamic));
topology.add(softmax("softmax", input_info("input"), 1));
vector<float> input_vec = {
//y0x0
/*b0f0*/0.1f,
/*b0f1*/0.2f,
/*b0f2*/0.2f,
/*b1f0*/3.f,
/*b1f1*/4.f,
/*b1f2*/0.2f,
};
set_values(input, input_vec);
float expected_max_values[2] = {
0.344253346f, //b=0, y=0, x=0
0.719294981f //b=1, y=0, x=0
};
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);
auto inst = network.get_primitive("softmax");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "softmax");
auto output = outputs.at("softmax").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
float out_buffer[buf_size];
for (uint32_t i = 0; i < buf_size; i++) {
out_buffer[i] = output_ptr[i];
}
float temp_max = 0;
float expected_sum = 1.0f;
int max_value_buffer_index = 0;
for (uint32_t i = 0; i < b; i++) { //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
for (uint32_t j = 0; j < y; j++) {
for (uint32_t k = 0; k < x; k++) {
float sum = 0.0f;
for (uint32_t l = 0; l < f; l++) {
int index = i * f * x * y +
l * x * y +
j * x +
k;
if (out_buffer[index] >= temp_max) {
temp_max = out_buffer[index];
}
sum += out_buffer[index];
}
ASSERT_TRUE(are_equal(temp_max, expected_max_values[max_value_buffer_index]));
temp_max = 0;
max_value_buffer_index++;
ASSERT_TRUE(are_equal(sum, expected_sum));
sum = 0.0f;
}
}
}
}