[GPU] Added shape agnostic optimized SoftMax kernel (#15834)
* [GPU] Added shape agnostic optimized SoftMax kernel Signed-off-by: Andrew Park <andrew.park@intel.com> * Update SoftmaxKernelBaseBF::Validate policy for shape agnostic kernel Signed-off-by: Andrew Park <andrew.park@intel.com> * Add softmax_gpu_bf shape agnostic TC for ov_gpu_unit_tests Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix failed TCs for ie-tests-linux-ubuntu20-gpu Signed-off-by: Andrew Park <andrew.park@intel.com> * Update to use stack array instead of global buffer Signed-off-by: Andrew Park <andrew.park@intel.com> * Remove global buffer usage completely Signed-off-by: Andrew Park <andrew.park@intel.com> * Add #undef directive Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
committed by
GitHub
parent
41fd836196
commit
7123e8879e
@@ -4,43 +4,60 @@
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
|
||||
#if IS_DYNAMIC
|
||||
#define CALC_POWER(n) ({uint pos = 0; uint i = n; do { i >>= 1; ++pos; } while (i); --pos;})
|
||||
#endif
|
||||
|
||||
#if !IS_DYNAMIC
|
||||
__attribute__((reqd_work_group_size(LWS, 1, 1)))
|
||||
#endif
|
||||
KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
) {
|
||||
const uint data_set_idx = get_global_id(1); //in processing of which data set this WI participates?
|
||||
const uint workers_per_data_set = LWS; //how many WI participates in processing of one data set
|
||||
const uint in_data_set_idx = get_global_id(0); //this WI's id in group of items processing single data set
|
||||
const uint data_set_size = DATA_SET_SIZE; //how many elements are in one data set
|
||||
const uint data_sets_count = DATA_SETS_COUNT; //how many data sets are in the processing payload
|
||||
const uint data_set_idx = get_global_id(1); // in processing of which data set this WI participates?
|
||||
const uint workers_per_data_set = LWS; // how many WI participates in processing of one data set
|
||||
const uint in_data_set_idx = get_global_id(0); // this WI's id in group of items processing single data set
|
||||
const uint data_set_size = DATA_SET_SIZE; // how many elements are in one data set
|
||||
const uint data_sets_count = DATA_SETS_COUNT; // how many data sets are in the processing payload
|
||||
#if !IS_DYNAMIC
|
||||
const uint items_num = ITEMS_NUM; // how many elements are processed per one WI
|
||||
const uint leftovers = LEFTOVERS;
|
||||
#else
|
||||
// since workers_per_data_set is calculated by power of 2
|
||||
// items_num can be calculated by dividing data_set_size by power of 2
|
||||
const uint power = CALC_POWER(workers_per_data_set);
|
||||
const uint items_num = data_set_size>>power;
|
||||
const uint leftovers = data_set_size-(items_num<<power);
|
||||
#endif
|
||||
|
||||
const uint data_set_offset = data_set_idx * data_set_size;
|
||||
const uint my_data_offset = data_set_offset + in_data_set_idx;
|
||||
|
||||
INPUT0_TYPE my_chunk[ITEMS_NUM + 1];
|
||||
INPUT0_TYPE my_chunk[STACK_SIZE];
|
||||
INPUT0_TYPE my_maximum = -UNIT_VAL_MAX;
|
||||
INPUT0_TYPE my_sum = UNIT_VAL_ZERO;
|
||||
INPUT0_TYPE tmp;
|
||||
|
||||
__local INPUT0_TYPE lg_storage[LWS];
|
||||
__local INPUT0_TYPE lg_storage[SLM_SIZE];
|
||||
|
||||
//each WI reads ITEMS_NUM consecutive items from batch
|
||||
for (uint i=0; i<ITEMS_NUM; ++i)
|
||||
//each WI reads items_num consecutive items from batch
|
||||
for (uint i=0; i<items_num; ++i)
|
||||
{
|
||||
tmp = input[my_data_offset + i * workers_per_data_set];
|
||||
my_maximum = max(my_maximum, tmp);
|
||||
my_chunk[i] = tmp;
|
||||
}
|
||||
|
||||
if (in_data_set_idx < LEFTOVERS)
|
||||
if (in_data_set_idx < leftovers)
|
||||
{
|
||||
tmp = input[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx];
|
||||
tmp = input[data_set_offset + workers_per_data_set * items_num + in_data_set_idx];
|
||||
my_maximum = max(my_maximum, tmp);
|
||||
my_chunk[ITEMS_NUM] = tmp;
|
||||
my_chunk[items_num] = tmp;
|
||||
}
|
||||
|
||||
lg_storage[in_data_set_idx] = my_maximum;
|
||||
@@ -60,18 +77,18 @@ KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (uint i=0; i<ITEMS_NUM; ++i)
|
||||
for (uint i=0; i<items_num; ++i)
|
||||
{
|
||||
tmp = native_exp(my_chunk[i] - my_maximum);
|
||||
my_sum += tmp;
|
||||
my_chunk[i] = tmp;
|
||||
}
|
||||
|
||||
if (in_data_set_idx < LEFTOVERS)
|
||||
if (in_data_set_idx < leftovers)
|
||||
{
|
||||
tmp = native_exp(my_chunk[ITEMS_NUM] - my_maximum);
|
||||
tmp = native_exp(my_chunk[items_num] - my_maximum);
|
||||
my_sum += tmp;
|
||||
my_chunk[ITEMS_NUM] = tmp;
|
||||
my_chunk[items_num] = tmp;
|
||||
}
|
||||
|
||||
lg_storage[in_data_set_idx] = my_sum;
|
||||
@@ -89,22 +106,25 @@ KERNEL (softmax_gpu_continuous_bfyx)(
|
||||
my_sum = lg_storage[0];
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
for (uint i=0; i<ITEMS_NUM; ++i)
|
||||
for (uint i=0; i<items_num; ++i)
|
||||
{
|
||||
ACTIVATION_TYPE dequantized = my_chunk[i] / my_sum;
|
||||
FUSED_OPS_MAIN;
|
||||
output[my_data_offset + i * workers_per_data_set] = FUSED_OPS_RESULT_MAIN;
|
||||
}
|
||||
if (in_data_set_idx < LEFTOVERS)
|
||||
if (in_data_set_idx < leftovers)
|
||||
{
|
||||
ACTIVATION_TYPE dequantized = my_chunk[ITEMS_NUM] / my_sum;
|
||||
ACTIVATION_TYPE dequantized = my_chunk[items_num] / my_sum;
|
||||
FUSED_OPS_LEFTOVERS;
|
||||
output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
|
||||
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = FUSED_OPS_RESULT_LEFTOVERS;
|
||||
}
|
||||
#else
|
||||
for (uint i=0; i<ITEMS_NUM; ++i)
|
||||
for (uint i=0; i<items_num; ++i)
|
||||
output[my_data_offset + i * workers_per_data_set] = ACTIVATION(my_chunk[i] / my_sum, ACTIVATION_PARAMS);
|
||||
if (in_data_set_idx < LEFTOVERS)
|
||||
output[data_set_offset + workers_per_data_set * ITEMS_NUM + in_data_set_idx] = ACTIVATION(my_chunk[ITEMS_NUM] / my_sum, ACTIVATION_PARAMS);
|
||||
if (in_data_set_idx < leftovers)
|
||||
output[data_set_offset + workers_per_data_set * items_num + in_data_set_idx] = ACTIVATION(my_chunk[items_num] / my_sum, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
}
|
||||
#ifdef CALC_POWER
|
||||
#undef CALC_POWER
|
||||
#endif
|
||||
|
||||
@@ -7,20 +7,11 @@
|
||||
namespace kernel_selector {
|
||||
JitConstants SoftmaxKernelBase::GetJitConstants(const softmax_params& params,
|
||||
SoftmaxKernelBase::DispatchData dispatchData) const {
|
||||
JitConstants mem_consts = MakeBaseParamsJitConstants(params);
|
||||
JitConstants jit = MakeBaseParamsJitConstants(params);
|
||||
|
||||
mem_consts.AddConstants({MakeJitConstant("ALONG_" + toString(params.dim), "")});
|
||||
jit.AddConstants({MakeJitConstant("ALONG_" + toString(params.dim), "1")});
|
||||
|
||||
mem_consts.AddConstants({
|
||||
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
|
||||
MakeJitConstant("LWS", dispatchData.lws[0]),
|
||||
MakeJitConstant("GWS", dispatchData.gws[0]),
|
||||
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
|
||||
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
|
||||
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
|
||||
});
|
||||
|
||||
return mem_consts;
|
||||
return jit;
|
||||
}
|
||||
|
||||
SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_params&) const {
|
||||
@@ -39,6 +30,7 @@ SoftmaxKernelBase::DispatchData SoftmaxKernelBase::SetDefault(const softmax_para
|
||||
dispatchData.normIndex = 0;
|
||||
dispatchData.dataSetsCount = 0;
|
||||
dispatchData.dataSetSize = 0;
|
||||
dispatchData.maxSlmSize = 0;
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
@@ -59,47 +51,25 @@ KernelsData SoftmaxKernelBase::GetCommonKernelsData(const Params& params, const
|
||||
const softmax_params& orgParams = static_cast<const softmax_params&>(params);
|
||||
KernelData kd = KernelData::Default<softmax_params>(params);
|
||||
|
||||
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
|
||||
const auto& prim_params = static_cast<const softmax_params&>(params);
|
||||
auto dispatchData = SetDefault(prim_params);
|
||||
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
|
||||
kd.kernels[0].params.workGroups.global = dispatchData.gws;
|
||||
kd.kernels[0].params.workGroups.local = dispatchData.lws;
|
||||
kd.internalBufferSizes.clear();
|
||||
kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes());
|
||||
kd.internalBufferDataType = prim_params.inputs[0].GetDType();
|
||||
};
|
||||
|
||||
auto dispatchData = SetDefault(orgParams);
|
||||
auto cldnn_jit = GetJitConstants(orgParams, dispatchData);
|
||||
auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, params, options);
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
|
||||
|
||||
auto& kernel = kd.kernels[0];
|
||||
bool is_dynamic = orgParams.outputs[0].is_dynamic();
|
||||
|
||||
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
|
||||
FillCLKernelData(kernel,
|
||||
dispatchData,
|
||||
params.engineInfo,
|
||||
kernelName,
|
||||
jit,
|
||||
entry_point,
|
||||
EXE_MODE_DEFAULT,
|
||||
false,
|
||||
false,
|
||||
1,
|
||||
GetFusedPrimitiveInputsCount(params),
|
||||
1,
|
||||
is_dynamic);
|
||||
|
||||
if (is_dynamic) {
|
||||
auto& args = kernel.params.arguments;
|
||||
args.clear();
|
||||
args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::INPUT, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
|
||||
|
||||
kd.internalBufferSizes.clear();
|
||||
kd.internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes());
|
||||
kd.internalBufferDataType = orgParams.inputs[0].GetDType();
|
||||
}
|
||||
orgParams.outputs[0].is_dynamic());
|
||||
|
||||
return {kd};
|
||||
}
|
||||
@@ -122,13 +92,21 @@ bool SoftmaxKernelBaseBF::Validate(const Params& p, const optional_params& o) co
|
||||
|
||||
switch (params.dim) {
|
||||
case SoftmaxDim::X:
|
||||
return input.Y().v == 1 && input.Z().v == 1 && input.Feature().v == 1;
|
||||
return !input.Y().is_dynamic && input.Y().v == 1 &&
|
||||
!input.Z().is_dynamic && input.Z().v == 1 &&
|
||||
!input.Feature().is_dynamic && input.Feature().v == 1;
|
||||
case SoftmaxDim::Y:
|
||||
return input.X().v == 1 && input.Z().v == 1 && (input.Feature().v == 1 || input.GetLayout() == DataLayout::bfyx);
|
||||
return !input.X().is_dynamic && input.X().v == 1 &&
|
||||
!input.Z().is_dynamic && input.Z().v == 1 &&
|
||||
((!input.Feature().is_dynamic && input.Feature().v == 1) || input.GetLayout() == DataLayout::bfyx);
|
||||
case SoftmaxDim::Z:
|
||||
return input.X().v == 1 && input.Y().v == 1 && input.Feature().v == 1;
|
||||
return !input.X().is_dynamic && input.X().v == 1 &&
|
||||
!input.Y().is_dynamic && input.Y().v == 1 &&
|
||||
!input.Feature().is_dynamic && input.Feature().v == 1;
|
||||
case SoftmaxDim::FEATURE:
|
||||
return input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1;
|
||||
return !input.X().is_dynamic && input.X().v == 1 &&
|
||||
!input.Y().is_dynamic && input.Y().v == 1 &&
|
||||
!input.Z().is_dynamic && input.Z().v == 1;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -43,6 +43,7 @@ public:
|
||||
size_t leftovers;
|
||||
size_t dataSetsCount;
|
||||
size_t dataSetSize;
|
||||
size_t maxSlmSize;
|
||||
size_t normIndex; // which dimension (from in-memory representation) is normalized, e.g. for bfyx and
|
||||
// softmax::normalize_f, it will be f's index == 2 (used only by naive kernel)
|
||||
};
|
||||
|
||||
@@ -23,39 +23,40 @@ ParamsKey SoftmaxKernel_bf::GetSupportedKey() const {
|
||||
k.EnableSoftmaxDim(SoftmaxDim::FEATURE);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableBatching();
|
||||
k.EnableDynamicShapesSupport();
|
||||
return k;
|
||||
}
|
||||
|
||||
SoftmaxKernel_bf::Parent::DispatchData SoftmaxKernel_bf::SetDefault(const softmax_params& params) const {
|
||||
auto dispatchData = Parent::SetDefault(params);
|
||||
|
||||
// start with 1 thread per data set
|
||||
dispatchData.gws[0] = 1;
|
||||
dispatchData.gws[1] = dispatchData.dataSetsCount;
|
||||
dispatchData.itemsNum = dispatchData.dataSetSize;
|
||||
|
||||
dispatchData.normIndex = 0;
|
||||
|
||||
// We have two units of data per work item in current implementation.
|
||||
auto local_mem_per_wi = 2 * BytesPerElement(params.inputs[0].GetDType());
|
||||
// Combining device execution and local memory restrictions to compute maximum possible LWS.
|
||||
auto max_lws = std::min(params.engineInfo.maxWorkGroupSize, params.engineInfo.maxLocalMemSize / local_mem_per_wi);
|
||||
dispatchData.maxSlmSize = max_lws;
|
||||
if (!params.has_dynamic_tensors()) {
|
||||
// start with 1 thread per data set
|
||||
dispatchData.gws[0] = 1;
|
||||
dispatchData.gws[1] = dispatchData.dataSetsCount;
|
||||
dispatchData.itemsNum = dispatchData.dataSetSize;
|
||||
|
||||
dispatchData.lws[0] = 1;
|
||||
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
|
||||
// reads.
|
||||
while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
|
||||
dispatchData.lws[0] *= 2;
|
||||
dispatchData.itemsNum /= 2;
|
||||
dispatchData.lws[0] = 1;
|
||||
// Compute maximum possible LWS that does not exceed device capabilities and optimizes number of global memory
|
||||
// reads.
|
||||
while ((dispatchData.itemsNum > 32 || dispatchData.lws[0] < dispatchData.itemsNum) && (2 * dispatchData.lws[0] <= max_lws)) {
|
||||
dispatchData.lws[0] *= 2;
|
||||
dispatchData.itemsNum /= 2;
|
||||
}
|
||||
|
||||
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
|
||||
|
||||
dispatchData.gws[0] = dispatchData.lws[0];
|
||||
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
|
||||
|
||||
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
|
||||
}
|
||||
|
||||
assert((dispatchData.itemsNum + 1) * dispatchData.lws[0] >= dispatchData.dataSetSize && "More than 'lws[0]' items per batch remains! Lws too small?");
|
||||
|
||||
dispatchData.gws[0] = dispatchData.lws[0];
|
||||
dispatchData.leftovers = dispatchData.dataSetSize % dispatchData.lws[0];
|
||||
|
||||
assert(dispatchData.itemsNum > 0 && dispatchData.lws[0] && dispatchData.gws[0] > 0);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
@@ -64,24 +65,71 @@ KernelsPriority SoftmaxKernel_bf::GetKernelsPriority(const Params& /*params*/, c
|
||||
}
|
||||
|
||||
KernelsData SoftmaxKernel_bf::GetKernelsData(const Params& params, const optional_params& optionalParams) const {
|
||||
return GetCommonKernelsData(params, optionalParams);
|
||||
KernelsData kds = GetCommonKernelsData(params, optionalParams);
|
||||
if (!kds.empty()) {
|
||||
kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
|
||||
const auto& prim_params = static_cast<const softmax_params&>(params);
|
||||
auto dispatchData = SetDefault(prim_params);
|
||||
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
|
||||
kd.kernels[0].params.workGroups.global = dispatchData.gws;
|
||||
kd.kernels[0].params.workGroups.local = dispatchData.lws;
|
||||
};
|
||||
}
|
||||
|
||||
return kds;
|
||||
}
|
||||
|
||||
JitConstants SoftmaxKernel_bf::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
|
||||
if (params.has_dynamic_tensors()) {
|
||||
const auto& input = params.inputs[0];
|
||||
auto x = toCodeString(input.X(), 5);
|
||||
auto y = toCodeString(input.Y(), 4);
|
||||
auto z = toCodeString(input.Z(), 3);
|
||||
auto w = toCodeString(input.W(), 2);
|
||||
auto f = toCodeString(input.Feature(), 1);
|
||||
auto b = toCodeString(input.Batch(), 0);
|
||||
auto softmax_dim_y_bfyx = (params.dim == SoftmaxDim::Y && input.GetLayout() == DataLayout::bfyx);
|
||||
const std::string flatten_bf = "(SOFTMAX_DIM_Y_BFYX&&(" + f + ">1))";
|
||||
const std::string lws_0 = "get_local_size(0)";
|
||||
const std::string data_set_count = "(FLATTEN_BF?" + toVectorMulString({f, b}) + ":" + b + ")";
|
||||
const std::string data_set_size = "(FLATTEN_BF?" + y + ":" + toVectorMulString({x, y, z, f}) + ")";
|
||||
// It can be expected that the maximum possible itemsNum will not exceed 32
|
||||
// Therefore, in dynamic shape, stack_size including additional buffer is set to 33
|
||||
constexpr size_t stack_size = 33; // The size of stack for my_chunk
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("SOFTMAX_DIM_Y_BFYX", softmax_dim_y_bfyx),
|
||||
MakeJitConstant("FLATTEN_BF", flatten_bf),
|
||||
MakeJitConstant("LWS", lws_0),
|
||||
MakeJitConstant("SLM_SIZE", dispatchData.maxSlmSize),
|
||||
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
|
||||
MakeJitConstant("DATA_SET_SIZE", data_set_size),
|
||||
MakeJitConstant("STACK_SIZE", stack_size),
|
||||
});
|
||||
} else {
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
|
||||
MakeJitConstant("LWS", dispatchData.lws[0]),
|
||||
MakeJitConstant("SLM_SIZE", dispatchData.lws[0]),
|
||||
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
|
||||
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
|
||||
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
|
||||
MakeJitConstant("STACK_SIZE", dispatchData.itemsNum + 1),
|
||||
});
|
||||
}
|
||||
auto activation_dt = GetActivationType(params);
|
||||
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_main = {"_MAIN",
|
||||
{"data_set_offset", "in_data_set_idx + i * workers_per_data_set", "0", "0"},
|
||||
"dequantized",
|
||||
input_dt};
|
||||
activation_dt};
|
||||
FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS",
|
||||
{"data_set_offset", "workers_per_data_set * ITEMS_NUM + in_data_set_idx", "0", "0"},
|
||||
"dequantized",
|
||||
input_dt};
|
||||
activation_dt};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers}));
|
||||
}
|
||||
|
||||
|
||||
@@ -98,18 +98,26 @@ KernelsData SoftmaxKernel_fb::GetKernelsData(const Params& params, const optiona
|
||||
JitConstants SoftmaxKernel_fb::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("ITEMS_NUM", dispatchData.itemsNum),
|
||||
MakeJitConstant("LWS", dispatchData.lws[0]),
|
||||
MakeJitConstant("GWS", dispatchData.gws[0]),
|
||||
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
|
||||
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
|
||||
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
|
||||
});
|
||||
auto activation_dt = GetActivationType(params);
|
||||
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_main = {"_MAIN",
|
||||
{"global_id", "LWS * i", "0", "0"},
|
||||
"dequantized",
|
||||
input_dt};
|
||||
activation_dt};
|
||||
FusedOpsConfiguration conf_leftovers = {"_LEFTOVERS",
|
||||
{"global_id", "LWS * ITEMS_NUM", "0", "0"},
|
||||
"dequantized",
|
||||
input_dt};
|
||||
activation_dt};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_main, conf_leftovers}));
|
||||
}
|
||||
|
||||
|
||||
@@ -77,8 +77,11 @@ KernelsPriority SoftmaxKerneItemsClassOptimized::GetKernelsPriority(const Params
|
||||
JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
|
||||
auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes));
|
||||
jit.AddConstant(MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad));
|
||||
jit.AddConstants({
|
||||
MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
|
||||
MakeJitConstant("WORKITEMS_PER_CLASSES", workitems_per_classes),
|
||||
MakeJitConstant("HAS_DRIVER_PROBLEMS", params.engineInfo.supports_imad),
|
||||
});
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
@@ -45,7 +45,36 @@ KernelsPriority SoftmaxKernelRef::GetKernelsPriority(const Params& /*params*/, c
|
||||
}
|
||||
|
||||
KernelsData SoftmaxKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options);
|
||||
KernelsData kds = GetCommonKernelsData(params, options);
|
||||
if (!kds.empty()) {
|
||||
const softmax_params& orgParams = static_cast<const softmax_params&>(params);
|
||||
bool is_dynamic = orgParams.outputs[0].is_dynamic();
|
||||
|
||||
kds[0].update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
|
||||
const auto& prim_params = static_cast<const softmax_params&>(params);
|
||||
auto dispatchData = SetDefault(prim_params);
|
||||
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
|
||||
kd.kernels[0].params.workGroups.global = dispatchData.gws;
|
||||
kd.kernels[0].params.workGroups.local = dispatchData.lws;
|
||||
kd.internalBufferSizes.clear();
|
||||
kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes());
|
||||
kd.internalBufferDataType = prim_params.inputs[0].GetDType();
|
||||
};
|
||||
|
||||
if (is_dynamic) {
|
||||
auto& args = kds[0].kernels[0].params.arguments;
|
||||
args.clear();
|
||||
args.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::INPUT, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
|
||||
args.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
|
||||
|
||||
kds[0].internalBufferSizes.clear();
|
||||
kds[0].internalBufferSizes.push_back(orgParams.inputs[0].PhysicalSizeInBytes());
|
||||
kds[0].internalBufferDataType = orgParams.inputs[0].GetDType();
|
||||
}
|
||||
}
|
||||
return kds;
|
||||
}
|
||||
|
||||
JitConstants SoftmaxKernelRef::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
|
||||
|
||||
@@ -1122,3 +1122,82 @@ TEST_P(softmax_gpu_formats_test_f16, softmax_gpu_formats_test_f16_cached) {
|
||||
ASSERT_NO_FATAL_FAILURE(test(true));
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(softmax_gpu_bfyx_f32, bf_opt_normalize_f_dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int64_t x = 1, y = 1, f = 3, b = 2;
|
||||
const int64_t buf_size = b*f*y*x;
|
||||
auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), y, x},
|
||||
data_types::f32, format::bfyx};
|
||||
auto input_layout_static = layout{ov::PartialShape{b, f, y, x}, data_types::f32, format::bfyx};
|
||||
|
||||
auto input = engine.allocate_memory(input_layout_static);
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input_layout_dynamic));
|
||||
topology.add(softmax("softmax", input_info("input"), 1));
|
||||
|
||||
vector<float> input_vec = {
|
||||
//y0x0
|
||||
/*b0f0*/0.1f,
|
||||
/*b0f1*/0.2f,
|
||||
/*b0f2*/0.2f,
|
||||
/*b1f0*/3.f,
|
||||
/*b1f1*/4.f,
|
||||
/*b1f2*/0.2f,
|
||||
};
|
||||
set_values(input, input_vec);
|
||||
|
||||
float expected_max_values[2] = {
|
||||
0.344253346f, //b=0, y=0, x=0
|
||||
0.719294981f //b=1, y=0, x=0
|
||||
};
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input);
|
||||
|
||||
auto inst = network.get_primitive("softmax");
|
||||
auto impl = inst->get_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "softmax");
|
||||
|
||||
auto output = outputs.at("softmax").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
float out_buffer[buf_size];
|
||||
for (uint32_t i = 0; i < buf_size; i++) {
|
||||
out_buffer[i] = output_ptr[i];
|
||||
}
|
||||
|
||||
float temp_max = 0;
|
||||
float expected_sum = 1.0f;
|
||||
int max_value_buffer_index = 0;
|
||||
for (uint32_t i = 0; i < b; i++) { //this for loops will sum results in a batch per feature, we expect that: sum = 1.0f
|
||||
for (uint32_t j = 0; j < y; j++) {
|
||||
for (uint32_t k = 0; k < x; k++) {
|
||||
float sum = 0.0f;
|
||||
for (uint32_t l = 0; l < f; l++) {
|
||||
int index = i * f * x * y +
|
||||
l * x * y +
|
||||
j * x +
|
||||
k;
|
||||
if (out_buffer[index] >= temp_max) {
|
||||
temp_max = out_buffer[index];
|
||||
}
|
||||
sum += out_buffer[index];
|
||||
}
|
||||
ASSERT_TRUE(are_equal(temp_max, expected_max_values[max_value_buffer_index]));
|
||||
temp_max = 0;
|
||||
max_value_buffer_index++;
|
||||
|
||||
ASSERT_TRUE(are_equal(sum, expected_sum));
|
||||
sum = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user