[GPU] Merge mvn kernel updates (#7736)

* [GPU] Merge mvn kernel updates

+ Support bs_fs_yx_bsv32_fsv32 format for i8 and u8 dtype.
+ Support bs_fs_yx_bfv32_fsv16 format for fp16 dtype.

* [GPU] Fix a bug of resample kernel selector.

* [GPU] fix a build error of mvn unit tests
This commit is contained in:
Jade Cho
2021-10-05 16:22:33 +09:00
committed by GitHub
parent c6b98d5e9b
commit d89b146259
8 changed files with 803 additions and 14 deletions

View File

@@ -82,16 +82,32 @@ MVNKernelBase::DispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefault(const mvn_p
return dispatchData;
}
Datatype MVNKernel_b_fs_yx_fsv16_imad::GetAccumulatorType(const mvn_params& params) const {
const auto& input_dt = params.inputs[0].GetDType();
switch (input_dt) {
case Datatype::F32:
case Datatype::F16:
return Datatype::F32;
case Datatype::INT8:
case Datatype::UINT8:
return Datatype::INT32;
default: return Datatype::F32;
}
}
JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const {
auto jits = Parent::GetJitConstants(params, dispatchData);
auto activation_dt = GetActivationType(params);
jits.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
jits.Merge(MakeTypeJitConstants(activation_dt, "MEAN"));
jits.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
jits.AddConstant(MakeJitConstant("SIMD", simd));
jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0]));
jits.AddConstant(MakeJitConstant("GWS", dispatchData.gws[0]));
jits.AddConstant(MakeJitConstant("ITEM_GROUPS", dispatchData.itemsNum));
jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", 16));
if (!params.fused_ops.empty()) {
std::vector<std::string> idx_order;

View File

@@ -42,5 +42,8 @@ protected:
KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&) const;
MultiDispatchData SetDefaultForMulti(const mvn_params& params) const;
private:
Datatype GetAccumulatorType(const mvn_params& params) const;
};
} // namespace kernel_selector

View File

@@ -0,0 +1,400 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mvn_kernel_bs_fs_yx_bsv32.hpp"
#include "common/common_tools.h"
#include <string>
#include <algorithm>
#include <iostream>
namespace kernel_selector {
static constexpr size_t simd = 16;
static constexpr size_t fsv = 16;
static constexpr size_t pref_work_groups = 16;
ParamsKey MVNKernel_bs_fs_yx_bsv32::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDifferentTypes();
k.EnableBatching();
k.EnableMVNMode(MVNMode::WITHIN_CHANNELS);
k.EnableMVNNormalizeVariance();
return k;
}
bool MVNKernel_bs_fs_yx_bsv32::Validate(const Params& p, const optional_params& options) const {
if (!Parent::Validate(p, options))
return false;
auto params = static_cast<const mvn_params&>(p);
// TODO Add support for input padding via iterating over y (parallel or in kernel).
if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0)
return false;
return true;
}
Datatype MVNKernel_bs_fs_yx_bsv32::GetAccumulatorType(const mvn_params& params) const {
const auto& input_dt = params.inputs[0].GetDType();
switch (input_dt) {
case Datatype::F32:
case Datatype::F16:
return Datatype::F32;
case Datatype::INT8:
case Datatype::UINT8:
return Datatype::INT32;
default: return Datatype::F32;
}
}
JitConstants MVNKernel_bs_fs_yx_bsv32::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const {
auto jits = Parent::GetJitConstants(params, dispatchData);
auto activation_dt = GetActivationType(params);
jits.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
jits.Merge(MakeTypeJitConstants(Datatype::F32, "MEAN"));
jits.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
jits.AddConstant(MakeJitConstant("SIMD", simd));
jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0]));
jits.AddConstant(MakeJitConstant("GWS", dispatchData.gws[0]));
jits.AddConstant(MakeJitConstant("ITEM_GROUPS", dispatchData.itemsNum));
const auto input_layout = params.inputs[0].GetLayout();
if (input_layout == DataLayout::bs_fs_yx_bsv32_fsv32) {
jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", (size_t)(32 * 32)));
} else { // DataLayout::bs_fs_yx_bsv32_fsv16
jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", (size_t)(32 * 16)));
}
if (!params.fused_ops.empty()) {
std::vector<std::string> idx_order = {"b", "(f + fi)", "(y)", "(x)"};
auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt);
jits.Merge(MakeFusedOpsJitConstants(params, {conf}));
}
return jits;
}
std::vector<size_t> MVNKernel_bs_fs_yx_bsv32::GetFinalKernelLws(const std::vector<size_t>& gws, uint64_t max_wg) const {
std::vector<size_t> lws(3);
lws[0] = 1;
lws[1] = gws[1];
lws[2] = gws[2];
// gws[1] is CeilDiv(feature, simd)
while (lws[1] > 16 || gws[1] % lws[1] != 0) {
lws[1] -= 1;
}
// gws[2] is Align(batch, simd)
while (lws[1] * lws[2] > max_wg && lws[2] > 16) {
lws[2] -= simd;
}
return lws;
}
MVNKernel_bs_fs_yx_bsv32::MultiDispatchData MVNKernel_bs_fs_yx_bsv32::SetDefaultForMulti(const mvn_params& params,
bool has_enough_data) const {
MultiDispatchData dispatchData;
auto items_num = params.output.X().v * params.output.Y().v;
auto max_wg = params.engineInfo.maxWorkGroupSize;
auto slm_per_sg = fsv * 4;
auto max_slm = params.engineInfo.maxLocalMemSize;
auto max_sgs = max_slm / slm_per_sg;
auto max_lws = std::min(max_wg, max_sgs * simd);
auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
// TODO Check if larger number of work-groups does not provide benefit
size_t item_groups = pref_work_groups;
dispatchData.item_groups = item_groups;
size_t stage1_lws = lws;
if (has_enough_data) {
dispatchData.stage_1.gws[0] = stage1_lws * item_groups;
dispatchData.stage_1.gws[1] = CeilDiv(params.output.Feature().v, fsv);
dispatchData.stage_1.gws[2] = params.output.Batch().v;
dispatchData.stage_1.lws[0] = stage1_lws;
dispatchData.stage_1.lws[1] = 1;
dispatchData.stage_1.lws[2] = 1;
dispatchData.stage_1.itemsNum = item_groups;
size_t stage2_lws = std::max(std::min(item_groups, max_lws) / simd, (size_t)1) * simd;
dispatchData.stage_2.gws[0] = stage2_lws;
dispatchData.stage_2.gws[1] = CeilDiv(params.output.Feature().v, fsv);
dispatchData.stage_2.gws[2] = params.output.Batch().v;
dispatchData.stage_2.lws[0] = stage2_lws;
dispatchData.stage_2.lws[1] = 1;
dispatchData.stage_2.lws[2] = 1;
dispatchData.stage_2.itemsNum = item_groups;
} else {
dispatchData.stage_1.gws[0] = lws;
dispatchData.stage_1.gws[1] = CeilDiv(params.output.Feature().v, fsv);
dispatchData.stage_1.gws[2] = params.output.Batch().v;
dispatchData.stage_1.lws[0] = lws;
dispatchData.stage_1.lws[1] = 1;
dispatchData.stage_1.lws[2] = 1;
dispatchData.stage_1.itemsNum = 1;
}
dispatchData.stage_final.gws[0] = items_num;
dispatchData.stage_final.gws[1] = CeilDiv(params.output.Feature().v, fsv);
dispatchData.stage_final.gws[2] = Align(params.output.Batch().v, simd);
dispatchData.stage_final.lws = GetFinalKernelLws(dispatchData.stage_final.gws, max_wg);
dispatchData.stage_final.itemsNum = 1;
return dispatchData;
}
KernelsData MVNKernel_bs_fs_yx_bsv32::GetMultiStageKernelsData(const mvn_params& params,
const optional_params& options,
bool has_enough_data) const {
if (!Validate(params, options))
return {};
constexpr size_t intermediate_bytes = 4;
auto dispatchData = SetDefaultForMulti(params, has_enough_data);
KernelData kd;
size_t entry_part_id = 0;
if (has_enough_data) {
size_t kernels_num = params.mvnNormalizeVariance ? 5 : 3;
kd = KernelData::Default<mvn_params>(params, kernels_num);
auto finalKernelName = GetKernelName(params);
{
// Mean first stage
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_1", 1));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
dispatchData.stage_1,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
0,
0);
kernel.params.arguments.clear(); // Clear original output argument
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
dispatchData.item_groups * intermediate_bytes);
}
{
// Mean second stage
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_2);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_2", 1));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[1];
FillCLKernelData(kernel,
dispatchData.stage_2,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
0,
0);
kernel.params.arguments.clear(); // Clear original output argument
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
intermediate_bytes);
}
if (params.mvnNormalizeVariance) {
// Variance first stage
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_1", 1));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[2];
FillCLKernelData(kernel,
dispatchData.stage_1,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
0,
0);
kernel.params.arguments.clear(); // Clear original output argument
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
}
if (params.mvnNormalizeVariance) {
// Variance second stage
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_2);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_2", 1));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[3];
FillCLKernelData(kernel,
dispatchData.stage_2,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
0,
0);
kernel.params.arguments.clear(); // Clear original output argument
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
intermediate_bytes);
}
{ // Final
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_final);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN_BSV32", 1));
cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[kernels_num - 1];
FillCLKernelData(kernel,
dispatchData.stage_final,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
1,
GetFusedPrimitiveInputsCount(params));
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
if (params.mvnNormalizeVariance) {
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
}
}
kd.internalBufferDataType = Datatype::F32;
} else { // not enough data
kd = KernelData::Default<mvn_params>(params, 2);
auto finalKernelName = GetKernelName(params);
{
// Mean and Variance stage
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_VAR_BSV32", 1));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel,
dispatchData.stage_1,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
0,
0);
kernel.params.arguments.clear(); // Clear original output argument
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
intermediate_bytes);
if (params.mvnNormalizeVariance) {
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) *
intermediate_bytes);
}
}
{ // Final
auto cldnn_jit = GetJitConstants(params, dispatchData.stage_final);
cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN_BSV32", 1));
cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance));
auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++);
auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
auto& kernel = kd.kernels[1];
FillCLKernelData(kernel,
dispatchData.stage_final,
params.engineInfo,
finalKernelName,
jit,
entry_point,
"",
false,
false,
1,
GetFusedPrimitiveInputsCount(params));
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
if (params.mvnNormalizeVariance) {
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
}
}
kd.internalBufferDataType = Datatype::F32;
}
return {kd};
}
KernelsData MVNKernel_bs_fs_yx_bsv32::GetKernelsData(const Params& params,
const optional_params& optParams) const {
const mvn_params& orgParams = static_cast<const mvn_params&>(params);
auto max_slm = params.engineInfo.maxLocalMemSize;
auto slm_per_sg = fsv * 4;
auto max_lws = params.engineInfo.maxWorkGroupSize;
auto items_num = orgParams.output.X().v * orgParams.output.Y().v * orgParams.output.Z().v;
auto enough_slm = max_lws / simd * simd * slm_per_sg <= max_slm;
auto enough_lws = max_lws / simd >= 1;
auto enough_items = items_num >= max_lws / simd * simd * pref_work_groups;
return GetMultiStageKernelsData(orgParams, optParams, enough_slm && enough_lws && enough_items);
}
KernelsPriority MVNKernel_bs_fs_yx_bsv32::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
return FORCE_PRIORITY_4;
}
} // namespace kernel_selector

View File

@@ -0,0 +1,49 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "mvn_kernel_base.h"
#include <string>
#include <vector>
namespace kernel_selector {
class MVNKernel_bs_fs_yx_bsv32 : public MVNKernelBase {
public:
using Parent = MVNKernelBase;
MVNKernel_bs_fs_yx_bsv32() : MVNKernelBase("mvn_gpu_b_fs_yx_fsv16_imad") {}
virtual ~MVNKernel_bs_fs_yx_bsv32() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
struct MultiDispatchData {
DispatchData stage_1;
DispatchData stage_2;
DispatchData stage_final;
size_t item_groups;
};
bool Validate(const Params&, const optional_params&) const override;
JitConstants GetJitConstants(const mvn_params& params, DispatchData dispatchData) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::ACTIVATION,
FusedOpType::QUANTIZE,
FusedOpType::ELTWISE,
FusedOpType::SCALE
};
}
KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&, bool) const;
MultiDispatchData SetDefaultForMulti(const mvn_params& params, bool) const;
private:
Datatype GetAccumulatorType(const mvn_params& params) const;
std::vector<size_t> GetFinalKernelLws(const std::vector<size_t>& gws, uint64_t max_wg) const;
};
} // namespace kernel_selector

View File

@@ -6,12 +6,14 @@
#include "mvn_kernel_ref.h"
#include "mvn_kernel_bfyx_opt.h"
#include "mvn_kernel_b_fs_yx_fsv16_imad.hpp"
#include "mvn_kernel_bs_fs_yx_bsv32.hpp"
namespace kernel_selector {
mvn_kernel_selector::mvn_kernel_selector() {
Attach<MVNKernelRef>();
Attach<MVNKernelBfyxOpt>();
Attach<MVNKernel_b_fs_yx_fsv16_imad>();
Attach<MVNKernel_bs_fs_yx_bsv32>();
}
KernelsData mvn_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {

View File

@@ -76,7 +76,7 @@ ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_sele
opt_x_block_size = GetOptimalDivisor(out.X().v, 32);
}
dispatchData.gws[0] = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v;
dispatchData.gws[0] = CeilDiv(out.X().v, opt_x_block_size) * out.Y().v;
dispatchData.gws[1] = Align(out.Feature().v, sub_group_size);
dispatchData.gws[2] = arg.output.Batch().v;

View File

@@ -63,7 +63,6 @@
// ITEM_GROUPS - Number of work-groups performing accumulation in parallel mode. Should be the same in both stages of parallel kernels.
#define FSV 16
#define INPUT_SLICE_PITCH 16
#define SG_NUM (LWS / SIMD)
#define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
@@ -73,8 +72,11 @@
#define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, FSV)
#define MEAN_PACKED_TYPE MAKE_VECTOR_TYPE(MEAN_TYPE, FSV)
#define INT_PACKED_TYPE MAKE_VECTOR_TYPE(int, FSV)
#define ACC_PACKED_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, FSV)
#define ACT_PACKED_TYPE MAKE_VECTOR_TYPE(ACTIVATION_TYPE, FSV)
#define TO_MEAN_PACKED_TYPE CAT(convert_, MEAN_PACKED_TYPE)
#define TO_ACT_PACKED_TYPE CAT(convert_, ACT_PACKED_TYPE)
#define ITEMS_NUM (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z)
@@ -83,18 +85,18 @@
// ================================================================================================
#if MVN_KERNEL_MEAN_1
DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM)
DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM)
#if SG_NUM != 1
DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, int, FSV, SG_NUM, REDUCE_NO_POST_OP)
DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, ACCUMULATOR_TYPE, FSV, SG_NUM, REDUCE_NO_POST_OP)
#else
DECLARE_SG_PACKED_REDUCE_ADD(reduce_sum_inside_sg, int, FSV, REDUCE_NO_POST_OP)
DECLARE_SG_PACKED_REDUCE_ADD(reduce_sum_inside_sg, ACCUMULATOR_TYPE, FSV, REDUCE_NO_POST_OP)
#endif
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(LWS, 1, 1)))
KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input,
__global int* intermidiate_sum) {
__global ACCUMULATOR_TYPE* intermidiate_sum) {
uint b = get_global_id(2);
uint f = get_global_id(1) * FSV;
uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
@@ -109,7 +111,7 @@ KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input,
const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
#endif
INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0));
ACC_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0));
#if SG_NUM != 1
__local int slm_acc[(SG_NUM - 1) * FSV];
@@ -125,7 +127,7 @@ KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input,
// ================================================================================================
#elif MVN_KERNEL_MEAN_2
DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, int, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, ACCUMULATOR_TYPE, FSV, FSV, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
#if SG_NUM != 1
@@ -136,7 +138,7 @@ DECLARE_SG_PACKED_REDUCE_ADD(reduce_mean_inside_sg, MEAN_TYPE, FSV, CALC_MEAN)
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(LWS, 1, 1)))
KERNEL(mvn_mean_2)(const __global int* intermidiate_sum,
KERNEL(mvn_mean_2)(const __global ACCUMULATOR_TYPE* intermidiate_sum,
__global MEAN_TYPE* intermidiate_mean) {
uint b = get_global_id(2);
uint f = get_global_id(1) * FSV;
@@ -147,7 +149,7 @@ KERNEL(mvn_mean_2)(const __global int* intermidiate_sum,
const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV;
INT_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0));
ACC_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0));
#if SG_NUM != 1
__local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
@@ -212,7 +214,7 @@ KERNEL(mvn_var_1)(const __global INPUT0_TYPE* input,
// ================================================================================================
#elif MVN_KERNEL_VAR_2
DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, FSV, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
#if defined EPS_OUTSIDE_SQRT
#define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr(native_sqrt((sum_diff_sq) / ITEMS_NUM) + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-1.f);
#elif defined EPS_INSIDE_SQRT
@@ -251,6 +253,138 @@ KERNEL(mvn_var_2)(const __global MEAN_TYPE* intermidiate_sum,
intermidiate_ivar[flat_data_set_group * FSV + sglid] = inv_variance;
}
}
// ================================================================================================
#elif MVN_KERNEL_MAIN_BSV32
__attribute__((intel_reqd_sub_group_size(SIMD)))
KERNEL(mvn_final_bsv32)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* restrict output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
, const __global MEAN_TYPE* means
#if PRECALC_VARIANCE
, const __global MEAN_TYPE* variances
#endif
) {
uint b = get_global_id(2);
uint f = get_global_id(1) * FSV;
uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
MEAN_PACKED_TYPE mean_vals = ((const __global MEAN_PACKED_TYPE*)(means + (flat_data_set_group * FSV)))[0];
#if PRECALC_VARIANCE
MEAN_PACKED_TYPE inv_variance = ((const __global MEAN_PACKED_TYPE*)(variances + (flat_data_set_group * FSV)))[0];
#else // !PRECALC_VARIANCE
MEAN_PACKED_TYPE inv_variance = (MEAN_PACKED_TYPE)(MEAN_VAL_ONE);
#endif
if (b >= OUTPUT_BATCH_NUM || f >= OUTPUT_FEATURE_NUM)
return;
const uint output_spatial = get_global_id(0);
uint x = output_spatial % OUTPUT_SIZE_X;
uint y = output_spatial / OUTPUT_SIZE_X;
uint input_offset = INPUT0_GET_INDEX(b, f, y, x);
uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[0];
ACT_PACKED_TYPE normalized_vec = fma((TO_ACT_PACKED_TYPE(in_pack) - TO_ACT_PACKED_TYPE(mean_vals)),
TO_ACT_PACKED_TYPE(inv_variance), (ACT_PACKED_TYPE)0);
OUTPUT_PACKED_TYPE result_vec = OUTPUT_VAL_ZERO;
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < FSV; fi++) {
ACTIVATION_TYPE normalized = normalized_vec[fi];
# if HAS_FUSED_OPS
FUSED_OPS;
result_vec[fi] = FUSED_OPS_RESULT;
# else
result_vec[fi] = TO_OUTPUT_TYPE(normalized);
# endif
}
vstore16(result_vec, 0, &output[output_offset]);
}
#elif MVN_KERNEL_MEAN_VAR_BSV32
// Mean:
DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM)
#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
#if SG_NUM != 1
DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN)
#else
DECLARE_SG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, CALC_MEAN)
#endif
// Variance:
#define EXTRA_ARGS_DECL_IMPL , MEAN_TYPE mean
#define EXTRA_ARGS_IMPL , mean
#define EXTRA_ARGS_DECL EXTRA_ARGS_DECL_IMPL
#define EXTRA_ARGS EXTRA_ARGS_IMPL
#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean) ACCUMULATE_SUM_SQ(curr, TO_MEAN_TYPE(next) - intel_sub_group_shuffle(mean, idx), idx)
DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS)
#if defined EPS_OUTSIDE_SQRT
#define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr(native_sqrt((sum_diff_sq) / ITEMS_NUM) + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-1.f);
#elif defined EPS_INSIDE_SQRT
#define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-0.5f)
#endif
#if SG_NUM != 1
DECLARE_WG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE)
#else
DECLARE_SG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, CALC_INVERSE_VARIANCE)
#endif
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(LWS, 1, 1)))
KERNEL(mvn_mean_var_bsv32)(
const __global INPUT0_TYPE* input,
__global MEAN_TYPE* means
#if NORMALIZE_VARIANCE
, __global MEAN_TYPE* variances
#endif
) {
uint b = get_global_id(2);
uint f = get_global_id(1) * FSV;
uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
const uint sgid = get_sub_group_id();
const uint sglid = get_sub_group_local_id();
const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
#if SG_NUM != 1
__local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
#endif
ACC_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_local_id(0));
#if SG_NUM != 1
MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum), slm_acc);
#else
MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum));
#endif
#if NORMALIZE_VARIANCE
MEAN_PACKED_TYPE partial_dev = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_local_id(0), mean);
#if SG_NUM != 1
MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev, slm_acc);
#else
MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev);
#endif
#endif
if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
means[flat_data_set_group * FSV + sglid] = mean;
#if NORMALIZE_VARIANCE
variances[flat_data_set_group * FSV + sglid] = inv_variance;
#endif
}
}
// ================================================================================================
#else // MVN_KERNEL_MAIN

View File

@@ -12,6 +12,8 @@
#include <iostream>
#include <cldnn/runtime/debug_configuration.hpp>
using namespace cldnn;
using namespace ::tests;
@@ -638,8 +640,8 @@ struct mvn_random_test : ::testing::TestWithParam<mvn_basic_test_params> {
};
TEST_P(mvn_random_test, random) {
auto& eng = tests::get_test_engine();
this->execute(GetParam(), eng);
auto& engine = tests::get_test_engine();
this->execute(GetParam(), engine);
}
struct mvn_test_case_generator : std::vector<mvn_basic_test_params> {
@@ -712,3 +714,186 @@ INSTANTIATE_TEST_SUITE_P(extended,
testing::ValuesIn(mvn_test_case_generator()
.extended_tests(format::b_fs_yx_fsv16, data_types::i8)
.extended_tests(format::b_fs_yx_fsv16, data_types::u8)));
struct mvn_random_test_bsv32 : ::testing::TestWithParam<mvn_basic_test_params> {
template <typename T>
void fill_data(cldnn::memory::ptr mem, const tests::VVVVVF<T>& data) {
auto size = mem->get_layout().size;
cldnn::mem_lock<T> ptr(mem, get_test_stream());
for (size_t bi = 0; bi < static_cast<size_t>(size.batch[0]); ++bi) {
for (size_t fi = 0; fi < static_cast<size_t>(size.feature[0]); ++fi) {
for (size_t zi = 0; zi < static_cast<size_t>(size.spatial[2]); ++zi) {
for (size_t yi = 0; yi < static_cast<size_t>(size.spatial[1]); ++yi) {
auto tensor_addr = tensor(batch(bi), feature(fi), spatial(0, yi, zi, 0));
auto offset = mem->get_layout().get_linear_offset(tensor_addr);
for (size_t xi = 0; xi < static_cast<size_t>(size.spatial[0]); ++xi) {
ptr[offset + xi] = data[bi][fi][xi][yi][zi];
}
}
}
}
}
}
template <typename T>
void fill_random_data(cldnn::memory::ptr mem, int min, int max, int k = 8) {
auto size = mem->get_layout().size;
auto input_data = tests::generate_random_5d<T>(size.batch[0],
size.feature[0],
size.spatial[0],
size.spatial[1],
size.spatial[2],
min,
max,
k);
fill_data(mem, input_data);
}
size_t get_x_pitch(layout& layout) {
auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
auto x0 = layout.get_linear_offset(tensor_x0);
auto x1 = layout.get_linear_offset(tensor_x1);
return (x1 - x0);
}
template <typename T>
bool compare_outputs(const cldnn::memory::ptr out_ref, const cldnn::memory::ptr out_opt) {
auto output_lay = out_ref->get_layout();
auto opt_output_lay = out_opt->get_layout();
size_t b = output_lay.size.batch[0];
size_t f = output_lay.size.feature[0];
size_t x = output_lay.size.spatial[0];
size_t y = output_lay.size.spatial[1];
cldnn::mem_lock<T> ref_ptr(out_ref, get_test_stream());
cldnn::mem_lock<T> opt_ptr(out_opt, get_test_stream());
auto ref_x_pitch = get_x_pitch(output_lay);
auto opt_x_pitch = get_x_pitch(opt_output_lay);
for (size_t bi = 0; bi < b; ++bi) {
for (size_t fi = 0; fi < f; ++fi) {
for (size_t yi = 0; yi < y; ++yi) {
auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0));
auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords);
auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords);
for (size_t xi = 0; xi < x; ++xi) {
auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch];
auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch];
EXPECT_NEAR(static_cast<float>(opt_out_val), static_cast<float>(ref_out_val), 1.e-1f);
}
}
}
}
return true;
}
void execute(const mvn_basic_test_params& params) {
auto& size = params.input_size;
auto& output_pad = params.output_pad;
auto& engine = get_test_engine();
auto input = engine.allocate_memory({params.input_type, format::bfyx, params.input_size});
switch (params.input_type) {
case data_types::f32:
fill_random_data<float>(input, -127, 127);
break;
case data_types::f16:
fill_random_data<FLOAT16>(input, -127, 127, 1);
break;
case data_types::i8:
fill_random_data<int8_t>(input, -127, 127, 1);
break;
case data_types::u8:
fill_random_data<uint8_t>(input, 0, 255, 1);
break;
default:
break;
}
topology topo;
topo.add(input_layout("input", input->get_layout()));
auto prim = mvn("mvn", "input", params.normalize_variance, 1e-10f, false, params.across_channels);
prim.output_padding = output_pad;
topo.add(prim);
auto build_opts = build_options();
build_opts.set_option(build_option::outputs({"mvn"}));
build_opts.set_option(build_option::force_implementations({ {"mvn", {format::type::bfyx, "mvn_gpu_bfyx_opt"}} }));
network net(engine, topo, build_opts);
net.set_input_data("input", input);
auto outputs = net.execute();
auto output = outputs.at("mvn").get_memory();
topology topo_opt;
topo_opt.add(input_layout("input", input->get_layout()));
topo_opt.add(reorder("input_to_target_layout", "input", {params.input_type, params.input_format, size}));
auto prim_opt = mvn("mvn_opt", "input_to_target_layout", params.normalize_variance, 1e-10f, false, params.across_channels);
prim_opt.output_padding = output_pad;
topo_opt.add(prim_opt);
auto build_opts_opt = build_options();
build_opts_opt.set_option(build_option::outputs({"mvn_opt", "input_to_target_layout"}));
build_opts_opt.set_option(build_option::force_implementations({ {"mvn_opt", {params.input_format, "mvn_gpu_b_fs_yx_fsv16_imad"}} }));
network net_opt(engine, topo_opt, build_opts_opt);
net_opt.set_input_data("input", input);
auto outputs_opt = net_opt.execute();
auto output_opt = outputs_opt.at("mvn_opt").get_memory();
auto output_dtype = output->get_layout().data_type;
auto output_opt_dtype = output_opt->get_layout().data_type;
if (output_dtype == output_opt_dtype) {
if(output_dtype == data_types::f32) {
compare_outputs<float>(output, output_opt);
} else if (output_dtype == data_types::f16) {
compare_outputs<FLOAT16>(output, output_opt);
} else if (output_dtype == data_types::i8) {
compare_outputs<int8_t>(output, output_opt);
} else if (output_dtype == data_types::u8) {
compare_outputs<uint8_t>(output, output_opt);
} else {
FAIL() << "Not supported data type: " << static_cast<size_t>(params.input_type);
}
} else {
FAIL() << "Outputs have diffent data types: "
<< static_cast<size_t>(output_dtype) << ", "
<< static_cast<size_t>(output_opt_dtype);
}
}
};
TEST_P(mvn_random_test_bsv32, random) {
this->execute(GetParam());
}
struct mvn_test_case_generator_bsv32 : std::vector<mvn_basic_test_params> {
mvn_test_case_generator_bsv32& add(mvn_basic_test_params params) {
push_back(params);
return *this;
}
mvn_test_case_generator_bsv32& bsv32_tests(format::type fmt, data_types in_dt) {
push_back(mvn_basic_test_params{fmt, in_dt, {32, 32, 10, 10}, true, false, false, padding()});
push_back(mvn_basic_test_params{fmt, in_dt, {32, 32, 10, 10}, false, false, false, padding()});
return *this;
}
};
INSTANTIATE_TEST_SUITE_P(mvn_bsv32_fsv32,
mvn_random_test_bsv32,
testing::ValuesIn(mvn_test_case_generator_bsv32()
.bsv32_tests(format::bs_fs_yx_bsv32_fsv32, data_types::i8)));
INSTANTIATE_TEST_SUITE_P(mvn_bsv32_fsv16,
mvn_random_test_bsv32,
testing::ValuesIn(mvn_test_case_generator_bsv32()
.bsv32_tests(format::bs_fs_yx_bsv32_fsv16, data_types::f16)));
INSTANTIATE_TEST_SUITE_P(mvn_fsv16,
mvn_random_test_bsv32,
testing::ValuesIn(mvn_test_case_generator_bsv32()
.bsv32_tests(format::b_fs_yx_fsv16, data_types::i8)));