From d89b14625995bf3eb7a0e821305202379a86f0c7 Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Tue, 5 Oct 2021 16:22:33 +0900 Subject: [PATCH] [GPU] Merge mvn kernel updates (#7736) * [GPU] Merge mvn kernel updates + Support bs_fs_yx_bsv32_fsv32 format for i8 and u8 dtype. + Support bs_fs_yx_bfv32_fsv16 format for fp16 dtype. * [GPU] Fix a bug of resample kernel selector. * [GPU] fix a build error of mvn unit tests --- .../mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp | 18 +- .../mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp | 3 + .../mvn/mvn_kernel_bs_fs_yx_bsv32.cpp | 400 ++++++++++++++++++ .../mvn/mvn_kernel_bs_fs_yx_bsv32.hpp | 49 +++ .../mvn/mvn_kernel_selector.cpp | 2 + .../resample/resample_kernel_opt.cpp | 2 +- .../cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl | 154 ++++++- .../clDNN/tests/test_cases/mvn_gpu_test.cpp | 189 ++++++++- 8 files changed, 803 insertions(+), 14 deletions(-) create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.cpp create mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.hpp diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp index 031963920e3..cb07837be42 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp @@ -82,16 +82,32 @@ MVNKernelBase::DispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefault(const mvn_p return dispatchData; } +Datatype MVNKernel_b_fs_yx_fsv16_imad::GetAccumulatorType(const mvn_params& params) const { + const auto& input_dt = params.inputs[0].GetDType(); + + switch (input_dt) { + case Datatype::F32: + case Datatype::F16: + return Datatype::F32; + case Datatype::INT8: + case Datatype::UINT8: + return Datatype::INT32; + default: return Datatype::F32; + } +} + JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const { auto jits = Parent::GetJitConstants(params, dispatchData); auto activation_dt = GetActivationType(params); + jits.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION")); jits.Merge(MakeTypeJitConstants(activation_dt, "MEAN")); + jits.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); jits.AddConstant(MakeJitConstant("SIMD", simd)); jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0])); jits.AddConstant(MakeJitConstant("GWS", dispatchData.gws[0])); jits.AddConstant(MakeJitConstant("ITEM_GROUPS", dispatchData.itemsNum)); - + jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", 16)); if (!params.fused_ops.empty()) { std::vector idx_order; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp index 623da1b642d..39767649238 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp @@ -42,5 +42,8 @@ protected: KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&) const; MultiDispatchData SetDefaultForMulti(const mvn_params& params) const; + +private: + Datatype GetAccumulatorType(const mvn_params& params) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.cpp new file mode 100644 index 00000000000..67aa4634e68 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.cpp @@ -0,0 +1,400 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mvn_kernel_bs_fs_yx_bsv32.hpp" +#include "common/common_tools.h" + +#include +#include +#include + +namespace kernel_selector { + +static constexpr size_t simd = 16; +static constexpr size_t fsv = 16; +static constexpr size_t pref_work_groups = 16; + +ParamsKey MVNKernel_bs_fs_yx_bsv32::GetSupportedKey() const { + ParamsKey k; + + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16); + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32); + + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableDifferentTypes(); + k.EnableBatching(); + + k.EnableMVNMode(MVNMode::WITHIN_CHANNELS); + k.EnableMVNNormalizeVariance(); + + return k; +} + +bool MVNKernel_bs_fs_yx_bsv32::Validate(const Params& p, const optional_params& options) const { + if (!Parent::Validate(p, options)) + return false; + + auto params = static_cast(p); + + // TODO Add support for input padding via iterating over y (parallel or in kernel). + if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0) + return false; + + return true; +} + +Datatype MVNKernel_bs_fs_yx_bsv32::GetAccumulatorType(const mvn_params& params) const { + const auto& input_dt = params.inputs[0].GetDType(); + + switch (input_dt) { + case Datatype::F32: + case Datatype::F16: + return Datatype::F32; + case Datatype::INT8: + case Datatype::UINT8: + return Datatype::INT32; + default: return Datatype::F32; + } +} + +JitConstants MVNKernel_bs_fs_yx_bsv32::GetJitConstants(const mvn_params& params, DispatchData dispatchData) const { + auto jits = Parent::GetJitConstants(params, dispatchData); + + auto activation_dt = GetActivationType(params); + jits.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION")); + jits.Merge(MakeTypeJitConstants(Datatype::F32, "MEAN")); + jits.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + jits.AddConstant(MakeJitConstant("SIMD", simd)); + jits.AddConstant(MakeJitConstant("LWS", dispatchData.lws[0])); + jits.AddConstant(MakeJitConstant("GWS", dispatchData.gws[0])); + jits.AddConstant(MakeJitConstant("ITEM_GROUPS", dispatchData.itemsNum)); + const auto input_layout = params.inputs[0].GetLayout(); + + if (input_layout == DataLayout::bs_fs_yx_bsv32_fsv32) { + jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", (size_t)(32 * 32))); + } else { // DataLayout::bs_fs_yx_bsv32_fsv16 + jits.AddConstant(MakeJitConstant("INPUT_SLICE_PITCH", (size_t)(32 * 16))); + } + + if (!params.fused_ops.empty()) { + std::vector idx_order = {"b", "(f + fi)", "(y)", "(x)"}; + auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt); + jits.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jits; +} + +std::vector MVNKernel_bs_fs_yx_bsv32::GetFinalKernelLws(const std::vector& gws, uint64_t max_wg) const { + std::vector lws(3); + lws[0] = 1; + lws[1] = gws[1]; + lws[2] = gws[2]; + + // gws[1] is CeilDiv(feature, simd) + while (lws[1] > 16 || gws[1] % lws[1] != 0) { + lws[1] -= 1; + } + + // gws[2] is Align(batch, simd) + while (lws[1] * lws[2] > max_wg && lws[2] > 16) { + lws[2] -= simd; + } + + return lws; +} + +MVNKernel_bs_fs_yx_bsv32::MultiDispatchData MVNKernel_bs_fs_yx_bsv32::SetDefaultForMulti(const mvn_params& params, + bool has_enough_data) const { + MultiDispatchData dispatchData; + + auto items_num = params.output.X().v * params.output.Y().v; + auto max_wg = params.engineInfo.maxWorkGroupSize; + auto slm_per_sg = fsv * 4; + auto max_slm = params.engineInfo.maxLocalMemSize; + auto max_sgs = max_slm / slm_per_sg; + + auto max_lws = std::min(max_wg, max_sgs * simd); + auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd; + + // TODO Check if larger number of work-groups does not provide benefit + size_t item_groups = pref_work_groups; + dispatchData.item_groups = item_groups; + + size_t stage1_lws = lws; + + if (has_enough_data) { + dispatchData.stage_1.gws[0] = stage1_lws * item_groups; + dispatchData.stage_1.gws[1] = CeilDiv(params.output.Feature().v, fsv); + dispatchData.stage_1.gws[2] = params.output.Batch().v; + + dispatchData.stage_1.lws[0] = stage1_lws; + dispatchData.stage_1.lws[1] = 1; + dispatchData.stage_1.lws[2] = 1; + + dispatchData.stage_1.itemsNum = item_groups; + + size_t stage2_lws = std::max(std::min(item_groups, max_lws) / simd, (size_t)1) * simd; + + dispatchData.stage_2.gws[0] = stage2_lws; + dispatchData.stage_2.gws[1] = CeilDiv(params.output.Feature().v, fsv); + dispatchData.stage_2.gws[2] = params.output.Batch().v; + + dispatchData.stage_2.lws[0] = stage2_lws; + dispatchData.stage_2.lws[1] = 1; + dispatchData.stage_2.lws[2] = 1; + + dispatchData.stage_2.itemsNum = item_groups; + } else { + dispatchData.stage_1.gws[0] = lws; + dispatchData.stage_1.gws[1] = CeilDiv(params.output.Feature().v, fsv); + dispatchData.stage_1.gws[2] = params.output.Batch().v; + + dispatchData.stage_1.lws[0] = lws; + dispatchData.stage_1.lws[1] = 1; + dispatchData.stage_1.lws[2] = 1; + + dispatchData.stage_1.itemsNum = 1; + } + + dispatchData.stage_final.gws[0] = items_num; + dispatchData.stage_final.gws[1] = CeilDiv(params.output.Feature().v, fsv); + dispatchData.stage_final.gws[2] = Align(params.output.Batch().v, simd); + + dispatchData.stage_final.lws = GetFinalKernelLws(dispatchData.stage_final.gws, max_wg); + dispatchData.stage_final.itemsNum = 1; + + return dispatchData; +} + +KernelsData MVNKernel_bs_fs_yx_bsv32::GetMultiStageKernelsData(const mvn_params& params, + const optional_params& options, + bool has_enough_data) const { + if (!Validate(params, options)) + return {}; + + constexpr size_t intermediate_bytes = 4; + auto dispatchData = SetDefaultForMulti(params, has_enough_data); + + KernelData kd; + size_t entry_part_id = 0; + + if (has_enough_data) { + size_t kernels_num = params.mvnNormalizeVariance ? 5 : 3; + kd = KernelData::Default(params, kernels_num); + + auto finalKernelName = GetKernelName(params); + { + // Mean first stage + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_1", 1)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData.stage_1, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 0, + 0); + kernel.params.arguments.clear(); // Clear original output argument + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * + dispatchData.item_groups * intermediate_bytes); + } + { + // Mean second stage + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_2); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_2", 1)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[1]; + FillCLKernelData(kernel, + dispatchData.stage_2, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 0, + 0); + kernel.params.arguments.clear(); // Clear original output argument + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * + intermediate_bytes); + } + if (params.mvnNormalizeVariance) { + // Variance first stage + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_1", 1)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[2]; + FillCLKernelData(kernel, + dispatchData.stage_1, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 0, + 0); + kernel.params.arguments.clear(); // Clear original output argument + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + } + if (params.mvnNormalizeVariance) { + // Variance second stage + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_2); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_2", 1)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[3]; + FillCLKernelData(kernel, + dispatchData.stage_2, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 0, + 0); + kernel.params.arguments.clear(); // Clear original output argument + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); + kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * + intermediate_bytes); + } + { // Final + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_final); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN_BSV32", 1)); + cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[kernels_num - 1]; + FillCLKernelData(kernel, + dispatchData.stage_final, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 1, + GetFusedPrimitiveInputsCount(params)); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + if (params.mvnNormalizeVariance) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2}); + } + } + kd.internalBufferDataType = Datatype::F32; + } else { // not enough data + kd = KernelData::Default(params, 2); + auto finalKernelName = GetKernelName(params); + { + // Mean and Variance stage + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_1); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_VAR_BSV32", 1)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData.stage_1, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 0, + 0); + kernel.params.arguments.clear(); // Clear original output argument + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * + intermediate_bytes); + if (params.mvnNormalizeVariance) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * + intermediate_bytes); + } + } + { // Final + auto cldnn_jit = GetJitConstants(params, dispatchData.stage_final); + cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN_BSV32", 1)); + cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance)); + auto entry_point = GetEntryPoint(finalKernelName, params.layerID, params, options, entry_part_id++); + auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); + auto& kernel = kd.kernels[1]; + FillCLKernelData(kernel, + dispatchData.stage_final, + params.engineInfo, + finalKernelName, + jit, + entry_point, + "", + false, + false, + 1, + GetFusedPrimitiveInputsCount(params)); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + if (params.mvnNormalizeVariance) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + } + } + kd.internalBufferDataType = Datatype::F32; + } + + return {kd}; +} + +KernelsData MVNKernel_bs_fs_yx_bsv32::GetKernelsData(const Params& params, + const optional_params& optParams) const { + const mvn_params& orgParams = static_cast(params); + + auto max_slm = params.engineInfo.maxLocalMemSize; + auto slm_per_sg = fsv * 4; + auto max_lws = params.engineInfo.maxWorkGroupSize; + auto items_num = orgParams.output.X().v * orgParams.output.Y().v * orgParams.output.Z().v; + + auto enough_slm = max_lws / simd * simd * slm_per_sg <= max_slm; + auto enough_lws = max_lws / simd >= 1; + auto enough_items = items_num >= max_lws / simd * simd * pref_work_groups; + + return GetMultiStageKernelsData(orgParams, optParams, enough_slm && enough_lws && enough_items); +} + +KernelsPriority MVNKernel_bs_fs_yx_bsv32::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const { + return FORCE_PRIORITY_4; +} +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.hpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.hpp new file mode 100644 index 00000000000..cfd4f0528a7 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_bs_fs_yx_bsv32.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "mvn_kernel_base.h" +#include +#include + +namespace kernel_selector { +class MVNKernel_bs_fs_yx_bsv32 : public MVNKernelBase { +public: + using Parent = MVNKernelBase; + MVNKernel_bs_fs_yx_bsv32() : MVNKernelBase("mvn_gpu_b_fs_yx_fsv16_imad") {} + virtual ~MVNKernel_bs_fs_yx_bsv32() {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + +protected: + struct MultiDispatchData { + DispatchData stage_1; + DispatchData stage_2; + DispatchData stage_final; + + size_t item_groups; + }; + + bool Validate(const Params&, const optional_params&) const override; + JitConstants GetJitConstants(const mvn_params& params, DispatchData dispatchData) const override; + std::vector GetSupportedFusedOps() const override { + return { + FusedOpType::ACTIVATION, + FusedOpType::QUANTIZE, + FusedOpType::ELTWISE, + FusedOpType::SCALE + }; + } + + KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&, bool) const; + MultiDispatchData SetDefaultForMulti(const mvn_params& params, bool) const; + +private: + Datatype GetAccumulatorType(const mvn_params& params) const; + std::vector GetFinalKernelLws(const std::vector& gws, uint64_t max_wg) const; +}; +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_selector.cpp index 5f40288352e..de2a135848d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_selector.cpp @@ -6,12 +6,14 @@ #include "mvn_kernel_ref.h" #include "mvn_kernel_bfyx_opt.h" #include "mvn_kernel_b_fs_yx_fsv16_imad.hpp" +#include "mvn_kernel_bs_fs_yx_bsv32.hpp" namespace kernel_selector { mvn_kernel_selector::mvn_kernel_selector() { Attach(); Attach(); Attach(); + Attach(); } KernelsData mvn_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp index b0184dcfa1a..69bb21d2b64 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp @@ -76,7 +76,7 @@ ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_sele opt_x_block_size = GetOptimalDivisor(out.X().v, 32); } - dispatchData.gws[0] = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v; + dispatchData.gws[0] = CeilDiv(out.X().v, opt_x_block_size) * out.Y().v; dispatchData.gws[1] = Align(out.Feature().v, sub_group_size); dispatchData.gws[2] = arg.output.Batch().v; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl index d5c1600dfb7..7a4a590c08c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl @@ -63,7 +63,6 @@ // ITEM_GROUPS - Number of work-groups performing accumulation in parallel mode. Should be the same in both stages of parallel kernels. #define FSV 16 -#define INPUT_SLICE_PITCH 16 #define SG_NUM (LWS / SIMD) #define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) @@ -73,8 +72,11 @@ #define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, FSV) #define MEAN_PACKED_TYPE MAKE_VECTOR_TYPE(MEAN_TYPE, FSV) #define INT_PACKED_TYPE MAKE_VECTOR_TYPE(int, FSV) +#define ACC_PACKED_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, FSV) +#define ACT_PACKED_TYPE MAKE_VECTOR_TYPE(ACTIVATION_TYPE, FSV) #define TO_MEAN_PACKED_TYPE CAT(convert_, MEAN_PACKED_TYPE) +#define TO_ACT_PACKED_TYPE CAT(convert_, ACT_PACKED_TYPE) #define ITEMS_NUM (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z) @@ -83,18 +85,18 @@ // ================================================================================================ #if MVN_KERNEL_MEAN_1 -DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM) +DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM) #if SG_NUM != 1 -DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, int, FSV, SG_NUM, REDUCE_NO_POST_OP) +DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, ACCUMULATOR_TYPE, FSV, SG_NUM, REDUCE_NO_POST_OP) #else -DECLARE_SG_PACKED_REDUCE_ADD(reduce_sum_inside_sg, int, FSV, REDUCE_NO_POST_OP) +DECLARE_SG_PACKED_REDUCE_ADD(reduce_sum_inside_sg, ACCUMULATOR_TYPE, FSV, REDUCE_NO_POST_OP) #endif __attribute__((intel_reqd_sub_group_size(SIMD))) __attribute__((reqd_work_group_size(LWS, 1, 1))) KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input, - __global int* intermidiate_sum) { + __global ACCUMULATOR_TYPE* intermidiate_sum) { uint b = get_global_id(2); uint f = get_global_id(1) * FSV; uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1); @@ -109,7 +111,7 @@ KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input, const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0); #endif - INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0)); + ACC_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0)); #if SG_NUM != 1 __local int slm_acc[(SG_NUM - 1) * FSV]; @@ -125,7 +127,7 @@ KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input, // ================================================================================================ #elif MVN_KERNEL_MEAN_2 -DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, int, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM) +DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, ACCUMULATOR_TYPE, FSV, FSV, ITEM_GROUPS, LWS, ACCUMULATE_SUM) #define CALC_MEAN(sum) ((sum) / ITEMS_NUM) #if SG_NUM != 1 @@ -136,7 +138,7 @@ DECLARE_SG_PACKED_REDUCE_ADD(reduce_mean_inside_sg, MEAN_TYPE, FSV, CALC_MEAN) __attribute__((intel_reqd_sub_group_size(SIMD))) __attribute__((reqd_work_group_size(LWS, 1, 1))) -KERNEL(mvn_mean_2)(const __global int* intermidiate_sum, +KERNEL(mvn_mean_2)(const __global ACCUMULATOR_TYPE* intermidiate_sum, __global MEAN_TYPE* intermidiate_mean) { uint b = get_global_id(2); uint f = get_global_id(1) * FSV; @@ -147,7 +149,7 @@ KERNEL(mvn_mean_2)(const __global int* intermidiate_sum, const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV; - INT_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0)); + ACC_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0)); #if SG_NUM != 1 __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV]; @@ -212,7 +214,7 @@ KERNEL(mvn_var_1)(const __global INPUT0_TYPE* input, // ================================================================================================ #elif MVN_KERNEL_VAR_2 -DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM) +DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, FSV, ITEM_GROUPS, LWS, ACCUMULATE_SUM) #if defined EPS_OUTSIDE_SQRT #define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr(native_sqrt((sum_diff_sq) / ITEMS_NUM) + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-1.f); #elif defined EPS_INSIDE_SQRT @@ -251,6 +253,138 @@ KERNEL(mvn_var_2)(const __global MEAN_TYPE* intermidiate_sum, intermidiate_ivar[flat_data_set_group * FSV + sglid] = inv_variance; } } + +// ================================================================================================ +#elif MVN_KERNEL_MAIN_BSV32 + +__attribute__((intel_reqd_sub_group_size(SIMD))) +KERNEL(mvn_final_bsv32)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* restrict output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif + , const __global MEAN_TYPE* means +#if PRECALC_VARIANCE + , const __global MEAN_TYPE* variances +#endif +) { + uint b = get_global_id(2); + uint f = get_global_id(1) * FSV; + uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1); + + MEAN_PACKED_TYPE mean_vals = ((const __global MEAN_PACKED_TYPE*)(means + (flat_data_set_group * FSV)))[0]; + +#if PRECALC_VARIANCE + MEAN_PACKED_TYPE inv_variance = ((const __global MEAN_PACKED_TYPE*)(variances + (flat_data_set_group * FSV)))[0]; +#else // !PRECALC_VARIANCE + MEAN_PACKED_TYPE inv_variance = (MEAN_PACKED_TYPE)(MEAN_VAL_ONE); +#endif + + if (b >= OUTPUT_BATCH_NUM || f >= OUTPUT_FEATURE_NUM) + return; + + const uint output_spatial = get_global_id(0); + uint x = output_spatial % OUTPUT_SIZE_X; + uint y = output_spatial / OUTPUT_SIZE_X; + uint input_offset = INPUT0_GET_INDEX(b, f, y, x); + uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); + + INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[0]; + ACT_PACKED_TYPE normalized_vec = fma((TO_ACT_PACKED_TYPE(in_pack) - TO_ACT_PACKED_TYPE(mean_vals)), + TO_ACT_PACKED_TYPE(inv_variance), (ACT_PACKED_TYPE)0); + OUTPUT_PACKED_TYPE result_vec = OUTPUT_VAL_ZERO; + + __attribute__((opencl_unroll_hint)) + for (uint fi = 0; fi < FSV; fi++) { + ACTIVATION_TYPE normalized = normalized_vec[fi]; +# if HAS_FUSED_OPS + FUSED_OPS; + result_vec[fi] = FUSED_OPS_RESULT; +# else + result_vec[fi] = TO_OUTPUT_TYPE(normalized); +# endif + } + + vstore16(result_vec, 0, &output[output_offset]); +} + +#elif MVN_KERNEL_MEAN_VAR_BSV32 + +// Mean: +DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, ACCUMULATOR_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM) + +#define CALC_MEAN(sum) ((sum) / ITEMS_NUM) +#if SG_NUM != 1 +DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN) +#else +DECLARE_SG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, CALC_MEAN) +#endif + +// Variance: +#define EXTRA_ARGS_DECL_IMPL , MEAN_TYPE mean +#define EXTRA_ARGS_IMPL , mean +#define EXTRA_ARGS_DECL EXTRA_ARGS_DECL_IMPL +#define EXTRA_ARGS EXTRA_ARGS_IMPL +#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean) ACCUMULATE_SUM_SQ(curr, TO_MEAN_TYPE(next) - intel_sub_group_shuffle(mean, idx), idx) +DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS) + +#if defined EPS_OUTSIDE_SQRT + #define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr(native_sqrt((sum_diff_sq) / ITEMS_NUM) + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-1.f); +#elif defined EPS_INSIDE_SQRT + #define CALC_INVERSE_VARIANCE(sum_diff_sq) native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, (MEAN_TYPE)-0.5f) +#endif +#if SG_NUM != 1 +DECLARE_WG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE) +#else +DECLARE_SG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, CALC_INVERSE_VARIANCE) +#endif + +__attribute__((intel_reqd_sub_group_size(SIMD))) +__attribute__((reqd_work_group_size(LWS, 1, 1))) +KERNEL(mvn_mean_var_bsv32)( + const __global INPUT0_TYPE* input, + __global MEAN_TYPE* means +#if NORMALIZE_VARIANCE + , __global MEAN_TYPE* variances +#endif +) { + uint b = get_global_id(2); + uint f = get_global_id(1) * FSV; + uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1); + + const uint sgid = get_sub_group_id(); + const uint sglid = get_sub_group_local_id(); + const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0); + +#if SG_NUM != 1 + __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV]; +#endif + + ACC_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_local_id(0)); +#if SG_NUM != 1 + MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum), slm_acc); +#else + MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum)); +#endif + +#if NORMALIZE_VARIANCE + MEAN_PACKED_TYPE partial_dev = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_local_id(0), mean); + #if SG_NUM != 1 + MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev, slm_acc); + #else + MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev); + #endif +#endif + + if (sgid == 0 && (sglid < FSV || SIMD == FSV)) { + means[flat_data_set_group * FSV + sglid] = mean; + #if NORMALIZE_VARIANCE + variances[flat_data_set_group * FSV + sglid] = inv_variance; + #endif + } +} + // ================================================================================================ #else // MVN_KERNEL_MAIN diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp index 3437ea51f27..1f15560dd45 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp @@ -12,6 +12,8 @@ #include +#include + using namespace cldnn; using namespace ::tests; @@ -638,8 +640,8 @@ struct mvn_random_test : ::testing::TestWithParam { }; TEST_P(mvn_random_test, random) { - auto& eng = tests::get_test_engine(); - this->execute(GetParam(), eng); + auto& engine = tests::get_test_engine(); + this->execute(GetParam(), engine); } struct mvn_test_case_generator : std::vector { @@ -712,3 +714,186 @@ INSTANTIATE_TEST_SUITE_P(extended, testing::ValuesIn(mvn_test_case_generator() .extended_tests(format::b_fs_yx_fsv16, data_types::i8) .extended_tests(format::b_fs_yx_fsv16, data_types::u8))); + +struct mvn_random_test_bsv32 : ::testing::TestWithParam { + template + void fill_data(cldnn::memory::ptr mem, const tests::VVVVVF& data) { + auto size = mem->get_layout().size; + cldnn::mem_lock ptr(mem, get_test_stream()); + for (size_t bi = 0; bi < static_cast(size.batch[0]); ++bi) { + for (size_t fi = 0; fi < static_cast(size.feature[0]); ++fi) { + for (size_t zi = 0; zi < static_cast(size.spatial[2]); ++zi) { + for (size_t yi = 0; yi < static_cast(size.spatial[1]); ++yi) { + auto tensor_addr = tensor(batch(bi), feature(fi), spatial(0, yi, zi, 0)); + auto offset = mem->get_layout().get_linear_offset(tensor_addr); + for (size_t xi = 0; xi < static_cast(size.spatial[0]); ++xi) { + ptr[offset + xi] = data[bi][fi][xi][yi][zi]; + } + } + } + } + } + } + + template + void fill_random_data(cldnn::memory::ptr mem, int min, int max, int k = 8) { + auto size = mem->get_layout().size; + auto input_data = tests::generate_random_5d(size.batch[0], + size.feature[0], + size.spatial[0], + size.spatial[1], + size.spatial[2], + min, + max, + k); + fill_data(mem, input_data); + } + + size_t get_x_pitch(layout& layout) { + auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0)); + auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0)); + auto x0 = layout.get_linear_offset(tensor_x0); + auto x1 = layout.get_linear_offset(tensor_x1); + return (x1 - x0); + } + + template + bool compare_outputs(const cldnn::memory::ptr out_ref, const cldnn::memory::ptr out_opt) { + auto output_lay = out_ref->get_layout(); + auto opt_output_lay = out_opt->get_layout(); + + size_t b = output_lay.size.batch[0]; + size_t f = output_lay.size.feature[0]; + size_t x = output_lay.size.spatial[0]; + size_t y = output_lay.size.spatial[1]; + cldnn::mem_lock ref_ptr(out_ref, get_test_stream()); + cldnn::mem_lock opt_ptr(out_opt, get_test_stream()); + + auto ref_x_pitch = get_x_pitch(output_lay); + auto opt_x_pitch = get_x_pitch(opt_output_lay); + + for (size_t bi = 0; bi < b; ++bi) { + for (size_t fi = 0; fi < f; ++fi) { + for (size_t yi = 0; yi < y; ++yi) { + auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0)); + auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords); + auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords); + for (size_t xi = 0; xi < x; ++xi) { + auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch]; + auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch]; + EXPECT_NEAR(static_cast(opt_out_val), static_cast(ref_out_val), 1.e-1f); + } + } + } + } + + return true; + } + + void execute(const mvn_basic_test_params& params) { + auto& size = params.input_size; + auto& output_pad = params.output_pad; + auto& engine = get_test_engine(); + auto input = engine.allocate_memory({params.input_type, format::bfyx, params.input_size}); + switch (params.input_type) { + case data_types::f32: + fill_random_data(input, -127, 127); + break; + case data_types::f16: + fill_random_data(input, -127, 127, 1); + break; + case data_types::i8: + fill_random_data(input, -127, 127, 1); + break; + case data_types::u8: + fill_random_data(input, 0, 255, 1); + break; + default: + break; + } + + topology topo; + topo.add(input_layout("input", input->get_layout())); + auto prim = mvn("mvn", "input", params.normalize_variance, 1e-10f, false, params.across_channels); + prim.output_padding = output_pad; + topo.add(prim); + auto build_opts = build_options(); + build_opts.set_option(build_option::outputs({"mvn"})); + build_opts.set_option(build_option::force_implementations({ {"mvn", {format::type::bfyx, "mvn_gpu_bfyx_opt"}} })); + + network net(engine, topo, build_opts); + net.set_input_data("input", input); + + auto outputs = net.execute(); + auto output = outputs.at("mvn").get_memory(); + + topology topo_opt; + topo_opt.add(input_layout("input", input->get_layout())); + topo_opt.add(reorder("input_to_target_layout", "input", {params.input_type, params.input_format, size})); + auto prim_opt = mvn("mvn_opt", "input_to_target_layout", params.normalize_variance, 1e-10f, false, params.across_channels); + prim_opt.output_padding = output_pad; + topo_opt.add(prim_opt); + auto build_opts_opt = build_options(); + build_opts_opt.set_option(build_option::outputs({"mvn_opt", "input_to_target_layout"})); + build_opts_opt.set_option(build_option::force_implementations({ {"mvn_opt", {params.input_format, "mvn_gpu_b_fs_yx_fsv16_imad"}} })); + + network net_opt(engine, topo_opt, build_opts_opt); + net_opt.set_input_data("input", input); + + auto outputs_opt = net_opt.execute(); + auto output_opt = outputs_opt.at("mvn_opt").get_memory(); + + auto output_dtype = output->get_layout().data_type; + auto output_opt_dtype = output_opt->get_layout().data_type; + if (output_dtype == output_opt_dtype) { + if(output_dtype == data_types::f32) { + compare_outputs(output, output_opt); + } else if (output_dtype == data_types::f16) { + compare_outputs(output, output_opt); + } else if (output_dtype == data_types::i8) { + compare_outputs(output, output_opt); + } else if (output_dtype == data_types::u8) { + compare_outputs(output, output_opt); + } else { + FAIL() << "Not supported data type: " << static_cast(params.input_type); + } + } else { + FAIL() << "Outputs have diffent data types: " + << static_cast(output_dtype) << ", " + << static_cast(output_opt_dtype); + } + } +}; + +TEST_P(mvn_random_test_bsv32, random) { + this->execute(GetParam()); +} + +struct mvn_test_case_generator_bsv32 : std::vector { + mvn_test_case_generator_bsv32& add(mvn_basic_test_params params) { + push_back(params); + return *this; + } + + mvn_test_case_generator_bsv32& bsv32_tests(format::type fmt, data_types in_dt) { + push_back(mvn_basic_test_params{fmt, in_dt, {32, 32, 10, 10}, true, false, false, padding()}); + push_back(mvn_basic_test_params{fmt, in_dt, {32, 32, 10, 10}, false, false, false, padding()}); + return *this; + } +}; + +INSTANTIATE_TEST_SUITE_P(mvn_bsv32_fsv32, + mvn_random_test_bsv32, + testing::ValuesIn(mvn_test_case_generator_bsv32() + .bsv32_tests(format::bs_fs_yx_bsv32_fsv32, data_types::i8))); + + +INSTANTIATE_TEST_SUITE_P(mvn_bsv32_fsv16, + mvn_random_test_bsv32, + testing::ValuesIn(mvn_test_case_generator_bsv32() + .bsv32_tests(format::bs_fs_yx_bsv32_fsv16, data_types::f16))); + +INSTANTIATE_TEST_SUITE_P(mvn_fsv16, + mvn_random_test_bsv32, + testing::ValuesIn(mvn_test_case_generator_bsv32() + .bsv32_tests(format::b_fs_yx_fsv16, data_types::i8)));