[IE CLDNN] Convolution fsv16 improvements several fixes after code review (#3637)

This commit is contained in:
Ilya Znamenskiy 2021-01-21 11:06:43 +03:00 committed by GitHub
parent d4f774f3c7
commit 72c3e0e4a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 364 additions and 148 deletions

View File

@ -19,16 +19,13 @@
namespace kernel_selector {
static const size_t sub_group_size = 16;
static const size_t feature_block_size = 16;
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionKernel_b_fs_yx_fsv16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
std::vector<size_t> outputBlockWidths = {2, 4, 8};
std::vector<size_t> outputBlockWidths = { 2, 4, 8 };
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
for (auto w : outputBlockWidths) {
for (auto exeMode : executionModes) {
autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
autoTuneOptions.emplace_back(AutoTuneOption{ w, exeMode });
}
}
}
@ -39,7 +36,7 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
auto x = cp.output.X().v;
auto f = cp.output.Feature().v;
if (x * f <= 256) {
if ( x <= 8 || x * f <= 128)
if (x <= 8 || x * f <= 128)
return { 2, DEFAULT };
else
return { 4, DEFAULT };
@ -55,6 +52,42 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
}
}
float ConvolutionKernel_b_fs_yx_fsv16::EstimateOccupancy(const convolution_params& params,
const ConvolutionTuningData& tuning_data) const {
auto tuneOptions = GetAutoTuneOptions(params, 0);
auto blockWidth = tuneOptions.blockWidth;
auto x = params.output.X().v;
auto y = params.output.Y().v;
auto f = params.output.Feature().v;
auto b = params.output.Batch().v;
auto threads = CeilDiv(x, blockWidth) * y * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
}
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16::GetTuningParams(const convolution_params& params) const {
ConvolutionTuningData tuning_data;
const auto& input = params.inputs[0];
size_t ic_blocks = CeilDiv(input.Feature().v / params.groups, tuning_data.feature_block_size);
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
bool slm_exception = params.output.X().v == 3 && params.output.Y().v == 3 && params.output.ElementSize() == 4 && params.output.Feature().v <= 512;
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !slm_exception)
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
EstimateOccupancy(params, tuning_data) < 4.0)
tuning_data.slm_div_factor *= 2;
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
return tuning_data;
}
ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
@ -92,6 +125,8 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
int autoTuneIndex) const {
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
ConvolutionTuningData tuning_data = GetTuningParams(params);
const auto& out = params.output;
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
@ -103,11 +138,11 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
auto b = out.Batch().v;
dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
dispatchData.gws[1] = Align(f, sub_group_size);
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
dispatchData.gws[2] = b;
dispatchData.lws[0] = 1;
dispatchData.lws[1] = sub_group_size;
dispatchData.lws[1] = tuning_data.work_group_size;
dispatchData.lws[2] = 1;
return dispatchData;
@ -126,27 +161,29 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p
const auto& params = static_cast<const convolution_params&>(p);
ConvolutionTuningData tuning_data = GetTuningParams(params);
const auto& input = params.inputs[0];
const auto& output = params.output;
if (params.groups > 1) {
auto outFeaturesPerGroup = output.Feature().v / params.groups;
auto inFeaturesPerGroup = input.Feature().v / params.groups;
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
(feature_block_size % inFeaturesPerGroup == 0) &&
(feature_block_size / outFeaturesPerGroup > 1) &&
(feature_block_size / inFeaturesPerGroup > 1) &&
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
(tuning_data.feature_block_size / inFeaturesPerGroup > 1) &&
(outFeaturesPerGroup != 1) &&
(inFeaturesPerGroup != 1);
auto grouped = inFeaturesPerGroup % sub_group_size == 0 &&
(outFeaturesPerGroup % sub_group_size == 0 || sub_group_size % outFeaturesPerGroup == 0);
auto grouped = inFeaturesPerGroup % tuning_data.sub_group_size == 0 &&
(outFeaturesPerGroup % tuning_data.sub_group_size == 0 || tuning_data.sub_group_size % outFeaturesPerGroup == 0);
if (!multipleGroupsInputPreload && !grouped)
return false;
}
// Check that padding before features doesn't miss-align the blocks
if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
return false;
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
@ -161,11 +198,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
auto output = params.output;
auto jit = Parent::GetJitConstants(params, dispatchData);
ConvolutionTuningData tuning_data = GetTuningParams(params);
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_vec = { "_VEC",
{"b", "(f_block*16)", "y", "x"},
{"b", "(feature_block * 16)", "y", "x"},
"dst",
input_dt,
blockWidth,
@ -174,7 +213,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
IndexType::TENSOR_COORD,
Tensor::DataChannelName::X };
FusedOpsConfiguration conf_scalar = { "_SCALAR",
{"b", "(f_block*16)", "y", "(x+i)"},
{"b", "(feature_block * 16)", "y", "(x + i)"},
"dst[i]",
input_dt,
1,
@ -190,23 +229,25 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
auto outFeaturesPerGroup = output.Feature().v / params.groups;
auto inFeaturesPerGroup = input.Feature().v / params.groups;
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
(feature_block_size % inFeaturesPerGroup == 0) &&
(feature_block_size / outFeaturesPerGroup > 1) &&
(feature_block_size / inFeaturesPerGroup > 1);
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
(tuning_data.feature_block_size / inFeaturesPerGroup > 1);
if (multipleGroupsInputPreload)
jit.AddConstant(MakeJitConstant("MULTIPLE_GROUPS_INPUT_PRELOAD", 1));
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, feature_block_size)));
if (params.output.Feature().v % feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, tuning_data.feature_block_size)));
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
}
if (inFeaturesPerGroup % feature_block_size != 0 && !multipleGroupsInputPreload) {
if (inFeaturesPerGroup % tuning_data.feature_block_size != 0 && !multipleGroupsInputPreload) {
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
}

View File

@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -58,7 +58,16 @@ private:
std::string exeMode;
};
struct ConvolutionTuningData {
const size_t sub_group_size = 16;
const size_t feature_block_size = 16;
size_t slm_div_factor = 1;
size_t work_group_size = 1;
};
std::vector<AutoTuneOption> autoTuneOptions;
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
};
} // namespace kernel_selector

View File

@ -19,11 +19,9 @@
#include <string>
namespace kernel_selector {
static const size_t sub_group_size = 16;
static const size_t feature_block_size = 16;
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_bfyx_f16_1x1") {
std::vector<size_t> outputBlockWidths = {2, 4, 8};
std::vector<size_t> outputBlockWidths = { 1, 2, 4, 8 };
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
for (auto w : outputBlockWidths) {
@ -36,10 +34,15 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con
ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
int /*autoTuneIndex*/) const {
const convolution_params& cp = static_cast<const convolution_params&>(params);
auto x = cp.output.X().v;
auto y = cp.output.Y().v;
auto f = cp.output.Feature().v;
if (x * f <= 256) {
if ( x < 8 || x * f <= 128)
if (x == 1 && y == 1) {
return { 1, DEFAULT };
} else if (x * f <= 256) {
if (x < 8 || x * f <= 128)
return { 2, DEFAULT };
else
return { 4, DEFAULT };
@ -50,6 +53,40 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs
}
}
float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_params& params,
const ConvolutionTuningData& tuning_data) const {
auto tuneOptions = GetAutoTuneOptions(params, 0);
auto blockWidth = tuneOptions.blockWidth;
auto x = params.output.X().v;
auto y = params.output.Y().v;
auto f = params.output.Feature().v;
auto b = params.output.Batch().v;
auto threads = CeilDiv(x * y, blockWidth) * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
}
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
ConvolutionTuningData tuning_data;
const auto& input = params.inputs[0];
size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
bool block_size_one_is_better = params.output.X().v == 1 && params.output.Y().v == 1 && input.Feature().v >= 2048;
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !block_size_one_is_better)
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
EstimateOccupancy(params, tuning_data) < 4.0)
tuning_data.slm_div_factor *= 2;
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
return tuning_data;
}
ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
ParamsKey k;
@ -75,21 +112,24 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
int autoTuneIndex) const {
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
ConvolutionTuningData tuning_data = GetTuningParams(params);
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
const auto& out = params.output;
auto x = out.X().v;
auto y = out.Y().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
dispatchData.gws[0] = CeilDiv(x * y, autoTune.blockWidth);
dispatchData.gws[1] = Align(f, feature_block_size);
dispatchData.gws[0] = x == 1 && y == 1 ? 1 : CeilDiv(x * y, autoTune.blockWidth);
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
dispatchData.gws[2] = b;
dispatchData.lws[0] = 1;
dispatchData.lws[1] = sub_group_size;
dispatchData.lws[1] = tuning_data.work_group_size;
dispatchData.lws[2] = 1;
return dispatchData;
@ -124,14 +164,16 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p, const option
const auto& params = static_cast<const convolution_params&>(p);
ConvolutionTuningData tuning_data = GetTuningParams(params);
const auto& input = params.inputs[0];
const auto& output = params.output;
const bool bOutputSizes =
output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
const bool bStride = params.stride.x != 1 || params.stride.y != 1;
const bool bPadding = input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0;
const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
output.Feature().pad.before % tuning_data.feature_block_size != 0;
if (bOutputSizes || bFilterSize || bStride || bPadding) {
return false;
@ -144,11 +186,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
const DispatchData& dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
ConvolutionTuningData tuning_data = GetTuningParams(params);
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
if (!params.fused_ops.empty()) {
auto input_dt = GetUnitType(params);
FusedOpsConfiguration conf_vec = { "_VEC",
{"b", "(f_block*16)", "y", "x"},
{"b", "(feature_block * 16)", "y", "x"},
"dst",
input_dt,
blockWidth,
@ -156,8 +200,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::X };
FusedOpsConfiguration conf_scalar = { "_SCALAR",
{"b", "(f_block*16)", "yi", "xi"},
FusedOpsConfiguration conf_scalar1 = { "_SCALAR",
{"b", "(feature_block * 16)", "yi", "xi"},
"dst[i]",
input_dt,
1,
@ -165,10 +209,19 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::X };
jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
FusedOpsConfiguration conf_scalar2 = { "_SCALAR_B1",
{"b", "(feature_block * 16)", "0", "0"},
"dst",
input_dt,
1,
LoadType::LT_ALIGNED_READ,
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::X };
jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
}
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
bool padded_output = params.output.X().pad.Total() != 0;
@ -194,11 +247,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, feature_block_size)));
if (params.output.Feature().v % feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
}
if (params.inputs[0].Feature().v % feature_block_size != 0) {
if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
}

View File

@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -51,7 +51,16 @@ protected:
std::string exeMode;
};
struct ConvolutionTuningData {
const size_t sub_group_size = 16;
const size_t feature_block_size = 16;
size_t slm_div_factor = 1;
size_t work_group_size = 1;
};
std::vector<AutoTuneOption> autoTuneOptions;
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
};
} // namespace kernel_selector

View File

@ -40,7 +40,7 @@
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16.cl - unsupported input type.
# error convolution_gpu_bfyx_f16.cl: unsupported input type
#endif
#if FILTER_TYPE_SIZE == 2
@ -48,7 +48,7 @@
#elif FILTER_TYPE_SIZE == 4
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16.cl - unsupported filter type.
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
#endif
#if OUTPUT_TYPE_SIZE == 1
@ -67,7 +67,7 @@
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
#else
# error convolution_gpu_bfyx_f16.cl - unsupported output type.
# error convolution_gpu_bfyx_f16.cl: unsupported output type
#endif
#if INPUT0_TYPE_SIZE == 2
@ -77,12 +77,14 @@
#else
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
#endif
#define FEATURE_SLICE_SIZE 16
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
KERNEL(convolution_bfyx_f16)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@ -94,26 +96,29 @@ KERNEL(convolution_bfyx_f16)(
FUSED_OPS_DECLS,
#endif
uint split_idx) {
#if GROUPED
const int f_block = get_group_id(1);
const int group = (f_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (f_block * FEATURE_SLICE_SIZE);
int groups_per_sub_group = 1;
if (prev_group_leftover < 16)
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
#else
const int f_block = get_group_id(1);
const int group = split_idx;
const int groups_per_sub_group = 1;
#endif // GROUPED
const int lid = get_sub_group_local_id();
const int sglid = get_sub_group_local_id();
const int b = (uint)get_global_id(2);
const int xy = get_global_id(0);
const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
const int y = (xy / X_BLOCKS);
const int lid1 = (int)get_local_id(1);
const int feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
const int feature_sub_block = lid1 / feature_per_wg;
const int feature_block = (int)get_group_id(1);
#if GROUPED
const int group = (feature_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (feature_block * FEATURE_SLICE_SIZE);
int groups_per_sub_group = 1;
if (prev_group_leftover < 16)
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
#else
const int group = split_idx;
const int groups_per_sub_group = 1;
#endif // GROUPED
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
@ -143,7 +148,7 @@ KERNEL(convolution_bfyx_f16)(
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
const uint output_offset = b * output_b_pitch +
(f_block + output_fs_pad_before) * output_fs_pitch +
(feature_block + output_fs_pad_before) * output_fs_pitch +
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
@ -155,54 +160,71 @@ KERNEL(convolution_bfyx_f16)(
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
#if BIAS_TERM
uint bias_offset = f_block * FEATURE_SLICE_SIZE;
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, bias_offset));
#if SLM_DIV_FACTOR == 1
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
#else
vec_t dst;
if (feature_sub_block == 0) {
dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
} else {
dst = INPUT0_VAL_ZERO;
}
#endif // SLM_DIV_FACTOR == 1
#else
vec_t dst = INPUT0_VAL_ZERO;
#endif // BIAS_TERM
#endif // BIAS_TERM
#if SLM_DIV_FACTOR > 1
__local vec_t partial_summ[WORK_GROUP_SIZE];
#endif
#if MULTIPLE_GROUPS_INPUT_PRELOAD
const uint in_split_offset = f_block * input_fs_pitch;
const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint in_split_offset = feature_block * input_fs_pitch;
const uint g = sglid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint ofm_in_group = sglid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
#else
#if GROUPED
for (uint g = group; g < group + groups_per_sub_group; g++) {
const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
const uint filter_offset = (feature_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
#else
const uint in_split_offset = 0;
const uint filter_split_offset = 0;
const uint filter_offset = f_block * filter_os_pitch;
const uint filter_offset = feature_block * filter_os_pitch;
#endif // GROUPED
const uint grouped_filter_offset = filter_offset + filter_split_offset;
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
const uint grouped_input_offset = input_offset + in_split_offset;
for (uint icb = 0; icb < IC_BLOCKS; icb++) {
#if SLM_DIV_FACTOR > 1
for (int icb = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; icb < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; icb++) {
#else
for (int icb = 0; icb < IC_BLOCKS; icb++) {
#endif // SLM_DIV_FACTOR > 1
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
for (int kh = 0; kh < FILTER_SIZE_Y; kh++) {
if (input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y)
if (input_y + kh * DILATION_SIZE_Y < 0 || input_y + kh * DILATION_SIZE_Y >= INPUT0_SIZE_Y)
continue;
INPUT_TYPE line_cache[INPUT_LINE_SIZE];
#if INPUT_LEFTOVERS
if ((icb+1)*FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
if ((icb + 1) * FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
{
for (int xb = 0; xb < INPUT_LINE_SIZE; xb++)
{
if (icb*FEATURE_SLICE_SIZE + lid >= FILTER_IFM_NUM)
if (icb * FEATURE_SLICE_SIZE + sglid >= FILTER_IFM_NUM)
line_cache[xb] = 0;
else
line_cache[xb] = input[grouped_input_offset +
icb * input_fs_pitch +
kh * DILATION_SIZE_Y * input_y_pitch +
xb * input_x_pitch +
lid];
sglid];
}
}
else
@ -251,7 +273,7 @@ KERNEL(convolution_bfyx_f16)(
#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
src[i] = line_cache[i];
#else
src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
src[i] = line_cache[kw * DILATION_SIZE_X + STRIDE_SIZE_X * i];
#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
}
#if MULTIPLE_GROUPS_INPUT_PRELOAD
@ -300,7 +322,7 @@ KERNEL(convolution_bfyx_f16)(
dst = mad(wei0.s6, src6, dst);
dst = mad(wei0.s7, src7, dst);
#else
#error Unsupported input feature size for multiple groups input preload
# error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
#endif // FILTER_IFM_NUM
#else
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
@ -352,13 +374,24 @@ KERNEL(convolution_bfyx_f16)(
#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
}
#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
#if SLM_DIV_FACTOR > 1
partial_summ[lid1] = dst;
barrier(CLK_LOCAL_MEM_FENCE);
if (feature_sub_block == 0) {
__attribute__((opencl_unroll_hint))
for (int i = 1; i < SLM_DIV_FACTOR; i++)
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
#endif // SLM_DIV_FACTOR > 1
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
out_vec_t res;
#if OUTPUT_LEFTOVERS
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR;
@ -366,8 +399,8 @@ KERNEL(convolution_bfyx_f16)(
#else
res[i] = TO_OUTPUT_TYPE(dst[i]);
#endif
if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
output[output_offset + i * output_x_pitch + lid] = res[i];
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
output[output_offset + i * output_x_pitch + sglid] = res[i];
}
}
}
@ -391,7 +424,7 @@ KERNEL(convolution_bfyx_f16)(
#elif OUTPUT_X_BLOCK_SIZE == 1
OUTPUT_BLOCK_WRITE(output, output_offset, res);
#else
# error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
#endif
} else {
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
@ -405,6 +438,10 @@ KERNEL(convolution_bfyx_f16)(
}
}
}
#if SLM_DIV_FACTOR > 1
}
#endif
}
#undef AS_INPUT_SRC

View File

@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -16,16 +16,24 @@
#include "include/unit_type.cl"
#include "include/mmad.cl"
#define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
intel_sub_group_shuffle( \
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
id))
#if X_BLOCK_SIZE > 1
# define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
intel_sub_group_shuffle( \
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
id))
#else
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
#endif
#define FEATURE_SLICE_SIZE 16
#if X_BLOCK_SIZE > 1
# define UNIT_BLOCK_READ_VEC(ptr, offset) CAT(UNIT_BLOCK_READ, X_BLOCK_SIZE)(ptr, offset)
# define UNIT_BLOCK_WRITE_VEC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@ -37,16 +45,21 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
FUSED_OPS_DECLS,
#endif
uint split_idx) {
const int xy = get_global_id(0);
const int f_block = get_group_id(1);
const int b = get_global_id(2);
const int lid = get_sub_group_local_id();
#if X_BLOCK_SIZE > 1
const uint xy = (int)get_global_id(0);
const uint x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
const uint y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
const int x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
const int y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
const uint input_x = x;
const uint input_y = y;
#endif
const uint b = (int)get_global_id(2);
const uint sglid = (int)get_sub_group_local_id();
const int input_x = x;
const int input_y = y;
const uint lid1 = (int)get_local_id(1);
const uint feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
const uint feature_sub_block = lid1 / feature_per_wg;
const uint feature_block = (int)get_group_id(1);
// Input offset calculations:
const uint input_x_pitch = FEATURE_SLICE_SIZE;
@ -71,8 +84,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
const uint output_offset = b * output_b_pitch +
(f_block + output_fs_pad_before) * output_fs_pitch +
const uint output_offset = b * output_b_pitch +
(feature_block + output_fs_pad_before) * output_fs_pitch +
(OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
(OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
@ -83,71 +96,91 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
const uint filter_is_pitch = filter_y_pitch * FILTER_SIZE_Y;
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
const uint filter_offset = f_block * filter_os_pitch;
const uint filter_offset = feature_block * filter_os_pitch;
#if X_BLOCK_SIZE > 1
typedef MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) vec_t;
#if BIAS_TERM
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
#else
vec_t dst = UNIT_VAL_ZERO;
typedef UNIT_TYPE vec_t;
#endif
for (uint k = 0; k < IC_BLOCKS; ++k)
#if BIAS_TERM
#if SLM_DIV_FACTOR == 1
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
#else
vec_t dst;
if (feature_sub_block == 0) {
dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
} else {
dst = UNIT_VAL_ZERO;
}
#endif // SLM_DIV_FACTOR == 1
#else
vec_t dst = UNIT_VAL_ZERO;
#endif // BIAS_TERM
#if SLM_DIV_FACTOR > 1
__local vec_t partial_summ[WORK_GROUP_SIZE];
for (uint k = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; k < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; k++)
{
#else
for (uint k = 0; k < IC_BLOCKS; k++)
{
#endif // SLM_DIV_FACTOR > 1
vec_t src = 0;
#if INPUT_LEFTOVERS
if ((k+1)*FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
{
if (k*FEATURE_SLICE_SIZE + lid < INPUT0_FEATURE_NUM)
if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM)
{
#if X_BLOCK_SIZE > 1
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
for (int i = 0; i < X_BLOCK_SIZE; i++)
{
const uint xb = (x + i) % INPUT0_SIZE_X;
const uint yb = y + (x + i) / INPUT0_SIZE_X;
const uint input_idx = input_offset + k * input_fs_pitch +
yb * input_y_pitch +
xb * input_x_pitch;
src[i] = input[input_idx + lid];
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
src[i] = input[input_idx + sglid];
}
#else
src = input[input_offset + k * input_fs_pitch + sglid];
#endif // X_BLOCK_SIZE > 1
}
}
else
#endif
#endif // INPUT_LEFTOVERS
{
#if PADDED_INPUT
#if X_BLOCK_SIZE > 1
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
for (int i = 0; i < X_BLOCK_SIZE; i++)
{
const uint xb = (x + i) % INPUT0_SIZE_X;
const uint yb = y + (x + i) / INPUT0_SIZE_X;
const uint input_idx = input_offset + k * input_fs_pitch +
yb * input_y_pitch +
xb * input_x_pitch;
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
src[i] = UNIT_BLOCK_READ(input, input_idx);
}
#else
#if X_BLOCK_SIZE == 8
src = UNIT_BLOCK_READ8(input, input_offset + k * input_fs_pitch +
input_y * input_y_pitch +
input_x * input_x_pitch);
#elif X_BLOCK_SIZE == 4
src = UNIT_BLOCK_READ4(input, input_offset + k * input_fs_pitch +
input_y * input_y_pitch +
input_x * input_x_pitch);
#elif X_BLOCK_SIZE == 2
src = UNIT_BLOCK_READ2(input, input_offset + k * input_fs_pitch +
input_y * input_y_pitch +
input_x * input_x_pitch);
#endif
#endif
}
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
#endif // X_BLOCK_SIZE > 1
#else // PADDED_INPUT
#if X_BLOCK_SIZE > 1
src = UNIT_BLOCK_READ_VEC(input, input_offset + k * input_fs_pitch + input_y * input_y_pitch + input_x * input_x_pitch);
#else
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
#endif // X_BLOCK_SIZE > 1
#endif // PADDED_INPUT
}
UNIT_TYPE8 wei0 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch);
UNIT_TYPE8 wei1 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch + 8 * filter_isv_pitch);
const vec_t src0 = GET_SRC(src, 0);
const vec_t src1 = GET_SRC(src, 1);
const vec_t src2 = GET_SRC(src, 2);
@ -183,29 +216,52 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
dst = mad(wei1.s7, src15, dst);
}
#if SLM_DIV_FACTOR > 1
partial_summ[lid1] = dst;
barrier(CLK_LOCAL_MEM_FENCE);
if (feature_sub_block == 0) {
__attribute__((opencl_unroll_hint))
for (int i = 1; i < SLM_DIV_FACTOR; i++)
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
#endif // SLM_DIV_FACTOR > 1
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
#if OUTPUT_LEFTOVERS
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
{
#if X_BLOCK_SIZE > 1
for (int i = 0; i < X_BLOCK_SIZE; i++) {
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y ||
f_block*FEATURE_SLICE_SIZE + lid >= OUTPUT_FEATURE_NUM)
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
return;
int xi = (x+i) % OUTPUT_SIZE_X;
int yi = y + ((x+i) / OUTPUT_SIZE_X);
int xi = (x + i) % OUTPUT_SIZE_X;
int yi = y + ((x + i) / OUTPUT_SIZE_X);
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR;
dst[i] = FUSED_OPS_RESULT_SCALAR;
#endif
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + lid] = dst[i];
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + sglid] = dst[i];
}
#else // X_BLOCK_SIZE > 1
if (feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
return;
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR_B1;
dst = FUSED_OPS_RESULT_SCALAR_B1;
#endif
output[output_offset + sglid] = dst;
#endif // X_BLOCK_SIZE > 1
}
else
#endif
#endif // OUTPUT_LEFTOVERS
#if X_BLOCK_SIZE > 1
{
#if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
@ -216,20 +272,14 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
FUSED_OPS_VEC;
dst = FUSED_OPS_RESULT_VEC;
#endif
#if X_BLOCK_SIZE == 8
UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#elif X_BLOCK_SIZE == 4
UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#elif X_BLOCK_SIZE == 2
UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#endif
UNIT_BLOCK_WRITE_VEC(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
} else {
for (int i = 0; i < X_BLOCK_SIZE; i++) {
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
return;
int xi = (x+i) % OUTPUT_SIZE_X;
int yi = y + ((x+i) / OUTPUT_SIZE_X);
int xi = (x + i) % OUTPUT_SIZE_X;
int yi = y + ((x + i) / OUTPUT_SIZE_X);
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR;
@ -240,7 +290,22 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
}
}
}
#else // X_BLOCK_SIZE > 1
{
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR_B1;
dst = FUSED_OPS_RESULT_SCALAR_B1;
#endif
UNIT_BLOCK_WRITE(output, output_offset, dst);
}
#endif // X_BLOCK_SIZE > 1
#if SLM_DIV_FACTOR > 1
}
#endif
}
#undef GET_SRC
#undef FEATURE_SLICE_SIZE
#undef UNIT_BLOCK_READ_VEC
#undef UNIT_BLOCK_WRITE_VEC