[IE CLDNN] Convolution fsv16 improvements several fixes after code review (#3637)
This commit is contained in:
parent
d4f774f3c7
commit
72c3e0e4a6
@ -19,16 +19,13 @@
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
static const size_t sub_group_size = 16;
|
||||
static const size_t feature_block_size = 16;
|
||||
|
||||
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionKernel_b_fs_yx_fsv16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
|
||||
std::vector<size_t> outputBlockWidths = {2, 4, 8};
|
||||
std::vector<size_t> outputBlockWidths = { 2, 4, 8 };
|
||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
||||
|
||||
for (auto w : outputBlockWidths) {
|
||||
for (auto exeMode : executionModes) {
|
||||
autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
|
||||
autoTuneOptions.emplace_back(AutoTuneOption{ w, exeMode });
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -39,7 +36,7 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
|
||||
auto x = cp.output.X().v;
|
||||
auto f = cp.output.Feature().v;
|
||||
if (x * f <= 256) {
|
||||
if ( x <= 8 || x * f <= 128)
|
||||
if (x <= 8 || x * f <= 128)
|
||||
return { 2, DEFAULT };
|
||||
else
|
||||
return { 4, DEFAULT };
|
||||
@ -55,6 +52,42 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
|
||||
}
|
||||
}
|
||||
|
||||
float ConvolutionKernel_b_fs_yx_fsv16::EstimateOccupancy(const convolution_params& params,
|
||||
const ConvolutionTuningData& tuning_data) const {
|
||||
auto tuneOptions = GetAutoTuneOptions(params, 0);
|
||||
auto blockWidth = tuneOptions.blockWidth;
|
||||
|
||||
auto x = params.output.X().v;
|
||||
auto y = params.output.Y().v;
|
||||
auto f = params.output.Feature().v;
|
||||
auto b = params.output.Batch().v;
|
||||
|
||||
auto threads = CeilDiv(x, blockWidth) * y * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
|
||||
|
||||
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
|
||||
}
|
||||
|
||||
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16::GetTuningParams(const convolution_params& params) const {
|
||||
ConvolutionTuningData tuning_data;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
|
||||
size_t ic_blocks = CeilDiv(input.Feature().v / params.groups, tuning_data.feature_block_size);
|
||||
|
||||
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
|
||||
|
||||
bool slm_exception = params.output.X().v == 3 && params.output.Y().v == 3 && params.output.ElementSize() == 4 && params.output.Feature().v <= 512;
|
||||
|
||||
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !slm_exception)
|
||||
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
|
||||
EstimateOccupancy(params, tuning_data) < 4.0)
|
||||
tuning_data.slm_div_factor *= 2;
|
||||
|
||||
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||
|
||||
return tuning_data;
|
||||
}
|
||||
|
||||
ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
@ -92,6 +125,8 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
|
||||
int autoTuneIndex) const {
|
||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
const auto& out = params.output;
|
||||
|
||||
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
||||
@ -103,11 +138,11 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
|
||||
auto b = out.Batch().v;
|
||||
|
||||
dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
|
||||
dispatchData.gws[1] = Align(f, sub_group_size);
|
||||
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
|
||||
dispatchData.gws[2] = b;
|
||||
|
||||
dispatchData.lws[0] = 1;
|
||||
dispatchData.lws[1] = sub_group_size;
|
||||
dispatchData.lws[1] = tuning_data.work_group_size;
|
||||
dispatchData.lws[2] = 1;
|
||||
|
||||
return dispatchData;
|
||||
@ -126,27 +161,29 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p
|
||||
|
||||
const auto& params = static_cast<const convolution_params&>(p);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
const auto& output = params.output;
|
||||
|
||||
if (params.groups > 1) {
|
||||
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
||||
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
||||
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
|
||||
(feature_block_size % inFeaturesPerGroup == 0) &&
|
||||
(feature_block_size / outFeaturesPerGroup > 1) &&
|
||||
(feature_block_size / inFeaturesPerGroup > 1) &&
|
||||
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
|
||||
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
|
||||
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
|
||||
(tuning_data.feature_block_size / inFeaturesPerGroup > 1) &&
|
||||
(outFeaturesPerGroup != 1) &&
|
||||
(inFeaturesPerGroup != 1);
|
||||
auto grouped = inFeaturesPerGroup % sub_group_size == 0 &&
|
||||
(outFeaturesPerGroup % sub_group_size == 0 || sub_group_size % outFeaturesPerGroup == 0);
|
||||
auto grouped = inFeaturesPerGroup % tuning_data.sub_group_size == 0 &&
|
||||
(outFeaturesPerGroup % tuning_data.sub_group_size == 0 || tuning_data.sub_group_size % outFeaturesPerGroup == 0);
|
||||
|
||||
if (!multipleGroupsInputPreload && !grouped)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that padding before features doesn't miss-align the blocks
|
||||
if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
|
||||
if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
|
||||
return false;
|
||||
|
||||
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
|
||||
@ -161,11 +198,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
||||
auto output = params.output;
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_vec = { "_VEC",
|
||||
{"b", "(f_block*16)", "y", "x"},
|
||||
{"b", "(feature_block * 16)", "y", "x"},
|
||||
"dst",
|
||||
input_dt,
|
||||
blockWidth,
|
||||
@ -174,7 +213,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::X };
|
||||
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
||||
{"b", "(f_block*16)", "y", "(x+i)"},
|
||||
{"b", "(feature_block * 16)", "y", "(x + i)"},
|
||||
"dst[i]",
|
||||
input_dt,
|
||||
1,
|
||||
@ -190,23 +229,25 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
||||
|
||||
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
||||
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
||||
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
|
||||
(feature_block_size % inFeaturesPerGroup == 0) &&
|
||||
(feature_block_size / outFeaturesPerGroup > 1) &&
|
||||
(feature_block_size / inFeaturesPerGroup > 1);
|
||||
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
|
||||
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
|
||||
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
|
||||
(tuning_data.feature_block_size / inFeaturesPerGroup > 1);
|
||||
|
||||
if (multipleGroupsInputPreload)
|
||||
jit.AddConstant(MakeJitConstant("MULTIPLE_GROUPS_INPUT_PRELOAD", 1));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
|
||||
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
|
||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, feature_block_size)));
|
||||
if (params.output.Feature().v % feature_block_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, tuning_data.feature_block_size)));
|
||||
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
||||
}
|
||||
if (inFeaturesPerGroup % feature_block_size != 0 && !multipleGroupsInputPreload) {
|
||||
if (inFeaturesPerGroup % tuning_data.feature_block_size != 0 && !multipleGroupsInputPreload) {
|
||||
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -58,7 +58,16 @@ private:
|
||||
std::string exeMode;
|
||||
};
|
||||
|
||||
struct ConvolutionTuningData {
|
||||
const size_t sub_group_size = 16;
|
||||
const size_t feature_block_size = 16;
|
||||
size_t slm_div_factor = 1;
|
||||
size_t work_group_size = 1;
|
||||
};
|
||||
|
||||
std::vector<AutoTuneOption> autoTuneOptions;
|
||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
||||
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
|
||||
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
@ -19,11 +19,9 @@
|
||||
#include <string>
|
||||
|
||||
namespace kernel_selector {
|
||||
static const size_t sub_group_size = 16;
|
||||
static const size_t feature_block_size = 16;
|
||||
|
||||
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_bfyx_f16_1x1") {
|
||||
std::vector<size_t> outputBlockWidths = {2, 4, 8};
|
||||
std::vector<size_t> outputBlockWidths = { 1, 2, 4, 8 };
|
||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
||||
|
||||
for (auto w : outputBlockWidths) {
|
||||
@ -36,10 +34,15 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con
|
||||
ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
|
||||
int /*autoTuneIndex*/) const {
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(params);
|
||||
|
||||
auto x = cp.output.X().v;
|
||||
auto y = cp.output.Y().v;
|
||||
auto f = cp.output.Feature().v;
|
||||
if (x * f <= 256) {
|
||||
if ( x < 8 || x * f <= 128)
|
||||
|
||||
if (x == 1 && y == 1) {
|
||||
return { 1, DEFAULT };
|
||||
} else if (x * f <= 256) {
|
||||
if (x < 8 || x * f <= 128)
|
||||
return { 2, DEFAULT };
|
||||
else
|
||||
return { 4, DEFAULT };
|
||||
@ -50,6 +53,40 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs
|
||||
}
|
||||
}
|
||||
|
||||
float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_params& params,
|
||||
const ConvolutionTuningData& tuning_data) const {
|
||||
auto tuneOptions = GetAutoTuneOptions(params, 0);
|
||||
auto blockWidth = tuneOptions.blockWidth;
|
||||
|
||||
auto x = params.output.X().v;
|
||||
auto y = params.output.Y().v;
|
||||
auto f = params.output.Feature().v;
|
||||
auto b = params.output.Batch().v;
|
||||
|
||||
auto threads = CeilDiv(x * y, blockWidth) * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
|
||||
|
||||
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
|
||||
}
|
||||
|
||||
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
|
||||
ConvolutionTuningData tuning_data;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
|
||||
size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
|
||||
|
||||
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
|
||||
bool block_size_one_is_better = params.output.X().v == 1 && params.output.Y().v == 1 && input.Feature().v >= 2048;
|
||||
|
||||
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !block_size_one_is_better)
|
||||
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
|
||||
EstimateOccupancy(params, tuning_data) < 4.0)
|
||||
tuning_data.slm_div_factor *= 2;
|
||||
|
||||
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||
|
||||
return tuning_data;
|
||||
}
|
||||
|
||||
ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
@ -75,21 +112,24 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
|
||||
int autoTuneIndex) const {
|
||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
||||
dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
|
||||
|
||||
const auto& out = params.output;
|
||||
|
||||
auto x = out.X().v;
|
||||
auto y = out.Y().v;
|
||||
auto f = out.Feature().v;
|
||||
auto b = out.Batch().v;
|
||||
|
||||
dispatchData.gws[0] = CeilDiv(x * y, autoTune.blockWidth);
|
||||
dispatchData.gws[1] = Align(f, feature_block_size);
|
||||
dispatchData.gws[0] = x == 1 && y == 1 ? 1 : CeilDiv(x * y, autoTune.blockWidth);
|
||||
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
|
||||
dispatchData.gws[2] = b;
|
||||
|
||||
dispatchData.lws[0] = 1;
|
||||
dispatchData.lws[1] = sub_group_size;
|
||||
dispatchData.lws[1] = tuning_data.work_group_size;
|
||||
dispatchData.lws[2] = 1;
|
||||
|
||||
return dispatchData;
|
||||
@ -124,14 +164,16 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p, const option
|
||||
|
||||
const auto& params = static_cast<const convolution_params&>(p);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
const auto& output = params.output;
|
||||
|
||||
const bool bOutputSizes =
|
||||
output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
|
||||
const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
|
||||
const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
|
||||
const bool bStride = params.stride.x != 1 || params.stride.y != 1;
|
||||
const bool bPadding = input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0;
|
||||
const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
|
||||
output.Feature().pad.before % tuning_data.feature_block_size != 0;
|
||||
|
||||
if (bOutputSizes || bFilterSize || bStride || bPadding) {
|
||||
return false;
|
||||
@ -144,11 +186,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
||||
const DispatchData& dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
|
||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetUnitType(params);
|
||||
FusedOpsConfiguration conf_vec = { "_VEC",
|
||||
{"b", "(f_block*16)", "y", "x"},
|
||||
{"b", "(feature_block * 16)", "y", "x"},
|
||||
"dst",
|
||||
input_dt,
|
||||
blockWidth,
|
||||
@ -156,8 +200,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
||||
BoundaryCheck::ENABLED,
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::X };
|
||||
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
||||
{"b", "(f_block*16)", "yi", "xi"},
|
||||
FusedOpsConfiguration conf_scalar1 = { "_SCALAR",
|
||||
{"b", "(feature_block * 16)", "yi", "xi"},
|
||||
"dst[i]",
|
||||
input_dt,
|
||||
1,
|
||||
@ -165,10 +209,19 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
||||
BoundaryCheck::ENABLED,
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::X };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
|
||||
FusedOpsConfiguration conf_scalar2 = { "_SCALAR_B1",
|
||||
{"b", "(feature_block * 16)", "0", "0"},
|
||||
"dst",
|
||||
input_dt,
|
||||
1,
|
||||
LoadType::LT_ALIGNED_READ,
|
||||
BoundaryCheck::ENABLED,
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::X };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
|
||||
|
||||
bool padded_output = params.output.X().pad.Total() != 0;
|
||||
@ -194,11 +247,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
||||
|
||||
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
|
||||
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
|
||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, feature_block_size)));
|
||||
if (params.output.Feature().v % feature_block_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
|
||||
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
||||
}
|
||||
if (params.inputs[0].Feature().v % feature_block_size != 0) {
|
||||
if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -51,7 +51,16 @@ protected:
|
||||
std::string exeMode;
|
||||
};
|
||||
|
||||
struct ConvolutionTuningData {
|
||||
const size_t sub_group_size = 16;
|
||||
const size_t feature_block_size = 16;
|
||||
size_t slm_div_factor = 1;
|
||||
size_t work_group_size = 1;
|
||||
};
|
||||
|
||||
std::vector<AutoTuneOption> autoTuneOptions;
|
||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
||||
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
|
||||
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
@ -40,7 +40,7 @@
|
||||
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl - unsupported input type.
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported input type
|
||||
#endif
|
||||
|
||||
#if FILTER_TYPE_SIZE == 2
|
||||
@ -48,7 +48,7 @@
|
||||
#elif FILTER_TYPE_SIZE == 4
|
||||
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl - unsupported filter type.
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
|
||||
#endif
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
@ -67,7 +67,7 @@
|
||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl - unsupported output type.
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported output type
|
||||
#endif
|
||||
|
||||
#if INPUT0_TYPE_SIZE == 2
|
||||
@ -77,12 +77,14 @@
|
||||
#else
|
||||
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
|
||||
#endif
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||
KERNEL(convolution_bfyx_f16)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@ -94,26 +96,29 @@ KERNEL(convolution_bfyx_f16)(
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx) {
|
||||
#if GROUPED
|
||||
const int f_block = get_group_id(1);
|
||||
const int group = (f_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
|
||||
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (f_block * FEATURE_SLICE_SIZE);
|
||||
int groups_per_sub_group = 1;
|
||||
if (prev_group_leftover < 16)
|
||||
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
|
||||
#else
|
||||
const int f_block = get_group_id(1);
|
||||
const int group = split_idx;
|
||||
const int groups_per_sub_group = 1;
|
||||
#endif // GROUPED
|
||||
|
||||
const int lid = get_sub_group_local_id();
|
||||
const int sglid = get_sub_group_local_id();
|
||||
const int b = (uint)get_global_id(2);
|
||||
|
||||
const int xy = get_global_id(0);
|
||||
const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
|
||||
const int y = (xy / X_BLOCKS);
|
||||
|
||||
const int lid1 = (int)get_local_id(1);
|
||||
const int feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
|
||||
const int feature_sub_block = lid1 / feature_per_wg;
|
||||
const int feature_block = (int)get_group_id(1);
|
||||
|
||||
#if GROUPED
|
||||
const int group = (feature_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
|
||||
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (feature_block * FEATURE_SLICE_SIZE);
|
||||
int groups_per_sub_group = 1;
|
||||
if (prev_group_leftover < 16)
|
||||
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
|
||||
#else
|
||||
const int group = split_idx;
|
||||
const int groups_per_sub_group = 1;
|
||||
#endif // GROUPED
|
||||
|
||||
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
|
||||
|
||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
@ -143,7 +148,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||
|
||||
const uint output_offset = b * output_b_pitch +
|
||||
(f_block + output_fs_pad_before) * output_fs_pitch +
|
||||
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
||||
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||
|
||||
@ -155,54 +160,71 @@ KERNEL(convolution_bfyx_f16)(
|
||||
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||
|
||||
#if BIAS_TERM
|
||||
uint bias_offset = f_block * FEATURE_SLICE_SIZE;
|
||||
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, bias_offset));
|
||||
#if SLM_DIV_FACTOR == 1
|
||||
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
#else
|
||||
vec_t dst;
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
} else {
|
||||
dst = INPUT0_VAL_ZERO;
|
||||
}
|
||||
#endif // SLM_DIV_FACTOR == 1
|
||||
#else
|
||||
vec_t dst = INPUT0_VAL_ZERO;
|
||||
#endif // BIAS_TERM
|
||||
#endif // BIAS_TERM
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
__local vec_t partial_summ[WORK_GROUP_SIZE];
|
||||
#endif
|
||||
|
||||
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||
const uint in_split_offset = f_block * input_fs_pitch;
|
||||
const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||
const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||
const uint in_split_offset = feature_block * input_fs_pitch;
|
||||
const uint g = sglid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||
const uint ofm_in_group = sglid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
|
||||
#else
|
||||
#if GROUPED
|
||||
for (uint g = group; g < group + groups_per_sub_group; g++) {
|
||||
const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
|
||||
const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
|
||||
const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
|
||||
const uint filter_offset = (feature_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
|
||||
#else
|
||||
const uint in_split_offset = 0;
|
||||
const uint filter_split_offset = 0;
|
||||
const uint filter_offset = f_block * filter_os_pitch;
|
||||
const uint filter_offset = feature_block * filter_os_pitch;
|
||||
#endif // GROUPED
|
||||
const uint grouped_filter_offset = filter_offset + filter_split_offset;
|
||||
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||
|
||||
const uint grouped_input_offset = input_offset + in_split_offset;
|
||||
|
||||
for (uint icb = 0; icb < IC_BLOCKS; icb++) {
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
for (int icb = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; icb < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; icb++) {
|
||||
#else
|
||||
for (int icb = 0; icb < IC_BLOCKS; icb++) {
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
|
||||
for (int kh = 0; kh < FILTER_SIZE_Y; kh++) {
|
||||
if (input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y)
|
||||
if (input_y + kh * DILATION_SIZE_Y < 0 || input_y + kh * DILATION_SIZE_Y >= INPUT0_SIZE_Y)
|
||||
continue;
|
||||
|
||||
INPUT_TYPE line_cache[INPUT_LINE_SIZE];
|
||||
|
||||
#if INPUT_LEFTOVERS
|
||||
if ((icb+1)*FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
|
||||
if ((icb + 1) * FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
|
||||
{
|
||||
for (int xb = 0; xb < INPUT_LINE_SIZE; xb++)
|
||||
{
|
||||
if (icb*FEATURE_SLICE_SIZE + lid >= FILTER_IFM_NUM)
|
||||
if (icb * FEATURE_SLICE_SIZE + sglid >= FILTER_IFM_NUM)
|
||||
line_cache[xb] = 0;
|
||||
else
|
||||
line_cache[xb] = input[grouped_input_offset +
|
||||
icb * input_fs_pitch +
|
||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||
xb * input_x_pitch +
|
||||
lid];
|
||||
sglid];
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -251,7 +273,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
||||
src[i] = line_cache[i];
|
||||
#else
|
||||
src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
|
||||
src[i] = line_cache[kw * DILATION_SIZE_X + STRIDE_SIZE_X * i];
|
||||
#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
||||
}
|
||||
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||
@ -300,7 +322,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
dst = mad(wei0.s6, src6, dst);
|
||||
dst = mad(wei0.s7, src7, dst);
|
||||
#else
|
||||
#error Unsupported input feature size for multiple groups input preload
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
|
||||
#endif // FILTER_IFM_NUM
|
||||
#else
|
||||
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||
@ -352,13 +374,24 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||
}
|
||||
#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
partial_summ[lid1] = dst;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
|
||||
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
||||
|
||||
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
|
||||
out_vec_t res;
|
||||
|
||||
#if OUTPUT_LEFTOVERS
|
||||
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
||||
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
||||
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_SCALAR;
|
||||
@ -366,8 +399,8 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#else
|
||||
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
||||
#endif
|
||||
if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
||||
output[output_offset + i * output_x_pitch + lid] = res[i];
|
||||
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
||||
output[output_offset + i * output_x_pitch + sglid] = res[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -391,7 +424,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 1
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
||||
#endif
|
||||
} else {
|
||||
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
|
||||
@ -405,6 +438,10 @@ KERNEL(convolution_bfyx_f16)(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef AS_INPUT_SRC
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -16,16 +16,24 @@
|
||||
#include "include/unit_type.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
#define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
|
||||
intel_sub_group_shuffle( \
|
||||
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
|
||||
id))
|
||||
#if X_BLOCK_SIZE > 1
|
||||
# define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
|
||||
intel_sub_group_shuffle( \
|
||||
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
|
||||
id))
|
||||
#else
|
||||
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
|
||||
#endif
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#if X_BLOCK_SIZE > 1
|
||||
# define UNIT_BLOCK_READ_VEC(ptr, offset) CAT(UNIT_BLOCK_READ, X_BLOCK_SIZE)(ptr, offset)
|
||||
# define UNIT_BLOCK_WRITE_VEC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@ -37,16 +45,21 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx) {
|
||||
const int xy = get_global_id(0);
|
||||
const int f_block = get_group_id(1);
|
||||
const int b = get_global_id(2);
|
||||
const int lid = get_sub_group_local_id();
|
||||
#if X_BLOCK_SIZE > 1
|
||||
const uint xy = (int)get_global_id(0);
|
||||
const uint x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
|
||||
const uint y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
|
||||
|
||||
const int x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
|
||||
const int y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
|
||||
const uint input_x = x;
|
||||
const uint input_y = y;
|
||||
#endif
|
||||
const uint b = (int)get_global_id(2);
|
||||
const uint sglid = (int)get_sub_group_local_id();
|
||||
|
||||
const int input_x = x;
|
||||
const int input_y = y;
|
||||
const uint lid1 = (int)get_local_id(1);
|
||||
const uint feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
|
||||
const uint feature_sub_block = lid1 / feature_per_wg;
|
||||
const uint feature_block = (int)get_group_id(1);
|
||||
|
||||
// Input offset calculations:
|
||||
const uint input_x_pitch = FEATURE_SLICE_SIZE;
|
||||
@ -71,8 +84,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
|
||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||
|
||||
const uint output_offset = b * output_b_pitch +
|
||||
(f_block + output_fs_pad_before) * output_fs_pitch +
|
||||
const uint output_offset = b * output_b_pitch +
|
||||
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
||||
(OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||
(OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||
|
||||
@ -83,71 +96,91 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
const uint filter_is_pitch = filter_y_pitch * FILTER_SIZE_Y;
|
||||
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||
|
||||
const uint filter_offset = f_block * filter_os_pitch;
|
||||
const uint filter_offset = feature_block * filter_os_pitch;
|
||||
|
||||
#if X_BLOCK_SIZE > 1
|
||||
typedef MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) vec_t;
|
||||
|
||||
|
||||
#if BIAS_TERM
|
||||
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
|
||||
#else
|
||||
vec_t dst = UNIT_VAL_ZERO;
|
||||
typedef UNIT_TYPE vec_t;
|
||||
#endif
|
||||
|
||||
for (uint k = 0; k < IC_BLOCKS; ++k)
|
||||
#if BIAS_TERM
|
||||
#if SLM_DIV_FACTOR == 1
|
||||
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
#else
|
||||
vec_t dst;
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
} else {
|
||||
dst = UNIT_VAL_ZERO;
|
||||
}
|
||||
#endif // SLM_DIV_FACTOR == 1
|
||||
#else
|
||||
vec_t dst = UNIT_VAL_ZERO;
|
||||
#endif // BIAS_TERM
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
__local vec_t partial_summ[WORK_GROUP_SIZE];
|
||||
|
||||
for (uint k = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; k < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; k++)
|
||||
{
|
||||
#else
|
||||
for (uint k = 0; k < IC_BLOCKS; k++)
|
||||
{
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
vec_t src = 0;
|
||||
#if INPUT_LEFTOVERS
|
||||
if ((k+1)*FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
|
||||
if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
|
||||
{
|
||||
if (k*FEATURE_SLICE_SIZE + lid < INPUT0_FEATURE_NUM)
|
||||
if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM)
|
||||
{
|
||||
#if X_BLOCK_SIZE > 1
|
||||
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
||||
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
||||
{
|
||||
const uint xb = (x + i) % INPUT0_SIZE_X;
|
||||
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
||||
const uint input_idx = input_offset + k * input_fs_pitch +
|
||||
yb * input_y_pitch +
|
||||
xb * input_x_pitch;
|
||||
src[i] = input[input_idx + lid];
|
||||
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
|
||||
|
||||
src[i] = input[input_idx + sglid];
|
||||
}
|
||||
#else
|
||||
src = input[input_offset + k * input_fs_pitch + sglid];
|
||||
#endif // X_BLOCK_SIZE > 1
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#endif // INPUT_LEFTOVERS
|
||||
{
|
||||
#if PADDED_INPUT
|
||||
#if X_BLOCK_SIZE > 1
|
||||
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
||||
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
||||
{
|
||||
const uint xb = (x + i) % INPUT0_SIZE_X;
|
||||
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
||||
const uint input_idx = input_offset + k * input_fs_pitch +
|
||||
yb * input_y_pitch +
|
||||
xb * input_x_pitch;
|
||||
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
|
||||
|
||||
src[i] = UNIT_BLOCK_READ(input, input_idx);
|
||||
}
|
||||
#else
|
||||
#if X_BLOCK_SIZE == 8
|
||||
src = UNIT_BLOCK_READ8(input, input_offset + k * input_fs_pitch +
|
||||
input_y * input_y_pitch +
|
||||
input_x * input_x_pitch);
|
||||
#elif X_BLOCK_SIZE == 4
|
||||
src = UNIT_BLOCK_READ4(input, input_offset + k * input_fs_pitch +
|
||||
input_y * input_y_pitch +
|
||||
input_x * input_x_pitch);
|
||||
#elif X_BLOCK_SIZE == 2
|
||||
src = UNIT_BLOCK_READ2(input, input_offset + k * input_fs_pitch +
|
||||
input_y * input_y_pitch +
|
||||
input_x * input_x_pitch);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
|
||||
#endif // X_BLOCK_SIZE > 1
|
||||
|
||||
#else // PADDED_INPUT
|
||||
|
||||
#if X_BLOCK_SIZE > 1
|
||||
src = UNIT_BLOCK_READ_VEC(input, input_offset + k * input_fs_pitch + input_y * input_y_pitch + input_x * input_x_pitch);
|
||||
#else
|
||||
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
|
||||
#endif // X_BLOCK_SIZE > 1
|
||||
#endif // PADDED_INPUT
|
||||
}
|
||||
|
||||
UNIT_TYPE8 wei0 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch);
|
||||
UNIT_TYPE8 wei1 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch + 8 * filter_isv_pitch);
|
||||
|
||||
const vec_t src0 = GET_SRC(src, 0);
|
||||
const vec_t src1 = GET_SRC(src, 1);
|
||||
const vec_t src2 = GET_SRC(src, 2);
|
||||
@ -183,29 +216,52 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
dst = mad(wei1.s7, src15, dst);
|
||||
}
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
partial_summ[lid1] = dst;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
|
||||
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
||||
|
||||
#if OUTPUT_LEFTOVERS
|
||||
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
|
||||
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
|
||||
{
|
||||
#if X_BLOCK_SIZE > 1
|
||||
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
||||
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y ||
|
||||
f_block*FEATURE_SLICE_SIZE + lid >= OUTPUT_FEATURE_NUM)
|
||||
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
|
||||
return;
|
||||
|
||||
int xi = (x+i) % OUTPUT_SIZE_X;
|
||||
int yi = y + ((x+i) / OUTPUT_SIZE_X);
|
||||
int xi = (x + i) % OUTPUT_SIZE_X;
|
||||
int yi = y + ((x + i) / OUTPUT_SIZE_X);
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_SCALAR;
|
||||
dst[i] = FUSED_OPS_RESULT_SCALAR;
|
||||
#endif
|
||||
|
||||
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + lid] = dst[i];
|
||||
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + sglid] = dst[i];
|
||||
}
|
||||
#else // X_BLOCK_SIZE > 1
|
||||
if (feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
|
||||
return;
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_SCALAR_B1;
|
||||
dst = FUSED_OPS_RESULT_SCALAR_B1;
|
||||
#endif
|
||||
|
||||
output[output_offset + sglid] = dst;
|
||||
#endif // X_BLOCK_SIZE > 1
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#endif // OUTPUT_LEFTOVERS
|
||||
|
||||
#if X_BLOCK_SIZE > 1
|
||||
{
|
||||
#if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
|
||||
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
|
||||
@ -216,20 +272,14 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
FUSED_OPS_VEC;
|
||||
dst = FUSED_OPS_RESULT_VEC;
|
||||
#endif
|
||||
#if X_BLOCK_SIZE == 8
|
||||
UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
||||
#elif X_BLOCK_SIZE == 4
|
||||
UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
||||
#elif X_BLOCK_SIZE == 2
|
||||
UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
||||
#endif
|
||||
UNIT_BLOCK_WRITE_VEC(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
||||
} else {
|
||||
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
||||
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
|
||||
return;
|
||||
|
||||
int xi = (x+i) % OUTPUT_SIZE_X;
|
||||
int yi = y + ((x+i) / OUTPUT_SIZE_X);
|
||||
int xi = (x + i) % OUTPUT_SIZE_X;
|
||||
int yi = y + ((x + i) / OUTPUT_SIZE_X);
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_SCALAR;
|
||||
@ -240,7 +290,22 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else // X_BLOCK_SIZE > 1
|
||||
{
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_SCALAR_B1;
|
||||
dst = FUSED_OPS_RESULT_SCALAR_B1;
|
||||
#endif
|
||||
UNIT_BLOCK_WRITE(output, output_offset, dst);
|
||||
}
|
||||
#endif // X_BLOCK_SIZE > 1
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef GET_SRC
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
#undef UNIT_BLOCK_READ_VEC
|
||||
#undef UNIT_BLOCK_WRITE_VEC
|
||||
|
Loading…
Reference in New Issue
Block a user