[IE CLDNN] Convolution fsv16 improvements several fixes after code review (#3637)
This commit is contained in:
parent
d4f774f3c7
commit
72c3e0e4a6
@ -19,16 +19,13 @@
|
|||||||
|
|
||||||
namespace kernel_selector {
|
namespace kernel_selector {
|
||||||
|
|
||||||
static const size_t sub_group_size = 16;
|
|
||||||
static const size_t feature_block_size = 16;
|
|
||||||
|
|
||||||
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionKernel_b_fs_yx_fsv16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
|
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionKernel_b_fs_yx_fsv16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
|
||||||
std::vector<size_t> outputBlockWidths = {2, 4, 8};
|
std::vector<size_t> outputBlockWidths = { 2, 4, 8 };
|
||||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
||||||
|
|
||||||
for (auto w : outputBlockWidths) {
|
for (auto w : outputBlockWidths) {
|
||||||
for (auto exeMode : executionModes) {
|
for (auto exeMode : executionModes) {
|
||||||
autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
|
autoTuneOptions.emplace_back(AutoTuneOption{ w, exeMode });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -39,7 +36,7 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
|
|||||||
auto x = cp.output.X().v;
|
auto x = cp.output.X().v;
|
||||||
auto f = cp.output.Feature().v;
|
auto f = cp.output.Feature().v;
|
||||||
if (x * f <= 256) {
|
if (x * f <= 256) {
|
||||||
if ( x <= 8 || x * f <= 128)
|
if (x <= 8 || x * f <= 128)
|
||||||
return { 2, DEFAULT };
|
return { 2, DEFAULT };
|
||||||
else
|
else
|
||||||
return { 4, DEFAULT };
|
return { 4, DEFAULT };
|
||||||
@ -55,6 +52,42 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float ConvolutionKernel_b_fs_yx_fsv16::EstimateOccupancy(const convolution_params& params,
|
||||||
|
const ConvolutionTuningData& tuning_data) const {
|
||||||
|
auto tuneOptions = GetAutoTuneOptions(params, 0);
|
||||||
|
auto blockWidth = tuneOptions.blockWidth;
|
||||||
|
|
||||||
|
auto x = params.output.X().v;
|
||||||
|
auto y = params.output.Y().v;
|
||||||
|
auto f = params.output.Feature().v;
|
||||||
|
auto b = params.output.Batch().v;
|
||||||
|
|
||||||
|
auto threads = CeilDiv(x, blockWidth) * y * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
|
||||||
|
|
||||||
|
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
ConvolutionKernel_b_fs_yx_fsv16::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16::GetTuningParams(const convolution_params& params) const {
|
||||||
|
ConvolutionTuningData tuning_data;
|
||||||
|
|
||||||
|
const auto& input = params.inputs[0];
|
||||||
|
|
||||||
|
size_t ic_blocks = CeilDiv(input.Feature().v / params.groups, tuning_data.feature_block_size);
|
||||||
|
|
||||||
|
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
|
||||||
|
|
||||||
|
bool slm_exception = params.output.X().v == 3 && params.output.Y().v == 3 && params.output.ElementSize() == 4 && params.output.Feature().v <= 512;
|
||||||
|
|
||||||
|
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !slm_exception)
|
||||||
|
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
|
||||||
|
EstimateOccupancy(params, tuning_data) < 4.0)
|
||||||
|
tuning_data.slm_div_factor *= 2;
|
||||||
|
|
||||||
|
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||||
|
|
||||||
|
return tuning_data;
|
||||||
|
}
|
||||||
|
|
||||||
ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
||||||
ParamsKey k;
|
ParamsKey k;
|
||||||
k.EnableInputDataType(Datatype::F16);
|
k.EnableInputDataType(Datatype::F16);
|
||||||
@ -92,6 +125,8 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
|
|||||||
int autoTuneIndex) const {
|
int autoTuneIndex) const {
|
||||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
const auto& out = params.output;
|
const auto& out = params.output;
|
||||||
|
|
||||||
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
||||||
@ -103,11 +138,11 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
|
|||||||
auto b = out.Batch().v;
|
auto b = out.Batch().v;
|
||||||
|
|
||||||
dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
|
dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
|
||||||
dispatchData.gws[1] = Align(f, sub_group_size);
|
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
|
||||||
dispatchData.gws[2] = b;
|
dispatchData.gws[2] = b;
|
||||||
|
|
||||||
dispatchData.lws[0] = 1;
|
dispatchData.lws[0] = 1;
|
||||||
dispatchData.lws[1] = sub_group_size;
|
dispatchData.lws[1] = tuning_data.work_group_size;
|
||||||
dispatchData.lws[2] = 1;
|
dispatchData.lws[2] = 1;
|
||||||
|
|
||||||
return dispatchData;
|
return dispatchData;
|
||||||
@ -126,27 +161,29 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p
|
|||||||
|
|
||||||
const auto& params = static_cast<const convolution_params&>(p);
|
const auto& params = static_cast<const convolution_params&>(p);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
const auto& input = params.inputs[0];
|
const auto& input = params.inputs[0];
|
||||||
const auto& output = params.output;
|
const auto& output = params.output;
|
||||||
|
|
||||||
if (params.groups > 1) {
|
if (params.groups > 1) {
|
||||||
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
||||||
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
||||||
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
|
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
|
||||||
(feature_block_size % inFeaturesPerGroup == 0) &&
|
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
|
||||||
(feature_block_size / outFeaturesPerGroup > 1) &&
|
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
|
||||||
(feature_block_size / inFeaturesPerGroup > 1) &&
|
(tuning_data.feature_block_size / inFeaturesPerGroup > 1) &&
|
||||||
(outFeaturesPerGroup != 1) &&
|
(outFeaturesPerGroup != 1) &&
|
||||||
(inFeaturesPerGroup != 1);
|
(inFeaturesPerGroup != 1);
|
||||||
auto grouped = inFeaturesPerGroup % sub_group_size == 0 &&
|
auto grouped = inFeaturesPerGroup % tuning_data.sub_group_size == 0 &&
|
||||||
(outFeaturesPerGroup % sub_group_size == 0 || sub_group_size % outFeaturesPerGroup == 0);
|
(outFeaturesPerGroup % tuning_data.sub_group_size == 0 || tuning_data.sub_group_size % outFeaturesPerGroup == 0);
|
||||||
|
|
||||||
if (!multipleGroupsInputPreload && !grouped)
|
if (!multipleGroupsInputPreload && !grouped)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that padding before features doesn't miss-align the blocks
|
// Check that padding before features doesn't miss-align the blocks
|
||||||
if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
|
if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
|
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
|
||||||
@ -161,11 +198,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
auto output = params.output;
|
auto output = params.output;
|
||||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
||||||
if (!params.fused_ops.empty()) {
|
if (!params.fused_ops.empty()) {
|
||||||
auto input_dt = GetActivationType(params);
|
auto input_dt = GetActivationType(params);
|
||||||
FusedOpsConfiguration conf_vec = { "_VEC",
|
FusedOpsConfiguration conf_vec = { "_VEC",
|
||||||
{"b", "(f_block*16)", "y", "x"},
|
{"b", "(feature_block * 16)", "y", "x"},
|
||||||
"dst",
|
"dst",
|
||||||
input_dt,
|
input_dt,
|
||||||
blockWidth,
|
blockWidth,
|
||||||
@ -174,7 +213,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
IndexType::TENSOR_COORD,
|
IndexType::TENSOR_COORD,
|
||||||
Tensor::DataChannelName::X };
|
Tensor::DataChannelName::X };
|
||||||
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
||||||
{"b", "(f_block*16)", "y", "(x+i)"},
|
{"b", "(feature_block * 16)", "y", "(x + i)"},
|
||||||
"dst[i]",
|
"dst[i]",
|
||||||
input_dt,
|
input_dt,
|
||||||
1,
|
1,
|
||||||
@ -190,23 +229,25 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
|
|
||||||
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
auto outFeaturesPerGroup = output.Feature().v / params.groups;
|
||||||
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
auto inFeaturesPerGroup = input.Feature().v / params.groups;
|
||||||
auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
|
auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
|
||||||
(feature_block_size % inFeaturesPerGroup == 0) &&
|
(tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
|
||||||
(feature_block_size / outFeaturesPerGroup > 1) &&
|
(tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
|
||||||
(feature_block_size / inFeaturesPerGroup > 1);
|
(tuning_data.feature_block_size / inFeaturesPerGroup > 1);
|
||||||
|
|
||||||
if (multipleGroupsInputPreload)
|
if (multipleGroupsInputPreload)
|
||||||
jit.AddConstant(MakeJitConstant("MULTIPLE_GROUPS_INPUT_PRELOAD", 1));
|
jit.AddConstant(MakeJitConstant("MULTIPLE_GROUPS_INPUT_PRELOAD", 1));
|
||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
|
jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
|
||||||
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
|
jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
|
||||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||||
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
|
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
|
||||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, feature_block_size)));
|
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||||
if (params.output.Feature().v % feature_block_size != 0) {
|
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||||
|
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, tuning_data.feature_block_size)));
|
||||||
|
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
|
||||||
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
||||||
}
|
}
|
||||||
if (inFeaturesPerGroup % feature_block_size != 0 && !multipleGroupsInputPreload) {
|
if (inFeaturesPerGroup % tuning_data.feature_block_size != 0 && !multipleGroupsInputPreload) {
|
||||||
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright (c) 2016-2019 Intel Corporation
|
// Copyright (c) 2016-2020 Intel Corporation
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
@ -58,7 +58,16 @@ private:
|
|||||||
std::string exeMode;
|
std::string exeMode;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ConvolutionTuningData {
|
||||||
|
const size_t sub_group_size = 16;
|
||||||
|
const size_t feature_block_size = 16;
|
||||||
|
size_t slm_div_factor = 1;
|
||||||
|
size_t work_group_size = 1;
|
||||||
|
};
|
||||||
|
|
||||||
std::vector<AutoTuneOption> autoTuneOptions;
|
std::vector<AutoTuneOption> autoTuneOptions;
|
||||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
||||||
|
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
|
||||||
|
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
|
||||||
};
|
};
|
||||||
} // namespace kernel_selector
|
} // namespace kernel_selector
|
||||||
|
@ -19,11 +19,9 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace kernel_selector {
|
namespace kernel_selector {
|
||||||
static const size_t sub_group_size = 16;
|
|
||||||
static const size_t feature_block_size = 16;
|
|
||||||
|
|
||||||
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_bfyx_f16_1x1") {
|
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_bfyx_f16_1x1") {
|
||||||
std::vector<size_t> outputBlockWidths = {2, 4, 8};
|
std::vector<size_t> outputBlockWidths = { 1, 2, 4, 8 };
|
||||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
||||||
|
|
||||||
for (auto w : outputBlockWidths) {
|
for (auto w : outputBlockWidths) {
|
||||||
@ -36,10 +34,15 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con
|
|||||||
ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
|
ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
|
||||||
int /*autoTuneIndex*/) const {
|
int /*autoTuneIndex*/) const {
|
||||||
const convolution_params& cp = static_cast<const convolution_params&>(params);
|
const convolution_params& cp = static_cast<const convolution_params&>(params);
|
||||||
|
|
||||||
auto x = cp.output.X().v;
|
auto x = cp.output.X().v;
|
||||||
|
auto y = cp.output.Y().v;
|
||||||
auto f = cp.output.Feature().v;
|
auto f = cp.output.Feature().v;
|
||||||
if (x * f <= 256) {
|
|
||||||
if ( x < 8 || x * f <= 128)
|
if (x == 1 && y == 1) {
|
||||||
|
return { 1, DEFAULT };
|
||||||
|
} else if (x * f <= 256) {
|
||||||
|
if (x < 8 || x * f <= 128)
|
||||||
return { 2, DEFAULT };
|
return { 2, DEFAULT };
|
||||||
else
|
else
|
||||||
return { 4, DEFAULT };
|
return { 4, DEFAULT };
|
||||||
@ -50,6 +53,40 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_params& params,
|
||||||
|
const ConvolutionTuningData& tuning_data) const {
|
||||||
|
auto tuneOptions = GetAutoTuneOptions(params, 0);
|
||||||
|
auto blockWidth = tuneOptions.blockWidth;
|
||||||
|
|
||||||
|
auto x = params.output.X().v;
|
||||||
|
auto y = params.output.Y().v;
|
||||||
|
auto f = params.output.Feature().v;
|
||||||
|
auto b = params.output.Batch().v;
|
||||||
|
|
||||||
|
auto threads = CeilDiv(x * y, blockWidth) * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
|
||||||
|
|
||||||
|
return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
|
||||||
|
ConvolutionTuningData tuning_data;
|
||||||
|
|
||||||
|
const auto& input = params.inputs[0];
|
||||||
|
|
||||||
|
size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
|
||||||
|
|
||||||
|
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
|
||||||
|
bool block_size_one_is_better = params.output.X().v == 1 && params.output.Y().v == 1 && input.Feature().v >= 2048;
|
||||||
|
|
||||||
|
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !block_size_one_is_better)
|
||||||
|
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
|
||||||
|
EstimateOccupancy(params, tuning_data) < 4.0)
|
||||||
|
tuning_data.slm_div_factor *= 2;
|
||||||
|
|
||||||
|
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||||
|
|
||||||
|
return tuning_data;
|
||||||
|
}
|
||||||
|
|
||||||
ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
|
ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
|
||||||
ParamsKey k;
|
ParamsKey k;
|
||||||
@ -75,21 +112,24 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
|
|||||||
int autoTuneIndex) const {
|
int autoTuneIndex) const {
|
||||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
|
||||||
dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
|
dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
|
||||||
|
|
||||||
const auto& out = params.output;
|
const auto& out = params.output;
|
||||||
|
|
||||||
auto x = out.X().v;
|
auto x = out.X().v;
|
||||||
auto y = out.Y().v;
|
auto y = out.Y().v;
|
||||||
auto f = out.Feature().v;
|
auto f = out.Feature().v;
|
||||||
auto b = out.Batch().v;
|
auto b = out.Batch().v;
|
||||||
|
|
||||||
dispatchData.gws[0] = CeilDiv(x * y, autoTune.blockWidth);
|
dispatchData.gws[0] = x == 1 && y == 1 ? 1 : CeilDiv(x * y, autoTune.blockWidth);
|
||||||
dispatchData.gws[1] = Align(f, feature_block_size);
|
dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
|
||||||
dispatchData.gws[2] = b;
|
dispatchData.gws[2] = b;
|
||||||
|
|
||||||
dispatchData.lws[0] = 1;
|
dispatchData.lws[0] = 1;
|
||||||
dispatchData.lws[1] = sub_group_size;
|
dispatchData.lws[1] = tuning_data.work_group_size;
|
||||||
dispatchData.lws[2] = 1;
|
dispatchData.lws[2] = 1;
|
||||||
|
|
||||||
return dispatchData;
|
return dispatchData;
|
||||||
@ -124,14 +164,16 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p, const option
|
|||||||
|
|
||||||
const auto& params = static_cast<const convolution_params&>(p);
|
const auto& params = static_cast<const convolution_params&>(p);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
const auto& input = params.inputs[0];
|
const auto& input = params.inputs[0];
|
||||||
const auto& output = params.output;
|
const auto& output = params.output;
|
||||||
|
|
||||||
const bool bOutputSizes =
|
const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
|
||||||
output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
|
|
||||||
const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
|
const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
|
||||||
const bool bStride = params.stride.x != 1 || params.stride.y != 1;
|
const bool bStride = params.stride.x != 1 || params.stride.y != 1;
|
||||||
const bool bPadding = input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0;
|
const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
|
||||||
|
output.Feature().pad.before % tuning_data.feature_block_size != 0;
|
||||||
|
|
||||||
if (bOutputSizes || bFilterSize || bStride || bPadding) {
|
if (bOutputSizes || bFilterSize || bStride || bPadding) {
|
||||||
return false;
|
return false;
|
||||||
@ -144,11 +186,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
|||||||
const DispatchData& dispatchData) const {
|
const DispatchData& dispatchData) const {
|
||||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||||
|
|
||||||
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
||||||
if (!params.fused_ops.empty()) {
|
if (!params.fused_ops.empty()) {
|
||||||
auto input_dt = GetUnitType(params);
|
auto input_dt = GetUnitType(params);
|
||||||
FusedOpsConfiguration conf_vec = { "_VEC",
|
FusedOpsConfiguration conf_vec = { "_VEC",
|
||||||
{"b", "(f_block*16)", "y", "x"},
|
{"b", "(feature_block * 16)", "y", "x"},
|
||||||
"dst",
|
"dst",
|
||||||
input_dt,
|
input_dt,
|
||||||
blockWidth,
|
blockWidth,
|
||||||
@ -156,8 +200,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
|||||||
BoundaryCheck::ENABLED,
|
BoundaryCheck::ENABLED,
|
||||||
IndexType::TENSOR_COORD,
|
IndexType::TENSOR_COORD,
|
||||||
Tensor::DataChannelName::X };
|
Tensor::DataChannelName::X };
|
||||||
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
FusedOpsConfiguration conf_scalar1 = { "_SCALAR",
|
||||||
{"b", "(f_block*16)", "yi", "xi"},
|
{"b", "(feature_block * 16)", "yi", "xi"},
|
||||||
"dst[i]",
|
"dst[i]",
|
||||||
input_dt,
|
input_dt,
|
||||||
1,
|
1,
|
||||||
@ -165,10 +209,19 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
|||||||
BoundaryCheck::ENABLED,
|
BoundaryCheck::ENABLED,
|
||||||
IndexType::TENSOR_COORD,
|
IndexType::TENSOR_COORD,
|
||||||
Tensor::DataChannelName::X };
|
Tensor::DataChannelName::X };
|
||||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
|
FusedOpsConfiguration conf_scalar2 = { "_SCALAR_B1",
|
||||||
|
{"b", "(feature_block * 16)", "0", "0"},
|
||||||
|
"dst",
|
||||||
|
input_dt,
|
||||||
|
1,
|
||||||
|
LoadType::LT_ALIGNED_READ,
|
||||||
|
BoundaryCheck::ENABLED,
|
||||||
|
IndexType::TENSOR_COORD,
|
||||||
|
Tensor::DataChannelName::X };
|
||||||
|
jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
|
||||||
}
|
}
|
||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||||
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
|
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
|
||||||
|
|
||||||
bool padded_output = params.output.X().pad.Total() != 0;
|
bool padded_output = params.output.X().pad.Total() != 0;
|
||||||
@ -194,11 +247,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
|
|||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
|
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
|
||||||
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
|
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
|
||||||
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, feature_block_size)));
|
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||||
if (params.output.Feature().v % feature_block_size != 0) {
|
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||||
|
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
|
||||||
|
if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
|
||||||
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
|
||||||
}
|
}
|
||||||
if (params.inputs[0].Feature().v % feature_block_size != 0) {
|
if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
|
||||||
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright (c) 2016-2019 Intel Corporation
|
// Copyright (c) 2016-2020 Intel Corporation
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
@ -51,7 +51,16 @@ protected:
|
|||||||
std::string exeMode;
|
std::string exeMode;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ConvolutionTuningData {
|
||||||
|
const size_t sub_group_size = 16;
|
||||||
|
const size_t feature_block_size = 16;
|
||||||
|
size_t slm_div_factor = 1;
|
||||||
|
size_t work_group_size = 1;
|
||||||
|
};
|
||||||
|
|
||||||
std::vector<AutoTuneOption> autoTuneOptions;
|
std::vector<AutoTuneOption> autoTuneOptions;
|
||||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
||||||
|
ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
|
||||||
|
float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
|
||||||
};
|
};
|
||||||
} // namespace kernel_selector
|
} // namespace kernel_selector
|
||||||
|
@ -40,7 +40,7 @@
|
|||||||
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
|
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
|
||||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||||
#else
|
#else
|
||||||
# error convolution_gpu_bfyx_f16.cl - unsupported input type.
|
# error convolution_gpu_bfyx_f16.cl: unsupported input type
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if FILTER_TYPE_SIZE == 2
|
#if FILTER_TYPE_SIZE == 2
|
||||||
@ -48,7 +48,7 @@
|
|||||||
#elif FILTER_TYPE_SIZE == 4
|
#elif FILTER_TYPE_SIZE == 4
|
||||||
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||||
#else
|
#else
|
||||||
# error convolution_gpu_bfyx_f16.cl - unsupported filter type.
|
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if OUTPUT_TYPE_SIZE == 1
|
#if OUTPUT_TYPE_SIZE == 1
|
||||||
@ -67,7 +67,7 @@
|
|||||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
||||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
||||||
#else
|
#else
|
||||||
# error convolution_gpu_bfyx_f16.cl - unsupported output type.
|
# error convolution_gpu_bfyx_f16.cl: unsupported output type
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if INPUT0_TYPE_SIZE == 2
|
#if INPUT0_TYPE_SIZE == 2
|
||||||
@ -77,12 +77,14 @@
|
|||||||
#else
|
#else
|
||||||
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
|
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FEATURE_SLICE_SIZE 16
|
#define FEATURE_SLICE_SIZE 16
|
||||||
|
|
||||||
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||||
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||||
|
|
||||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||||
KERNEL(convolution_bfyx_f16)(
|
KERNEL(convolution_bfyx_f16)(
|
||||||
__global INPUT0_TYPE* input,
|
__global INPUT0_TYPE* input,
|
||||||
__global OUTPUT_TYPE* output,
|
__global OUTPUT_TYPE* output,
|
||||||
@ -94,26 +96,29 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
FUSED_OPS_DECLS,
|
FUSED_OPS_DECLS,
|
||||||
#endif
|
#endif
|
||||||
uint split_idx) {
|
uint split_idx) {
|
||||||
#if GROUPED
|
const int sglid = get_sub_group_local_id();
|
||||||
const int f_block = get_group_id(1);
|
|
||||||
const int group = (f_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
|
|
||||||
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (f_block * FEATURE_SLICE_SIZE);
|
|
||||||
int groups_per_sub_group = 1;
|
|
||||||
if (prev_group_leftover < 16)
|
|
||||||
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
|
|
||||||
#else
|
|
||||||
const int f_block = get_group_id(1);
|
|
||||||
const int group = split_idx;
|
|
||||||
const int groups_per_sub_group = 1;
|
|
||||||
#endif // GROUPED
|
|
||||||
|
|
||||||
const int lid = get_sub_group_local_id();
|
|
||||||
const int b = (uint)get_global_id(2);
|
const int b = (uint)get_global_id(2);
|
||||||
|
|
||||||
const int xy = get_global_id(0);
|
const int xy = get_global_id(0);
|
||||||
const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
|
const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
|
||||||
const int y = (xy / X_BLOCKS);
|
const int y = (xy / X_BLOCKS);
|
||||||
|
|
||||||
|
const int lid1 = (int)get_local_id(1);
|
||||||
|
const int feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
|
||||||
|
const int feature_sub_block = lid1 / feature_per_wg;
|
||||||
|
const int feature_block = (int)get_group_id(1);
|
||||||
|
|
||||||
|
#if GROUPED
|
||||||
|
const int group = (feature_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
|
||||||
|
const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (feature_block * FEATURE_SLICE_SIZE);
|
||||||
|
int groups_per_sub_group = 1;
|
||||||
|
if (prev_group_leftover < 16)
|
||||||
|
groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
|
||||||
|
#else
|
||||||
|
const int group = split_idx;
|
||||||
|
const int groups_per_sub_group = 1;
|
||||||
|
#endif // GROUPED
|
||||||
|
|
||||||
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
|
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
|
||||||
|
|
||||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||||
@ -143,7 +148,7 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||||
|
|
||||||
const uint output_offset = b * output_b_pitch +
|
const uint output_offset = b * output_b_pitch +
|
||||||
(f_block + output_fs_pad_before) * output_fs_pitch +
|
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
||||||
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||||
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||||
|
|
||||||
@ -155,54 +160,71 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||||
|
|
||||||
#if BIAS_TERM
|
#if BIAS_TERM
|
||||||
uint bias_offset = f_block * FEATURE_SLICE_SIZE;
|
#if SLM_DIV_FACTOR == 1
|
||||||
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, bias_offset));
|
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||||
|
#else
|
||||||
|
vec_t dst;
|
||||||
|
|
||||||
|
if (feature_sub_block == 0) {
|
||||||
|
dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||||
|
} else {
|
||||||
|
dst = INPUT0_VAL_ZERO;
|
||||||
|
}
|
||||||
|
#endif // SLM_DIV_FACTOR == 1
|
||||||
#else
|
#else
|
||||||
vec_t dst = INPUT0_VAL_ZERO;
|
vec_t dst = INPUT0_VAL_ZERO;
|
||||||
#endif // BIAS_TERM
|
#endif // BIAS_TERM
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
__local vec_t partial_summ[WORK_GROUP_SIZE];
|
||||||
|
#endif
|
||||||
|
|
||||||
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||||
const uint in_split_offset = f_block * input_fs_pitch;
|
const uint in_split_offset = feature_block * input_fs_pitch;
|
||||||
const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
const uint g = sglid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||||
const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
const uint ofm_in_group = sglid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
|
||||||
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
|
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
|
||||||
#else
|
#else
|
||||||
#if GROUPED
|
#if GROUPED
|
||||||
for (uint g = group; g < group + groups_per_sub_group; g++) {
|
for (uint g = group; g < group + groups_per_sub_group; g++) {
|
||||||
const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
|
const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
|
||||||
const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
|
const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
|
||||||
const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
|
const uint filter_offset = (feature_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
|
||||||
#else
|
#else
|
||||||
const uint in_split_offset = 0;
|
const uint in_split_offset = 0;
|
||||||
const uint filter_split_offset = 0;
|
const uint filter_split_offset = 0;
|
||||||
const uint filter_offset = f_block * filter_os_pitch;
|
const uint filter_offset = feature_block * filter_os_pitch;
|
||||||
#endif // GROUPED
|
#endif // GROUPED
|
||||||
const uint grouped_filter_offset = filter_offset + filter_split_offset;
|
const uint grouped_filter_offset = filter_offset + filter_split_offset;
|
||||||
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
|
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||||
|
|
||||||
const uint grouped_input_offset = input_offset + in_split_offset;
|
const uint grouped_input_offset = input_offset + in_split_offset;
|
||||||
|
|
||||||
for (uint icb = 0; icb < IC_BLOCKS; icb++) {
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
for (int icb = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; icb < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; icb++) {
|
||||||
|
#else
|
||||||
|
for (int icb = 0; icb < IC_BLOCKS; icb++) {
|
||||||
|
#endif // SLM_DIV_FACTOR > 1
|
||||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
|
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
|
||||||
for (int kh = 0; kh < FILTER_SIZE_Y; kh++) {
|
for (int kh = 0; kh < FILTER_SIZE_Y; kh++) {
|
||||||
if (input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y)
|
if (input_y + kh * DILATION_SIZE_Y < 0 || input_y + kh * DILATION_SIZE_Y >= INPUT0_SIZE_Y)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
INPUT_TYPE line_cache[INPUT_LINE_SIZE];
|
INPUT_TYPE line_cache[INPUT_LINE_SIZE];
|
||||||
|
|
||||||
#if INPUT_LEFTOVERS
|
#if INPUT_LEFTOVERS
|
||||||
if ((icb+1)*FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
|
if ((icb + 1) * FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
|
||||||
{
|
{
|
||||||
for (int xb = 0; xb < INPUT_LINE_SIZE; xb++)
|
for (int xb = 0; xb < INPUT_LINE_SIZE; xb++)
|
||||||
{
|
{
|
||||||
if (icb*FEATURE_SLICE_SIZE + lid >= FILTER_IFM_NUM)
|
if (icb * FEATURE_SLICE_SIZE + sglid >= FILTER_IFM_NUM)
|
||||||
line_cache[xb] = 0;
|
line_cache[xb] = 0;
|
||||||
else
|
else
|
||||||
line_cache[xb] = input[grouped_input_offset +
|
line_cache[xb] = input[grouped_input_offset +
|
||||||
icb * input_fs_pitch +
|
icb * input_fs_pitch +
|
||||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||||
xb * input_x_pitch +
|
xb * input_x_pitch +
|
||||||
lid];
|
sglid];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -251,7 +273,7 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
||||||
src[i] = line_cache[i];
|
src[i] = line_cache[i];
|
||||||
#else
|
#else
|
||||||
src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
|
src[i] = line_cache[kw * DILATION_SIZE_X + STRIDE_SIZE_X * i];
|
||||||
#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
|
||||||
}
|
}
|
||||||
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
#if MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||||
@ -300,7 +322,7 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
dst = mad(wei0.s6, src6, dst);
|
dst = mad(wei0.s6, src6, dst);
|
||||||
dst = mad(wei0.s7, src7, dst);
|
dst = mad(wei0.s7, src7, dst);
|
||||||
#else
|
#else
|
||||||
#error Unsupported input feature size for multiple groups input preload
|
# error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
|
||||||
#endif // FILTER_IFM_NUM
|
#endif // FILTER_IFM_NUM
|
||||||
#else
|
#else
|
||||||
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||||
@ -352,13 +374,24 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||||
}
|
}
|
||||||
#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
partial_summ[lid1] = dst;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (feature_sub_block == 0) {
|
||||||
|
__attribute__((opencl_unroll_hint))
|
||||||
|
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||||
|
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||||
|
#endif // SLM_DIV_FACTOR > 1
|
||||||
|
|
||||||
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
||||||
|
|
||||||
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
|
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
|
||||||
out_vec_t res;
|
out_vec_t res;
|
||||||
|
|
||||||
#if OUTPUT_LEFTOVERS
|
#if OUTPUT_LEFTOVERS
|
||||||
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
||||||
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_SCALAR;
|
FUSED_OPS_SCALAR;
|
||||||
@ -366,8 +399,8 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
#else
|
#else
|
||||||
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
||||||
#endif
|
#endif
|
||||||
if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
||||||
output[output_offset + i * output_x_pitch + lid] = res[i];
|
output[output_offset + i * output_x_pitch + sglid] = res[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -391,7 +424,7 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
#elif OUTPUT_X_BLOCK_SIZE == 1
|
#elif OUTPUT_X_BLOCK_SIZE == 1
|
||||||
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
||||||
#else
|
#else
|
||||||
# error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
|
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
|
||||||
@ -405,6 +438,10 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef AS_INPUT_SRC
|
#undef AS_INPUT_SRC
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright (c) 2016-2019 Intel Corporation
|
// Copyright (c) 2016-2020 Intel Corporation
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
@ -16,16 +16,24 @@
|
|||||||
#include "include/unit_type.cl"
|
#include "include/unit_type.cl"
|
||||||
#include "include/mmad.cl"
|
#include "include/mmad.cl"
|
||||||
|
|
||||||
#define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
|
#if X_BLOCK_SIZE > 1
|
||||||
intel_sub_group_shuffle( \
|
# define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
|
||||||
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
|
intel_sub_group_shuffle( \
|
||||||
id))
|
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
|
||||||
|
id))
|
||||||
|
#else
|
||||||
|
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
|
||||||
|
#endif
|
||||||
|
|
||||||
#define FEATURE_SLICE_SIZE 16
|
#define FEATURE_SLICE_SIZE 16
|
||||||
|
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
|
# define UNIT_BLOCK_READ_VEC(ptr, offset) CAT(UNIT_BLOCK_READ, X_BLOCK_SIZE)(ptr, offset)
|
||||||
|
# define UNIT_BLOCK_WRITE_VEC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
|
||||||
|
#endif
|
||||||
|
|
||||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||||
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||||
__global INPUT0_TYPE* input,
|
__global INPUT0_TYPE* input,
|
||||||
__global OUTPUT_TYPE* output,
|
__global OUTPUT_TYPE* output,
|
||||||
@ -37,16 +45,21 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
FUSED_OPS_DECLS,
|
FUSED_OPS_DECLS,
|
||||||
#endif
|
#endif
|
||||||
uint split_idx) {
|
uint split_idx) {
|
||||||
const int xy = get_global_id(0);
|
#if X_BLOCK_SIZE > 1
|
||||||
const int f_block = get_group_id(1);
|
const uint xy = (int)get_global_id(0);
|
||||||
const int b = get_global_id(2);
|
const uint x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
|
||||||
const int lid = get_sub_group_local_id();
|
const uint y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
|
||||||
|
|
||||||
const int x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
|
const uint input_x = x;
|
||||||
const int y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
|
const uint input_y = y;
|
||||||
|
#endif
|
||||||
|
const uint b = (int)get_global_id(2);
|
||||||
|
const uint sglid = (int)get_sub_group_local_id();
|
||||||
|
|
||||||
const int input_x = x;
|
const uint lid1 = (int)get_local_id(1);
|
||||||
const int input_y = y;
|
const uint feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
|
||||||
|
const uint feature_sub_block = lid1 / feature_per_wg;
|
||||||
|
const uint feature_block = (int)get_group_id(1);
|
||||||
|
|
||||||
// Input offset calculations:
|
// Input offset calculations:
|
||||||
const uint input_x_pitch = FEATURE_SLICE_SIZE;
|
const uint input_x_pitch = FEATURE_SLICE_SIZE;
|
||||||
@ -71,8 +84,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
|
|
||||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||||
|
|
||||||
const uint output_offset = b * output_b_pitch +
|
const uint output_offset = b * output_b_pitch +
|
||||||
(f_block + output_fs_pad_before) * output_fs_pitch +
|
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
||||||
(OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
(OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||||
(OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
(OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||||
|
|
||||||
@ -83,71 +96,91 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
const uint filter_is_pitch = filter_y_pitch * FILTER_SIZE_Y;
|
const uint filter_is_pitch = filter_y_pitch * FILTER_SIZE_Y;
|
||||||
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||||
|
|
||||||
const uint filter_offset = f_block * filter_os_pitch;
|
const uint filter_offset = feature_block * filter_os_pitch;
|
||||||
|
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
typedef MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) vec_t;
|
typedef MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) vec_t;
|
||||||
|
|
||||||
|
|
||||||
#if BIAS_TERM
|
|
||||||
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
|
|
||||||
#else
|
#else
|
||||||
vec_t dst = UNIT_VAL_ZERO;
|
typedef UNIT_TYPE vec_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (uint k = 0; k < IC_BLOCKS; ++k)
|
#if BIAS_TERM
|
||||||
|
#if SLM_DIV_FACTOR == 1
|
||||||
|
vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||||
|
#else
|
||||||
|
vec_t dst;
|
||||||
|
|
||||||
|
if (feature_sub_block == 0) {
|
||||||
|
dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||||
|
} else {
|
||||||
|
dst = UNIT_VAL_ZERO;
|
||||||
|
}
|
||||||
|
#endif // SLM_DIV_FACTOR == 1
|
||||||
|
#else
|
||||||
|
vec_t dst = UNIT_VAL_ZERO;
|
||||||
|
#endif // BIAS_TERM
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
__local vec_t partial_summ[WORK_GROUP_SIZE];
|
||||||
|
|
||||||
|
for (uint k = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; k < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; k++)
|
||||||
{
|
{
|
||||||
|
#else
|
||||||
|
for (uint k = 0; k < IC_BLOCKS; k++)
|
||||||
|
{
|
||||||
|
#endif // SLM_DIV_FACTOR > 1
|
||||||
vec_t src = 0;
|
vec_t src = 0;
|
||||||
#if INPUT_LEFTOVERS
|
#if INPUT_LEFTOVERS
|
||||||
if ((k+1)*FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
|
if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
|
||||||
{
|
{
|
||||||
if (k*FEATURE_SLICE_SIZE + lid < INPUT0_FEATURE_NUM)
|
if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM)
|
||||||
{
|
{
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
||||||
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
||||||
{
|
{
|
||||||
const uint xb = (x + i) % INPUT0_SIZE_X;
|
const uint xb = (x + i) % INPUT0_SIZE_X;
|
||||||
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
||||||
const uint input_idx = input_offset + k * input_fs_pitch +
|
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
|
||||||
yb * input_y_pitch +
|
|
||||||
xb * input_x_pitch;
|
src[i] = input[input_idx + sglid];
|
||||||
src[i] = input[input_idx + lid];
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
src = input[input_offset + k * input_fs_pitch + sglid];
|
||||||
|
#endif // X_BLOCK_SIZE > 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif // INPUT_LEFTOVERS
|
||||||
{
|
{
|
||||||
#if PADDED_INPUT
|
#if PADDED_INPUT
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
__attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
|
||||||
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
for (int i = 0; i < X_BLOCK_SIZE; i++)
|
||||||
{
|
{
|
||||||
const uint xb = (x + i) % INPUT0_SIZE_X;
|
const uint xb = (x + i) % INPUT0_SIZE_X;
|
||||||
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
const uint yb = y + (x + i) / INPUT0_SIZE_X;
|
||||||
const uint input_idx = input_offset + k * input_fs_pitch +
|
const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
|
||||||
yb * input_y_pitch +
|
|
||||||
xb * input_x_pitch;
|
|
||||||
src[i] = UNIT_BLOCK_READ(input, input_idx);
|
src[i] = UNIT_BLOCK_READ(input, input_idx);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#if X_BLOCK_SIZE == 8
|
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
|
||||||
src = UNIT_BLOCK_READ8(input, input_offset + k * input_fs_pitch +
|
#endif // X_BLOCK_SIZE > 1
|
||||||
input_y * input_y_pitch +
|
|
||||||
input_x * input_x_pitch);
|
|
||||||
#elif X_BLOCK_SIZE == 4
|
|
||||||
src = UNIT_BLOCK_READ4(input, input_offset + k * input_fs_pitch +
|
|
||||||
input_y * input_y_pitch +
|
|
||||||
input_x * input_x_pitch);
|
|
||||||
#elif X_BLOCK_SIZE == 2
|
|
||||||
src = UNIT_BLOCK_READ2(input, input_offset + k * input_fs_pitch +
|
|
||||||
input_y * input_y_pitch +
|
|
||||||
input_x * input_x_pitch);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
#else // PADDED_INPUT
|
||||||
|
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
|
src = UNIT_BLOCK_READ_VEC(input, input_offset + k * input_fs_pitch + input_y * input_y_pitch + input_x * input_x_pitch);
|
||||||
|
#else
|
||||||
|
src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
|
||||||
|
#endif // X_BLOCK_SIZE > 1
|
||||||
|
#endif // PADDED_INPUT
|
||||||
|
}
|
||||||
|
|
||||||
UNIT_TYPE8 wei0 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch);
|
UNIT_TYPE8 wei0 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch);
|
||||||
UNIT_TYPE8 wei1 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch + 8 * filter_isv_pitch);
|
UNIT_TYPE8 wei1 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch + 8 * filter_isv_pitch);
|
||||||
|
|
||||||
const vec_t src0 = GET_SRC(src, 0);
|
const vec_t src0 = GET_SRC(src, 0);
|
||||||
const vec_t src1 = GET_SRC(src, 1);
|
const vec_t src1 = GET_SRC(src, 1);
|
||||||
const vec_t src2 = GET_SRC(src, 2);
|
const vec_t src2 = GET_SRC(src, 2);
|
||||||
@ -183,29 +216,52 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
dst = mad(wei1.s7, src15, dst);
|
dst = mad(wei1.s7, src15, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
partial_summ[lid1] = dst;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (feature_sub_block == 0) {
|
||||||
|
__attribute__((opencl_unroll_hint))
|
||||||
|
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||||
|
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||||
|
#endif // SLM_DIV_FACTOR > 1
|
||||||
|
|
||||||
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
|
||||||
|
|
||||||
#if OUTPUT_LEFTOVERS
|
#if OUTPUT_LEFTOVERS
|
||||||
if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
|
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
|
||||||
{
|
{
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
||||||
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y ||
|
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
|
||||||
f_block*FEATURE_SLICE_SIZE + lid >= OUTPUT_FEATURE_NUM)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
int xi = (x+i) % OUTPUT_SIZE_X;
|
int xi = (x + i) % OUTPUT_SIZE_X;
|
||||||
int yi = y + ((x+i) / OUTPUT_SIZE_X);
|
int yi = y + ((x + i) / OUTPUT_SIZE_X);
|
||||||
|
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_SCALAR;
|
FUSED_OPS_SCALAR;
|
||||||
dst[i] = FUSED_OPS_RESULT_SCALAR;
|
dst[i] = FUSED_OPS_RESULT_SCALAR;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + lid] = dst[i];
|
output[output_offset + yi * output_y_pitch + xi * output_x_pitch + sglid] = dst[i];
|
||||||
}
|
}
|
||||||
|
#else // X_BLOCK_SIZE > 1
|
||||||
|
if (feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
|
||||||
|
return;
|
||||||
|
|
||||||
|
#if HAS_FUSED_OPS
|
||||||
|
FUSED_OPS_SCALAR_B1;
|
||||||
|
dst = FUSED_OPS_RESULT_SCALAR_B1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
output[output_offset + sglid] = dst;
|
||||||
|
#endif // X_BLOCK_SIZE > 1
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif // OUTPUT_LEFTOVERS
|
||||||
|
|
||||||
|
#if X_BLOCK_SIZE > 1
|
||||||
{
|
{
|
||||||
#if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
|
#if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
|
||||||
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
|
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
|
||||||
@ -216,20 +272,14 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
FUSED_OPS_VEC;
|
FUSED_OPS_VEC;
|
||||||
dst = FUSED_OPS_RESULT_VEC;
|
dst = FUSED_OPS_RESULT_VEC;
|
||||||
#endif
|
#endif
|
||||||
#if X_BLOCK_SIZE == 8
|
UNIT_BLOCK_WRITE_VEC(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
||||||
UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
|
||||||
#elif X_BLOCK_SIZE == 4
|
|
||||||
UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
|
||||||
#elif X_BLOCK_SIZE == 2
|
|
||||||
UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < X_BLOCK_SIZE; i++) {
|
||||||
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
|
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
int xi = (x+i) % OUTPUT_SIZE_X;
|
int xi = (x + i) % OUTPUT_SIZE_X;
|
||||||
int yi = y + ((x+i) / OUTPUT_SIZE_X);
|
int yi = y + ((x + i) / OUTPUT_SIZE_X);
|
||||||
|
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_SCALAR;
|
FUSED_OPS_SCALAR;
|
||||||
@ -240,7 +290,22 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else // X_BLOCK_SIZE > 1
|
||||||
|
{
|
||||||
|
#if HAS_FUSED_OPS
|
||||||
|
FUSED_OPS_SCALAR_B1;
|
||||||
|
dst = FUSED_OPS_RESULT_SCALAR_B1;
|
||||||
|
#endif
|
||||||
|
UNIT_BLOCK_WRITE(output, output_offset, dst);
|
||||||
|
}
|
||||||
|
#endif // X_BLOCK_SIZE > 1
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef GET_SRC
|
#undef GET_SRC
|
||||||
#undef FEATURE_SLICE_SIZE
|
#undef FEATURE_SLICE_SIZE
|
||||||
|
#undef UNIT_BLOCK_READ_VEC
|
||||||
|
#undef UNIT_BLOCK_WRITE_VEC
|
||||||
|
Loading…
Reference in New Issue
Block a user