[IE CLDNN] Optimize 1x1 imad convolution kernel (#757)

This commit is contained in:
Konrad Dobros
2020-06-08 15:44:50 +02:00
committed by GitHub
parent 626bc4f3d4
commit d155483573
8 changed files with 633 additions and 259 deletions

View File

@@ -73,6 +73,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
{ WeightsLayout::os_i_osv16__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_i_osv16, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_osv16_isv16, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osv32_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osv64_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::i_yxs_os_yxsv2_osv16, { 1, 2, -1, 3, 0, -1, -1, -1 } },
{ WeightsLayout::iy_xs_os_xsv2_osv16__ao32, { 1, 2, -1, 3, 0, -1, -1, -1 } },
{ WeightsLayout::iy_xs_os_xsv2_osv8__ao32, { 1, 2, -1, 3, 0, -1, -1, -1 } },
@@ -633,6 +635,14 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
newDims[2] = RoundUp(newDims[2], 16);
newDims[3] = RoundUp(newDims[3], 16);
break;
case os_is_zyx_osv32_isv16:
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 32);
break;
case os_is_zyx_osv64_isv16:
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 64);
break;
case gs_oi_yxs_gsv16_yxsv4:
newDims[4] = RoundUp(newDims[4], 16);
break;

View File

@@ -91,6 +91,8 @@ enum WeightsLayout {
os_i_osv16__ai8,
os_i_osv16,
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
os_is_zyx_osv32_isv16,
os_is_zyx_osv64_isv16,
i_yxs_os_yxsv2_osv16,
iy_xs_os_xsv2_osv16__ao32,
iy_xs_os_xsv2_osv8__ao32,

View File

@@ -19,66 +19,30 @@
#include <vector>
#include <iostream>
#include <algorithm>
#include <string>
//
// Kernel specific constants
//
#define SIMD_SIZE 16
static constexpr size_t fsv = 16;
static constexpr size_t simd = 16;
namespace kernel_selector {
namespace {
size_t getOutBlock_X(size_t output_size_x, size_t stride_x) {
size_t output_block_width = 0;
size_t max_block_size = std::min((SIMD_SIZE - 1) / stride_x + 1, output_size_x);
if (output_size_x <= max_block_size)
return output_size_x;
for (size_t block = 4; block <= max_block_size; ++block) {
if (output_size_x % block == 0)
output_block_width = block;
}
if (output_block_width == 0 && output_size_x < max_block_size * 3) {
size_t min_overhang = max_block_size;
for (size_t block = 4; block <= max_block_size; ++block) {
size_t overhang = block - output_size_x % block;
if (overhang <= min_overhang) {
min_overhang = overhang;
output_block_width = block;
}
}
}
if (output_block_width == 0) {
output_block_width = max_block_size;
}
return output_block_width;
}
bool should_k_slice(const convolution_params& params, size_t output_block_width) {
constexpr float preferred_eu_occupancy = 5.f;
if (params.inputs[0].Feature().v % (16 * 4) != 0)
return false;
size_t eu_count = params.engineInfo.computeUnitsCount;
auto global_size = CeilDiv(params.output.X().v, output_block_width) *
params.output.Y().v *
params.output.Batch().v * Align(CeilDiv(params.output.Feature().v, 2), SIMD_SIZE);
auto threads = global_size / SIMD_SIZE;
auto optimal_threads_num = eu_count * preferred_eu_occupancy;
return threads < optimal_threads_num;
}
} // namespace
Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Convolution_kernel_b_fs_yx_fsv16_imad_1x1()
: ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_1x1") {
for (size_t bw = 1; bw <= SIMD_SIZE; ++bw) {
for (auto exe : ConvolutionKernelBase::autoTuneOptions) {
all_tune_params.push_back(AutoTuneParams{ bw, true, exe });
all_tune_params.push_back(AutoTuneParams{ bw, false, exe });
constexpr size_t max_block_elements = 32;
for (size_t bs = 1; bs <= 2 * simd; ++bs) {
for (size_t bf = 1; bf <= 4; ++bf) {
if (bs * bf > max_block_elements)
continue;
for (size_t split = 1; split <= 8; ++split) {
if (bf > split)
continue;
for (auto exe : ConvolutionKernelBase::autoTuneOptions) {
all_tune_params.push_back(AutoTuneParams{ bs, bf, split, exe });
}
}
}
}
}
@@ -94,6 +58,7 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
k.EnableOutputDataType(Datatype::F16);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputWeightsType(WeightsType::UINT8);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
@@ -106,19 +71,33 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetJitConstants(const convolution_params& params,
const DispatchData& kd) const {
auto mem_consts = Parent::GetJitConstants(params, kd);
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
mem_consts.AddConstant(MakeJitConstant("FEATURE_LWS_SPLIT", kd.cldnnStyle.prefetch));
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_SPATIAL", kd.cldnnStyle.blockWidth));
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_FEATURES", kd.cldnnStyle.blockHeight));
mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", kd.cldnnStyle.prefetch));
mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
mem_consts.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"", {"out_b", "out_f + out_f_offset", "out_y", "out_x + i"}, "dequantized", input_dt, 1 };
conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
std::vector<std::string> idx_order = { "out_b",
"(out_f + ofb * SIMD)",
"intel_sub_group_shuffle(out_y_shuffle[os / SIMD], os % SIMD)",
"intel_sub_group_shuffle(out_x_shuffle[os / SIMD], os % SIMD)" };
FusedOpsConfiguration conf_scalar = {"_SCALAR",
idx_order,
"dequantized[ofb][os]",
input_dt,
1,
LoadType::LT_UNALIGNED,
BoundaryCheck::DISABLED };
conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, true);
mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
@@ -130,24 +109,62 @@ ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::S
DispatchData kd;
const auto& output = params.output;
auto tune_params = GetAutoTuneParams(params, index);
size_t k_slices = tune_params.k_slicing ? 4 : 1;
size_t k_slices = tune_params.feature_slm_split;
kd.gws0 = CeilDiv(output.X().v, tune_params.out_block_width);
kd.gws1 = output.Y().v;
kd.gws2 = output.Batch().v * Align(CeilDiv(output.Feature().v, 2), SIMD_SIZE) * k_slices;
kd.gws0 = CeilDiv(output.X().v * output.Y().v, tune_params.out_block_spatial);
kd.gws1 = CeilDiv(output.Feature().v, tune_params.out_block_features * simd) * simd * k_slices;
kd.gws2 = output.Batch().v;
kd.lws0 = 1;
kd.lws1 = 1;
kd.lws2 = SIMD_SIZE * k_slices;
kd.lws1 = simd * k_slices;
kd.lws2 = 1;
kd.cldnnStyle = {0, 0, 0, 0, 0};
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
kd.cldnnStyle.blockWidth = tune_params.out_block_width;
kd.cldnnStyle.blockWidth = tune_params.out_block_spatial;
kd.cldnnStyle.blockHeight = tune_params.out_block_features;
kd.cldnnStyle.prefetch = k_slices;
kd.efficiency = FORCE_PRIORITY_2;
auto in_f = params.weights.IFM().v;
auto out_f = params.weights.OFM().v;
auto batch = output.Batch().v;
auto out_x = output.X().v;
auto out_y = output.Y().v;
bool x_strided = params.stride.x != 1;
bool general_is_faster = false;
// This kernel cannot split for large x, but general could
general_is_faster |= CeilDiv(in_f, fsv) % 4 == 0
&& (out_x % 15 == 0 || out_x % 16 == 0)
&& tune_params.feature_slm_split == 1
&& tune_params.out_block_spatial <= 8;
// List of known cases where general kernel is better
general_is_faster |= in_f == 24 && out_f == 144 && out_x == 75 && out_y == 75 && batch == 1;
general_is_faster |= in_f == 192 && out_f == 64 && out_x == 28 && out_y == 28 && batch == 1;
general_is_faster |= in_f == 576 && out_f == 96 && out_x == 19 && out_y == 19 && batch == 1;
general_is_faster |= in_f == 384 && out_f == 96 && out_x == 19 && out_y == 19 && batch == 1;
general_is_faster |= in_f == 384 && out_f == 64 && out_x == 19 && out_y == 19 && batch == 1;
general_is_faster |= in_f == 192 && out_f == 64 && out_x == 19 && out_y == 19 && batch == 1;
general_is_faster |= in_f == 96 && out_f == 576 && out_x == 19 && out_y == 19 && batch == 1;
general_is_faster |= in_f == 1024 && out_f == 256 && out_x == 14 && out_y == 14 && batch == 1;
general_is_faster |= in_f == 256 && out_f == 256 && out_x == 14 && out_y == 14 && batch == 1;
general_is_faster |= in_f == 136 && out_f == 816 && out_x == 14 && out_y == 14 && batch == 1;
general_is_faster |= in_f == 1280 && out_f == 256 && out_x == 10 && out_y == 10 && batch == 1;
general_is_faster |= in_f == 256 && out_f == 128 && out_x == 3 && out_y == 3 && batch == 1;
if (general_is_faster && !x_strided) {
kd.efficiency = FORCE_PRIORITY_3;
}
// Better to use kernel with 4 input features in a loop
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
kd.efficiency = FORCE_PRIORITY_4;
return kd;
} // SetDefault
@@ -165,40 +182,169 @@ bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Validate(const Params& params, c
return false;
}
if ((newParams.stride.x != newParams.stride.y) ||
(newParams.stride.x != 1 && newParams.stride.x != 2)) {
// Strides must be 1x1 or 2x2
return false;
}
if (newParams.groups != 1 || newParams.split != 1)
return false;
return true;
}
WeightsLayout Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetPreferredWeightsLayout(const convolution_params& params) const {
// TODO Auto tune index is needed in GetPreferredWeightsLayout to select correct weights layout
auto tparams = GetAutoTuneParams(params, -1);
if (tparams.out_block_features == 2)
return WeightsLayout::os_is_zyx_osv32_isv16;
if (tparams.out_block_features == 4)
return WeightsLayout::os_is_zyx_osv64_isv16;
return WeightsLayout::os_is_yx_osv16_isv16;
}
Convolution_kernel_b_fs_yx_fsv16_imad_1x1::AutoTuneParams
Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetAutoTuneParams(const convolution_params& params, int index) const {
if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
return all_tune_params[index];
}
AutoTuneParams default_params;
default_params.out_block_width = getOutBlock_X(params.output.X().v, params.stride.x);
default_params.k_slicing = should_k_slice(params, default_params.out_block_width);
default_params.exe_mode = DEFAULT;
return default_params;
size_t block_spatial = 1;
size_t block_features = 1;
size_t feature_slm_split = 1;
std::string exe_mode = DEFAULT;
size_t total_spatial = params.output.X().v * params.output.Y().v;
// Try two features per work-item
if (params.output.Feature().v % 32 == 0 || params.output.Feature().v > 32 * 2)
block_features = 2;
// Non strict inequality here leads to some regressions, ie: [1, 64, 19, 19] (*) [384, 64, 1, 1]
bool can_split = params.weights.IFM().v > 4 * fsv;
// Select block size in spatial dimension
{
size_t max_spatial = std::min(2 * simd / block_features, total_spatial);
size_t min_efficient_spatial = 8;
if (max_spatial <= min_efficient_spatial) {
block_spatial = max_spatial;
} else {
auto minimum_params = AutoTuneParams{ min_efficient_spatial, block_features, 1, exe_mode };
bool preserve_occupancy = EstimateOccupancy(params, minimum_params) >= 1.f;
size_t min_overhang = max_spatial;
size_t best_block = min_efficient_spatial;
bool block_write_found = false;
bool output_pad = params.output.X().pad.Total() != 0;
for (size_t block = min_efficient_spatial; block <= max_spatial; ++block) {
bool c_occupancy = EstimateOccupancy(params, { block, block_features, 1, exe_mode }) >= 1.f;
auto overhang = Align(total_spatial, block) - total_spatial;
bool c_block_write = (overhang == 0 && !output_pad) || params.output.X().v % block == 0;
// Kernel work-around for spills/inefficient loop order
if (can_split && !c_occupancy && block > 14 && block_features > 1)
break;
if (preserve_occupancy && !c_occupancy)
break;
if (overhang <= min_overhang && (!block_write_found || c_block_write)) {
best_block = block;
min_overhang = overhang;
block_write_found = c_block_write;
}
}
block_spatial = best_block;
}
}
// Try to split features using slm to increase occupancy
{
auto dummy_params = AutoTuneParams{ block_spatial, block_features, 1, exe_mode };
bool enough_occupancy = EstimateOccupancy(params, dummy_params) >= 1.f;
if (!enough_occupancy && can_split) {
std::vector<size_t> check_split = { 4 };
size_t ifm_blocks = CeilDiv(params.weights.IFM().v, fsv);
for (auto split : check_split) {
if (split > ifm_blocks)
break;
auto tmp_tune = AutoTuneParams{ block_spatial, block_features, split, exe_mode };
bool c_lws = split * simd <= params.engineInfo.maxWorkGroupSize;
bool c_slm = EstimateSLMUsage(params, tmp_tune) <= 1.f;
bool c_fb = block_features <= split;
bool c_occupancy = EstimateOccupancy(params, tmp_tune) >= 1.f;
if (c_lws && c_slm && c_fb) {
feature_slm_split = split;
}
// Increasing split will only increase memory and work-group size, don't check bigger split
if (!c_slm || !c_lws || c_occupancy)
break;
}
}
}
// Occupancy is still extremely low, try to decrease spatials
{
auto dummy_params = AutoTuneParams{ block_spatial, block_features, feature_slm_split, exe_mode };
constexpr float default_threshold = 5.f / 7.f;
constexpr float split_threshold = 4.f / 7.f;
float threshold_occupancy = feature_slm_split == 1 ? default_threshold : split_threshold;
if (EstimateOccupancy(params, dummy_params) < threshold_occupancy && block_spatial != 1) {
for (size_t block = block_spatial - 1; block >= 4; --block) {
auto tmp_params = AutoTuneParams{ block, block_features, feature_slm_split, exe_mode };
bool c_mul = total_spatial % block == 0;
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= threshold_occupancy;
if (c_mul) {
block_spatial = block;
if (c_occupancy)
break;
}
}
}
}
return AutoTuneParams{ block_spatial, block_features, feature_slm_split, exe_mode };
}
float Convolution_kernel_b_fs_yx_fsv16_imad_1x1::EstimateOccupancy(const convolution_params& params, const AutoTuneParams& tparams) const {
size_t blocks_s = CeilDiv(params.output.X().v * params.output.Y().v, tparams.out_block_spatial);
size_t blocks_f = CeilDiv(params.output.Feature().v, tparams.out_block_features * simd) * tparams.feature_slm_split;
size_t block_b = params.output.Batch().v;
auto threads = blocks_s * blocks_f * block_b;
constexpr size_t max_threads_per_cu = 7;
size_t compute_units = params.engineInfo.computeUnitsCount;
size_t max_threads = compute_units * max_threads_per_cu;
return static_cast<float>(threads) / static_cast<float>(max_threads);
}
float Convolution_kernel_b_fs_yx_fsv16_imad_1x1::EstimateSLMUsage(const convolution_params& params, const AutoTuneParams& tparams) const {
size_t slm_elements = tparams.out_block_spatial * tparams.out_block_features * fsv * (tparams.feature_slm_split - 1);
size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
// TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
}
bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::ValidateAutoTuneParams(const convolution_params& params,
const AutoTuneParams& tune_params) const {
if (tune_params.k_slicing && params.inputs[0].Feature().v % (16 * 4) != 0)
return false;
bool c_ifm = CeilDiv(params.weights.IFM().v, fsv) >= tune_params.feature_slm_split;
bool c_slm = EstimateSLMUsage(params, tune_params) <= 1.f;
bool c_lws = tune_params.feature_slm_split * simd <= params.engineInfo.maxWorkGroupSize;
size_t max_block_size = std::min(static_cast<size_t>((SIMD_SIZE - 1) / params.stride.x + 1), params.output.X().v);
if (tune_params.out_block_width > max_block_size)
return false;
// Work-around for lack of actual AutoTuneParams in GetPreferredWeightsLayout
auto default_params = GetAutoTuneParams(params, -1);
bool c_wa_fb = default_params.out_block_features == tune_params.out_block_features;
return true;
return c_ifm && c_slm && c_lws && c_wa_fb;
}
KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetKernelsData(const Params& params,

View File

@@ -38,9 +38,7 @@ protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
return WeightsLayout::os_is_yx_osv16_isv16;
}
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
@@ -50,13 +48,17 @@ protected:
}
struct AutoTuneParams {
size_t out_block_width;
bool k_slicing;
size_t out_block_spatial;
size_t out_block_features;
size_t feature_slm_split;
std::string exe_mode;
};
std::vector<AutoTuneParams> all_tune_params;
bool ValidateAutoTuneParams(const convolution_params& params, const AutoTuneParams& tune_params) const;
AutoTuneParams GetAutoTuneParams(const convolution_params& params, int index) const;
float EstimateOccupancy(const convolution_params& params, const AutoTuneParams& tune) const;
float EstimateSLMUsage(const convolution_params& params, const AutoTuneParams& tune) const;
};
} // namespace kernel_selector

View File

@@ -17,28 +17,38 @@
#include "include/fetch.cl"
#include "include/imad.cl"
#include "include/mmad.cl"
#include "include/data_types.cl"
#define FSV 16
#define SIMD 16
#if FILTER_LAYOUT_OS_IS_YX_OSV16_ISV16
# define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, o, i, y, x)
# define WEIGHTS_FEATURE_BLOCK_PITCH (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FSV)
# define WEIGHTS_IS_PITCH (FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y)
#elif FILTER_LAYOUT_OS_IS_ZYX_OSV32_ISV16
# define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(FILTER, o, i, z, y, x)
# define WEIGHTS_FEATURE_BLOCK_PITCH (FSV * FSV)
# define WEIGHTS_IS_PITCH (2 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
#elif FILTER_LAYOUT_OS_IS_ZYX_OSV64_ISV16
# define GET_WEIGHTS_INDEX(o, i, z, y, x) GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(FILTER, o, i, z, y, x)
# define WEIGHTS_FEATURE_BLOCK_PITCH (FSV * FSV)
# define WEIGHTS_IS_PITCH (4 * FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z)
#if QUANTIZATION_TERM
#define ACCUMULATOR_TYPE int
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
#define ACTIVATION_TYPE float
#define TO_ACTIVATION_TYPE(x) convert_float(x)
#else
#define ACCUMULATOR_TYPE INPUT0_TYPE
#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
#define ACTIVATION_TYPE INPUT0_TYPE
#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
#endif
#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
__attribute__((intel_reqd_sub_group_size(16)))
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
@@ -51,182 +61,343 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#endif
uint split_idx)
{
#define LUT_VALUE_CLAMP(x) ((x) < (OUT_BLOCK_WIDTH - 1) * STRIDE_SIZE_X + 1 ? (x) : 0)
const int tmp[16] = {
LUT_VALUE_CLAMP(0),
LUT_VALUE_CLAMP(1),
LUT_VALUE_CLAMP(2),
LUT_VALUE_CLAMP(3),
LUT_VALUE_CLAMP(4),
LUT_VALUE_CLAMP(5),
LUT_VALUE_CLAMP(6),
LUT_VALUE_CLAMP(7),
LUT_VALUE_CLAMP(8),
LUT_VALUE_CLAMP(9),
LUT_VALUE_CLAMP(10),
LUT_VALUE_CLAMP(11),
LUT_VALUE_CLAMP(12),
LUT_VALUE_CLAMP(13),
LUT_VALUE_CLAMP(14),
LUT_VALUE_CLAMP(15)
};
#undef LUT_VALUE_CLAMP
// Use group ids to ease sub-group uniform variables optimization for compiler
const uint out_yx_sg = (uint)get_group_id(0) * OUT_BLOCK_SPATIAL;
uint out_fg = (uint)get_group_id(1) * OUT_BLOCK_FEATURES * SIMD;
const uint out_b = (uint)get_group_id(2);
uint out_f = out_fg + get_sub_group_local_id();
#if FEATURE_LWS_SPLIT != 1
const uint subgroup_id = get_sub_group_id();
#else
const uint subgroup_id = 0;
#endif
const uint subgroup_local_id = get_sub_group_local_id();
const uint sglid = get_sub_group_local_id();
const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
const uint out_y = get_global_id(1);
const uint out_b = (uint)(get_group_id(2) * 32) / ALIGN(OUTPUT_FEATURE_NUM, 32);
const uint out_fg = (uint)(get_group_id(2) * 32) % ALIGN(OUTPUT_FEATURE_NUM, 32);
const uint out_f = out_fg + subgroup_local_id;
uint out_x_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
uint out_y_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
const uint feature_offset = subgroup_id * INPUT0_FEATURE_NUM / FEATURE_LWS_SPLIT;
ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * 2] = { 0 };
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, feature_offset, 0, 0);
uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16, feature_offset, 0, 0);
__attribute__((opencl_unroll_hint(1)))
for(uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16)/FEATURE_LWS_SPLIT; k++ ) {
uint4 weights_val = vload4(0, (__global uint*)(weights + filter_idx));
uint4 weights_val2 = vload4(0, (__global uint *)(weights + filter_idx2));
uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, feature_offset + k * 16, input_y, input_x + tmp[get_sub_group_local_id()]);
uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val.s0)));
dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val.s1)));
dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val.s2)));
dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val.s3)));
dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s0)));
dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s1)));
dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s2)));
dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s3)));
}
filter_idx += 16 * 16;
filter_idx2 += 16 * 16;
const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
? out_yx_shuffle
: min(out_yx_shuffle, max_local_yx - 1);
out_x_shuffle[os] = out_yx_clamp % OUTPUT_SIZE_X;
out_y_shuffle[os] = out_yx_clamp / OUTPUT_SIZE_X;
}
#if FEATURE_LWS_SPLIT != 1
__local ACCUMULATOR_TYPE partial_acc[16 * OUT_BLOCK_WIDTH * (FEATURE_LWS_SPLIT - 1) * 2];
if (subgroup_id == 0) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
partial_acc[16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
const uint ifm_blocks = CEIL_DIV(INPUT0_FEATURE_NUM, FSV);
const uint ifm_blocks_per_sg = ifm_blocks / FEATURE_SLM_SPLIT;
const uint ifm_per_sg = ifm_blocks_per_sg * FSV;
uint feature_offset = 0;
uint feature_blocks = ifm_blocks_per_sg;
#if FEATURE_SLM_SPLIT != 1
feature_offset = get_sub_group_id() * ifm_per_sg;
if (ifm_blocks % FEATURE_SLM_SPLIT != 0) {
bool bigger_sg = get_sub_group_id() < ifm_blocks % FEATURE_SLM_SPLIT;
feature_blocks = bigger_sg ? ifm_blocks_per_sg + 1 : ifm_blocks_per_sg;
feature_offset += bigger_sg ? get_sub_group_id() * FSV : ifm_blocks % FEATURE_SLM_SPLIT * FSV;
}
#endif
uint filter_idx = GET_WEIGHTS_INDEX(out_f, feature_offset, 0, 0, 0);
uint input_idx[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
uint input_x = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
uint input_y = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
input_idx[os] = INPUT0_GET_INDEX(out_b, feature_offset, input_y, input_x);
}
ACCUMULATOR_TYPE dotProd[OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL] = { };
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < feature_blocks; ++k) {
uint4 weights_val[OUT_BLOCK_FEATURES] = { };
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
}
} else if (subgroup_id == 1) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
partial_acc[i * 16 + subgroup_local_id] = dotProd[i];
dotProd[i] = dotProd[i + OUT_BLOCK_WIDTH];
uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
input_val[os] = vload4(0, (__global uint *)(conv_input + input_idx[os]));
}
} else if (subgroup_id == 2) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
partial_acc[2 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
partial_acc[3 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
#if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
// For some cases compiler spills here due to loop order
// Use suboptimal order to avoid this at cost of instruction dispatch delays.
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ++ive) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
#else
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ++ive) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
#endif
dotProd[ofb][os] = IMAD(dotProd[ofb][os],
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD)),
AS_FILTER_TYPE_4(weights_val[ofb][ive]));
}
}
}
} else if (subgroup_id == 3) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
partial_acc[4 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
partial_acc[5 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
filter_idx += WEIGHTS_IS_PITCH;
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
}
}
#if FEATURE_SLM_SPLIT != 1
// Additional local memory reduction for feature split mode
# if FEATURE_SLM_SPLIT < OUT_BLOCK_FEATURES
# error convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl - OUT_BLOCK_FEATURES must be less or equal to FEATURE_SLM_SPLIT
# endif
const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL;
__local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
uint sgid_start_idx = get_sub_group_id();
sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
if (get_sub_group_id() == wg) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < wg; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_SPATIAL * SIMD +
os * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
}
}
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
dotProd[0][os] = dotProd[wg][os];
}
__attribute__((opencl_unroll_hint))
for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
ofb * OUT_BLOCK_SPATIAL * SIMD +
os * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
}
}
}
}
} else {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_SPATIAL * SIMD +
os * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (subgroup_id >= 2)
if (get_sub_group_id() >= OUT_BLOCK_FEATURES)
return;
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
dotProd[i] += partial_acc[(i + subgroup_id * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
dotProd[i] += partial_acc[(i + (subgroup_id + 2) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
dotProd[i] += partial_acc[(i + (subgroup_id + 4) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
os * SIMD;
dotProd[0][os] += partial_acc_ptr[partial_acc_ptr_idx];
}
}
#endif
#if FEATURE_LWS_SPLIT == 1
# define OUTPUT_FEATURES_PER_WI 2
# if BIAS_TERM
BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f], biases[out_f + 16] };
# endif
#if FEATURE_SLM_SPLIT == 1
# define FINAL_OUT_BLOCK_FEATURES (OUT_BLOCK_FEATURES)
#else
# define OUTPUT_FEATURES_PER_WI 1
# if BIAS_TERM
BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f + subgroup_id * 16] };
# endif
# define FINAL_OUT_BLOCK_FEATURES 1
out_f += get_sub_group_id() * SIMD;
out_fg += get_sub_group_id() * SIMD;
if (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OUT_BLOCK_FEATURES != 0 && out_fg >= OUTPUT_FEATURE_NUM)
return;
#endif
for (uint j = 0; j < OUTPUT_FEATURES_PER_WI; j++) {
uint out_f_offset = subgroup_id * 16 + j * 16;
#if OUTPUT_FEATURE_NUM % 32 != 0 && OUTPUT_FEATURE_NUM % 32 <= 16
if (out_fg + 32 > OUTPUT_FEATURE_NUM && out_f_offset >= OUTPUT_FEATURE_NUM % 32)
break;
#endif
const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + out_f_offset, out_y, out_x);
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD
#endif
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && i >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
break;
#endif
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
#if BIAS_TERM
dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i] + bias[j];
#else
dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i];
#endif
OUTPUT_TYPE result;
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC
#else
FUSED_OPS
#endif
result = FUSED_OPS_RESULT;
#else
result = TO_OUTPUT_TYPE(dequantized);
// Preload bias
BIAS_TYPE bias_val[FINAL_OUT_BLOCK_FEATURES];
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
bias_val[ofb] = biases[out_f + ofb * SIMD];
}
#endif
#if OUTPUT_FEATURE_NUM % 16 != 0
if (out_fg + out_f_offset + 16 > OUTPUT_FEATURE_NUM && subgroup_local_id >= OUTPUT_FEATURE_NUM % 16)
result = (OUTPUT_TYPE)0;
// Convert accumulator type to activation type
ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
#if BIAS_TERM
dequantized[ofb][os] += TO_ACTIVATION_TYPE(bias_val[ofb]);
#endif
output[dst_index + i * 16] = result;
}
}
#undef OUTPUT_FEATURES_PER_WI
// Fused ops/activation
OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_PRELOAD_SCALAR;
#endif
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_CALC_SCALAR;
#else
FUSED_OPS_SCALAR;
#endif
result[ofb][os] = FUSED_OPS_RESULT_SCALAR;
#else
result[ofb][os] = TO_OUTPUT_TYPE(ACTIVATION(dequantized[ofb][os], ACTIVATION_PARAMS));
#endif
}
}
// Store output
// Check if can use block writes
bool only_x_block = OUTPUT_SIZE_X % OUT_BLOCK_SPATIAL == 0;
bool at_least_one_x_block = OUTPUT_SIZE_X >= OUT_BLOCK_SPATIAL;
bool full_x = out_yx_sg % OUTPUT_SIZE_X <= OUTPUT_SIZE_X - OUT_BLOCK_SPATIAL;
bool can_write_x = only_x_block || (at_least_one_x_block && full_x);
bool no_x_pad = OUTPUT_PAD_BEFORE_SIZE_X == 0 && OUTPUT_PAD_AFTER_SIZE_X == 0;
bool exact_spatial = max_out_yx % OUT_BLOCK_SPATIAL == 0;
bool full_spatial = out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL;
bool can_write_spatial = no_x_pad && (exact_spatial || full_spatial);
bool full_feature_block = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM);
bool can_use_full_block_write = full_feature_block && (can_write_x || can_write_spatial);
if (can_use_full_block_write) {
uint output_idx = OUTPUT_GET_INDEX(out_b,
out_fg,
intel_sub_group_shuffle(out_y_shuffle[0], 0),
intel_sub_group_shuffle(out_x_shuffle[0], 0));
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
if (good_of_block) {
uint os = 0;
#if OUTPUT_TYPE_SIZE == 1
for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 8; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
output_idx += 8 * SIMD;
}
#endif
#if OUTPUT_TYPE_SIZE <= 2
for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 4; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
output_idx += 4 * SIMD;
}
#endif
for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 2; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
output_idx += 2 * SIMD;
}
if (OUT_BLOCK_SPATIAL % 2 == 1) {
OUTPUT_TYPE result_val = result[ofb][os];
DT_OUTPUT_BLOCK_WRITE(output, output_idx, result_val);
output_idx += 1 * SIMD;
}
}
output_idx += OUTPUT_FEATURE_PITCH * FSV - OUT_BLOCK_SPATIAL * SIMD;
}
} else {
uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
}
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
if (good_of_block) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
if (!good_os)
break;
uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
if (!good_of)
result[ofb][os] = (OUTPUT_TYPE)0;
output[output_idx + sglid] = result[ofb][os];
}
}
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
}
}
}
#undef FINAL_OUT_BLOCK_FEATURES
}
#undef AS_INPUT0_TYPE_4
#undef AS_FILTER_TYPE_4
#undef AS_TYPE_N
#undef AS_TYPE_N_
#undef MAKE_VECTOR_TYPE
#undef TO_ACTIVATION_TYPE
#undef ACTIVATION_TYPE
#undef TO_ACCUMULATOR_TYPE
#undef ACCUMULATOR_TYPE
#undef CEIL_DIV
#undef ALIGN
#undef FSV
#undef SIMD

View File

@@ -281,32 +281,65 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
CAT(prefix, _OFFSET) \
)
#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_yx_osv16_isv16_index)( \
o, i, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM))
inline uint FUNC(get_os_is_yx_osv16_isv16_index)(uint o, uint i, uint y, uint x,
uint x_size, uint y_size, uint i_size, uint o_size)
inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, uint x,
uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size)
{
const uint isv = i % 16;
const uint osv = o % 16;
const uint is = i / 16;
const uint os = o / 16;
const uint isv = i % isv_size;
const uint osv = o % osv_size;
const uint is = i / isv_size;
const uint os = o / osv_size;
const uint x_pitch = 16 * 16;
const uint x_pitch = osv_size * isv_size;
const uint y_pitch = x_pitch * x_size;
const uint is_pitch = y_pitch * y_size;
const uint os_pitch = is_pitch * ((i_size + 16 - 1) / 16);
const uint z_pitch = y_pitch * y_size;
const uint is_pitch = z_pitch * z_size;
const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size);
const uint output_offset = isv + osv * 16 + x * x_pitch + y * y_pitch + is * is_pitch + os * os_pitch;
const uint output_offset =
isv +
osv * isv_size +
x * x_pitch +
y * y_pitch +
z * z_pitch +
is * is_pitch +
os * os_pitch;
return output_offset;
}
#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, 0, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
1, \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
16, \
16)
#define GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, z, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
32, \
16)
#define GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, z, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
64, \
16)
#define GET_FILTER_G_OS_IS_YX_ISV8_OSV16_ISV2_INDEX(prefix, g, o, i, y, x, sub_group_size) \
FUNC_CALL(get_os_is_zyx_isv8_osv16_isv2_index)( \
g, o, i, 0, y, x, \

View File

@@ -93,6 +93,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV64_ISV16
return GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV16_YXSV4
return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(INPUT0, g, o, i, y, x);
#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV32_YXSV4
@@ -220,6 +224,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV64_ISV16
return GET_FILTER_OS_IS_ZYX_OSV64_ISV16_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV16_YXSV4
return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV32_YXSV4

View File

@@ -307,6 +307,8 @@ std::string toString(WeightsLayout layout) {
case WeightsLayout::yxio: return "YXIO";
case WeightsLayout::os_is_yx_isv16_osv16: return "OS_IS_YX_ISV16_OSV16";
case WeightsLayout::os_is_yx_osv16_isv16: return "OS_IS_YX_OSV16_ISV16";
case WeightsLayout::os_is_zyx_osv32_isv16: return "OS_IS_ZYX_OSV32_ISV16";
case WeightsLayout::os_is_zyx_osv64_isv16: return "OS_IS_ZYX_OSV64_ISV16";
case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16";
case WeightsLayout::os_iyx_osv32: return "OS_IYX_OSV32";
case WeightsLayout::os_iyx_osv32__ai32: return "OS_IYX_OSV32__AI32";