[IE CLDNN] Add 3d spatials support to conv & pool imad kernels and unblock any in/out feature sizes (#1693)

This commit is contained in:
Jedrzej Hajduczenia
2020-08-20 13:54:54 +02:00
committed by GitHub
parent 94bfff2011
commit 1880958f8b
28 changed files with 1565 additions and 973 deletions

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -170,6 +170,7 @@ struct format {
os_is_y_x8_osv8_isv4, ///< format for weights for 1x1 MMAD convolutions
os_is_y_x8_osv8_isv4_swizzled_by_4, ///< format for weights for 1x1 MMAD convolutions
os_is_yx_osv16_isv4, ///< format for weights for IMAD convolutions
os_is_zyx_osv16_isv16, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv4_swizzled_by_2, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv4, ///< format for weights for IMAD convolutions
os_is_yx_osv32_isv32p, ///< format for weights for binary convolutions
@@ -188,6 +189,7 @@ struct format {
gs_oiyx_gsv32, ///< format used for weights for 2D convolution
g_is_os_zyx_osv16_isv16, ///< format used for grouped weights for blocked 3D deconvolution
g_os_is_yx_osv16_isv4,
g_os_is_zyx_osv16_isv16,
g_is_os_yx_osv16_isv16,
g_os_is_zyx_isv8_osv16_isv2,
g_os_is_yx_isv8_osv16_isv2,
@@ -237,7 +239,7 @@ struct format {
{ b_fs_yx_32fp, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ b_fs_zyx_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{1, 16}}}},
{ bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16 }, {1, 16}}}},
{ bs_fs_yx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
{ bs_fs_yx_bsv16_fsv16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
{ nv12, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
{ image_2d_rgba, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
@@ -265,7 +267,8 @@ struct format {
{ is_o32_yx_isv32_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ os_is_y_x8_osv8_isv4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ os_is_y_x8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 16}, {1, 4}}}},
{ os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16}, {1, 4}}}},
{ os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16}, {1, 16}}}},
{ os_is_yx_osv32_isv4_swizzled_by_2, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
{ os_is_yx_osv32_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
{ os_is_yx_osv32_isv32p, { 1, 1, 1, 0, 0, "bfxy", "bfxy?", {}}},
@@ -291,7 +294,8 @@ struct format {
{ g_os_is_zyx_isv8_osv16_isv2, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{1, 8}, {0, 16}, {1, 2}}}},
{ g_os_is_yx_isv8_osv16_isv2, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{1, 8}, {0, 16}, {1, 2}}}},
{ g_os_is_zyx_isv16_osv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
{ g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfxy", "bfxy????g", {{0, 16}, {1, 4}}}},
{ g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{0, 16}, {1, 4}}}},
{ g_os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
{ g_os_zyx_is_osv16_isv4, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 4}}}},
{ g_os_zyx_is_osv16_isv16, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 16}}}},
{ g_os_zyx_is_osv16_isv32, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 32}}}},

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -73,6 +73,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
{ WeightsLayout::os_i_osv16__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_i_osv16, { -1, -1, -1, 0, 1, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_osv16_isv16, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::g_os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, 5 } },
{ WeightsLayout::os_is_zyx_osv32_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osv64_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::i_yxs_os_yxsv2_osv16, { 1, 2, -1, 3, 0, -1, -1, -1 } },
@@ -425,7 +427,7 @@ DataTensor DataTensor::FlattenEverything() const {
NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l) {
std::vector<size_t> newDims = d;
// TOOD: it's not the right pitches. it's here in order to calculate physical size
// TODO: It's not the right pitches. it's here in order to calculate physical size
switch (l) {
case os_iyx_osv16:
case os_iyx_osv16_rotate_180:
@@ -635,6 +637,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
newDims[2] = RoundUp(newDims[2], 16);
newDims[3] = RoundUp(newDims[3], 16);
break;
case os_is_zyx_osv16_isv16:
assert(newDims.size() == 5);
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 16);
break;
case g_os_is_zyx_osv16_isv16:
assert(newDims.size() == 6);
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 16);
break;
case os_is_zyx_osv32_isv16:
newDims[3] = RoundUp(newDims[3], 16);
newDims[4] = RoundUp(newDims[4], 32);

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -92,6 +92,7 @@ enum WeightsLayout {
os_i_osv16__ai8,
os_i_osv16,
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
os_is_zyx_osv16_isv16,
os_is_zyx_osv32_isv16,
os_is_zyx_osv64_isv16,
i_yxs_os_yxsv2_osv16,
@@ -142,6 +143,7 @@ enum WeightsLayout {
g_os_is_zyx_isv8_osv16_isv2,
g_os_is_yx_isv8_osv16_isv2,
g_os_is_zyx_isv16_osv16,
g_os_is_zyx_osv16_isv16,
giy_xs_os_xsv2_osv16__ao32,
giy_xs_os_xsv2_osv8__ao32,
g_os_is_yx_isv16_osv16,

View File

@@ -24,11 +24,13 @@ ParamsKey ConcatenationKernel_simple_Ref::GetSupportedKey() const {
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableInputLayout(DataLayout::bfyx);

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
#include "kernel_selector_utils.h"
#include "common_tools.h"
#include <vector>
@@ -63,8 +63,8 @@ static size_t getOutBlock_X(const size_t output_size_x, const size_t stride_x, c
namespace kernel_selector {
Convolution_kernel_b_fs_yx_fsv16_imad::BlockParams
Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
constexpr float max_reg_pressure = 0.75f;
// TODO Investigate whether below algorithm for selecting optimal block params could be reduced to:
@@ -85,9 +85,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
size_t block_features = simd;
{
size_t tmp_block_features = simd * 2;
auto block2_params = BlockParams{ block_width, 1, tmp_block_features, in_block_width, 1, 1 };
auto block2_params = BlockParams{ block_width, 1, 1, tmp_block_features, in_block_width, 1, 1, 1 };
bool c_mul_f = params.output.Feature().v % tmp_block_features == 0;
bool c_mul_f = params.weights.OFM().v % tmp_block_features == 0;
bool c_reg_pressure = EstimateRegPressure(params, block2_params) <= max_reg_pressure;
if (c_mul_f && c_reg_pressure) {
@@ -97,7 +97,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
// If not enough occupancy try to perform feature split or/and block reduction
size_t feature_slm_split = 1;
auto no_split_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, 1 };
auto no_split_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 1 };
if (EstimateOccupancy(params, no_split_params) < 1.f) {
// Temporary variables for possible reductions in block sizes
bool update_block_params = false;
@@ -107,7 +109,8 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
// Feature split requires extra registers, so check if it can be done with current block sizes
bool can_split =
EstimateRegPressure(params, BlockParams{ block_width, 1, block_features, in_block_width, 1, 2 }) <= max_reg_pressure;
EstimateRegPressure(params, BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 2 }) <= max_reg_pressure;
// Has the occupancy reached sufficient level
bool enough_occupancy = false;
// Reductions to reduce register pressure
@@ -116,7 +119,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
// At most twice reduction in output block width is acceptable
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
auto dummy_split_params = BlockParams{ w, 1, block_features, tmp_in_width, 1, 2 };
auto dummy_split_params = BlockParams{ w, 1, 1, block_features, tmp_in_width, 1, 1, 2 };
bool c_reg_pressure = EstimateRegPressure(params, dummy_split_params) <= max_reg_pressure;
bool c_mul_x = params.output.X().v % w == 0;
@@ -139,7 +142,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
}
// Check if previous reductions haven't improved occupancy enough
{
auto reduced_params = BlockParams{ split_block_width, 1, split_block_features, split_in_block_width, 1, 1 };
auto reduced_params = BlockParams{ split_block_width, 1, 1, split_block_features, split_in_block_width, 1, 1, 1 };
enough_occupancy = EstimateOccupancy(params, reduced_params) >= 1.f;
update_block_params = enough_occupancy;
}
@@ -147,7 +150,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
if (can_split && !enough_occupancy) {
// TODO Try other split sizes
for (size_t split = 4; split < 5; ++split) {
auto tmp_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, split };
auto tmp_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, split };
bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
@@ -172,7 +175,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
// At most twice reduction in output block width is acceptable
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
auto tmp_params = BlockParams{ w, 1, split_block_features, tmp_in_width, 1, feature_slm_split };
auto tmp_params = BlockParams{ w, 1, 1, split_block_features, tmp_in_width, 1, 1, feature_slm_split };
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
bool c_mul_x = params.output.X().v % w == 0;
@@ -194,44 +197,60 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
}
}
// Select biggest block height that fits into registers
// Select biggest block height and depth that fits into registers
size_t block_height = 1;
size_t block_depth = 1;
size_t in_block_height = 1;
for (size_t h = 2; h < 16; ++h) {
if (params.output.Y().v % h != 0)
size_t in_block_depth = 1;
bool break_external_loop = false;
for (size_t d = 1; d < 16; ++d) {
if (params.output.Z().v % d != 0)
continue;
for (size_t h = 2; h < 16; ++h) {
if (params.output.Y().v % h != 0)
continue;
size_t tmp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
auto tmp_params = BlockParams{ block_width, h, d, block_features, in_block_width, tmp_in_block_height, tmp_in_block_depth, feature_slm_split };
size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
auto tmp_params = BlockParams{ block_width, h, block_features, in_block_width, tmp_in_block_height, feature_slm_split };
bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
if (c_reg_pressure && c_occupancy && c_slm) {
block_height = h;
block_depth = d;
in_block_height = tmp_in_block_height;
in_block_depth = tmp_in_block_depth;
} else {
break_external_loop = true;
break;
}
}
if (c_reg_pressure && c_occupancy && c_slm) {
block_height = h;
in_block_height = tmp_in_block_height;
} else {
if (break_external_loop) {
break;
}
}
return BlockParams{ block_width, block_height, block_features, in_block_width, in_block_height, feature_slm_split };
return BlockParams{ block_width, block_height, block_depth, block_features, in_block_width, in_block_height, in_block_depth, feature_slm_split };
}
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
size_t bytes_used = 0;
// accumulator
size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_features;
size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_depth * block.output_block_features;
bytes_used += accumulator_elements * BytesPerElement(GetAccumulatorType(params));
// input block
size_t input_block_elements = block.input_block_height * Align(block.input_block_width, simd) * fsv;
size_t input_block_elements = block.input_block_depth * block.input_block_height * Align(block.input_block_width, simd) * fsv;
bytes_used += input_block_elements * BytesPerElement(params.inputs[0].GetDType());
// weights block
size_t weights_block_elements = block.output_block_features * fsv;
bytes_used += weights_block_elements * BytesPerElement(params.weights.GetDType());
// Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, filter_idx, etc.)
// Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, out_z, filter_idx, etc.)
constexpr size_t experimental_extra_regs = 8 * 32;
bytes_used += experimental_extra_regs;
@@ -248,13 +267,14 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convoluti
return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
}
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
size_t blocks_w = CeilDiv(params.output.X().v, block.output_block_width);
size_t blocks_h = CeilDiv(params.output.Y().v, block.output_block_height);
size_t blocks_f = CeilDiv(params.output.Feature().v, block.output_block_features) * block.feature_slm_split;
size_t blocks_d = CeilDiv(params.output.Z().v, block.output_block_depth);
size_t blocks_f = CeilDiv(params.weights.OFM().v, block.output_block_features) * params.groups * block.feature_slm_split;
size_t block_b = params.output.Batch().v;
auto threads = blocks_w * blocks_h * blocks_f * block_b;
auto threads = blocks_w * blocks_h * blocks_d * blocks_f * block_b;
constexpr size_t max_threads_per_cu = 7;
size_t compute_units = params.engineInfo.computeUnitsCount;
size_t max_threads = compute_units * max_threads_per_cu;
@@ -262,17 +282,18 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution
return static_cast<float>(threads) / static_cast<float>(max_threads);
}
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_features * (block.feature_slm_split - 1);
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
block.output_block_features * (block.feature_slm_split - 1);
size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
// TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
// TODO: Actual maximum slm should also depend on number of work-groups, but this is device specific
size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
}
ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
@@ -284,6 +305,9 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
@@ -294,31 +318,36 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableGroupedConvolution();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableDilation();
k.DisableTuning();
return k;
}
KernelsData Convolution_kernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params,
KernelsData Convolution_kernel_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params,
const optional_params& options) const {
return GetCommonKernelsData(params, options);
}
JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convolution_params& params,
JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convolution_params& params,
const DispatchData& kd) const {
auto mem_consts = Parent::GetJitConstants(params, kd);
auto block_params = GetBlockParams(params);
bool unroll_filter_y = block_params.output_block_height != 1;
bool unroll_filter_z = block_params.output_block_depth != 1;
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block_params.output_block_width));
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", block_params.input_block_width));
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block_params.output_block_height));
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_HEIGHT", block_params.input_block_height));
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block_params.output_block_depth));
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_DEPTH", block_params.input_block_depth));
mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Y_UNROLL", unroll_filter_y ? params.filterSize.y : 1));
mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", block_params.output_block_features / simd));
mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Z_UNROLL", unroll_filter_z ? params.filterSize.z : 1));
mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", static_cast<int>(std::ceil(block_params.output_block_features / simd))));
mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", block_params.output_block_features));
mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", block_params.feature_slm_split));
mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
@@ -327,7 +356,20 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
std::vector<std::string> idx_order = { "out_b", "(out_f + ofb * 16)", "(out_y + oh)", "(out_x + ow)" };
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
idx_order = { "out_b", "(out_f + ofb * 16)", "(out_z + od)", "(out_y + oh)", "(out_x + ow)" };
}
std::vector<Tensor::DataChannelName> loop_axes = { Tensor::DataChannelName::X };
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
if (block_params.output_block_depth != 1) {
loop_axes.push_back(Tensor::DataChannelName::Z);
} else {
idx_order[idx_order.size() - 3] = "out_z";
}
}
if (block_params.output_block_height != 1) {
loop_axes.push_back(Tensor::DataChannelName::Y);
} else {
@@ -349,15 +391,16 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
return mem_consts;
} // GetJitConstants
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDefault(const convolution_params& params,
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
int) const {
DispatchData kd;
const auto& output = params.output;
const auto& weights = params.weights;
auto block_params = GetBlockParams(params);
kd.gws0 = CeilDiv(output.X().v, block_params.output_block_width);
kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height);
kd.gws2 = output.Batch().v * CeilDiv(output.Feature().v, block_params.output_block_features) * simd * block_params.feature_slm_split;
kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
kd.gws2 = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
kd.lws0 = 1;
kd.lws1 = 1;
@@ -367,16 +410,13 @@ ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDe
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
kd.efficiency = FORCE_PRIORITY_2;
// TODO Optimize 1x1, because this kernel is better in most cases
//if (params.filterSize.x == 1 && params.filterSize.y == 1)
// kd.efficiency = FORCE_PRIORITY_1;
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
kd.efficiency = FORCE_PRIORITY_4;
return kd;
} // SetDefault
bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
if (!Parent::Validate(params, options)) {
return false;
}
@@ -384,7 +424,7 @@ bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const
KernelData kd = KernelData::Default<convolution_params>(params);
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
if (newParams.groups != 1 || newParams.split != 1)
if (newParams.split != 1)
return false;
return true;

View File

@@ -21,11 +21,11 @@
namespace kernel_selector {
class Convolution_kernel_b_fs_yx_fsv16_imad : public ConvolutionKernelBase {
class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
Convolution_kernel_b_fs_yx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad") {}
virtual ~Convolution_kernel_b_fs_yx_fsv16_imad() {}
Convolution_kernel_b_fs_zyx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_zyx_fsv16_imad") {}
virtual ~Convolution_kernel_b_fs_zyx_fsv16_imad() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
@@ -35,8 +35,8 @@ protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
return WeightsLayout::os_is_yx_osv16_isv16;
WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
return p.groups > 1 ? WeightsLayout::g_os_is_zyx_osv16_isv16 : WeightsLayout::os_is_zyx_osv16_isv16;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
@@ -49,10 +49,13 @@ protected:
struct BlockParams {
size_t output_block_width;
size_t output_block_height;
size_t output_block_depth;
size_t output_block_features;
size_t input_block_width;
size_t input_block_height;
size_t input_block_depth;
size_t feature_slm_split;
};

View File

@@ -60,7 +60,7 @@
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.h"
#include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
@@ -71,9 +71,9 @@ convolution_kernel_selector::convolution_kernel_selector() {
Attach<ConvolutionKernel_Ref>();
Attach<DeformableConvolutionKernel_bfyx_Ref>();
// b_fs_yx_fsv16 int8
// b_fs_yx_fsv16 and b_fs_zyx_fsv16 int8
Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
Attach<Convolution_kernel_b_fs_yx_fsv16_imad>();
Attach<Convolution_kernel_b_fs_zyx_fsv16_imad>();
// b_fs_yx_fsv16 and b_fs_zyx_fsv16
Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();

View File

@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
#include "kernel_selector_utils.h"
#define FEATURE_SLICE_SIZE 16
namespace kernel_selector {
ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
@@ -27,6 +27,8 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
@@ -41,17 +43,18 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
return k;
}
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(const pooling_params& params) const {
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
const auto& out = params.output;
auto x = out.X().v;
auto y = out.Y().v;
auto z = out.Z().v;
auto f = out.Feature().v;
auto b = out.Batch().v;
runInfo.gws0 = x;
runInfo.gws1 = y;
runInfo.gws1 = y * z;
// we got b_fs_yx_fsv16 format, we process 16 features per workitem
runInfo.gws2 = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
@@ -64,19 +67,24 @@ PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(
return runInfo;
}
JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
auto jit = PoolingKernelBase::GetJitConstants(params, kd);
const size_t in_x_pitch = FEATURE_SLICE_SIZE;
const size_t in_y_pitch = FEATURE_SLICE_SIZE * params.inputs[0].X().LogicalDimPadded();
const size_t in_z_pitch = FEATURE_SLICE_SIZE * params.inputs[0].Y().LogicalDimPadded() * params.inputs[0].X().LogicalDimPadded();
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_Z_PITCH", in_z_pitch));
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
if (!params.fused_ops.empty()) {
auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
FusedOpsConfiguration conf = {"", {"b", "(f+i)", "y", "x"}, "pool_result[i]", input_dt, 1};
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
conf = {"", {"b", "(f+i)", "z", "y", "x"}, "pool_result[i]", input_dt, 1 };
}
conf.SetLoopAxes({ Tensor::DataChannelName::FEATURE }, true);
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
}
@@ -84,19 +92,11 @@ JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_
return jit;
}
KernelsData PoolingKernelGPU_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
KernelsData PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
}
bool PoolingKernelGPU_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
if (!PoolingKernelBase::Validate(params, options)) {
return false;
}
auto p = dynamic_cast<const pooling_params&>(params);
if (p.inputs[0].Feature().v % FEATURE_SLICE_SIZE != 0)
return false;
return true;
bool PoolingKernelGPU_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
return PoolingKernelBase::Validate(params, options);
}
} // namespace kernel_selector

View File

@@ -19,10 +19,10 @@
#include <vector>
namespace kernel_selector {
class PoolingKernelGPU_b_fs_yx_fsv16_imad: public PoolingKernelBase{
class PoolingKernelGPU_b_fs_zyx_fsv16_imad: public PoolingKernelBase{
public:
PoolingKernelGPU_b_fs_yx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_yx_fsv16_imad") {}
virtual ~PoolingKernelGPU_b_fs_yx_fsv16_imad() {}
PoolingKernelGPU_b_fs_zyx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_zyx_fsv16_imad") {}
virtual ~PoolingKernelGPU_b_fs_zyx_fsv16_imad() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;

View File

@@ -24,7 +24,7 @@
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
#include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
#include "pooling_kernel_gpu_bsv16_fsv16.h"
#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
namespace kernel_selector {
@@ -40,7 +40,7 @@ pooling_kernel_selector::pooling_kernel_selector() {
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
Attach<PoolingKernel_b_fs_yx_fsv16>();
Attach<PoolingKernel_bsv16_fsv16>();
Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
Attach<PoolingKernelGPU_b_fs_zyx_fsv16_imad>();
Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
}

View File

@@ -1,390 +0,0 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/fetch.cl"
#include "include/imad.cl"
#include "include/mmad.cl"
#include "include/data_types.cl"
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define SIMD 16
#define FSV 16
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
KERNEL(convolution_gpu_b_fs_yx_fsv16_imad)(
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
const __global FILTER_TYPE *weights,
#if BIAS_TERM
const __global BIAS_TYPE *biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx) {
#define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
#undef LUT_VALUE_CLAMP
const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
const uint out_y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
uint out_f = out_fg + get_sub_group_local_id();
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
#if FEATURE_SLM_SPLIT == 1
const uint k_start = 0;
#else
const uint k_start = get_sub_group_id() * FSV;
#endif
uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, k_start, 0, 0);
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, 16) * FILTER_SIZE_X * FILTER_SIZE_Y * 16);
uint input_start_idx = INPUT0_GET_INDEX(out_b, k_start, input_y, input_x);
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
uint4 input_val[IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16) / FEATURE_SLM_SPLIT; k++) {
__attribute__((opencl_unroll_hint(1)))
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
// Load input block IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
__attribute__((opencl_unroll_hint))
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
__attribute__((opencl_unroll_hint))
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
uint input_idx = input_start_idx + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
input_val[iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * 16));
} else {
input_val[iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * 16));
}
}
}
__attribute__((opencl_unroll_hint))
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
}
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
const uint shuffle_wi = x_block_idx % SIMD;
const uint shuffle_idx = x_block_idx / SIMD;
dotProd[ofb][oh][ow] = TO_ACCUMULATOR_TYPE(
IMAD(dotProd[ofb][oh][ow],
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[y_block_idx][shuffle_idx][ive], shuffle_wi)),
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
}
}
}
}
filter_idx += FSV * FSV;
}
}
input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
}
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * (FEATURE_SLM_SPLIT - 1);
}
#if FEATURE_SLM_SPLIT != 1
// Additional local memory reduction for feature split mode
# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
# error convolution_gpu_b_fs_yx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
# endif
const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
__local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
uint sgid_start_idx = get_sub_group_id();
sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH
+ get_sub_group_local_id();
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
if (get_sub_group_id() == wg) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < wg; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
}
}
}
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dotProd[0][oh][ow] = dotProd[wg][oh][ow];
}
}
__attribute__((opencl_unroll_hint))
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OFM_SIZE_PER_SIMD : 0) +
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
}
}
}
}
}
} else {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
return;
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * SIMD + get_sub_group_local_id();
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
dotProd[0][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
}
}
}
#endif
#if FEATURE_SLM_SPLIT == 1
# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
#else
# define OFM_VALUES_PER_WI 1
out_f += get_sub_group_id() * SIMD;
out_fg += get_sub_group_id() * SIMD;
#endif
#if BIAS_TERM
BIAS_TYPE bias[OFM_VALUES_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
bias[ofb] = biases[out_f + ofb * SIMD];
}
#endif
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dequantized[ofb][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][oh][ow]);
#if BIAS_TERM
dequantized[ofb][oh][ow] += bias[ofb];
#endif
}
}
}
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_PRELOAD_SCALAR;
#endif
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
ACTIVATION_TYPE dequantized_val = dequantized[ofb][oh][ow];
#if HAS_FUSED_OPS
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_CALC_SCALAR;
# else
FUSED_OPS_SCALAR;
# endif
result[ofb][oh][ow] = FUSED_OPS_RESULT_SCALAR;
#else
result[ofb][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
#endif
}
}
}
uint dst_index = OUTPUT_GET_INDEX(out_b, out_fg, out_y, out_x);
if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)
&& (OUTPUT_FEATURE_NUM % OFM_BLOCKS_PER_SIMD == 0) ) {
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
if (good_of_block) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
uint ow = 0;
#if OUTPUT_TYPE_SIZE == 1
__attribute__((opencl_unroll_hint))
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 8; ++i) {
result_val[i] = result[ofb][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
dst_index += 8 * SIMD;
}
#endif
#if OUTPUT_TYPE_SIZE <= 2
__attribute__((opencl_unroll_hint))
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 4; ++i) {
result_val[i] = result[ofb][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
dst_index += 4 * SIMD;
}
#endif
__attribute__((opencl_unroll_hint))
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 2; ++i) {
result_val[i] = result[ofb][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
dst_index += 2 * SIMD;
}
if (OUT_BLOCK_WIDTH % 2 == 1) {
OUTPUT_TYPE result_val = result[ofb][oh][ow];
DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
dst_index += 1 * SIMD;
}
} // if (good_y)
dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
} // for (OUT_BLOCK_HEIGHT)
} // if (good_of_block)
dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Y_PITCH * FSV * OUT_BLOCK_HEIGHT;
} // for (OFM_VALUES_PER_WI)
} else {
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
if (good_of_block) {
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
break;
#endif
#if OUTPUT_FEATURE_NUM % SIMD != 0
if (out_fg + (ofb + 1) * SIMD >= OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % SIMD)
result[ofb][oh][ow] = (OUTPUT_TYPE)0;
#endif
output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV] = result[ofb][oh][ow];
}
}
}
}
}
}
}
#undef AS_INPUT0_TYPE_4
#undef AS_TYPE_N
#undef AS_TYPE_N_
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV
#undef ALIGN
#undef SIMD
#undef FSV
#undef OFM_VALUES_PER_WI

View File

@@ -0,0 +1,516 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/fetch.cl"
#include "include/imad.cl"
#include "include/mmad.cl"
#include "include/data_types.cl"
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define SIMD 16
#define FSV 16
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(SIMD)))
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
const __global FILTER_TYPE *weights,
#if BIAS_TERM
const __global BIAS_TYPE *biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx) {
#define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
#undef LUT_VALUE_CLAMP
const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
const uint out_y = ((uint)get_global_id(1) / ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_HEIGHT;
#if INPUT0_DIMS == 4
const uint out_z = 0;
#else
const uint out_z = ((uint)get_global_id(1) % ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_DEPTH;
#endif
const uint out_b = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) / FILTER_GROUPS_NUM;
const uint g = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) % FILTER_GROUPS_NUM;
uint out_f_sg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % (ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD) * FILTER_GROUPS_NUM);
uint out_f = out_f_sg + get_sub_group_local_id();
uint out_f_g = (out_f % ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD));
#if FILTER_OFM_NUM % SIMD != 0
out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
#endif
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
#if FEATURE_SLM_SPLIT == 1
const uint k_start = 0;
#else
const uint k_start = get_sub_group_id() * FSV;
#endif
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
#if INPUT0_DIMS == 4
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
#else
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
#endif
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
uint in_f_offset = (g * FILTER_IFM_NUM) % FSV;
#endif
uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
__attribute__((opencl_unroll_hint(1)))
for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
__attribute__((opencl_unroll_hint(1)))
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
// Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
__attribute__((opencl_unroll_hint))
for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
__attribute__((opencl_unroll_hint))
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
__attribute__((opencl_unroll_hint))
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
#else
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
} else {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v +
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
FSV];
}
}
}
#endif
} else {
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
#else
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
} else {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v +
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
FSV];
}
}
}
#endif
}
}
}
}
__attribute__((opencl_unroll_hint))
for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
__attribute__((opencl_unroll_hint))
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
}
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
const uint shuffle_wi = x_block_idx % SIMD;
const uint shuffle_idx = x_block_idx / SIMD;
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
IMAD(dotProd[ofb][od][oh][ow],
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
shuffle_wi)),
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
}
}
}
}
}
filter_idx += FSV * FSV;
}
}
}
input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
}
input_start_idx += DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
}
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
}
#if FEATURE_SLM_SPLIT != 1
// Additional local memory reduction for feature split mode
# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
# error convolution_gpu_b_fs_zyx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
# endif
const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
__local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
uint sgid_start_idx = get_sub_group_id();
sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
get_sub_group_local_id();
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
if (get_sub_group_id() == wg) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < wg; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
}
}
}
}
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
}
}
}
__attribute__((opencl_unroll_hint))
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
}
}
}
}
}
}
} else {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
}
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
return;
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
oh * OUT_BLOCK_WIDTH * SIMD +
ow * SIMD;
dotProd[0][od][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
}
}
}
}
#endif
#if FEATURE_SLM_SPLIT == 1
# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
#else
# define OFM_VALUES_PER_WI 1
out_f += get_sub_group_id() * SIMD;
out_f_sg += get_sub_group_id() * SIMD;
#endif
#if BIAS_TERM
BIAS_TYPE bias[OFM_VALUES_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
bias[ofb] = biases[out_f + ofb * SIMD];
}
#endif
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
#if BIAS_TERM
dequantized[ofb][od][oh][ow] += bias[ofb];
#endif
}
}
}
}
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_PRELOAD_SCALAR;
#endif
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
#if HAS_FUSED_OPS
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_CALC_SCALAR;
# else
FUSED_OPS_SCALAR;
# endif
result[ofb][od][oh][ow] = FUSED_OPS_RESULT_SCALAR;
#else
result[ofb][od][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
#endif
}
}
}
}
#if OUTPUT_DIMS == 4
uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_y, out_x);
#else
uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_z, out_y, out_x);
#endif
#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)) {
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
if (good_of_block) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
if (good_z) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
uint ow = 0;
#if OUTPUT_TYPE_SIZE == 1
__attribute__((opencl_unroll_hint))
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 8; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
dst_index += 8 * SIMD;
}
#endif
#if OUTPUT_TYPE_SIZE <= 2
__attribute__((opencl_unroll_hint))
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 4; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
dst_index += 4 * SIMD;
}
#endif
__attribute__((opencl_unroll_hint))
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 2; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
dst_index += 2 * SIMD;
}
if (OUT_BLOCK_WIDTH % 2 == 1) {
OUTPUT_TYPE result_val = result[ofb][od][oh][ow];
DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
dst_index += 1 * SIMD;
}
} // if (good_y)
dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
} // for (OUT_BLOCK_HEIGHT)
} // if (good_z)
dst_index += OUTPUT_Z_PITCH * FSV - OUTPUT_Y_PITCH * OUT_BLOCK_HEIGHT * FSV;
} // for (OUT_BLOCK_DEPTH)
} // if (good_of_block)
dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Z_PITCH * OUT_BLOCK_DEPTH * FSV;
} // for (OFM_VALUES_PER_WI)
} else {
#endif
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
if (good_of_block) {
#if OUTPUT_DIMS == 4
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
#else
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
#endif
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
if (good_z) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
break;
#endif
if (out_f_g < FILTER_OFM_NUM) {
output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV + od * OUTPUT_Z_PITCH * FSV] = result[ofb][od][oh][ow];
}
}
}
}
}
}
}
}
#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
}
#endif
}
#undef AS_INPUT0_TYPE_4
#undef AS_TYPE_N
#undef AS_TYPE_N_
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV
#undef ALIGN
#undef SIMD
#undef FSV
#undef OFM_VALUES_PER_WI

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -353,6 +353,45 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
return output_offset;
}
inline uint FUNC(get_g_os_is_zyx_osv_isv_index)(uint g, uint o, uint i, uint z, uint y, uint x,
uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size)
{
const uint isv = i % isv_size;
const uint osv = o % osv_size;
const uint is = i / isv_size;
const uint os = o / osv_size;
const uint x_pitch = osv_size * isv_size;
const uint y_pitch = x_pitch * x_size;
const uint z_pitch = y_pitch * y_size;
const uint is_pitch = z_pitch * z_size;
const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size);
const uint g_pitch = os_pitch * ((o_size + osv_size - 1) / osv_size);
const uint output_offset =
isv +
osv * isv_size +
x * x_pitch +
y * y_pitch +
z * z_pitch +
is * is_pitch +
os * os_pitch +
g * g_pitch;
return output_offset;
}
#define GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, g, o, i, z, y, x) \
FUNC_CALL(get_g_os_is_zyx_osv_isv_index)( \
g, o, i, z, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
16, \
16)
#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, 0, y, x, \
@@ -364,6 +403,17 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
16, \
16)
#define GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, z, y, x, \
CAT(prefix, _SIZE_X), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
16, \
16)
#define GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
o, i, z, y, x, \

View File

@@ -1,206 +0,0 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/include_all.cl"
#include "include/data_types.cl"
#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
#define AS_TYPE(type, val) CAT(as_, type)(val)
#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
#define FEATURE_SLICE_SIZE 16
#if MAX_POOLING
#define INIT_VAL ACCUMULATOR_VAL_MIN
#elif AVG_POOLING
#define INIT_VAL ACCUMULATOR_VAL_ZERO
#else
#error
#endif
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
{
#if MAX_POOLING
return ACCUMULATOR_MAX_FUNC(tmp, in);
#elif AVG_POOLING
return tmp + in;
#endif
}
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
KERNEL(pooling_gpu_b_fs_yx_fsv16)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint x = (uint)get_global_id(0);
const uint y = (uint)get_global_id(1);
const uint bf = (uint)get_global_id(2);
const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
#ifdef CHECK_BOUNDRY
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
{
return;
}
#ifdef DYNAMIC_KERNEL_DIVIDER
uint num_elements = 0;
#endif
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
for(uint j = 0; j < POOL_SIZE_Y; j++)
{
int input_offset_y = offset_y + j;
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
if(!zero_y)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
for(uint i = 0; i < POOL_SIZE_X; i++)
{
int input_offset_x = offset_x + i;
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
if(!zero)
{
const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
int4 int_data = vload4(0, (__global int*)(input + input_idx));
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
#ifdef DYNAMIC_KERNEL_DIVIDER
num_elements++;
#endif
}
}
}
}
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
const uint num_elements = (hend - offset_y) * (wend - offset_x);
#endif
#else // !CHECK_BOUNDRY
uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
for(uint j = 0; j < POOL_SIZE_Y; j++)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
for(uint i = 0; i < POOL_SIZE_X; i++)
{
int4 int_data = vload4(0, (__global int*)(input + input_idx));
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
input_idx += IN_X_PITCH;
}
input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
}
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y;
#endif
#endif
ACTIVATION_VEC16 pool_result;
#if defined AVG_POOLING
#if ENABLE_ROUND
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
#else
pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
#endif
}
#else
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
#else
pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
#endif
}
#endif // ENABLE_ROUND
#else // AVG_POOLING
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
pool_result[i] = result[i];
}
#endif // AVG_POOLING
OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD;
#endif
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC;
#else
FUSED_OPS;
#endif
final_result[i] = FUSED_OPS_RESULT;
#else
final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
#endif
}
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
#if OUTPUT_TYPE_SIZE == 1
vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
#else
*((__global OUT_VEC16*)(output + output_pos)) = final_result;
#endif
}
#undef ALIGN_TO
#undef AS_TYPE
#undef IN_VEC16
#undef OUT_VEC16
#undef ACTIVATION_VEC16
#undef TO_ACTIVATION_VEC16
#undef INIT_VAL
#undef FEATURE_SLICE_SIZE

View File

@@ -0,0 +1,311 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/include_all.cl"
#include "include/data_types.cl"
#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
#define AS_TYPE(type, val) CAT(as_, type)(val)
#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
#define FEATURE_SLICE_SIZE 16
#if MAX_POOLING
#define INIT_VAL ACCUMULATOR_VAL_MIN
#elif AVG_POOLING
#define INIT_VAL ACCUMULATOR_VAL_ZERO
#else
#error
#endif
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
{
#if MAX_POOLING
return ACCUMULATOR_MAX_FUNC(tmp, in);
#elif AVG_POOLING
return tmp + in;
#endif
}
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint x = (uint)get_global_id(0);
#if OUTPUT_DIMS == 4
const uint y = (uint)get_global_id(1);
const uint z = 0;
#else
const uint zy = (uint)get_global_id(1);
const uint y = zy % OUTPUT_SIZE_Y;
const uint z = zy / OUTPUT_SIZE_Y;
#endif
const uint bf = (uint)get_global_id(2);
const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
const bool last_in_f_group = (f == FEATURE_SLICE_SIZE * ((INPUT0_FEATURE_NUM - 1) / FEATURE_SLICE_SIZE));
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
#ifdef CHECK_BOUNDRY
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
{
return;
}
#ifdef DYNAMIC_KERNEL_DIVIDER
uint num_elements = 0;
#endif
#if INPUT0_DIMS == 4
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
#else
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
#endif
__attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
{
int input_offset_z = offset_z + pz;
bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
if(!zero_z)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
for(uint py = 0; py < POOL_SIZE_Y; py++)
{
int input_offset_y = offset_y + py;
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
if(!zero_y)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
for(uint px = 0; px < POOL_SIZE_X; px++)
{
int input_offset_x = offset_x + px;
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
if(!zero)
{
const uint input_idx = batch_and_feature_offset + input_offset_z*IN_Z_PITCH + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
IN_VEC16 ch16_data;
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
ch16_data[k] = input[input_idx + k];
}
}
#endif
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
}
#endif
#ifdef DYNAMIC_KERNEL_DIVIDER
num_elements++;
#endif
}
}
}
}
}
}
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
const uint num_elements = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
#endif
#else // !CHECK_BOUNDRY
#if INPUT0_DIMS == 4
uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
#else
uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
#endif
__attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
for(uint py = 0; py < POOL_SIZE_Y; py++)
{
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
for(uint px = 0; px < POOL_SIZE_X; px++)
{
IN_VEC16 ch16_data;
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
ch16_data[k] = input[input_idx + k];
}
}
#endif
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
{
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
}
}
#endif
input_idx += IN_X_PITCH;
}
input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
}
input_idx += (IN_Z_PITCH - POOL_SIZE_Y*IN_Y_PITCH);
}
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
#endif
#endif
ACTIVATION_VEC16 pool_result;
#if defined AVG_POOLING
#if ENABLE_ROUND
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
#else
pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
#endif
}
#else
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
#else
pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
#endif
}
#endif // ENABLE_ROUND
#else // AVG_POOLING
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
pool_result[i] = result[i];
}
#endif // AVG_POOLING
OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD;
#endif
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC;
#else
FUSED_OPS;
#endif
final_result[i] = FUSED_OPS_RESULT;
#else
final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
#endif
}
#if OUTPUT_DIMS == 4
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
#else
const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x);
#endif
#if OUTPUT_TYPE_SIZE == 1
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
output[output_pos + k] = final_result[k];
}
}
#endif
#else
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
if (!last_in_f_group) {
#endif
*((__global OUT_VEC16*)(output + output_pos)) = final_result;
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
} else {
__attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
output[output_pos + k] = final_result[k];
}
}
#endif
#endif
}
#undef ALIGN_TO
#undef AS_TYPE
#undef IN_VEC16
#undef OUT_VEC16
#undef ACTIVATION_VEC16
#undef TO_ACTIVATION_VEC16
#undef INIT_VAL
#undef FEATURE_SLICE_SIZE

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -93,6 +93,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV16_ISV16
return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, g, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV64_ISV16
@@ -224,6 +228,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV16_ISV16
return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, g, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV16
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV64_ISV16

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -306,6 +306,7 @@ std::string toString(WeightsLayout layout) {
case WeightsLayout::yxio: return "YXIO";
case WeightsLayout::os_is_yx_isv16_osv16: return "OS_IS_YX_ISV16_OSV16";
case WeightsLayout::os_is_yx_osv16_isv16: return "OS_IS_YX_OSV16_ISV16";
case WeightsLayout::os_is_zyx_osv16_isv16: return "OS_IS_ZYX_OSV16_ISV16";
case WeightsLayout::os_is_zyx_osv32_isv16: return "OS_IS_ZYX_OSV32_ISV16";
case WeightsLayout::os_is_zyx_osv64_isv16: return "OS_IS_ZYX_OSV64_ISV16";
case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16";
@@ -371,6 +372,7 @@ std::string toString(WeightsLayout layout) {
case WeightsLayout::gs_oi_yxs_gsv32_yxsv4: return "GS_OI_YXS_GSV32_YXSV4";
case WeightsLayout::g_os_is_yx_isv16_osv16: return "G_OS_IS_YX_ISV16_OSV16";
case WeightsLayout::g_os_is_yx_osv16_isv4: return "G_OS_IS_YX_OSV16_ISV4";
case WeightsLayout::g_os_is_zyx_osv16_isv16: return "G_OS_IS_ZYX_OSV16_ISV16";
case WeightsLayout::g_os_zyx_is_osv16_isv4: return "G_OS_ZYX_IS_OSV16_ISV4";
case WeightsLayout::g_os_zyx_is_osv16_isv16: return "G_OS_ZYX_IS_OSV16_ISV16";
case WeightsLayout::g_os_zyx_is_osv16_isv32: return "G_OS_ZYX_IS_OSV16_ISV32";

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2018 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
// window size spatial Y", filter_size.spatial[1], "First convolution is outside of image. please reduce input
// offset Y");
if (input_layout.format == format::bfzyx) {
if (input_layout.format.spatial_num() == 3) {
// convolution 3D
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
"Stride spatial Z",

View File

@@ -199,6 +199,8 @@ attach_convolution_gpu::attach_convolution_gpu() {
// block i8 format
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
// MMAD
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);

View File

@@ -110,6 +110,8 @@ attach_scale_gpu::attach_scale_gpu() {
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);

View File

@@ -72,12 +72,7 @@ void pre_replace_deconv::run(program_impl& p) {
!((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
_lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16));
// int8/uint8 input
perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8) &&
// imad convolution kernel limitation for groups
(groups == 1 || weights_node.get_output_layout().size.feature[0] % 4 == 0 ||
groups == static_cast<uint32_t>(input_node.get_output_layout().size.feature[0])) &&
// no uint8/int8 3D convolution support
input_node.get_output_layout().format.dimension() == 4;
perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8);
if (!perform_opt)
continue;

View File

@@ -123,6 +123,7 @@ void prepare_padding::run(program_impl& p) {
if (conv_layout.format != cldnn::format::bfyx &&
conv_layout.format != cldnn::format::bf8_xy16 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
conv_layout.format != cldnn::format::byxf_af32 &&
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
@@ -154,11 +155,15 @@ void prepare_padding::run(program_impl& p) {
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
(filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
auto left_padding = std::max(-input_offset.spatial[0], 0);
auto top_padding = std::max(-input_offset.spatial[1], 0);
auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
// Adjust right padding, so entire buffer size in X dimension is properly aligned.
// TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
@@ -166,7 +171,7 @@ void prepare_padding::run(program_impl& p) {
// round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
// right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
p.apply_needed_padding(node, conv_input_node, needed_padding);
}
@@ -209,13 +214,17 @@ void prepare_padding::run(program_impl& p) {
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
(filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
auto left_padding = std::max(-input_offset.spatial[0], 0);
auto top_padding = std::max(-input_offset.spatial[1], 0);
auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
p.apply_needed_padding(node, conv_input_node, needed_padding);

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2017-2019 Intel Corporation
// Copyright (c) 2017-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -205,6 +205,8 @@ inline std::string fmt_to_str(format fmt) {
return "g_os_is_zyx_isv16_osv16";
case format::g_os_is_yx_osv16_isv4:
return "g_os_is_yx_osv16_isv4";
case format::g_os_is_zyx_osv16_isv16:
return "g_os_is_zyx_osv16_isv16";
case format::g_os_zyx_is_osv16_isv4:
return "g_os_zyx_is_osv16_isv4";
case format::g_os_zyx_is_osv16_isv16:

View File

@@ -327,6 +327,10 @@ kernel_selector::weights_layout to_weights_layout(format f) {
return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
case format::g_os_is_yx_osv16_isv4:
return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
case format::os_is_zyx_osv16_isv16:
return kernel_selector::weights_layout::os_is_zyx_osv16_isv16;
case format::g_os_is_zyx_osv16_isv16:
return kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16;
case format::g_os_zyx_is_osv16_isv4:
return kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4;
case format::g_os_zyx_is_osv16_isv16:
@@ -450,6 +454,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
return cldnn::format::g_os_is_zyx_isv16_osv16;
case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
return cldnn::format::g_os_is_yx_osv16_isv4;
case kernel_selector::weights_layout::os_is_zyx_osv16_isv16:
return cldnn::format::os_is_zyx_osv16_isv16;
case kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16:
return cldnn::format::g_os_is_zyx_osv16_isv16;
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4:
return cldnn::format::g_os_zyx_is_osv16_isv4;
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv16:

View File

@@ -375,9 +375,14 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
auto ks_x = weights_layout.size.spatial[0];
auto ks_y = weights_layout.size.spatial[1];
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
if (weights_layout.format.group_num() > 0) {
out_features_per_group = weights_layout.size.batch[0];
}
// Check for non-grouped or depthwise convolution
if (input_layout.size.spatial[2] == 1 &&
input_layout.size.batch[0] < 16 &&
((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
weights_layout.size.batch[0] >= 16 &&
((conv->groups == 1 && conv->split() == 1) ||
@@ -388,12 +393,17 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
return true;
// Check for grouped convolution
else if (input_layout.size.spatial[2] == 1 && input_layout.size.batch[0] < 16 &&
weights_layout.size.batch[0] >= 16 &&
((input_layout.size.feature[0] / conv->groups) % 4 == 0) &&
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) < 16 &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
return true;
out_features_per_group >= 16 &&
// Need to extend imad fsv4 kernel to handle e.g. 3 input features per group
(in_features_per_group % 4 == 0) &&
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
return true;
// Check for fsv16 imad kernel
else if ((input_layout.format.dimension() == 4) &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
(!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
return true;
return false;
}
// A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
@@ -459,6 +469,20 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
(weights_layout.size.batch[0] % 16 == 0 || (weights_layout.size.batch[0] == 8 && conv->groups > 1)) &&
conv->dilation == tensor(1))
return true;
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
if (weights_layout.format.group_num() > 0) {
out_features_per_group = weights_layout.size.batch[0];
}
// Check for fsv16 imad kernel
if ((input_layout.format.dimension() == 5) &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
(input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
(weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
(!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
return true;
return false;
}
@@ -650,6 +674,9 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
} else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
expected_format = cldnn::format::b_fs_yx_fsv16;
} else if ((_optimization_attributes.b_fs_zyx_fsv16_network &&
convolution_b_fs_zyx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
expected_format = cldnn::format::b_fs_zyx_fsv16;
} else {
expected_format = imad_case(node);
}
@@ -807,7 +834,8 @@ format layout_optimizer::get_preferred_format(program_node& node) {
layout{ data_types::f32, format::bfyx, tensor{} }).format;
} else if (node.is_type<quantize>()) {
auto layout = node.get_output_layout();
if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
if (layout.format.spatial_num() == 2 &&
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
layout.size.batch[0] % 16 == 0)
expected = format::b_fs_yx_fsv4;
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -149,26 +149,32 @@ layout pooling_inst::calc_output_layout(parent::typed_node const& node) {
0,
"Input offset in batch is not supported");
if (input_layout.format == format::bfzyx) {
if (input_layout.format.spatial_num() == 3) {
// 3D
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
"stride spatial Z",
stride.spatial[1],
"",
0,
"Stride spatial Z must be positive (>= 1)");
"stride spatial Z",
stride.spatial[1],
"",
0,
"Stride spatial Z must be positive (>= 1)");
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
"window size spatial Z",
window_size.spatial[2],
"",
0,
"Size Z (of pooling window) must be positive (>= 1)");
"window size spatial Z",
window_size.spatial[2],
"",
0,
"Size Z (of pooling window) must be positive (>= 1)");
CLDNN_ERROR_GREATER_THAN(node.id(),
"Input offset spatial Z",
2 * input_offset.spatial[2],
"input layout size spatial Z",
input_layout.size.spatial[2],
"Input offset is greater than input data range. There is no input data to process");
"Input offset spatial Z",
2 * input_offset.spatial[2],
"input layout size spatial Z",
input_layout.size.spatial[2],
"Input offset is greater than input data range. There is no input data to process");
CLDNN_ERROR_GREATER_THAN(node.id(),
"Negate input offset spatial Z",
-input_offset.spatial[2],
"input window size spatial Z",
window_size.spatial[2],
"First pool is outside of image. please reduce input offset Z");
}
if (desc->with_output_size) {

View File

@@ -78,83 +78,169 @@ struct convolution_accumulator<uint8_t> {
};
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
VVVF<OutputT> reference_convolve(VVVVF<InputT> &input, VVVVF<WeightsT> &filter,
int stride_z, int stride_y, int stride_x,
float bias,
int dilation_z = 1, int dilation_y = 1, int dilation_x = 1,
int input_padding_z = 0, int input_padding_y = 0, int input_padding_x = 0,
int output_padding_z = 0, int output_padding_y = 0, int output_padding_x = 0,
size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
{
size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
size_t kernel_extent_z = dilation_z * (filter[0].size() - 1) + 1;
size_t kernel_extent_y = dilation_y * (filter[0][0].size() - 1) + 1;
size_t kernel_extent_x = dilation_x * (filter[0][0][0].size() - 1) + 1;
size_t output_z = 1 + (input[0].size() - kernel_extent_z + 2 * input_padding_z) / stride_z + 2 * output_padding_z;
size_t output_y = 1 + (input[0][0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
size_t output_x = 1 + (input[0][0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
bool asymm_data = !data_zp.empty();
bool asymm_weights = weights_zp != static_cast<WeightsT>(0);
VVF<OutputT> output(output_y, VF<OutputT>(output_x, 0));
VVVF<OutputT> output(output_z, VVF<OutputT>(output_y, VF<OutputT>(output_x, 0)));
size_t filter_begin = f_begin ? f_begin : 0;
size_t filter_end = f_end ? f_end : filter.size();
for (size_t f = filter_begin; f < filter_end; ++f) {
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
VF<AccT> values;
values.reserve(filter[0].size() * filter[0][0].size());
for (size_t yf = 0; yf < filter[0].size(); ++yf) {
int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
if (!yi_inside) continue;
for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
if (!xi_inside) continue;
for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
VF<AccT> values;
values.reserve(filter[0].size() * filter[0][0].size() * filter[0][0][0].size());
for (size_t zf = 0; zf < filter[0].size(); ++zf) {
int zi = -input_padding_z + (int)zf * dilation_z + stride_z * (int)z;
bool zi_inside = zi >= 0 && (int)input[0].size() > zi;
if (!zi_inside) continue;
for (size_t yf = 0; yf < filter[0][0].size(); ++yf) {
int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
bool yi_inside = yi >= 0 && (int)input[0][0].size() > yi;
if (!yi_inside) continue;
for (size_t xf = 0; xf < filter[0][0][0].size(); ++xf) {
int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
bool xi_inside = xi >= 0 && (int)input[0][0][0].size() > xi;
if (!xi_inside) continue;
auto input_val = static_cast<AccT>(input[f][yi][xi]);
auto input_val = static_cast<AccT>(input[f][zi][yi][xi]);
if (asymm_data) {
input_val = input_val - static_cast<AccT>(data_zp[f]);
if (asymm_data) {
input_val = input_val - static_cast<AccT>(data_zp[f]);
}
AccT weights_val;
if (!depthwise && !grouped) {
weights_val = static_cast<AccT>(filter[f][zf][yf][xf]);
} else if (grouped) {
weights_val = static_cast<AccT>(filter[f - filter_begin][zf][yf][xf]);
}
else {
weights_val = static_cast<AccT>(filter[0][zf][yf][xf]);
}
if (asymm_weights) {
weights_val = weights_val - static_cast<AccT>(weights_zp);
}
//std::cout << std::endl << "f=" << f << ", z=" << z << ", y=" << y << ", x=" << x << ", zf=" << zf << ", yf=" << yf << ", xf=" << xf << ": " << (int)input_val << " * " << (int)weights_val;
values.push_back(input_val * weights_val);
}
}
AccT weights_val;
if (!depthwise && !grouped) {
weights_val = static_cast<AccT>(filter[f][yf][xf]);
} else if (grouped) {
weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
}
else {
weights_val = static_cast<AccT>(filter[0][yf][xf]);
}
if (asymm_weights) {
weights_val = weights_val - static_cast<AccT>(weights_zp);
}
values.push_back(input_val * weights_val);
}
output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
}
output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
}
}
}
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
}
}
}
return output;
}
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
{
VVVVF<InputT> input_extended(input.size(), VVVF<InputT>(1, VVF<InputT>(input[0].size(), VF<InputT>(input[0][0].size(), 0))));
for (size_t fi = 0; fi < input.size(); fi++) {
for (size_t yi = 0; yi < input[0].size(); yi++) {
for (size_t xi = 0; xi < input[0][0].size(); xi++) {
input_extended[fi][0][yi][xi] = input[fi][yi][xi];
}
}
}
VVVVF<WeightsT> filter_extended(filter.size(), VVVF<WeightsT>(1, VVF<WeightsT>(filter[0].size(), VF<WeightsT>(filter[0][0].size(), 0))));
for (size_t fi = 0; fi < filter.size(); fi++) {
for (size_t yi = 0; yi < filter[0].size(); yi++) {
for (size_t xi = 0; xi < filter[0][0].size(); xi++) {
filter_extended[fi][0][yi][xi] = filter[fi][yi][xi];
}
}
}
VVVF<OutputT> output = reference_convolve<InputT, OutputT, WeightsT, AccT>(input_extended, filter_extended,
1, stride_y, stride_x,
bias,
1, dilation_y, dilation_x,
0, input_padding_y, input_padding_x,
0, output_padding_y, output_padding_x,
f_begin, f_end, depthwise, grouped,
data_zp, weights_zp);
VVF<OutputT> output_shrinked(output[0].size(), VF<OutputT>(output[0][0].size(), 0));
for (size_t yi = 0; yi < output[0].size(); yi++) {
for (size_t xi = 0; xi < output[0][0].size(); xi++) {
output_shrinked[yi][xi] = output[0][yi][xi];
}
}
return output_shrinked;
}
template <typename T>
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
auto output = input;
auto size_y = input.size();
auto size_x = input[0].size();
for (size_t yi = 0; yi < size_y; ++yi) {
for (size_t xi = 0; xi < size_x; ++xi) {
output[yi][xi] = output[yi][xi] * scale + shift;
auto size_z = input.size();
auto size_y = input[0].size();
auto size_x = input[0][0].size();
for (size_t zi = 0; zi < size_z; ++zi) {
for (size_t yi = 0; yi < size_y; ++yi) {
for (size_t xi = 0; xi < size_x; ++xi) {
output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
}
}
}
return output;
}
template <typename T>
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
VVVF<T> input_extended(1, VVF<T>(input.size(), VF<T>(input[0].size(), 0)));
for (size_t yi = 0; yi < input.size(); yi++) {
for (size_t xi = 0; xi < input[0].size(); xi++) {
input_extended[0][yi][xi] = input[yi][xi];
}
}
VVVF<T> output = reference_scale_post_op<T>(input_extended, scale, shift);
VVF<T> output_shrinked(output[0].size(), VF<T>(output[0][0].size(), 0));
for (size_t yi = 0; yi < output[0].size(); yi++) {
for (size_t xi = 0; xi < output[0][0].size(); xi++) {
output_shrinked[yi][xi] = output[0][yi][xi];
}
}
return output_shrinked;
}
void dump_buffer(memory const& mem, std::string const& name)
{
std::ofstream out(name);
@@ -172,14 +258,17 @@ void dump_buffer(memory const& mem, std::string const& name)
for (int f = 0; f < size.feature[0]; ++f)
{
out << "feature " << f << ":\n";
for (int y = 0; y < size.spatial[1]; ++y)
for (int z = 0; z < size.spatial[2]; ++z)
{
for (int x = 0; x < size.spatial[0]; ++x)
for (int y = 0; y < size.spatial[1]; ++y)
{
size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
out << ptr[idx] << " ";
for (int x = 0; x < size.spatial[0]; ++x)
{
size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + z * pitches.spatial[2] + y * pitches.spatial[1] + x * pitches.spatial[0];
out << ptr[idx] << " ";
}
out << "\n";
}
out << "\n";
}
out << "\n";
@@ -4798,15 +4887,18 @@ using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int, // 0 - I
bool>; // 6 - With bias
using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
int, // 1 - Input Y size
int, // 2 - Input features
int, // 3 - Output features
int, // 4 - Kernel sizeX
int, // 5 - Kernel sizeY
int, // 6 - Groups number
int, // 7 - Stride
int, // 8 - Batch
format>; // 9 - Input data format
int, // 1 - Input Y size
int, // 2 - Input Z size
int, // 3 - Input features
int, // 4 - Output features
int, // 5 - Kernel sizeX
int, // 6 - Kernel sizeY
int, // 7 - Kernel sizeZ
int, // 8 - Groups number
int, // 9 - Stride
int, // 10 - Batch
format, // 11 - Input data format
std::string>; // 12 - Implementation name
struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
{
@@ -4880,16 +4972,24 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
static std::string PrintToStringParamName(
testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
// construct a readable name
return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
std::to_string(testing::get<1>(param_info.param)) + "y" +
std::to_string(testing::get<2>(param_info.param)) + "f" +
"_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
"_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
std::to_string(testing::get<5>(param_info.param)) + "y" +
"_groups" + std::to_string(testing::get<6>(param_info.param)) +
"_stride" + std::to_string(testing::get<7>(param_info.param)) +
"_batch" + std::to_string(testing::get<8>(param_info.param)) +
"_format" + std::to_string(testing::get<9>(param_info.param));
std::string res = "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
std::to_string(testing::get<1>(param_info.param)) + "y" +
std::to_string(testing::get<2>(param_info.param)) + "z" +
std::to_string(testing::get<3>(param_info.param)) + "f" +
"_output" + std::to_string(testing::get<4>(param_info.param)) + "f" +
"_filter" + std::to_string(testing::get<5>(param_info.param)) + "x" +
std::to_string(testing::get<6>(param_info.param)) + "y" +
std::to_string(testing::get<7>(param_info.param)) + "z" +
"_groups" + std::to_string(testing::get<8>(param_info.param)) +
"_stride" + std::to_string(testing::get<9>(param_info.param)) +
"_batch" + std::to_string(testing::get<10>(param_info.param)) +
"_format" + std::to_string(testing::get<11>(param_info.param));
if (testing::get<12>(param_info.param) != "") {
res += "_impl_" + testing::get<12>(param_info.param);
}
return res;
}
};
@@ -6957,26 +7057,47 @@ INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_bfyx,
INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
convolution_grouped_gpu,
::testing::Values(
// Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size Y,
// Groups number, Stride, Output padding, Batch, Input data format
// Input X size, Input Y size, Input Z size, Input features, Output features,
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
// Input data format, Implementation name
// Format: b_fs_yx_fsv4
TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv4),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
// Format: b_fs_yx_fsv16
TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv16),
TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv16)
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
// Format: b_fs_zyx_fsv16
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, "")
),
convolution_grouped_gpu::PrintToStringParamName);
@@ -6985,42 +7106,66 @@ TEST_P(convolution_grouped_gpu, base) {
const int input_x = testing::get<0>(GetParam()),
input_y = testing::get<1>(GetParam()),
input_f = testing::get<2>(GetParam()),
output_f = testing::get<3>(GetParam()),
filter_x = testing::get<4>(GetParam()),
filter_y = testing::get<5>(GetParam()),
groups = testing::get<6>(GetParam()),
stride = testing::get<7>(GetParam()),
batch_num = testing::get<8>(GetParam()),
input_z = testing::get<2>(GetParam()),
input_f = testing::get<3>(GetParam()),
output_f = testing::get<4>(GetParam()),
filter_x = testing::get<5>(GetParam()),
filter_y = testing::get<6>(GetParam()),
filter_z = testing::get<7>(GetParam()),
groups = testing::get<8>(GetParam()),
stride = testing::get<9>(GetParam()),
batch_num = testing::get<10>(GetParam()),
output_padding = 0,
input_offset_y = (filter_x - 1) / 2,
input_offset_x = (filter_y - 1) / 2;
auto input_data_format = testing::get<9>(GetParam());
input_offset_z = (filter_z - 1) / 2,
input_offset_y = (filter_y - 1) / 2,
input_offset_x = (filter_x - 1) / 2;
auto input_data_format = testing::get<11>(GetParam());
auto impl_name = testing::get<12>(GetParam());
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
set_values(input, input_rnd_vec);
auto num_in_spatial_dims = input_data_format.spatial_num();
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
if (num_in_spatial_dims == 2) {
input_lay = layout(data_types::u8, format::bfyx, input_size);
}
std::vector<uint8_t> input_flat(input_lay.get_linear_size());
for (int b = 0; b < batch_num; b++)
for (int f = 0; f < input_f; f++)
for (int z = 0; z < input_z; z++)
for (int y = 0; y < input_y; y++)
for (int x = 0; x < input_x; x++) {
tensor coords = tensor(batch(b), feature(f), spatial(x, y, z, 0));
size_t offset = input_lay.get_linear_offset(coords);
input_flat[offset] = input_rnd[b][f][z][y][x];
}
auto input = memory::allocate(engine, input_lay);
set_values(input, input_flat);
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
auto weights_lay = layout(data_types::i8, format::goizyx, weights_size);
if (num_in_spatial_dims == 2) {
weights_lay = layout(data_types::i8, format::goiyx, weights_size);
}
std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
for (int gi = 0; gi < groups; ++gi)
for (int ofi = 0; ofi < output_f / groups; ++ofi)
for (int ifi = 0; ifi < input_f / groups; ++ifi)
for (int kyi = 0; kyi < filter_y; ++kyi)
for (int kxi = 0; kxi < filter_x; ++kxi) {
tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
size_t offset = weights_lay.get_linear_offset(coords);
weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
}
auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
for (int kzi = 0; kzi < filter_z; ++kzi)
for (int kyi = 0; kyi < filter_y; ++kyi)
for (int kxi = 0; kxi < filter_x; ++kxi) {
tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, kzi, 0));
size_t offset = weights_lay.get_linear_offset(coords);
weights_flat[offset] = weights_rnd[gi][ofi][ifi][kzi][kyi][kxi];
}
auto weights = memory::allocate(engine, weights_lay);
set_values(weights, weights_flat);
VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
// Calculate reference values without bias
for (int bi = 0; bi < batch_num; ++bi)
@@ -7031,15 +7176,15 @@ TEST_P(convolution_grouped_gpu, base) {
int f_end = gi * input_f / groups + input_f / groups;
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
stride, stride, // strides
0, // bias
1, 1, // dilation
input_offset_y, input_offset_x, // input padding
0, 0, // output_padding
f_begin, f_end, // f_begin, f_end
false, // depthwise
grouped); // grouped
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
stride, stride, stride, // strides
0, // bias
1, 1, 1, // dilation
input_offset_z, input_offset_y, input_offset_x, // input padding
0, 0, 0, // output_padding
f_begin, f_end, // f_begin, f_end
false, // depthwise
grouped); // grouped
}
topology topology(input_layout("input", input.get_layout()),
@@ -7049,14 +7194,14 @@ TEST_P(convolution_grouped_gpu, base) {
"input_fsv",
{"weights"},
groups,
{1, 1, stride, stride},
{0, 0, -input_offset_x, -input_offset_y},
{1, 1, 1, 1},
padding({0, 0, output_padding, output_padding}, 0.f)));
tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc conv_impl = {input_data_format, "fused_conv_eltwise_gpu_imad"};
implementation_desc conv_impl = {input_data_format, impl_name};
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
network network(engine, topology, options);
@@ -7070,24 +7215,26 @@ TEST_P(convolution_grouped_gpu, base) {
ASSERT_EQ(out_mem.get_layout().format, input_data_format);
ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
ASSERT_EQ(out_lay.size.spatial[2], expected_result[0][0].size());
ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0][0].size());
ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0][0].size());
for (int bi = 0; bi < batch_num; ++bi)
for (int ofi = 0; ofi < output_f; ++ofi)
for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
auto offset = out_lay.get_linear_offset(coords);
auto val = out_ptr[offset];
auto val_ref = expected_result[bi][ofi][yi][xi];
auto equal = are_equal(val_ref, val, 1e-2f);
if (!equal) {
std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
for (int zi = 0; zi < (int)expected_result[0][0].size(); ++zi)
for (int yi = 0; yi < (int)expected_result[0][0][0].size(); ++yi)
for (int xi = 0; xi < (int)expected_result[0][0][0][0].size(); ++xi) {
tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, zi, 0));
auto offset = out_lay.get_linear_offset(coords);
auto val = out_ptr[offset];
auto val_ref = expected_result[bi][ofi][zi][yi][xi];
auto equal = are_equal(val_ref, val, 1e-2f);
if (!equal) {
std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
}
EXPECT_TRUE(equal);
}
EXPECT_TRUE(equal);
}
}
template <typename InputT, typename WeightsT, typename OutputT>

View File

@@ -86,7 +86,7 @@ struct pooling_accumulator<InputT, pooling_mode::max> {
_acc = max(_acc, val);
}
output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
return static_cast<output_t>(_acc);
}
@@ -105,7 +105,7 @@ struct pooling_accumulator<InputT, pooling_mode::average_no_padding> {
_acc += static_cast<output_t>(val);
}
output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
return _acc / _cnt;
}
@@ -128,8 +128,8 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
_acc += static_cast<output_t>(val);
}
output_t get(size_t pool_x, size_t pool_y) {
return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y));
output_t get(size_t pool_x, size_t pool_y, size_t pool_z) {
return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y * pool_z));
}
void reset() {
@@ -140,46 +140,59 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
};
template <typename InputT, pooling_mode Mode>
VVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVF<InputT>& input, size_t pool_x, size_t pool_y, int stride_x, int stride_y, int offset_x, int offset_y) {
VVVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVVF<InputT>& input, size_t pool_x, size_t pool_y, size_t pool_z, int stride_x, int stride_y, int stride_z, int offset_x, int offset_y, int offset_z) {
using output_t = typename pooling_mode_output<InputT, Mode>::type;
VVF<output_t> result;
auto size_x = input[0].size();
auto size_y = input.size();
VVVF<output_t> result;
auto size_x = input[0][0].size();
auto size_y = input[0].size();
auto size_z = input.size();
auto accumulator = pooling_accumulator<InputT, Mode>();
for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
VF<output_t> result_row;
for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
accumulator.reset();
for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
int index_y = yi + fyi;
if (index_y < 0 || index_y >= static_cast<int>(size_y))
continue;
for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
int index_x = xi + fxi;
if (index_x < 0 || index_x >= static_cast<int>(size_x))
for (int zi = offset_z; zi + static_cast<int>(pool_z) <= static_cast<int>(size_z) - offset_z; zi += stride_z) {
VVF<output_t> result_matrix;
for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
VF<output_t> result_row;
for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
accumulator.reset();
for (int fzi = 0; fzi < static_cast<int>(pool_z); ++fzi) {
int index_z = zi + fzi;
if (index_z < 0 || index_z >= static_cast<int>(size_z))
continue;
for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
int index_y = yi + fyi;
if (index_y < 0 || index_y >= static_cast<int>(size_y))
continue;
for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
int index_x = xi + fxi;
if (index_x < 0 || index_x >= static_cast<int>(size_x))
continue;
auto input_val = input[static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
accumulator.accumulate(input_val);
auto input_val = input[static_cast<size_t>(index_z)][static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
accumulator.accumulate(input_val);
}
}
}
result_row.push_back(accumulator.get(pool_x, pool_y, pool_z));
}
result_row.push_back(accumulator.get(pool_x, pool_y));
result_matrix.emplace_back(std::move(result_row));
}
result.emplace_back(std::move(result_row));
result.emplace_back(std::move(result_matrix));
}
return result;
}
template <typename T>
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
auto output = input;
auto size_y = input.size();
auto size_x = input[0].size();
for (size_t yi = 0; yi < size_y; ++yi) {
for (size_t xi = 0; xi < size_x; ++xi) {
output[yi][xi] = output[yi][xi] * scale + shift;
auto size_z = input.size();
auto size_y = input[0].size();
auto size_x = input[0][0].size();
for (size_t zi = 0; zi < size_z; ++zi) {
for (size_t yi = 0; yi < size_y; ++yi) {
for (size_t xi = 0; xi < size_x; ++xi) {
output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
}
}
}
return output;
@@ -2355,7 +2368,7 @@ public:
using output_t = typename pooling_mode_output<InputT, Mode>::type;
virtual topology build_topology(const engine& /*eng*/) {
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
auto input_lay = layout(input_type(),
input_format(),
input_size);
@@ -2365,9 +2378,9 @@ public:
pooling("pool",
"input",
pool_mode(),
tensor(batch(0), feature(0), spatial(pool_x(), pool_y())),
tensor(batch(0), feature(0), spatial(stride_x(), stride_y())),
tensor(batch(0), feature(0), spatial(offset_x(), offset_y())))
tensor(batch(0), feature(0), spatial(pool_x(), pool_y(), pool_z())),
tensor(batch(0), feature(0), spatial(stride_x(), stride_y(), stride_z())),
tensor(batch(0), feature(0), spatial(offset_x(), offset_y(), offset_z())))
);
return topo;
}
@@ -2376,7 +2389,8 @@ public:
return "pool";
}
virtual void run_expect(const VVVVF<output_t>& expected) {
virtual void run_expect(const VVVVVF<output_t>& expected) {
auto eng = get_test_engine();
auto topo = build_topology(eng);
auto opts = build_options(
@@ -2384,7 +2398,7 @@ public:
);
auto net = network(eng, topo, opts);
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
auto input_lay = layout(input_type(),
input_format(),
input_size);
@@ -2392,12 +2406,13 @@ public:
std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
for (size_t bi = 0; bi < batch_num(); ++bi)
for (size_t fi = 0; fi < input_features(); ++fi)
for (size_t yi = 0; yi < input_y(); ++yi)
for (size_t xi = 0; xi < input_x(); ++xi) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
size_t offset = input_lay.get_linear_offset(coords);
input_flat[offset] = _input[bi][fi][yi][xi];
}
for (size_t zi = 0; zi < input_z(); ++zi)
for (size_t yi = 0; yi < input_y(); ++yi)
for (size_t xi = 0; xi < input_x(); ++xi) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
size_t offset = input_lay.get_linear_offset(coords);
input_flat[offset] = _input[bi][fi][zi][yi][xi];
}
set_values(input_mem, input_flat);
net.set_input_data("input", input_mem);
@@ -2418,35 +2433,37 @@ public:
ASSERT_EQ(out_lay.data_type, output_type());
ASSERT_EQ(out_lay.size.batch[0], expected.size());
ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
ASSERT_EQ(out_lay.size.spatial[2], expected[0][0].size());
ASSERT_EQ(out_lay.size.spatial[1], expected[0][0][0].size());
ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0][0].size());
bool compare_with_tolerance = input_type() == data_types::f16;
for (size_t bi = 0; bi < batch_num(); ++bi)
for (size_t fi = 0; fi < expected[0].size(); ++fi)
for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
size_t offset = out_lay.get_linear_offset(coords);
auto ref_val = static_cast<float>(expected[bi][fi][yi][xi]);
auto actual_val = static_cast<float>(out_ptr[offset]);
if (compare_with_tolerance) {
auto tolerance = 1;
ASSERT_NEAR(ref_val, actual_val, tolerance)
<< "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
} else {
EXPECT_TRUE(are_equal(ref_val, actual_val))
<< "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
for (size_t zi = 0; zi < expected[0][0].size(); ++zi)
for (size_t yi = 0; yi < expected[0][0][0].size(); ++yi)
for (size_t xi = 0; xi < expected[0][0][0][0].size(); ++xi) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
size_t offset = out_lay.get_linear_offset(coords);
auto ref_val = static_cast<float>(expected[bi][fi][zi][yi][xi]);
auto actual_val = static_cast<float>(out_ptr[offset]);
if (compare_with_tolerance) {
auto tolerance = 1;
ASSERT_NEAR(ref_val, actual_val, tolerance)
<< "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
} else {
EXPECT_TRUE(are_equal(ref_val, actual_val))
<< "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
}
}
}
}
size_t batch_num() { return _input.size(); }
size_t input_features() { return _input[0].size(); }
size_t input_x() { return _input[0][0][0].size(); }
size_t input_y() { return _input[0][0].size(); }
size_t input_x() { return _input[0][0][0][0].size(); }
size_t input_y() { return _input[0][0][0].size(); }
size_t input_z() { return _input[0][0].size(); }
format::type input_format() { return _input_fmt; }
data_types input_type() {
@@ -2460,46 +2477,52 @@ public:
pooling_mode pool_mode() { return Mode; }
size_t pool_x() { return _pool_x; }
size_t pool_y() { return _pool_y; }
size_t pool_z() { return _pool_z; }
int stride_x() { return _stride_x; }
int stride_y() { return _stride_y; }
int stride_z() { return _stride_z; }
int offset_x() { return _offset_x; }
int offset_y() { return _offset_y; }
int offset_z() { return _offset_z; }
void set_input(format::type input_fmt, VVVVF<InputT> input_data) {
void set_input(format::type input_fmt, VVVVVF<InputT> input_data) {
_input_fmt = input_fmt;
_input = std::move(input_data);
}
void set_pool_size(size_t x, size_t y) {
void set_pool_size(size_t x, size_t y, size_t z) {
_pool_x = x;
_pool_y = y;
_pool_z = z;
}
void set_strides(int x, int y) {
void set_strides(int x, int y, int z) {
_stride_x = x;
_stride_y = y;
_stride_z = z;
}
void set_offsets(int x, int y) {
void set_offsets(int x, int y, int z) {
_offset_x = x;
_offset_y = y;
_offset_z = z;
}
VVVVF<InputT> _input;
VVVVVF<InputT> _input;
format::type _input_fmt;
size_t _pool_x, _pool_y;
int _stride_x, _stride_y;
int _offset_x, _offset_y;
size_t _pool_x, _pool_y, _pool_z;
int _stride_x, _stride_y, _stride_z;
int _offset_x, _offset_y, _offset_z;
};
using pooling_random_test_params = std::tuple<
size_t, // batch
size_t, // features
std::tuple<size_t, size_t>, // input x, y
std::tuple<size_t, size_t>, // pool x, y
std::tuple<int, int>, // stride x, y
std::tuple<int, int>, // offset x, y
format::type // input format
size_t, // batch
size_t, // features
std::tuple<size_t, size_t, size_t>, // input x, y, z
std::tuple<size_t, size_t, size_t>, // pool x, y, z
std::tuple<int, int, int>, // stride x, y, z
std::tuple<int, int, int>, // offset x, y, z
format::type // input format
>;
template <typename InputT, pooling_mode Mode>
@@ -2508,44 +2531,47 @@ public:
using parent = pooling_test_base<InputT, Mode>;
using output_t = typename parent::output_t;
virtual VVVVF<output_t> calculate_reference() {
VVVVF<output_t> reference(this->batch_num(), VVVF<output_t>(this->input_features()));
virtual VVVVVF<output_t> calculate_reference() {
VVVVVF<output_t> reference(this->batch_num(), VVVVF<output_t>(this->input_features()));
for (size_t bi = 0; bi < this->batch_num(); ++bi) {
for (size_t fi = 0; fi < this->input_features(); ++fi) {
reference[bi][fi] = reference_pooling<InputT, Mode>(
this->_input[bi][fi],
this->pool_x(),
this->pool_y(),
this->pool_z(),
this->stride_x(),
this->stride_y(),
this->stride_z(),
this->offset_x(),
this->offset_y());
this->offset_y(),
this->offset_z());
}
}
return reference;
}
virtual void param_set_up(const pooling_random_test_params& params) {
size_t b, f, in_x, in_y, p_x, p_y;
int s_x, s_y, o_x, o_y;
size_t b, f, in_x, in_y, in_z, p_x, p_y, p_z;
int s_x, s_y, s_z, o_x, o_y, o_z;
format::type in_fmt;
std::forward_as_tuple(
b,
f,
std::forward_as_tuple(in_x, in_y),
std::forward_as_tuple(p_x, p_y),
std::forward_as_tuple(s_x, s_y),
std::forward_as_tuple(o_x, o_y),
std::forward_as_tuple(in_x, in_y, in_z),
std::forward_as_tuple(p_x, p_y, p_z),
std::forward_as_tuple(s_x, s_y, s_z),
std::forward_as_tuple(o_x, o_y, o_z),
in_fmt
) = params;
auto input_data = generate_random_4d<InputT>(b, f, in_y, in_x, -256, 256);
auto input_data = generate_random_5d<InputT>(b, f, in_z, in_y, in_x, -256, 256);
this->set_input(in_fmt, std::move(input_data));
this->set_pool_size(p_x, p_y);
this->set_strides(s_x, s_y);
this->set_offsets(o_x, o_y);
this->set_pool_size(p_x, p_y, p_z);
this->set_strides(s_x, s_y, s_z);
this->set_offsets(o_x, o_y, o_z);
}
void run_random(const pooling_random_test_params& params) {
@@ -2583,14 +2609,14 @@ TEST_P(pooling_random_test, avg_u8) {
}
INSTANTIATE_TEST_CASE_P(
smoke_low_precision,
smoke_low_precision_2d_spatial,
pooling_random_test,
testing::Combine(testing::Values(1, 2),
testing::Values(3, 8, 64),
testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
testing::Values(std::tuple<int, int>(2, 2)),
testing::Values(std::tuple<int, int>(0, 0)),
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
testing::Values(std::tuple<int, int, int>(2, 2, 1)),
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
format::byxf_af32,
@@ -2599,16 +2625,29 @@ INSTANTIATE_TEST_CASE_P(
format::b_fs_yx_fsv32)),
testing::internal::DefaultParamName<pooling_random_test_params>);
INSTANTIATE_TEST_CASE_P(
smoke_low_precision_3d_spatial,
pooling_random_test,
testing::Combine(testing::Values(1, 2),
testing::Values(3, 8, 64),
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 12), std::tuple<size_t, size_t, size_t>(24, 24, 24)),
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 4), std::tuple<size_t, size_t, size_t>(2, 2, 2)),
testing::Values(std::tuple<int, int, int>(2, 2, 2)),
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::bfzyx,
format::b_fs_zyx_fsv16)),
testing::internal::DefaultParamName<pooling_random_test_params>);
INSTANTIATE_TEST_CASE_P(
batched_low_precision,
pooling_random_test,
testing::Combine(
testing::Values(16),
testing::Values(16, 32),
testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
testing::Values(std::tuple<int, int>(1, 1)),
testing::Values(std::tuple<int, int>(0, 0)),
testing::Values(std::tuple<size_t, size_t, size_t>(3, 3, 1), std::tuple<size_t, size_t, size_t>(8, 8, 1)),
testing::Values(std::tuple<size_t, size_t, size_t>(1, 1, 1), std::tuple<size_t, size_t, size_t>(3, 3, 1)),
testing::Values(std::tuple<int, int, int>(1, 1, 1)),
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::bs_fs_yx_bsv16_fsv16)
),
testing::internal::DefaultParamName<pooling_random_test_params>);
@@ -2622,7 +2661,7 @@ public:
topology build_topology(const engine& eng) override {
topology topo = parent::build_topology(eng);
auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1, 1, 1)));
auto scale_mem = memory::allocate(eng, scale_lay);
auto shift_mem = memory::allocate(eng, scale_lay);
set_values(scale_mem, _scale);
@@ -2640,7 +2679,7 @@ public:
return "scale_wa_out";
}
VVVVF<output_t> calculate_reference() override {
VVVVVF<output_t> calculate_reference() override {
auto expected = parent::calculate_reference();
for (size_t bi = 0; bi < this->batch_num(); ++bi)
@@ -2688,10 +2727,10 @@ INSTANTIATE_TEST_CASE_P(
pooling_random_test_fp16_fp32,
testing::Combine(testing::Values(1, 2),
testing::Values(3, 8),
testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
testing::Values(std::tuple<int, int>(2, 2)),
testing::Values(std::tuple<int, int>(0, 0)),
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
testing::Values(std::tuple<int, int, int>(2, 2, 1)),
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
format::byxf,