[IE CLDNN] Add 3d spatials support to conv & pool imad kernels and unblock any in/out feature sizes (#1693)
This commit is contained in:
committed by
GitHub
parent
94bfff2011
commit
1880958f8b
12
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
12
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -170,6 +170,7 @@ struct format {
|
||||
os_is_y_x8_osv8_isv4, ///< format for weights for 1x1 MMAD convolutions
|
||||
os_is_y_x8_osv8_isv4_swizzled_by_4, ///< format for weights for 1x1 MMAD convolutions
|
||||
os_is_yx_osv16_isv4, ///< format for weights for IMAD convolutions
|
||||
os_is_zyx_osv16_isv16, ///< format for weights for IMAD convolutions
|
||||
os_is_yx_osv32_isv4_swizzled_by_2, ///< format for weights for IMAD convolutions
|
||||
os_is_yx_osv32_isv4, ///< format for weights for IMAD convolutions
|
||||
os_is_yx_osv32_isv32p, ///< format for weights for binary convolutions
|
||||
@@ -188,6 +189,7 @@ struct format {
|
||||
gs_oiyx_gsv32, ///< format used for weights for 2D convolution
|
||||
g_is_os_zyx_osv16_isv16, ///< format used for grouped weights for blocked 3D deconvolution
|
||||
g_os_is_yx_osv16_isv4,
|
||||
g_os_is_zyx_osv16_isv16,
|
||||
g_is_os_yx_osv16_isv16,
|
||||
g_os_is_zyx_isv8_osv16_isv2,
|
||||
g_os_is_yx_isv8_osv16_isv2,
|
||||
@@ -237,7 +239,7 @@ struct format {
|
||||
{ b_fs_yx_32fp, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
|
||||
{ b_fs_zyx_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{1, 16}}}},
|
||||
{ bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16 }, {1, 16}}}},
|
||||
{ bs_fs_yx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
|
||||
{ bs_fs_yx_bsv16_fsv16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16 }, {1, 16}}}},
|
||||
{ nv12, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
|
||||
{ image_2d_rgba, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {}}},
|
||||
|
||||
@@ -265,7 +267,8 @@ struct format {
|
||||
{ is_o32_yx_isv32_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
|
||||
{ os_is_y_x8_osv8_isv4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
|
||||
{ os_is_y_x8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
|
||||
{ os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 16}, {1, 4}}}},
|
||||
{ os_is_yx_osv16_isv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{0, 16}, {1, 4}}}},
|
||||
{ os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {{0, 16}, {1, 16}}}},
|
||||
{ os_is_yx_osv32_isv4_swizzled_by_2, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
|
||||
{ os_is_yx_osv32_isv4, { 1, 1, 2, 0, 0, "bfxy", "bfxy?", {{0, 32}, {1, 4}}}},
|
||||
{ os_is_yx_osv32_isv32p, { 1, 1, 1, 0, 0, "bfxy", "bfxy?", {}}},
|
||||
@@ -291,7 +294,8 @@ struct format {
|
||||
{ g_os_is_zyx_isv8_osv16_isv2, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{1, 8}, {0, 16}, {1, 2}}}},
|
||||
{ g_os_is_yx_isv8_osv16_isv2, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{1, 8}, {0, 16}, {1, 2}}}},
|
||||
{ g_os_is_zyx_isv16_osv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
|
||||
{ g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfxy", "bfxy????g", {{0, 16}, {1, 4}}}},
|
||||
{ g_os_is_yx_osv16_isv4, { 1, 1, 2, 0, 1, "gbfyx", "bfxy????g", {{0, 16}, {1, 4}}}},
|
||||
{ g_os_is_zyx_osv16_isv16, { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g", {{0, 16}, {1, 16}}}},
|
||||
{ g_os_zyx_is_osv16_isv4, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 4}}}},
|
||||
{ g_os_zyx_is_osv16_isv16, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 16}}}},
|
||||
{ g_os_zyx_is_osv16_isv32, { 1, 1, 3, 0, 1, "gbzyxi", "bfxyz???g", {{0, 16}, {1, 32}}}},
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -73,6 +73,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
|
||||
{ WeightsLayout::os_i_osv16__ai8, { -1, -1, -1, 0, 1, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_i_osv16, { -1, -1, -1, 0, 1, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_yx_osv16_isv16, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::g_os_is_zyx_osv16_isv16, { 0, 1, 2, 3, 4, -1, -1, 5 } },
|
||||
{ WeightsLayout::os_is_zyx_osv32_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_zyx_osv64_isv16, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::i_yxs_os_yxsv2_osv16, { 1, 2, -1, 3, 0, -1, -1, -1 } },
|
||||
@@ -425,7 +427,7 @@ DataTensor DataTensor::FlattenEverything() const {
|
||||
NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l) {
|
||||
std::vector<size_t> newDims = d;
|
||||
|
||||
// TOOD: it's not the right pitches. it's here in order to calculate physical size
|
||||
// TODO: It's not the right pitches. it's here in order to calculate physical size
|
||||
switch (l) {
|
||||
case os_iyx_osv16:
|
||||
case os_iyx_osv16_rotate_180:
|
||||
@@ -635,6 +637,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
|
||||
newDims[2] = RoundUp(newDims[2], 16);
|
||||
newDims[3] = RoundUp(newDims[3], 16);
|
||||
break;
|
||||
case os_is_zyx_osv16_isv16:
|
||||
assert(newDims.size() == 5);
|
||||
newDims[3] = RoundUp(newDims[3], 16);
|
||||
newDims[4] = RoundUp(newDims[4], 16);
|
||||
break;
|
||||
case g_os_is_zyx_osv16_isv16:
|
||||
assert(newDims.size() == 6);
|
||||
newDims[3] = RoundUp(newDims[3], 16);
|
||||
newDims[4] = RoundUp(newDims[4], 16);
|
||||
break;
|
||||
case os_is_zyx_osv32_isv16:
|
||||
newDims[3] = RoundUp(newDims[3], 16);
|
||||
newDims[4] = RoundUp(newDims[4], 32);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -92,6 +92,7 @@ enum WeightsLayout {
|
||||
os_i_osv16__ai8,
|
||||
os_i_osv16,
|
||||
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
|
||||
os_is_zyx_osv16_isv16,
|
||||
os_is_zyx_osv32_isv16,
|
||||
os_is_zyx_osv64_isv16,
|
||||
i_yxs_os_yxsv2_osv16,
|
||||
@@ -142,6 +143,7 @@ enum WeightsLayout {
|
||||
g_os_is_zyx_isv8_osv16_isv2,
|
||||
g_os_is_yx_isv8_osv16_isv2,
|
||||
g_os_is_zyx_isv16_osv16,
|
||||
g_os_is_zyx_osv16_isv16,
|
||||
giy_xs_os_xsv2_osv16__ao32,
|
||||
giy_xs_os_xsv2_osv8__ao32,
|
||||
g_os_is_yx_isv16_osv16,
|
||||
|
||||
@@ -24,11 +24,13 @@ ParamsKey ConcatenationKernel_simple_Ref::GetSupportedKey() const {
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
k.EnableInputDataType(Datatype::F32);
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableInputDataType(Datatype::INT32);
|
||||
k.EnableInputDataType(Datatype::INT64);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT32);
|
||||
k.EnableOutputDataType(Datatype::INT64);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
|
||||
#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include "common_tools.h"
|
||||
#include <vector>
|
||||
@@ -63,8 +63,8 @@ static size_t getOutBlock_X(const size_t output_size_x, const size_t stride_x, c
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
Convolution_kernel_b_fs_yx_fsv16_imad::BlockParams
|
||||
Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
|
||||
Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
|
||||
Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
|
||||
constexpr float max_reg_pressure = 0.75f;
|
||||
|
||||
// TODO Investigate whether below algorithm for selecting optimal block params could be reduced to:
|
||||
@@ -85,9 +85,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
size_t block_features = simd;
|
||||
{
|
||||
size_t tmp_block_features = simd * 2;
|
||||
auto block2_params = BlockParams{ block_width, 1, tmp_block_features, in_block_width, 1, 1 };
|
||||
auto block2_params = BlockParams{ block_width, 1, 1, tmp_block_features, in_block_width, 1, 1, 1 };
|
||||
|
||||
bool c_mul_f = params.output.Feature().v % tmp_block_features == 0;
|
||||
bool c_mul_f = params.weights.OFM().v % tmp_block_features == 0;
|
||||
bool c_reg_pressure = EstimateRegPressure(params, block2_params) <= max_reg_pressure;
|
||||
|
||||
if (c_mul_f && c_reg_pressure) {
|
||||
@@ -97,7 +97,9 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
|
||||
// If not enough occupancy try to perform feature split or/and block reduction
|
||||
size_t feature_slm_split = 1;
|
||||
auto no_split_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, 1 };
|
||||
|
||||
auto no_split_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 1 };
|
||||
|
||||
if (EstimateOccupancy(params, no_split_params) < 1.f) {
|
||||
// Temporary variables for possible reductions in block sizes
|
||||
bool update_block_params = false;
|
||||
@@ -107,7 +109,8 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
|
||||
// Feature split requires extra registers, so check if it can be done with current block sizes
|
||||
bool can_split =
|
||||
EstimateRegPressure(params, BlockParams{ block_width, 1, block_features, in_block_width, 1, 2 }) <= max_reg_pressure;
|
||||
EstimateRegPressure(params, BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 2 }) <= max_reg_pressure;
|
||||
|
||||
// Has the occupancy reached sufficient level
|
||||
bool enough_occupancy = false;
|
||||
// Reductions to reduce register pressure
|
||||
@@ -116,7 +119,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
// At most twice reduction in output block width is acceptable
|
||||
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
|
||||
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
|
||||
auto dummy_split_params = BlockParams{ w, 1, block_features, tmp_in_width, 1, 2 };
|
||||
auto dummy_split_params = BlockParams{ w, 1, 1, block_features, tmp_in_width, 1, 1, 2 };
|
||||
|
||||
bool c_reg_pressure = EstimateRegPressure(params, dummy_split_params) <= max_reg_pressure;
|
||||
bool c_mul_x = params.output.X().v % w == 0;
|
||||
@@ -139,7 +142,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
}
|
||||
// Check if previous reductions haven't improved occupancy enough
|
||||
{
|
||||
auto reduced_params = BlockParams{ split_block_width, 1, split_block_features, split_in_block_width, 1, 1 };
|
||||
auto reduced_params = BlockParams{ split_block_width, 1, 1, split_block_features, split_in_block_width, 1, 1, 1 };
|
||||
enough_occupancy = EstimateOccupancy(params, reduced_params) >= 1.f;
|
||||
update_block_params = enough_occupancy;
|
||||
}
|
||||
@@ -147,7 +150,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
if (can_split && !enough_occupancy) {
|
||||
// TODO Try other split sizes
|
||||
for (size_t split = 4; split < 5; ++split) {
|
||||
auto tmp_params = BlockParams{ block_width, 1, block_features, in_block_width, 1, split };
|
||||
auto tmp_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, split };
|
||||
|
||||
bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
|
||||
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
|
||||
@@ -172,7 +175,7 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
// At most twice reduction in output block width is acceptable
|
||||
for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
|
||||
size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
|
||||
auto tmp_params = BlockParams{ w, 1, split_block_features, tmp_in_width, 1, feature_slm_split };
|
||||
auto tmp_params = BlockParams{ w, 1, 1, split_block_features, tmp_in_width, 1, 1, feature_slm_split };
|
||||
|
||||
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
|
||||
bool c_mul_x = params.output.X().v % w == 0;
|
||||
@@ -194,44 +197,60 @@ Convolution_kernel_b_fs_yx_fsv16_imad::GetBlockParams(const convolution_params&
|
||||
}
|
||||
}
|
||||
|
||||
// Select biggest block height that fits into registers
|
||||
// Select biggest block height and depth that fits into registers
|
||||
size_t block_height = 1;
|
||||
size_t block_depth = 1;
|
||||
size_t in_block_height = 1;
|
||||
for (size_t h = 2; h < 16; ++h) {
|
||||
if (params.output.Y().v % h != 0)
|
||||
size_t in_block_depth = 1;
|
||||
|
||||
bool break_external_loop = false;
|
||||
|
||||
for (size_t d = 1; d < 16; ++d) {
|
||||
if (params.output.Z().v % d != 0)
|
||||
continue;
|
||||
for (size_t h = 2; h < 16; ++h) {
|
||||
if (params.output.Y().v % h != 0)
|
||||
continue;
|
||||
size_t tmp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
|
||||
size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
|
||||
auto tmp_params = BlockParams{ block_width, h, d, block_features, in_block_width, tmp_in_block_height, tmp_in_block_depth, feature_slm_split };
|
||||
|
||||
size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
|
||||
auto tmp_params = BlockParams{ block_width, h, block_features, in_block_width, tmp_in_block_height, feature_slm_split };
|
||||
bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
|
||||
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
|
||||
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
|
||||
|
||||
bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
|
||||
bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
|
||||
bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
|
||||
if (c_reg_pressure && c_occupancy && c_slm) {
|
||||
block_height = h;
|
||||
block_depth = d;
|
||||
in_block_height = tmp_in_block_height;
|
||||
in_block_depth = tmp_in_block_depth;
|
||||
} else {
|
||||
break_external_loop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (c_reg_pressure && c_occupancy && c_slm) {
|
||||
block_height = h;
|
||||
in_block_height = tmp_in_block_height;
|
||||
} else {
|
||||
if (break_external_loop) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return BlockParams{ block_width, block_height, block_features, in_block_width, in_block_height, feature_slm_split };
|
||||
return BlockParams{ block_width, block_height, block_depth, block_features, in_block_width, in_block_height, in_block_depth, feature_slm_split };
|
||||
}
|
||||
|
||||
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
|
||||
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
|
||||
size_t bytes_used = 0;
|
||||
// accumulator
|
||||
size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_features;
|
||||
size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_depth * block.output_block_features;
|
||||
bytes_used += accumulator_elements * BytesPerElement(GetAccumulatorType(params));
|
||||
// input block
|
||||
size_t input_block_elements = block.input_block_height * Align(block.input_block_width, simd) * fsv;
|
||||
size_t input_block_elements = block.input_block_depth * block.input_block_height * Align(block.input_block_width, simd) * fsv;
|
||||
bytes_used += input_block_elements * BytesPerElement(params.inputs[0].GetDType());
|
||||
// weights block
|
||||
size_t weights_block_elements = block.output_block_features * fsv;
|
||||
bytes_used += weights_block_elements * BytesPerElement(params.weights.GetDType());
|
||||
|
||||
// Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, filter_idx, etc.)
|
||||
// Experimentally selected number of registers needed for extra variables (eg. out_x, out_y, out_z, filter_idx, etc.)
|
||||
constexpr size_t experimental_extra_regs = 8 * 32;
|
||||
bytes_used += experimental_extra_regs;
|
||||
|
||||
@@ -248,13 +267,14 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateRegPressure(const convoluti
|
||||
return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
|
||||
}
|
||||
|
||||
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
|
||||
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateOccupancy(const convolution_params& params, const BlockParams& block) const {
|
||||
size_t blocks_w = CeilDiv(params.output.X().v, block.output_block_width);
|
||||
size_t blocks_h = CeilDiv(params.output.Y().v, block.output_block_height);
|
||||
size_t blocks_f = CeilDiv(params.output.Feature().v, block.output_block_features) * block.feature_slm_split;
|
||||
size_t blocks_d = CeilDiv(params.output.Z().v, block.output_block_depth);
|
||||
size_t blocks_f = CeilDiv(params.weights.OFM().v, block.output_block_features) * params.groups * block.feature_slm_split;
|
||||
size_t block_b = params.output.Batch().v;
|
||||
|
||||
auto threads = blocks_w * blocks_h * blocks_f * block_b;
|
||||
auto threads = blocks_w * blocks_h * blocks_d * blocks_f * block_b;
|
||||
constexpr size_t max_threads_per_cu = 7;
|
||||
size_t compute_units = params.engineInfo.computeUnitsCount;
|
||||
size_t max_threads = compute_units * max_threads_per_cu;
|
||||
@@ -262,17 +282,18 @@ float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateOccupancy(const convolution
|
||||
return static_cast<float>(threads) / static_cast<float>(max_threads);
|
||||
}
|
||||
|
||||
float Convolution_kernel_b_fs_yx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
|
||||
size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_features * (block.feature_slm_split - 1);
|
||||
float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
|
||||
size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
|
||||
block.output_block_features * (block.feature_slm_split - 1);
|
||||
size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
|
||||
|
||||
// TODO Actual maximum slm should also depend on number of work-groups, but this is device specific
|
||||
// TODO: Actual maximum slm should also depend on number of work-groups, but this is device specific
|
||||
size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
|
||||
|
||||
return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
|
||||
}
|
||||
|
||||
ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
@@ -284,6 +305,9 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
|
||||
@@ -294,31 +318,36 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableDilation();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
KernelsData Convolution_kernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params,
|
||||
KernelsData Convolution_kernel_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options);
|
||||
}
|
||||
|
||||
JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convolution_params& params,
|
||||
JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& kd) const {
|
||||
auto mem_consts = Parent::GetJitConstants(params, kd);
|
||||
|
||||
auto block_params = GetBlockParams(params);
|
||||
|
||||
bool unroll_filter_y = block_params.output_block_height != 1;
|
||||
bool unroll_filter_z = block_params.output_block_depth != 1;
|
||||
|
||||
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block_params.output_block_width));
|
||||
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", block_params.input_block_width));
|
||||
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block_params.output_block_height));
|
||||
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_HEIGHT", block_params.input_block_height));
|
||||
mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block_params.output_block_depth));
|
||||
mem_consts.AddConstant(MakeJitConstant("IN_BLOCK_DEPTH", block_params.input_block_depth));
|
||||
mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Y_UNROLL", unroll_filter_y ? params.filterSize.y : 1));
|
||||
mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", block_params.output_block_features / simd));
|
||||
mem_consts.AddConstant(MakeJitConstant("FILTER_SIZE_Z_UNROLL", unroll_filter_z ? params.filterSize.z : 1));
|
||||
mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", static_cast<int>(std::ceil(block_params.output_block_features / simd))));
|
||||
mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", block_params.output_block_features));
|
||||
mem_consts.AddConstant(MakeJitConstant("FEATURE_SLM_SPLIT", block_params.feature_slm_split));
|
||||
mem_consts.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
|
||||
@@ -327,7 +356,20 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
std::vector<std::string> idx_order = { "out_b", "(out_f + ofb * 16)", "(out_y + oh)", "(out_x + ow)" };
|
||||
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
|
||||
idx_order = { "out_b", "(out_f + ofb * 16)", "(out_z + od)", "(out_y + oh)", "(out_x + ow)" };
|
||||
}
|
||||
|
||||
std::vector<Tensor::DataChannelName> loop_axes = { Tensor::DataChannelName::X };
|
||||
|
||||
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
|
||||
if (block_params.output_block_depth != 1) {
|
||||
loop_axes.push_back(Tensor::DataChannelName::Z);
|
||||
} else {
|
||||
idx_order[idx_order.size() - 3] = "out_z";
|
||||
}
|
||||
}
|
||||
|
||||
if (block_params.output_block_height != 1) {
|
||||
loop_axes.push_back(Tensor::DataChannelName::Y);
|
||||
} else {
|
||||
@@ -349,15 +391,16 @@ JitConstants Convolution_kernel_b_fs_yx_fsv16_imad::GetJitConstants(const convol
|
||||
return mem_consts;
|
||||
} // GetJitConstants
|
||||
|
||||
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDefault(const convolution_params& params,
|
||||
ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
|
||||
int) const {
|
||||
DispatchData kd;
|
||||
const auto& output = params.output;
|
||||
const auto& weights = params.weights;
|
||||
auto block_params = GetBlockParams(params);
|
||||
|
||||
kd.gws0 = CeilDiv(output.X().v, block_params.output_block_width);
|
||||
kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height);
|
||||
kd.gws2 = output.Batch().v * CeilDiv(output.Feature().v, block_params.output_block_features) * simd * block_params.feature_slm_split;
|
||||
kd.gws1 = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
|
||||
kd.gws2 = output.Batch().v * CeilDiv(weights.OFM().v, block_params.output_block_features) * params.groups * simd * block_params.feature_slm_split;
|
||||
|
||||
kd.lws0 = 1;
|
||||
kd.lws1 = 1;
|
||||
@@ -367,16 +410,13 @@ ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad::SetDe
|
||||
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
kd.efficiency = FORCE_PRIORITY_2;
|
||||
// TODO Optimize 1x1, because this kernel is better in most cases
|
||||
//if (params.filterSize.x == 1 && params.filterSize.y == 1)
|
||||
// kd.efficiency = FORCE_PRIORITY_1;
|
||||
if (static_cast<float>(params.weights.IFM().v) / static_cast<float>(Align(params.weights.IFM().v, fsv)) < 0.5f)
|
||||
kd.efficiency = FORCE_PRIORITY_4;
|
||||
|
||||
return kd;
|
||||
} // SetDefault
|
||||
|
||||
bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
|
||||
bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
|
||||
if (!Parent::Validate(params, options)) {
|
||||
return false;
|
||||
}
|
||||
@@ -384,7 +424,7 @@ bool Convolution_kernel_b_fs_yx_fsv16_imad::Validate(const Params& params, const
|
||||
KernelData kd = KernelData::Default<convolution_params>(params);
|
||||
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
|
||||
|
||||
if (newParams.groups != 1 || newParams.split != 1)
|
||||
if (newParams.split != 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@@ -21,11 +21,11 @@
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class Convolution_kernel_b_fs_yx_fsv16_imad : public ConvolutionKernelBase {
|
||||
class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
Convolution_kernel_b_fs_yx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad") {}
|
||||
virtual ~Convolution_kernel_b_fs_yx_fsv16_imad() {}
|
||||
Convolution_kernel_b_fs_zyx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_zyx_fsv16_imad") {}
|
||||
virtual ~Convolution_kernel_b_fs_zyx_fsv16_imad() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
@@ -35,8 +35,8 @@ protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
|
||||
bool NeedPaddedInput() const override { return true; }
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
|
||||
return WeightsLayout::os_is_yx_osv16_isv16;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
|
||||
return p.groups > 1 ? WeightsLayout::g_os_is_zyx_osv16_isv16 : WeightsLayout::os_is_zyx_osv16_isv16;
|
||||
}
|
||||
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
@@ -49,10 +49,13 @@ protected:
|
||||
struct BlockParams {
|
||||
size_t output_block_width;
|
||||
size_t output_block_height;
|
||||
size_t output_block_depth;
|
||||
|
||||
size_t output_block_features;
|
||||
|
||||
size_t input_block_width;
|
||||
size_t input_block_height;
|
||||
size_t input_block_depth;
|
||||
|
||||
size_t feature_slm_split;
|
||||
};
|
||||
@@ -60,7 +60,7 @@
|
||||
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.h"
|
||||
#include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
|
||||
#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
|
||||
#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
|
||||
#include "convolution_kernel_b_fs_zyx_fsv16_imad.h"
|
||||
#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
|
||||
@@ -71,9 +71,9 @@ convolution_kernel_selector::convolution_kernel_selector() {
|
||||
Attach<ConvolutionKernel_Ref>();
|
||||
Attach<DeformableConvolutionKernel_bfyx_Ref>();
|
||||
|
||||
// b_fs_yx_fsv16 int8
|
||||
// b_fs_yx_fsv16 and b_fs_zyx_fsv16 int8
|
||||
Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
|
||||
Attach<Convolution_kernel_b_fs_yx_fsv16_imad>();
|
||||
Attach<Convolution_kernel_b_fs_zyx_fsv16_imad>();
|
||||
|
||||
// b_fs_yx_fsv16 and b_fs_zyx_fsv16
|
||||
Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();
|
||||
|
||||
@@ -12,13 +12,13 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
|
||||
#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
@@ -27,6 +27,8 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
@@ -41,17 +43,18 @@ ParamsKey PoolingKernelGPU_b_fs_yx_fsv16_imad::GetSupportedKey() const {
|
||||
return k;
|
||||
}
|
||||
|
||||
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(const pooling_params& params) const {
|
||||
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
|
||||
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
|
||||
|
||||
const auto& out = params.output;
|
||||
auto x = out.X().v;
|
||||
auto y = out.Y().v;
|
||||
auto z = out.Z().v;
|
||||
auto f = out.Feature().v;
|
||||
auto b = out.Batch().v;
|
||||
|
||||
runInfo.gws0 = x;
|
||||
runInfo.gws1 = y;
|
||||
runInfo.gws1 = y * z;
|
||||
// we got b_fs_yx_fsv16 format, we process 16 features per workitem
|
||||
runInfo.gws2 = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
|
||||
|
||||
@@ -64,19 +67,24 @@ PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_yx_fsv16_imad::SetDefault(
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
|
||||
JitConstants PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetJitConstants(const pooling_params& params, DispatchData kd) const {
|
||||
auto jit = PoolingKernelBase::GetJitConstants(params, kd);
|
||||
|
||||
const size_t in_x_pitch = FEATURE_SLICE_SIZE;
|
||||
const size_t in_y_pitch = FEATURE_SLICE_SIZE * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_z_pitch = FEATURE_SLICE_SIZE * params.inputs[0].Y().LogicalDimPadded() * params.inputs[0].X().LogicalDimPadded();
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Z_PITCH", in_z_pitch));
|
||||
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
|
||||
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
|
||||
FusedOpsConfiguration conf = {"", {"b", "(f+i)", "y", "x"}, "pool_result[i]", input_dt, 1};
|
||||
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
|
||||
conf = {"", {"b", "(f+i)", "z", "y", "x"}, "pool_result[i]", input_dt, 1 };
|
||||
}
|
||||
conf.SetLoopAxes({ Tensor::DataChannelName::FEATURE }, true);
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
@@ -84,19 +92,11 @@ JitConstants PoolingKernelGPU_b_fs_yx_fsv16_imad::GetJitConstants(const pooling_
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData PoolingKernelGPU_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
KernelsData PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
|
||||
}
|
||||
|
||||
bool PoolingKernelGPU_b_fs_yx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
|
||||
if (!PoolingKernelBase::Validate(params, options)) {
|
||||
return false;
|
||||
}
|
||||
auto p = dynamic_cast<const pooling_params&>(params);
|
||||
|
||||
if (p.inputs[0].Feature().v % FEATURE_SLICE_SIZE != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
bool PoolingKernelGPU_b_fs_zyx_fsv16_imad::Validate(const Params& params, const optional_params& options) const {
|
||||
return PoolingKernelBase::Validate(params, options);
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
@@ -19,10 +19,10 @@
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
class PoolingKernelGPU_b_fs_yx_fsv16_imad: public PoolingKernelBase{
|
||||
class PoolingKernelGPU_b_fs_zyx_fsv16_imad: public PoolingKernelBase{
|
||||
public:
|
||||
PoolingKernelGPU_b_fs_yx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_yx_fsv16_imad") {}
|
||||
virtual ~PoolingKernelGPU_b_fs_yx_fsv16_imad() {}
|
||||
PoolingKernelGPU_b_fs_zyx_fsv16_imad() : PoolingKernelBase("pooling_gpu_b_fs_zyx_fsv16_imad") {}
|
||||
virtual ~PoolingKernelGPU_b_fs_zyx_fsv16_imad() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
@@ -24,7 +24,7 @@
|
||||
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
|
||||
#include "pooling_kernel_gpu_bsv16_fsv16.h"
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
|
||||
#include "pooling_kernel_gpu_b_fs_zyx_fsv16_imad.h"
|
||||
#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
@@ -40,7 +40,7 @@ pooling_kernel_selector::pooling_kernel_selector() {
|
||||
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
|
||||
Attach<PoolingKernel_b_fs_yx_fsv16>();
|
||||
Attach<PoolingKernel_bsv16_fsv16>();
|
||||
Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
|
||||
Attach<PoolingKernelGPU_b_fs_zyx_fsv16_imad>();
|
||||
Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
|
||||
}
|
||||
|
||||
|
||||
@@ -1,390 +0,0 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/mmad.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
|
||||
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
|
||||
|
||||
#define SIMD 16
|
||||
#define FSV 16
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
|
||||
KERNEL(convolution_gpu_b_fs_yx_fsv16_imad)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
const __global FILTER_TYPE *weights,
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx) {
|
||||
|
||||
#define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
|
||||
const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
|
||||
#undef LUT_VALUE_CLAMP
|
||||
|
||||
const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
|
||||
const uint out_y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
|
||||
const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
|
||||
uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
|
||||
uint out_f = out_fg + get_sub_group_local_id();
|
||||
|
||||
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
#if FEATURE_SLM_SPLIT == 1
|
||||
const uint k_start = 0;
|
||||
#else
|
||||
const uint k_start = get_sub_group_id() * FSV;
|
||||
#endif
|
||||
|
||||
uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, k_start, 0, 0);
|
||||
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, 16) * FILTER_SIZE_X * FILTER_SIZE_Y * 16);
|
||||
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, k_start, input_y, input_x);
|
||||
|
||||
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
|
||||
uint4 input_val[IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16) / FEATURE_SLM_SPLIT; k++) {
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
|
||||
// Load input block IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
|
||||
uint input_idx = input_start_idx + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
|
||||
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
|
||||
input_val[iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * 16));
|
||||
} else {
|
||||
input_val[iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * 16));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
|
||||
|
||||
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
|
||||
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
|
||||
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
|
||||
const uint shuffle_wi = x_block_idx % SIMD;
|
||||
const uint shuffle_idx = x_block_idx / SIMD;
|
||||
|
||||
dotProd[ofb][oh][ow] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProd[ofb][oh][ow],
|
||||
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[y_block_idx][shuffle_idx][ive], shuffle_wi)),
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
filter_idx += FSV * FSV;
|
||||
}
|
||||
}
|
||||
input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
|
||||
}
|
||||
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
|
||||
|
||||
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * (FEATURE_SLM_SPLIT - 1);
|
||||
}
|
||||
|
||||
#if FEATURE_SLM_SPLIT != 1
|
||||
// Additional local memory reduction for feature split mode
|
||||
# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
|
||||
# error convolution_gpu_b_fs_yx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
|
||||
# endif
|
||||
|
||||
const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
|
||||
__local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
|
||||
|
||||
uint sgid_start_idx = get_sub_group_id();
|
||||
sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
|
||||
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH
|
||||
+ get_sub_group_local_id();
|
||||
|
||||
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
|
||||
if (get_sub_group_id() == wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < wg; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dotProd[0][oh][ow] = dotProd[wg][oh][ow];
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OFM_SIZE_PER_SIMD : 0) +
|
||||
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
|
||||
return;
|
||||
|
||||
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * SIMD + get_sub_group_local_id();
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
dotProd[0][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if FEATURE_SLM_SPLIT == 1
|
||||
# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
|
||||
#else
|
||||
# define OFM_VALUES_PER_WI 1
|
||||
out_f += get_sub_group_id() * SIMD;
|
||||
out_fg += get_sub_group_id() * SIMD;
|
||||
#endif
|
||||
|
||||
#if BIAS_TERM
|
||||
BIAS_TYPE bias[OFM_VALUES_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
bias[ofb] = biases[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dequantized[ofb][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][oh][ow]);
|
||||
#if BIAS_TERM
|
||||
dequantized[ofb][oh][ow] += bias[ofb];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
ACTIVATION_TYPE dequantized_val = dequantized[ofb][oh][ow];
|
||||
#if HAS_FUSED_OPS
|
||||
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_CALC_SCALAR;
|
||||
# else
|
||||
FUSED_OPS_SCALAR;
|
||||
# endif
|
||||
result[ofb][oh][ow] = FUSED_OPS_RESULT_SCALAR;
|
||||
#else
|
||||
result[ofb][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint dst_index = OUTPUT_GET_INDEX(out_b, out_fg, out_y, out_x);
|
||||
|
||||
if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)
|
||||
&& (OUTPUT_FEATURE_NUM % OFM_BLOCKS_PER_SIMD == 0) ) {
|
||||
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
|
||||
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
|
||||
if (good_of_block) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
uint ow = 0;
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
result_val[i] = result[ofb][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
|
||||
dst_index += 8 * SIMD;
|
||||
}
|
||||
#endif
|
||||
#if OUTPUT_TYPE_SIZE <= 2
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
result_val[i] = result[ofb][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
|
||||
dst_index += 4 * SIMD;
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 2; ++i) {
|
||||
result_val[i] = result[ofb][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
|
||||
dst_index += 2 * SIMD;
|
||||
}
|
||||
|
||||
if (OUT_BLOCK_WIDTH % 2 == 1) {
|
||||
OUTPUT_TYPE result_val = result[ofb][oh][ow];
|
||||
DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
|
||||
dst_index += 1 * SIMD;
|
||||
}
|
||||
} // if (good_y)
|
||||
dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
|
||||
} // for (OUT_BLOCK_HEIGHT)
|
||||
} // if (good_of_block)
|
||||
dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Y_PITCH * FSV * OUT_BLOCK_HEIGHT;
|
||||
} // for (OFM_VALUES_PER_WI)
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
|
||||
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_fg + ofb * SIMD <= OUTPUT_FEATURE_NUM);
|
||||
if (good_of_block) {
|
||||
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
|
||||
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
|
||||
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
|
||||
break;
|
||||
#endif
|
||||
|
||||
#if OUTPUT_FEATURE_NUM % SIMD != 0
|
||||
if (out_fg + (ofb + 1) * SIMD >= OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % SIMD)
|
||||
result[ofb][oh][ow] = (OUTPUT_TYPE)0;
|
||||
#endif
|
||||
output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV] = result[ofb][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef AS_TYPE_N
|
||||
#undef AS_TYPE_N_
|
||||
#undef AS_FILTER_TYPE_4
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef ALIGN
|
||||
|
||||
#undef SIMD
|
||||
#undef FSV
|
||||
#undef OFM_VALUES_PER_WI
|
||||
@@ -0,0 +1,516 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/mmad.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
|
||||
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
|
||||
|
||||
#define SIMD 16
|
||||
#define FSV 16
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
|
||||
KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
const __global FILTER_TYPE *weights,
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx) {
|
||||
|
||||
#define LUT_VALUE_CLAMP(x) (( (IN_BLOCK_WIDTH % SIMD == 0) || ((x) < IN_BLOCK_WIDTH % SIMD) ) ? (x) : 0)
|
||||
const int tmp = LUT_VALUE_CLAMP(get_sub_group_local_id());
|
||||
#undef LUT_VALUE_CLAMP
|
||||
|
||||
const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
|
||||
const uint out_y = ((uint)get_global_id(1) / ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_HEIGHT;
|
||||
#if INPUT0_DIMS == 4
|
||||
const uint out_z = 0;
|
||||
#else
|
||||
const uint out_z = ((uint)get_global_id(1) % ALIGN(OUTPUT_SIZE_Z, OUT_BLOCK_DEPTH)) * OUT_BLOCK_DEPTH;
|
||||
#endif
|
||||
const uint out_b = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) / FILTER_GROUPS_NUM;
|
||||
const uint g = (uint)(get_group_id(2) / CEIL_DIV(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD)) % FILTER_GROUPS_NUM;
|
||||
uint out_f_sg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % (ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD) * FILTER_GROUPS_NUM);
|
||||
uint out_f = out_f_sg + get_sub_group_local_id();
|
||||
uint out_f_g = (out_f % ALIGN(FILTER_OFM_NUM, OFM_SIZE_PER_SIMD));
|
||||
#if FILTER_OFM_NUM % SIMD != 0
|
||||
out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
|
||||
#endif
|
||||
|
||||
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
|
||||
|
||||
#if FEATURE_SLM_SPLIT == 1
|
||||
const uint k_start = 0;
|
||||
#else
|
||||
const uint k_start = get_sub_group_id() * FSV;
|
||||
#endif
|
||||
|
||||
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
|
||||
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
|
||||
|
||||
#if INPUT0_DIMS == 4
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
|
||||
#else
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
|
||||
#endif
|
||||
|
||||
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
uint in_f_offset = (g * FILTER_IFM_NUM) % FSV;
|
||||
#endif
|
||||
|
||||
uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
|
||||
// Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
|
||||
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
|
||||
|
||||
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#else
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#endif
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
__attribute__((opencl_unroll_hint(FSV)))
|
||||
for (uint v = 0; v < FSV; v++) {
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
|
||||
} else {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v +
|
||||
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
|
||||
FSV];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
|
||||
#else
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
|
||||
#endif
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
__attribute__((opencl_unroll_hint(FSV)))
|
||||
for (uint v = 0; v < FSV; v++) {
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
|
||||
} else {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v +
|
||||
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
|
||||
FSV];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
|
||||
|
||||
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
|
||||
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
|
||||
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
|
||||
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
|
||||
const uint shuffle_wi = x_block_idx % SIMD;
|
||||
const uint shuffle_idx = x_block_idx / SIMD;
|
||||
|
||||
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProd[ofb][od][oh][ow],
|
||||
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
|
||||
shuffle_wi)),
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
filter_idx += FSV * FSV;
|
||||
}
|
||||
}
|
||||
}
|
||||
input_start_idx += DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
|
||||
}
|
||||
input_start_idx += DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV - (FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL) * DILATION_SIZE_Y * INPUT0_Y_PITCH * FSV;
|
||||
}
|
||||
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
|
||||
|
||||
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
|
||||
}
|
||||
|
||||
#if FEATURE_SLM_SPLIT != 1
|
||||
// Additional local memory reduction for feature split mode
|
||||
# if FEATURE_SLM_SPLIT < OFM_BLOCKS_PER_SIMD
|
||||
# error convolution_gpu_b_fs_zyx_fsv16_imad.cl - OFM_BLOCKS_PER_SIMD must be less or equal to FEATURE_SLM_SPLIT
|
||||
# endif
|
||||
|
||||
const uint partial_acc_size = (FEATURE_SLM_SPLIT - 1) * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH;
|
||||
__local ACCUMULATOR_TYPE partial_acc[partial_acc_size];
|
||||
|
||||
uint sgid_start_idx = get_sub_group_id();
|
||||
sgid_start_idx = sgid_start_idx == 0 ? 0 : sgid_start_idx - 1;
|
||||
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
|
||||
get_sub_group_local_id();
|
||||
|
||||
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
|
||||
if (get_sub_group_id() == wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < wg; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_sub_group_id() >= OFM_BLOCKS_PER_SIMD)
|
||||
return;
|
||||
|
||||
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
oh * OUT_BLOCK_WIDTH * SIMD +
|
||||
ow * SIMD;
|
||||
dotProd[0][od][oh][ow] += partial_acc_ptr[partial_acc_ptr_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if FEATURE_SLM_SPLIT == 1
|
||||
# define OFM_VALUES_PER_WI (OFM_BLOCKS_PER_SIMD)
|
||||
#else
|
||||
# define OFM_VALUES_PER_WI 1
|
||||
out_f += get_sub_group_id() * SIMD;
|
||||
out_f_sg += get_sub_group_id() * SIMD;
|
||||
#endif
|
||||
|
||||
#if BIAS_TERM
|
||||
BIAS_TYPE bias[OFM_VALUES_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
bias[ofb] = biases[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
|
||||
#if BIAS_TERM
|
||||
dequantized[ofb][od][oh][ow] += bias[ofb];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
|
||||
#if HAS_FUSED_OPS
|
||||
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_CALC_SCALAR;
|
||||
# else
|
||||
FUSED_OPS_SCALAR;
|
||||
# endif
|
||||
result[ofb][od][oh][ow] = FUSED_OPS_RESULT_SCALAR;
|
||||
#else
|
||||
result[ofb][od][oh][ow] = TO_OUTPUT_TYPE(dequantized_val);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if OUTPUT_DIMS == 4
|
||||
uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_y, out_x);
|
||||
#else
|
||||
uint dst_index = OUTPUT_GET_INDEX(out_b, out_f_sg, out_z, out_y, out_x);
|
||||
#endif
|
||||
|
||||
#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
|
||||
if ((OUTPUT_SIZE_X % OUT_BLOCK_WIDTH == 0 || out_x + OUT_BLOCK_WIDTH <= OUTPUT_SIZE_X)) {
|
||||
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
|
||||
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
|
||||
if (good_of_block) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
|
||||
if (good_z) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
uint ow = 0;
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
|
||||
dst_index += 8 * SIMD;
|
||||
}
|
||||
#endif
|
||||
#if OUTPUT_TYPE_SIZE <= 2
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
|
||||
dst_index += 4 * SIMD;
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 2; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
|
||||
dst_index += 2 * SIMD;
|
||||
}
|
||||
|
||||
if (OUT_BLOCK_WIDTH % 2 == 1) {
|
||||
OUTPUT_TYPE result_val = result[ofb][od][oh][ow];
|
||||
DT_OUTPUT_BLOCK_WRITE(output, dst_index, result_val);
|
||||
dst_index += 1 * SIMD;
|
||||
}
|
||||
} // if (good_y)
|
||||
dst_index += OUTPUT_Y_PITCH * FSV - OUT_BLOCK_WIDTH * FSV;
|
||||
} // for (OUT_BLOCK_HEIGHT)
|
||||
} // if (good_z)
|
||||
dst_index += OUTPUT_Z_PITCH * FSV - OUTPUT_Y_PITCH * OUT_BLOCK_HEIGHT * FSV;
|
||||
} // for (OUT_BLOCK_DEPTH)
|
||||
} // if (good_of_block)
|
||||
dst_index += OUTPUT_FEATURE_PITCH * FSV - OUTPUT_Z_PITCH * OUT_BLOCK_DEPTH * FSV;
|
||||
} // for (OFM_VALUES_PER_WI)
|
||||
} else {
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(OFM_VALUES_PER_WI)))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
|
||||
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
|
||||
if (good_of_block) {
|
||||
#if OUTPUT_DIMS == 4
|
||||
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_y, out_x);
|
||||
#else
|
||||
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
|
||||
if (good_z) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
|
||||
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
|
||||
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
|
||||
break;
|
||||
#endif
|
||||
|
||||
if (out_f_g < FILTER_OFM_NUM) {
|
||||
output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV + od * OUTPUT_Z_PITCH * FSV] = result[ofb][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#if ((FILTER_OFM_NUM % OFM_BLOCKS_PER_SIMD == 0) && ((FILTER_GROUPS_NUM == 1) || (FILTER_OFM_NUM % SIMD == 0)))
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef AS_TYPE_N
|
||||
#undef AS_TYPE_N_
|
||||
#undef AS_FILTER_TYPE_4
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef ALIGN
|
||||
|
||||
#undef SIMD
|
||||
#undef FSV
|
||||
#undef OFM_VALUES_PER_WI
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -353,6 +353,45 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
|
||||
return output_offset;
|
||||
}
|
||||
|
||||
inline uint FUNC(get_g_os_is_zyx_osv_isv_index)(uint g, uint o, uint i, uint z, uint y, uint x,
|
||||
uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size)
|
||||
{
|
||||
const uint isv = i % isv_size;
|
||||
const uint osv = o % osv_size;
|
||||
const uint is = i / isv_size;
|
||||
const uint os = o / osv_size;
|
||||
|
||||
const uint x_pitch = osv_size * isv_size;
|
||||
const uint y_pitch = x_pitch * x_size;
|
||||
const uint z_pitch = y_pitch * y_size;
|
||||
const uint is_pitch = z_pitch * z_size;
|
||||
const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size);
|
||||
const uint g_pitch = os_pitch * ((o_size + osv_size - 1) / osv_size);
|
||||
|
||||
const uint output_offset =
|
||||
isv +
|
||||
osv * isv_size +
|
||||
x * x_pitch +
|
||||
y * y_pitch +
|
||||
z * z_pitch +
|
||||
is * is_pitch +
|
||||
os * os_pitch +
|
||||
g * g_pitch;
|
||||
|
||||
return output_offset;
|
||||
}
|
||||
|
||||
#define GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, g, o, i, z, y, x) \
|
||||
FUNC_CALL(get_g_os_is_zyx_osv_isv_index)( \
|
||||
g, o, i, z, y, x, \
|
||||
CAT(prefix, _SIZE_X), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _SIZE_Z), \
|
||||
CAT(prefix, _IFM_NUM), \
|
||||
CAT(prefix, _OFM_NUM), \
|
||||
16, \
|
||||
16)
|
||||
|
||||
#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
|
||||
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
|
||||
o, i, 0, y, x, \
|
||||
@@ -364,6 +403,17 @@ inline uint FUNC(get_os_is_zyx_osv_isv_index)(uint o, uint i, uint z, uint y, ui
|
||||
16, \
|
||||
16)
|
||||
|
||||
#define GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(prefix, o, i, z, y, x) \
|
||||
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
|
||||
o, i, z, y, x, \
|
||||
CAT(prefix, _SIZE_X), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _SIZE_Z), \
|
||||
CAT(prefix, _IFM_NUM), \
|
||||
CAT(prefix, _OFM_NUM), \
|
||||
16, \
|
||||
16)
|
||||
|
||||
#define GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(prefix, o, i, z, y, x) \
|
||||
FUNC_CALL(get_os_is_zyx_osv_isv_index)( \
|
||||
o, i, z, y, x, \
|
||||
|
||||
@@ -1,206 +0,0 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include/include_all.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
|
||||
|
||||
#define AS_TYPE(type, val) CAT(as_, type)(val)
|
||||
#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
|
||||
#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
|
||||
|
||||
#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
|
||||
#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#if MAX_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_MIN
|
||||
#elif AVG_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_ZERO
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
|
||||
|
||||
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
|
||||
{
|
||||
#if MAX_POOLING
|
||||
return ACCUMULATOR_MAX_FUNC(tmp, in);
|
||||
#elif AVG_POOLING
|
||||
return tmp + in;
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
|
||||
KERNEL(pooling_gpu_b_fs_yx_fsv16)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0);
|
||||
const uint y = (uint)get_global_id(1);
|
||||
const uint bf = (uint)get_global_id(2);
|
||||
const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
|
||||
const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
|
||||
|
||||
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
|
||||
INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
|
||||
|
||||
#ifdef CHECK_BOUNDRY
|
||||
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
|
||||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
uint num_elements = 0;
|
||||
#endif
|
||||
|
||||
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for(uint j = 0; j < POOL_SIZE_Y; j++)
|
||||
{
|
||||
int input_offset_y = offset_y + j;
|
||||
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
||||
if(!zero_y)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for(uint i = 0; i < POOL_SIZE_X; i++)
|
||||
{
|
||||
int input_offset_x = offset_x + i;
|
||||
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
||||
if(!zero)
|
||||
{
|
||||
const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
|
||||
|
||||
int4 int_data = vload4(0, (__global int*)(input + input_idx));
|
||||
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
num_elements++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
|
||||
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
|
||||
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
|
||||
const uint num_elements = (hend - offset_y) * (wend - offset_x);
|
||||
#endif
|
||||
#else // !CHECK_BOUNDRY
|
||||
uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for(uint j = 0; j < POOL_SIZE_Y; j++)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for(uint i = 0; i < POOL_SIZE_X; i++)
|
||||
{
|
||||
int4 int_data = vload4(0, (__global int*)(input + input_idx));
|
||||
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
|
||||
input_idx += IN_X_PITCH;
|
||||
}
|
||||
input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
|
||||
}
|
||||
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ACTIVATION_VEC16 pool_result;
|
||||
#if defined AVG_POOLING
|
||||
#if ENABLE_ROUND
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
|
||||
#else
|
||||
pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
|
||||
#else
|
||||
pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
|
||||
#endif
|
||||
}
|
||||
#endif // ENABLE_ROUND
|
||||
#else // AVG_POOLING
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
|
||||
pool_result[i] = result[i];
|
||||
}
|
||||
#endif // AVG_POOLING
|
||||
|
||||
OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD;
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC;
|
||||
#else
|
||||
FUSED_OPS;
|
||||
#endif
|
||||
final_result[i] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
|
||||
#endif
|
||||
}
|
||||
|
||||
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
|
||||
#else
|
||||
*((__global OUT_VEC16*)(output + output_pos)) = final_result;
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef ALIGN_TO
|
||||
#undef AS_TYPE
|
||||
#undef IN_VEC16
|
||||
#undef OUT_VEC16
|
||||
#undef ACTIVATION_VEC16
|
||||
#undef TO_ACTIVATION_VEC16
|
||||
#undef INIT_VAL
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
311
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl
vendored
Normal file
311
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl
vendored
Normal file
@@ -0,0 +1,311 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include/include_all.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
|
||||
|
||||
#define AS_TYPE(type, val) CAT(as_, type)(val)
|
||||
#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
|
||||
#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
|
||||
|
||||
#define ACTIVATION_VEC16 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 16)
|
||||
#define TO_ACTIVATION_VEC16 CAT(convert_, ACTIVATION_VEC16)
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#if MAX_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_MIN
|
||||
#elif AVG_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_ZERO
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
|
||||
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
|
||||
{
|
||||
#if MAX_POOLING
|
||||
return ACCUMULATOR_MAX_FUNC(tmp, in);
|
||||
#elif AVG_POOLING
|
||||
return tmp + in;
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
|
||||
KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0);
|
||||
#if OUTPUT_DIMS == 4
|
||||
const uint y = (uint)get_global_id(1);
|
||||
const uint z = 0;
|
||||
#else
|
||||
const uint zy = (uint)get_global_id(1);
|
||||
const uint y = zy % OUTPUT_SIZE_Y;
|
||||
const uint z = zy / OUTPUT_SIZE_Y;
|
||||
#endif
|
||||
const uint bf = (uint)get_global_id(2);
|
||||
const uint f = (bf * FEATURE_SLICE_SIZE) % ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
|
||||
const uint b = (bf * FEATURE_SLICE_SIZE) / ALIGN_TO(INPUT0_FEATURE_NUM, FEATURE_SLICE_SIZE);
|
||||
|
||||
const bool last_in_f_group = (f == FEATURE_SLICE_SIZE * ((INPUT0_FEATURE_NUM - 1) / FEATURE_SLICE_SIZE));
|
||||
|
||||
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
|
||||
|
||||
ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
|
||||
INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
|
||||
|
||||
#ifdef CHECK_BOUNDRY
|
||||
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
|
||||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
|
||||
offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
uint num_elements = 0;
|
||||
#endif
|
||||
|
||||
#if INPUT0_DIMS == 4
|
||||
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
|
||||
#else
|
||||
const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
|
||||
for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
|
||||
{
|
||||
int input_offset_z = offset_z + pz;
|
||||
bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
|
||||
if(!zero_z)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for(uint py = 0; py < POOL_SIZE_Y; py++)
|
||||
{
|
||||
int input_offset_y = offset_y + py;
|
||||
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
||||
if(!zero_y)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for(uint px = 0; px < POOL_SIZE_X; px++)
|
||||
{
|
||||
int input_offset_x = offset_x + px;
|
||||
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
||||
if(!zero)
|
||||
{
|
||||
const uint input_idx = batch_and_feature_offset + input_offset_z*IN_Z_PITCH + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
|
||||
IN_VEC16 ch16_data;
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
|
||||
ch16_data[k] = input[input_idx + k];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
num_elements++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
|
||||
const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
|
||||
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
|
||||
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
|
||||
const uint num_elements = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
|
||||
#endif
|
||||
#else // !CHECK_BOUNDRY
|
||||
#if INPUT0_DIMS == 4
|
||||
uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
|
||||
#else
|
||||
uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Z)))
|
||||
for(uint pz = 0; pz < POOL_SIZE_Z; pz++)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for(uint py = 0; py < POOL_SIZE_Y; py++)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for(uint px = 0; px < POOL_SIZE_X; px++)
|
||||
{
|
||||
IN_VEC16 ch16_data;
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
ch16_data = AS_TYPE(IN_VEC16, vload4(0, (__global int*)(input + input_idx)));
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
|
||||
ch16_data[k] = input[input_idx + k];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
#if INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < INPUT0_FEATURE_NUM % FEATURE_SLICE_SIZE; k++)
|
||||
{
|
||||
result[k] = FUNC_CALL(apply_pooling)(result[k], ch16_data[k]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
input_idx += IN_X_PITCH;
|
||||
}
|
||||
input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
|
||||
}
|
||||
input_idx += (IN_Z_PITCH - POOL_SIZE_Y*IN_Y_PITCH);
|
||||
}
|
||||
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
const uint num_elements = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ACTIVATION_VEC16 pool_result;
|
||||
#if defined AVG_POOLING
|
||||
#if ENABLE_ROUND
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
pool_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
|
||||
#else
|
||||
pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for(uint i = 0; i < FEATURE_SLICE_SIZE; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
|
||||
#else
|
||||
pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
|
||||
#endif
|
||||
}
|
||||
#endif // ENABLE_ROUND
|
||||
#else // AVG_POOLING
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
|
||||
pool_result[i] = result[i];
|
||||
}
|
||||
#endif // AVG_POOLING
|
||||
|
||||
OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD;
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(FEATURE_SLICE_SIZE)))
|
||||
for (uint i = 0; i < FEATURE_SLICE_SIZE; ++i) {
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC;
|
||||
#else
|
||||
FUSED_OPS;
|
||||
#endif
|
||||
final_result[i] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
final_result[i] = TO_OUTPUT_TYPE(ACTIVATION(pool_result[i], ACTIVATION_PARAMS));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if OUTPUT_DIMS == 4
|
||||
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#else
|
||||
const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#endif
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
|
||||
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
|
||||
output[output_pos + k] = final_result[k];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
if (!last_in_f_group) {
|
||||
#endif
|
||||
*((__global OUT_VEC16*)(output + output_pos)) = final_result;
|
||||
#if OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE != 0
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE)))
|
||||
for(uint k = 0; k < OUTPUT_FEATURE_NUM % FEATURE_SLICE_SIZE; k++) {
|
||||
output[output_pos + k] = final_result[k];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef ALIGN_TO
|
||||
#undef AS_TYPE
|
||||
#undef IN_VEC16
|
||||
#undef OUT_VEC16
|
||||
#undef ACTIVATION_VEC16
|
||||
#undef TO_ACTIVATION_VEC16
|
||||
#undef INIT_VAL
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -93,6 +93,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
|
||||
return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
|
||||
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV16_ISV16
|
||||
return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, o, i, z, y, x);
|
||||
#elif defined INPUT0_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
|
||||
return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(INPUT0, g, o, i, z, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV32_ISV16
|
||||
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(INPUT0, o, i, z, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_OSV64_ISV16
|
||||
@@ -224,6 +228,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
|
||||
return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
|
||||
return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV16_ISV16
|
||||
return GET_FILTER_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, o, i, z, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_G_OS_IS_ZYX_OSV16_ISV16
|
||||
return GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(OUTPUT, g, o, i, z, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV32_ISV16
|
||||
return GET_FILTER_OS_IS_ZYX_OSV32_ISV16_INDEX(OUTPUT, o, i, z, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSV64_ISV16
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -306,6 +306,7 @@ std::string toString(WeightsLayout layout) {
|
||||
case WeightsLayout::yxio: return "YXIO";
|
||||
case WeightsLayout::os_is_yx_isv16_osv16: return "OS_IS_YX_ISV16_OSV16";
|
||||
case WeightsLayout::os_is_yx_osv16_isv16: return "OS_IS_YX_OSV16_ISV16";
|
||||
case WeightsLayout::os_is_zyx_osv16_isv16: return "OS_IS_ZYX_OSV16_ISV16";
|
||||
case WeightsLayout::os_is_zyx_osv32_isv16: return "OS_IS_ZYX_OSV32_ISV16";
|
||||
case WeightsLayout::os_is_zyx_osv64_isv16: return "OS_IS_ZYX_OSV64_ISV16";
|
||||
case WeightsLayout::os_iyx_osv16: return "OS_IYX_OSV16";
|
||||
@@ -371,6 +372,7 @@ std::string toString(WeightsLayout layout) {
|
||||
case WeightsLayout::gs_oi_yxs_gsv32_yxsv4: return "GS_OI_YXS_GSV32_YXSV4";
|
||||
case WeightsLayout::g_os_is_yx_isv16_osv16: return "G_OS_IS_YX_ISV16_OSV16";
|
||||
case WeightsLayout::g_os_is_yx_osv16_isv4: return "G_OS_IS_YX_OSV16_ISV4";
|
||||
case WeightsLayout::g_os_is_zyx_osv16_isv16: return "G_OS_IS_ZYX_OSV16_ISV16";
|
||||
case WeightsLayout::g_os_zyx_is_osv16_isv4: return "G_OS_ZYX_IS_OSV16_ISV4";
|
||||
case WeightsLayout::g_os_zyx_is_osv16_isv16: return "G_OS_ZYX_IS_OSV16_ISV16";
|
||||
case WeightsLayout::g_os_zyx_is_osv16_isv32: return "G_OS_ZYX_IS_OSV16_ISV32";
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2018 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -118,7 +118,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
|
||||
// window size spatial Y", filter_size.spatial[1], "First convolution is outside of image. please reduce input
|
||||
// offset Y");
|
||||
|
||||
if (input_layout.format == format::bfzyx) {
|
||||
if (input_layout.format.spatial_num() == 3) {
|
||||
// convolution 3D
|
||||
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
|
||||
"Stride spatial Z",
|
||||
|
||||
@@ -199,6 +199,8 @@ attach_convolution_gpu::attach_convolution_gpu() {
|
||||
// block i8 format
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
|
||||
// MMAD
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
|
||||
|
||||
@@ -110,6 +110,8 @@ attach_scale_gpu::attach_scale_gpu() {
|
||||
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
|
||||
implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
|
||||
|
||||
@@ -72,12 +72,7 @@ void pre_replace_deconv::run(program_impl& p) {
|
||||
!((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
|
||||
_lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16));
|
||||
// int8/uint8 input
|
||||
perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8) &&
|
||||
// imad convolution kernel limitation for groups
|
||||
(groups == 1 || weights_node.get_output_layout().size.feature[0] % 4 == 0 ||
|
||||
groups == static_cast<uint32_t>(input_node.get_output_layout().size.feature[0])) &&
|
||||
// no uint8/int8 3D convolution support
|
||||
input_node.get_output_layout().format.dimension() == 4;
|
||||
perform_opt |= (input_node.get_output_layout().data_type == data_types::i8 || input_node.get_output_layout().data_type == data_types::u8);
|
||||
|
||||
if (!perform_opt)
|
||||
continue;
|
||||
|
||||
@@ -123,6 +123,7 @@ void prepare_padding::run(program_impl& p) {
|
||||
if (conv_layout.format != cldnn::format::bfyx &&
|
||||
conv_layout.format != cldnn::format::bf8_xy16 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
|
||||
conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
|
||||
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
|
||||
conv_layout.format != cldnn::format::byxf_af32 &&
|
||||
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
|
||||
@@ -154,11 +155,15 @@ void prepare_padding::run(program_impl& p) {
|
||||
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
|
||||
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
|
||||
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
|
||||
auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
|
||||
(filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
|
||||
|
||||
auto left_padding = std::max(-input_offset.spatial[0], 0);
|
||||
auto top_padding = std::max(-input_offset.spatial[1], 0);
|
||||
auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
|
||||
auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
|
||||
auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
|
||||
auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
|
||||
auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
|
||||
auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
|
||||
auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
|
||||
auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
|
||||
|
||||
// Adjust right padding, so entire buffer size in X dimension is properly aligned.
|
||||
// TODO: NOTE: Will be reenabled with next check-in once heuristic for line-aligned algorithm will be added.
|
||||
@@ -166,7 +171,7 @@ void prepare_padding::run(program_impl& p) {
|
||||
// round_up_to(left_padding + prev_prim_output_layout.size.spatial[0] + right_padding, 16));
|
||||
// right_padding = needed_buffer_size_x - left_padding - prev_prim_output_layout.size.spatial[0];
|
||||
|
||||
cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
|
||||
cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
|
||||
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
|
||||
p.apply_needed_padding(node, conv_input_node, needed_padding);
|
||||
}
|
||||
@@ -209,13 +214,17 @@ void prepare_padding::run(program_impl& p) {
|
||||
(filter_layout.size.spatial[0] - 1) * dilation.spatial[0] + 1;
|
||||
auto input_limit_y = input_offset.spatial[1] + (conv_layout.size.spatial[1] - 1) * stride.spatial[1] +
|
||||
(filter_layout.size.spatial[1] - 1) * dilation.spatial[1] + 1;
|
||||
auto input_limit_z = input_offset.spatial[2] + (conv_layout.size.spatial[2] - 1) * stride.spatial[2] +
|
||||
(filter_layout.size.spatial[2] - 1) * dilation.spatial[2] + 1;
|
||||
|
||||
auto left_padding = std::max(-input_offset.spatial[0], 0);
|
||||
auto top_padding = std::max(-input_offset.spatial[1], 0);
|
||||
auto right_padding = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
|
||||
auto bottom_padding = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
|
||||
auto padding_begin_x = std::max(-input_offset.spatial[0], 0);
|
||||
auto padding_begin_y = std::max(-input_offset.spatial[1], 0);
|
||||
auto padding_begin_z = std::max(-input_offset.spatial[2], 0);
|
||||
auto padding_end_x = std::max(input_limit_x - prev_prim_output_layout.size.spatial[0], 0);
|
||||
auto padding_end_y = std::max(input_limit_y - prev_prim_output_layout.size.spatial[1], 0);
|
||||
auto padding_end_z = std::max(input_limit_z - prev_prim_output_layout.size.spatial[2], 0);
|
||||
|
||||
cldnn::padding needed_padding({0, 0, left_padding, top_padding}, {0, 0, right_padding, bottom_padding}, 0);
|
||||
cldnn::padding needed_padding({0, 0, padding_begin_x, padding_begin_y, padding_begin_z}, {0, 0, padding_end_x, padding_end_y, padding_end_z}, 0);
|
||||
needed_padding = padding::max(prev_prim_output_layout.data_padding, needed_padding);
|
||||
|
||||
p.apply_needed_padding(node, conv_input_node, needed_padding);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2017-2019 Intel Corporation
|
||||
// Copyright (c) 2017-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -205,6 +205,8 @@ inline std::string fmt_to_str(format fmt) {
|
||||
return "g_os_is_zyx_isv16_osv16";
|
||||
case format::g_os_is_yx_osv16_isv4:
|
||||
return "g_os_is_yx_osv16_isv4";
|
||||
case format::g_os_is_zyx_osv16_isv16:
|
||||
return "g_os_is_zyx_osv16_isv16";
|
||||
case format::g_os_zyx_is_osv16_isv4:
|
||||
return "g_os_zyx_is_osv16_isv4";
|
||||
case format::g_os_zyx_is_osv16_isv16:
|
||||
|
||||
@@ -327,6 +327,10 @@ kernel_selector::weights_layout to_weights_layout(format f) {
|
||||
return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
|
||||
case format::g_os_is_yx_osv16_isv4:
|
||||
return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
|
||||
case format::os_is_zyx_osv16_isv16:
|
||||
return kernel_selector::weights_layout::os_is_zyx_osv16_isv16;
|
||||
case format::g_os_is_zyx_osv16_isv16:
|
||||
return kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16;
|
||||
case format::g_os_zyx_is_osv16_isv4:
|
||||
return kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4;
|
||||
case format::g_os_zyx_is_osv16_isv16:
|
||||
@@ -450,6 +454,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
return cldnn::format::g_os_is_zyx_isv16_osv16;
|
||||
case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
|
||||
return cldnn::format::g_os_is_yx_osv16_isv4;
|
||||
case kernel_selector::weights_layout::os_is_zyx_osv16_isv16:
|
||||
return cldnn::format::os_is_zyx_osv16_isv16;
|
||||
case kernel_selector::weights_layout::g_os_is_zyx_osv16_isv16:
|
||||
return cldnn::format::g_os_is_zyx_osv16_isv16;
|
||||
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv4:
|
||||
return cldnn::format::g_os_zyx_is_osv16_isv4;
|
||||
case kernel_selector::weights_layout::g_os_zyx_is_osv16_isv16:
|
||||
|
||||
@@ -375,9 +375,14 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
|
||||
auto ks_x = weights_layout.size.spatial[0];
|
||||
auto ks_y = weights_layout.size.spatial[1];
|
||||
|
||||
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
|
||||
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
|
||||
if (weights_layout.format.group_num() > 0) {
|
||||
out_features_per_group = weights_layout.size.batch[0];
|
||||
}
|
||||
|
||||
// Check for non-grouped or depthwise convolution
|
||||
if (input_layout.size.spatial[2] == 1 &&
|
||||
input_layout.size.batch[0] < 16 &&
|
||||
((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
|
||||
weights_layout.size.batch[0] >= 16 &&
|
||||
((conv->groups == 1 && conv->split() == 1) ||
|
||||
@@ -388,12 +393,17 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
|
||||
return true;
|
||||
// Check for grouped convolution
|
||||
else if (input_layout.size.spatial[2] == 1 && input_layout.size.batch[0] < 16 &&
|
||||
weights_layout.size.batch[0] >= 16 &&
|
||||
((input_layout.size.feature[0] / conv->groups) % 4 == 0) &&
|
||||
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) < 16 &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
|
||||
return true;
|
||||
|
||||
out_features_per_group >= 16 &&
|
||||
// Need to extend imad fsv4 kernel to handle e.g. 3 input features per group
|
||||
(in_features_per_group % 4 == 0) &&
|
||||
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
|
||||
return true;
|
||||
// Check for fsv16 imad kernel
|
||||
else if ((input_layout.format.dimension() == 4) &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
|
||||
(!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
// A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
|
||||
@@ -459,6 +469,20 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
|
||||
(weights_layout.size.batch[0] % 16 == 0 || (weights_layout.size.batch[0] == 8 && conv->groups > 1)) &&
|
||||
conv->dilation == tensor(1))
|
||||
return true;
|
||||
|
||||
size_t in_features_per_group = input_layout.size.feature[0] / conv->groups;
|
||||
size_t out_features_per_group = weights_layout.size.batch[0] / conv->groups;
|
||||
if (weights_layout.format.group_num() > 0) {
|
||||
out_features_per_group = weights_layout.size.batch[0];
|
||||
}
|
||||
|
||||
// Check for fsv16 imad kernel
|
||||
if ((input_layout.format.dimension() == 5) &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
|
||||
(input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
|
||||
(weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
|
||||
(!((conv->groups > 1) && (in_features_per_group == 1) && (out_features_per_group == 1))))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -650,6 +674,9 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
} else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
|
||||
convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
|
||||
expected_format = cldnn::format::b_fs_yx_fsv16;
|
||||
} else if ((_optimization_attributes.b_fs_zyx_fsv16_network &&
|
||||
convolution_b_fs_zyx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
|
||||
expected_format = cldnn::format::b_fs_zyx_fsv16;
|
||||
} else {
|
||||
expected_format = imad_case(node);
|
||||
}
|
||||
@@ -807,7 +834,8 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
||||
} else if (node.is_type<quantize>()) {
|
||||
auto layout = node.get_output_layout();
|
||||
if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||
if (layout.format.spatial_num() == 2 &&
|
||||
(layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||
layout.size.batch[0] % 16 == 0)
|
||||
expected = format::b_fs_yx_fsv4;
|
||||
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -149,26 +149,32 @@ layout pooling_inst::calc_output_layout(parent::typed_node const& node) {
|
||||
0,
|
||||
"Input offset in batch is not supported");
|
||||
|
||||
if (input_layout.format == format::bfzyx) {
|
||||
if (input_layout.format.spatial_num() == 3) {
|
||||
// 3D
|
||||
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
|
||||
"stride spatial Z",
|
||||
stride.spatial[1],
|
||||
"",
|
||||
0,
|
||||
"Stride spatial Z must be positive (>= 1)");
|
||||
"stride spatial Z",
|
||||
stride.spatial[1],
|
||||
"",
|
||||
0,
|
||||
"Stride spatial Z must be positive (>= 1)");
|
||||
CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
|
||||
"window size spatial Z",
|
||||
window_size.spatial[2],
|
||||
"",
|
||||
0,
|
||||
"Size Z (of pooling window) must be positive (>= 1)");
|
||||
"window size spatial Z",
|
||||
window_size.spatial[2],
|
||||
"",
|
||||
0,
|
||||
"Size Z (of pooling window) must be positive (>= 1)");
|
||||
CLDNN_ERROR_GREATER_THAN(node.id(),
|
||||
"Input offset spatial Z",
|
||||
2 * input_offset.spatial[2],
|
||||
"input layout size spatial Z",
|
||||
input_layout.size.spatial[2],
|
||||
"Input offset is greater than input data range. There is no input data to process");
|
||||
"Input offset spatial Z",
|
||||
2 * input_offset.spatial[2],
|
||||
"input layout size spatial Z",
|
||||
input_layout.size.spatial[2],
|
||||
"Input offset is greater than input data range. There is no input data to process");
|
||||
CLDNN_ERROR_GREATER_THAN(node.id(),
|
||||
"Negate input offset spatial Z",
|
||||
-input_offset.spatial[2],
|
||||
"input window size spatial Z",
|
||||
window_size.spatial[2],
|
||||
"First pool is outside of image. please reduce input offset Z");
|
||||
}
|
||||
|
||||
if (desc->with_output_size) {
|
||||
|
||||
@@ -78,83 +78,169 @@ struct convolution_accumulator<uint8_t> {
|
||||
};
|
||||
|
||||
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
|
||||
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
|
||||
int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
|
||||
int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
|
||||
VVVF<OutputT> reference_convolve(VVVVF<InputT> &input, VVVVF<WeightsT> &filter,
|
||||
int stride_z, int stride_y, int stride_x,
|
||||
float bias,
|
||||
int dilation_z = 1, int dilation_y = 1, int dilation_x = 1,
|
||||
int input_padding_z = 0, int input_padding_y = 0, int input_padding_x = 0,
|
||||
int output_padding_z = 0, int output_padding_y = 0, int output_padding_x = 0,
|
||||
size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
|
||||
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
|
||||
{
|
||||
size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
|
||||
size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
|
||||
size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
|
||||
size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
|
||||
size_t kernel_extent_z = dilation_z * (filter[0].size() - 1) + 1;
|
||||
size_t kernel_extent_y = dilation_y * (filter[0][0].size() - 1) + 1;
|
||||
size_t kernel_extent_x = dilation_x * (filter[0][0][0].size() - 1) + 1;
|
||||
|
||||
size_t output_z = 1 + (input[0].size() - kernel_extent_z + 2 * input_padding_z) / stride_z + 2 * output_padding_z;
|
||||
size_t output_y = 1 + (input[0][0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
|
||||
size_t output_x = 1 + (input[0][0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
|
||||
|
||||
bool asymm_data = !data_zp.empty();
|
||||
bool asymm_weights = weights_zp != static_cast<WeightsT>(0);
|
||||
VVF<OutputT> output(output_y, VF<OutputT>(output_x, 0));
|
||||
VVVF<OutputT> output(output_z, VVF<OutputT>(output_y, VF<OutputT>(output_x, 0)));
|
||||
size_t filter_begin = f_begin ? f_begin : 0;
|
||||
size_t filter_end = f_end ? f_end : filter.size();
|
||||
for (size_t f = filter_begin; f < filter_end; ++f) {
|
||||
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
|
||||
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
|
||||
VF<AccT> values;
|
||||
values.reserve(filter[0].size() * filter[0][0].size());
|
||||
for (size_t yf = 0; yf < filter[0].size(); ++yf) {
|
||||
int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
|
||||
bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
|
||||
if (!yi_inside) continue;
|
||||
for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
|
||||
int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
|
||||
bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
|
||||
if (!xi_inside) continue;
|
||||
for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
|
||||
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
|
||||
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
|
||||
VF<AccT> values;
|
||||
values.reserve(filter[0].size() * filter[0][0].size() * filter[0][0][0].size());
|
||||
for (size_t zf = 0; zf < filter[0].size(); ++zf) {
|
||||
int zi = -input_padding_z + (int)zf * dilation_z + stride_z * (int)z;
|
||||
bool zi_inside = zi >= 0 && (int)input[0].size() > zi;
|
||||
if (!zi_inside) continue;
|
||||
for (size_t yf = 0; yf < filter[0][0].size(); ++yf) {
|
||||
int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
|
||||
bool yi_inside = yi >= 0 && (int)input[0][0].size() > yi;
|
||||
if (!yi_inside) continue;
|
||||
for (size_t xf = 0; xf < filter[0][0][0].size(); ++xf) {
|
||||
int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
|
||||
bool xi_inside = xi >= 0 && (int)input[0][0][0].size() > xi;
|
||||
if (!xi_inside) continue;
|
||||
|
||||
auto input_val = static_cast<AccT>(input[f][yi][xi]);
|
||||
auto input_val = static_cast<AccT>(input[f][zi][yi][xi]);
|
||||
|
||||
if (asymm_data) {
|
||||
input_val = input_val - static_cast<AccT>(data_zp[f]);
|
||||
if (asymm_data) {
|
||||
input_val = input_val - static_cast<AccT>(data_zp[f]);
|
||||
}
|
||||
|
||||
AccT weights_val;
|
||||
if (!depthwise && !grouped) {
|
||||
weights_val = static_cast<AccT>(filter[f][zf][yf][xf]);
|
||||
} else if (grouped) {
|
||||
weights_val = static_cast<AccT>(filter[f - filter_begin][zf][yf][xf]);
|
||||
}
|
||||
else {
|
||||
weights_val = static_cast<AccT>(filter[0][zf][yf][xf]);
|
||||
}
|
||||
|
||||
if (asymm_weights) {
|
||||
weights_val = weights_val - static_cast<AccT>(weights_zp);
|
||||
}
|
||||
|
||||
//std::cout << std::endl << "f=" << f << ", z=" << z << ", y=" << y << ", x=" << x << ", zf=" << zf << ", yf=" << yf << ", xf=" << xf << ": " << (int)input_val << " * " << (int)weights_val;
|
||||
|
||||
values.push_back(input_val * weights_val);
|
||||
}
|
||||
}
|
||||
|
||||
AccT weights_val;
|
||||
if (!depthwise && !grouped) {
|
||||
weights_val = static_cast<AccT>(filter[f][yf][xf]);
|
||||
} else if (grouped) {
|
||||
weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
|
||||
}
|
||||
else {
|
||||
weights_val = static_cast<AccT>(filter[0][yf][xf]);
|
||||
}
|
||||
|
||||
if (asymm_weights) {
|
||||
weights_val = weights_val - static_cast<AccT>(weights_zp);
|
||||
}
|
||||
|
||||
values.push_back(input_val * weights_val);
|
||||
}
|
||||
output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
|
||||
}
|
||||
output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
|
||||
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
|
||||
output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
|
||||
for (size_t z = 0; z < (output_z - 2 * output_padding_z); ++z) {
|
||||
for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
|
||||
for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
|
||||
output[z + output_padding_z][y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT, typename AccT = typename convolution_accumulator<InputT>::type>
|
||||
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
|
||||
int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
|
||||
int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
|
||||
const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
|
||||
{
|
||||
VVVVF<InputT> input_extended(input.size(), VVVF<InputT>(1, VVF<InputT>(input[0].size(), VF<InputT>(input[0][0].size(), 0))));
|
||||
for (size_t fi = 0; fi < input.size(); fi++) {
|
||||
for (size_t yi = 0; yi < input[0].size(); yi++) {
|
||||
for (size_t xi = 0; xi < input[0][0].size(); xi++) {
|
||||
input_extended[fi][0][yi][xi] = input[fi][yi][xi];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VVVVF<WeightsT> filter_extended(filter.size(), VVVF<WeightsT>(1, VVF<WeightsT>(filter[0].size(), VF<WeightsT>(filter[0][0].size(), 0))));
|
||||
for (size_t fi = 0; fi < filter.size(); fi++) {
|
||||
for (size_t yi = 0; yi < filter[0].size(); yi++) {
|
||||
for (size_t xi = 0; xi < filter[0][0].size(); xi++) {
|
||||
filter_extended[fi][0][yi][xi] = filter[fi][yi][xi];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VVVF<OutputT> output = reference_convolve<InputT, OutputT, WeightsT, AccT>(input_extended, filter_extended,
|
||||
1, stride_y, stride_x,
|
||||
bias,
|
||||
1, dilation_y, dilation_x,
|
||||
0, input_padding_y, input_padding_x,
|
||||
0, output_padding_y, output_padding_x,
|
||||
f_begin, f_end, depthwise, grouped,
|
||||
data_zp, weights_zp);
|
||||
|
||||
VVF<OutputT> output_shrinked(output[0].size(), VF<OutputT>(output[0][0].size(), 0));
|
||||
|
||||
for (size_t yi = 0; yi < output[0].size(); yi++) {
|
||||
for (size_t xi = 0; xi < output[0][0].size(); xi++) {
|
||||
output_shrinked[yi][xi] = output[0][yi][xi];
|
||||
}
|
||||
}
|
||||
|
||||
return output_shrinked;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
|
||||
VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
|
||||
auto output = input;
|
||||
auto size_y = input.size();
|
||||
auto size_x = input[0].size();
|
||||
for (size_t yi = 0; yi < size_y; ++yi) {
|
||||
for (size_t xi = 0; xi < size_x; ++xi) {
|
||||
output[yi][xi] = output[yi][xi] * scale + shift;
|
||||
auto size_z = input.size();
|
||||
auto size_y = input[0].size();
|
||||
auto size_x = input[0][0].size();
|
||||
for (size_t zi = 0; zi < size_z; ++zi) {
|
||||
for (size_t yi = 0; yi < size_y; ++yi) {
|
||||
for (size_t xi = 0; xi < size_x; ++xi) {
|
||||
output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
|
||||
VVVF<T> input_extended(1, VVF<T>(input.size(), VF<T>(input[0].size(), 0)));
|
||||
for (size_t yi = 0; yi < input.size(); yi++) {
|
||||
for (size_t xi = 0; xi < input[0].size(); xi++) {
|
||||
input_extended[0][yi][xi] = input[yi][xi];
|
||||
}
|
||||
}
|
||||
VVVF<T> output = reference_scale_post_op<T>(input_extended, scale, shift);
|
||||
VVF<T> output_shrinked(output[0].size(), VF<T>(output[0][0].size(), 0));
|
||||
for (size_t yi = 0; yi < output[0].size(); yi++) {
|
||||
for (size_t xi = 0; xi < output[0][0].size(); xi++) {
|
||||
output_shrinked[yi][xi] = output[0][yi][xi];
|
||||
}
|
||||
}
|
||||
|
||||
return output_shrinked;
|
||||
}
|
||||
|
||||
void dump_buffer(memory const& mem, std::string const& name)
|
||||
{
|
||||
std::ofstream out(name);
|
||||
@@ -172,14 +258,17 @@ void dump_buffer(memory const& mem, std::string const& name)
|
||||
for (int f = 0; f < size.feature[0]; ++f)
|
||||
{
|
||||
out << "feature " << f << ":\n";
|
||||
for (int y = 0; y < size.spatial[1]; ++y)
|
||||
for (int z = 0; z < size.spatial[2]; ++z)
|
||||
{
|
||||
for (int x = 0; x < size.spatial[0]; ++x)
|
||||
for (int y = 0; y < size.spatial[1]; ++y)
|
||||
{
|
||||
size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
|
||||
out << ptr[idx] << " ";
|
||||
for (int x = 0; x < size.spatial[0]; ++x)
|
||||
{
|
||||
size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + z * pitches.spatial[2] + y * pitches.spatial[1] + x * pitches.spatial[0];
|
||||
out << ptr[idx] << " ";
|
||||
}
|
||||
out << "\n";
|
||||
}
|
||||
out << "\n";
|
||||
}
|
||||
|
||||
out << "\n";
|
||||
@@ -4798,15 +4887,18 @@ using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int, // 0 - I
|
||||
bool>; // 6 - With bias
|
||||
|
||||
using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
|
||||
int, // 1 - Input Y size
|
||||
int, // 2 - Input features
|
||||
int, // 3 - Output features
|
||||
int, // 4 - Kernel sizeX
|
||||
int, // 5 - Kernel sizeY
|
||||
int, // 6 - Groups number
|
||||
int, // 7 - Stride
|
||||
int, // 8 - Batch
|
||||
format>; // 9 - Input data format
|
||||
int, // 1 - Input Y size
|
||||
int, // 2 - Input Z size
|
||||
int, // 3 - Input features
|
||||
int, // 4 - Output features
|
||||
int, // 5 - Kernel sizeX
|
||||
int, // 6 - Kernel sizeY
|
||||
int, // 7 - Kernel sizeZ
|
||||
int, // 8 - Groups number
|
||||
int, // 9 - Stride
|
||||
int, // 10 - Batch
|
||||
format, // 11 - Input data format
|
||||
std::string>; // 12 - Implementation name
|
||||
|
||||
struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
|
||||
{
|
||||
@@ -4880,16 +4972,24 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
|
||||
static std::string PrintToStringParamName(
|
||||
testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
|
||||
// construct a readable name
|
||||
return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
|
||||
std::to_string(testing::get<1>(param_info.param)) + "y" +
|
||||
std::to_string(testing::get<2>(param_info.param)) + "f" +
|
||||
"_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
|
||||
"_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
|
||||
std::to_string(testing::get<5>(param_info.param)) + "y" +
|
||||
"_groups" + std::to_string(testing::get<6>(param_info.param)) +
|
||||
"_stride" + std::to_string(testing::get<7>(param_info.param)) +
|
||||
"_batch" + std::to_string(testing::get<8>(param_info.param)) +
|
||||
"_format" + std::to_string(testing::get<9>(param_info.param));
|
||||
std::string res = "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
|
||||
std::to_string(testing::get<1>(param_info.param)) + "y" +
|
||||
std::to_string(testing::get<2>(param_info.param)) + "z" +
|
||||
std::to_string(testing::get<3>(param_info.param)) + "f" +
|
||||
"_output" + std::to_string(testing::get<4>(param_info.param)) + "f" +
|
||||
"_filter" + std::to_string(testing::get<5>(param_info.param)) + "x" +
|
||||
std::to_string(testing::get<6>(param_info.param)) + "y" +
|
||||
std::to_string(testing::get<7>(param_info.param)) + "z" +
|
||||
"_groups" + std::to_string(testing::get<8>(param_info.param)) +
|
||||
"_stride" + std::to_string(testing::get<9>(param_info.param)) +
|
||||
"_batch" + std::to_string(testing::get<10>(param_info.param)) +
|
||||
"_format" + std::to_string(testing::get<11>(param_info.param));
|
||||
|
||||
if (testing::get<12>(param_info.param) != "") {
|
||||
res += "_impl_" + testing::get<12>(param_info.param);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -6957,26 +7057,47 @@ INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_bfyx,
|
||||
INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
convolution_grouped_gpu,
|
||||
::testing::Values(
|
||||
// Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size Y,
|
||||
// Groups number, Stride, Output padding, Batch, Input data format
|
||||
// Input X size, Input Y size, Input Z size, Input features, Output features,
|
||||
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
|
||||
// Input data format, Implementation name
|
||||
|
||||
// Format: b_fs_yx_fsv4
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv4),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
|
||||
|
||||
// Format: b_fs_yx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv16),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv16)
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
|
||||
|
||||
// Format: b_fs_zyx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, "")
|
||||
),
|
||||
convolution_grouped_gpu::PrintToStringParamName);
|
||||
|
||||
@@ -6985,42 +7106,66 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
|
||||
const int input_x = testing::get<0>(GetParam()),
|
||||
input_y = testing::get<1>(GetParam()),
|
||||
input_f = testing::get<2>(GetParam()),
|
||||
output_f = testing::get<3>(GetParam()),
|
||||
filter_x = testing::get<4>(GetParam()),
|
||||
filter_y = testing::get<5>(GetParam()),
|
||||
groups = testing::get<6>(GetParam()),
|
||||
stride = testing::get<7>(GetParam()),
|
||||
batch_num = testing::get<8>(GetParam()),
|
||||
input_z = testing::get<2>(GetParam()),
|
||||
input_f = testing::get<3>(GetParam()),
|
||||
output_f = testing::get<4>(GetParam()),
|
||||
filter_x = testing::get<5>(GetParam()),
|
||||
filter_y = testing::get<6>(GetParam()),
|
||||
filter_z = testing::get<7>(GetParam()),
|
||||
groups = testing::get<8>(GetParam()),
|
||||
stride = testing::get<9>(GetParam()),
|
||||
batch_num = testing::get<10>(GetParam()),
|
||||
output_padding = 0,
|
||||
input_offset_y = (filter_x - 1) / 2,
|
||||
input_offset_x = (filter_y - 1) / 2;
|
||||
auto input_data_format = testing::get<9>(GetParam());
|
||||
input_offset_z = (filter_z - 1) / 2,
|
||||
input_offset_y = (filter_y - 1) / 2,
|
||||
input_offset_x = (filter_x - 1) / 2;
|
||||
auto input_data_format = testing::get<11>(GetParam());
|
||||
auto impl_name = testing::get<12>(GetParam());
|
||||
|
||||
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
|
||||
auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
|
||||
auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
|
||||
auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
|
||||
set_values(input, input_rnd_vec);
|
||||
auto num_in_spatial_dims = input_data_format.spatial_num();
|
||||
|
||||
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
|
||||
VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
|
||||
auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
|
||||
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
|
||||
auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
|
||||
|
||||
auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
|
||||
if (num_in_spatial_dims == 2) {
|
||||
input_lay = layout(data_types::u8, format::bfyx, input_size);
|
||||
}
|
||||
std::vector<uint8_t> input_flat(input_lay.get_linear_size());
|
||||
for (int b = 0; b < batch_num; b++)
|
||||
for (int f = 0; f < input_f; f++)
|
||||
for (int z = 0; z < input_z; z++)
|
||||
for (int y = 0; y < input_y; y++)
|
||||
for (int x = 0; x < input_x; x++) {
|
||||
tensor coords = tensor(batch(b), feature(f), spatial(x, y, z, 0));
|
||||
size_t offset = input_lay.get_linear_offset(coords);
|
||||
input_flat[offset] = input_rnd[b][f][z][y][x];
|
||||
}
|
||||
auto input = memory::allocate(engine, input_lay);
|
||||
set_values(input, input_flat);
|
||||
|
||||
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
|
||||
|
||||
VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
|
||||
auto weights_lay = layout(data_types::i8, format::goizyx, weights_size);
|
||||
if (num_in_spatial_dims == 2) {
|
||||
weights_lay = layout(data_types::i8, format::goiyx, weights_size);
|
||||
}
|
||||
std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
|
||||
for (int gi = 0; gi < groups; ++gi)
|
||||
for (int ofi = 0; ofi < output_f / groups; ++ofi)
|
||||
for (int ifi = 0; ifi < input_f / groups; ++ifi)
|
||||
for (int kyi = 0; kyi < filter_y; ++kyi)
|
||||
for (int kxi = 0; kxi < filter_x; ++kxi) {
|
||||
tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
|
||||
size_t offset = weights_lay.get_linear_offset(coords);
|
||||
weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
|
||||
}
|
||||
auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
|
||||
for (int kzi = 0; kzi < filter_z; ++kzi)
|
||||
for (int kyi = 0; kyi < filter_y; ++kyi)
|
||||
for (int kxi = 0; kxi < filter_x; ++kxi) {
|
||||
tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, kzi, 0));
|
||||
size_t offset = weights_lay.get_linear_offset(coords);
|
||||
weights_flat[offset] = weights_rnd[gi][ofi][ifi][kzi][kyi][kxi];
|
||||
}
|
||||
auto weights = memory::allocate(engine, weights_lay);
|
||||
set_values(weights, weights_flat);
|
||||
|
||||
VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
|
||||
VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
|
||||
|
||||
// Calculate reference values without bias
|
||||
for (int bi = 0; bi < batch_num; ++bi)
|
||||
@@ -7031,15 +7176,15 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
int f_end = gi * input_f / groups + input_f / groups;
|
||||
|
||||
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
|
||||
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
|
||||
stride, stride, // strides
|
||||
0, // bias
|
||||
1, 1, // dilation
|
||||
input_offset_y, input_offset_x, // input padding
|
||||
0, 0, // output_padding
|
||||
f_begin, f_end, // f_begin, f_end
|
||||
false, // depthwise
|
||||
grouped); // grouped
|
||||
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
|
||||
stride, stride, stride, // strides
|
||||
0, // bias
|
||||
1, 1, 1, // dilation
|
||||
input_offset_z, input_offset_y, input_offset_x, // input padding
|
||||
0, 0, 0, // output_padding
|
||||
f_begin, f_end, // f_begin, f_end
|
||||
false, // depthwise
|
||||
grouped); // grouped
|
||||
}
|
||||
|
||||
topology topology(input_layout("input", input.get_layout()),
|
||||
@@ -7049,14 +7194,14 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
"input_fsv",
|
||||
{"weights"},
|
||||
groups,
|
||||
{1, 1, stride, stride},
|
||||
{0, 0, -input_offset_x, -input_offset_y},
|
||||
{1, 1, 1, 1},
|
||||
padding({0, 0, output_padding, output_padding}, 0.f)));
|
||||
tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
|
||||
tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
|
||||
tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
|
||||
padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
|
||||
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
implementation_desc conv_impl = {input_data_format, "fused_conv_eltwise_gpu_imad"};
|
||||
implementation_desc conv_impl = {input_data_format, impl_name};
|
||||
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
|
||||
|
||||
network network(engine, topology, options);
|
||||
@@ -7070,24 +7215,26 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
ASSERT_EQ(out_mem.get_layout().format, input_data_format);
|
||||
ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
|
||||
ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[2], expected_result[0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0][0].size());
|
||||
|
||||
for (int bi = 0; bi < batch_num; ++bi)
|
||||
for (int ofi = 0; ofi < output_f; ++ofi)
|
||||
for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
|
||||
for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
|
||||
auto offset = out_lay.get_linear_offset(coords);
|
||||
auto val = out_ptr[offset];
|
||||
auto val_ref = expected_result[bi][ofi][yi][xi];
|
||||
auto equal = are_equal(val_ref, val, 1e-2f);
|
||||
if (!equal) {
|
||||
std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
|
||||
std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
|
||||
for (int zi = 0; zi < (int)expected_result[0][0].size(); ++zi)
|
||||
for (int yi = 0; yi < (int)expected_result[0][0][0].size(); ++yi)
|
||||
for (int xi = 0; xi < (int)expected_result[0][0][0][0].size(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, zi, 0));
|
||||
auto offset = out_lay.get_linear_offset(coords);
|
||||
auto val = out_ptr[offset];
|
||||
auto val_ref = expected_result[bi][ofi][zi][yi][xi];
|
||||
auto equal = are_equal(val_ref, val, 1e-2f);
|
||||
if (!equal) {
|
||||
std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
|
||||
std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", z: " << zi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
|
||||
}
|
||||
EXPECT_TRUE(equal);
|
||||
}
|
||||
EXPECT_TRUE(equal);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputT, typename WeightsT, typename OutputT>
|
||||
|
||||
@@ -86,7 +86,7 @@ struct pooling_accumulator<InputT, pooling_mode::max> {
|
||||
_acc = max(_acc, val);
|
||||
}
|
||||
|
||||
output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
|
||||
output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
|
||||
return static_cast<output_t>(_acc);
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ struct pooling_accumulator<InputT, pooling_mode::average_no_padding> {
|
||||
_acc += static_cast<output_t>(val);
|
||||
}
|
||||
|
||||
output_t get(size_t /*pool_x*/, size_t /*pool_y*/) {
|
||||
output_t get(size_t /*pool_x*/, size_t /*pool_y*/, size_t /*pool_z*/) {
|
||||
return _acc / _cnt;
|
||||
}
|
||||
|
||||
@@ -128,8 +128,8 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
|
||||
_acc += static_cast<output_t>(val);
|
||||
}
|
||||
|
||||
output_t get(size_t pool_x, size_t pool_y) {
|
||||
return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y));
|
||||
output_t get(size_t pool_x, size_t pool_y, size_t pool_z) {
|
||||
return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y * pool_z));
|
||||
}
|
||||
|
||||
void reset() {
|
||||
@@ -140,46 +140,59 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
|
||||
};
|
||||
|
||||
template <typename InputT, pooling_mode Mode>
|
||||
VVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVF<InputT>& input, size_t pool_x, size_t pool_y, int stride_x, int stride_y, int offset_x, int offset_y) {
|
||||
VVVF<typename pooling_mode_output<InputT, Mode>::type> reference_pooling(const VVVF<InputT>& input, size_t pool_x, size_t pool_y, size_t pool_z, int stride_x, int stride_y, int stride_z, int offset_x, int offset_y, int offset_z) {
|
||||
using output_t = typename pooling_mode_output<InputT, Mode>::type;
|
||||
VVF<output_t> result;
|
||||
auto size_x = input[0].size();
|
||||
auto size_y = input.size();
|
||||
VVVF<output_t> result;
|
||||
auto size_x = input[0][0].size();
|
||||
auto size_y = input[0].size();
|
||||
auto size_z = input.size();
|
||||
|
||||
auto accumulator = pooling_accumulator<InputT, Mode>();
|
||||
|
||||
for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
|
||||
VF<output_t> result_row;
|
||||
for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
|
||||
accumulator.reset();
|
||||
for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
|
||||
int index_y = yi + fyi;
|
||||
if (index_y < 0 || index_y >= static_cast<int>(size_y))
|
||||
continue;
|
||||
for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
|
||||
int index_x = xi + fxi;
|
||||
if (index_x < 0 || index_x >= static_cast<int>(size_x))
|
||||
for (int zi = offset_z; zi + static_cast<int>(pool_z) <= static_cast<int>(size_z) - offset_z; zi += stride_z) {
|
||||
VVF<output_t> result_matrix;
|
||||
for (int yi = offset_y; yi + static_cast<int>(pool_y) <= static_cast<int>(size_y) - offset_y; yi += stride_y) {
|
||||
VF<output_t> result_row;
|
||||
for (int xi = offset_x; xi + static_cast<int>(pool_x) <= static_cast<int>(size_x) - offset_x; xi += stride_x) {
|
||||
accumulator.reset();
|
||||
for (int fzi = 0; fzi < static_cast<int>(pool_z); ++fzi) {
|
||||
int index_z = zi + fzi;
|
||||
if (index_z < 0 || index_z >= static_cast<int>(size_z))
|
||||
continue;
|
||||
for (int fyi = 0; fyi < static_cast<int>(pool_y); ++fyi) {
|
||||
int index_y = yi + fyi;
|
||||
if (index_y < 0 || index_y >= static_cast<int>(size_y))
|
||||
continue;
|
||||
for (int fxi = 0; fxi < static_cast<int>(pool_x); ++fxi) {
|
||||
int index_x = xi + fxi;
|
||||
if (index_x < 0 || index_x >= static_cast<int>(size_x))
|
||||
continue;
|
||||
|
||||
auto input_val = input[static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
|
||||
accumulator.accumulate(input_val);
|
||||
auto input_val = input[static_cast<size_t>(index_z)][static_cast<size_t>(index_y)][static_cast<size_t>(index_x)];
|
||||
accumulator.accumulate(input_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
result_row.push_back(accumulator.get(pool_x, pool_y, pool_z));
|
||||
}
|
||||
result_row.push_back(accumulator.get(pool_x, pool_y));
|
||||
result_matrix.emplace_back(std::move(result_row));
|
||||
}
|
||||
result.emplace_back(std::move(result_row));
|
||||
result.emplace_back(std::move(result_matrix));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
|
||||
VVVF<T> reference_scale_post_op(const VVVF<T>& input, const T& scale, const T& shift) {
|
||||
auto output = input;
|
||||
auto size_y = input.size();
|
||||
auto size_x = input[0].size();
|
||||
for (size_t yi = 0; yi < size_y; ++yi) {
|
||||
for (size_t xi = 0; xi < size_x; ++xi) {
|
||||
output[yi][xi] = output[yi][xi] * scale + shift;
|
||||
auto size_z = input.size();
|
||||
auto size_y = input[0].size();
|
||||
auto size_x = input[0][0].size();
|
||||
for (size_t zi = 0; zi < size_z; ++zi) {
|
||||
for (size_t yi = 0; yi < size_y; ++yi) {
|
||||
for (size_t xi = 0; xi < size_x; ++xi) {
|
||||
output[zi][yi][xi] = output[zi][yi][xi] * scale + shift;
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
@@ -2355,7 +2368,7 @@ public:
|
||||
using output_t = typename pooling_mode_output<InputT, Mode>::type;
|
||||
|
||||
virtual topology build_topology(const engine& /*eng*/) {
|
||||
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
|
||||
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
|
||||
auto input_lay = layout(input_type(),
|
||||
input_format(),
|
||||
input_size);
|
||||
@@ -2365,9 +2378,9 @@ public:
|
||||
pooling("pool",
|
||||
"input",
|
||||
pool_mode(),
|
||||
tensor(batch(0), feature(0), spatial(pool_x(), pool_y())),
|
||||
tensor(batch(0), feature(0), spatial(stride_x(), stride_y())),
|
||||
tensor(batch(0), feature(0), spatial(offset_x(), offset_y())))
|
||||
tensor(batch(0), feature(0), spatial(pool_x(), pool_y(), pool_z())),
|
||||
tensor(batch(0), feature(0), spatial(stride_x(), stride_y(), stride_z())),
|
||||
tensor(batch(0), feature(0), spatial(offset_x(), offset_y(), offset_z())))
|
||||
);
|
||||
return topo;
|
||||
}
|
||||
@@ -2376,7 +2389,8 @@ public:
|
||||
return "pool";
|
||||
}
|
||||
|
||||
virtual void run_expect(const VVVVF<output_t>& expected) {
|
||||
virtual void run_expect(const VVVVVF<output_t>& expected) {
|
||||
|
||||
auto eng = get_test_engine();
|
||||
auto topo = build_topology(eng);
|
||||
auto opts = build_options(
|
||||
@@ -2384,7 +2398,7 @@ public:
|
||||
);
|
||||
auto net = network(eng, topo, opts);
|
||||
|
||||
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y()));
|
||||
auto input_size = tensor(batch(batch_num()), feature(input_features()), spatial(input_x(), input_y(), input_z()));
|
||||
auto input_lay = layout(input_type(),
|
||||
input_format(),
|
||||
input_size);
|
||||
@@ -2392,12 +2406,13 @@ public:
|
||||
std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
|
||||
for (size_t bi = 0; bi < batch_num(); ++bi)
|
||||
for (size_t fi = 0; fi < input_features(); ++fi)
|
||||
for (size_t yi = 0; yi < input_y(); ++yi)
|
||||
for (size_t xi = 0; xi < input_x(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
|
||||
size_t offset = input_lay.get_linear_offset(coords);
|
||||
input_flat[offset] = _input[bi][fi][yi][xi];
|
||||
}
|
||||
for (size_t zi = 0; zi < input_z(); ++zi)
|
||||
for (size_t yi = 0; yi < input_y(); ++yi)
|
||||
for (size_t xi = 0; xi < input_x(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
|
||||
size_t offset = input_lay.get_linear_offset(coords);
|
||||
input_flat[offset] = _input[bi][fi][zi][yi][xi];
|
||||
}
|
||||
set_values(input_mem, input_flat);
|
||||
|
||||
net.set_input_data("input", input_mem);
|
||||
@@ -2418,35 +2433,37 @@ public:
|
||||
ASSERT_EQ(out_lay.data_type, output_type());
|
||||
ASSERT_EQ(out_lay.size.batch[0], expected.size());
|
||||
ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[2], expected[0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[1], expected[0][0][0].size());
|
||||
ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0][0].size());
|
||||
|
||||
bool compare_with_tolerance = input_type() == data_types::f16;
|
||||
|
||||
for (size_t bi = 0; bi < batch_num(); ++bi)
|
||||
for (size_t fi = 0; fi < expected[0].size(); ++fi)
|
||||
for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
|
||||
for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
|
||||
size_t offset = out_lay.get_linear_offset(coords);
|
||||
auto ref_val = static_cast<float>(expected[bi][fi][yi][xi]);
|
||||
auto actual_val = static_cast<float>(out_ptr[offset]);
|
||||
if (compare_with_tolerance) {
|
||||
auto tolerance = 1;
|
||||
ASSERT_NEAR(ref_val, actual_val, tolerance)
|
||||
<< "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
|
||||
} else {
|
||||
EXPECT_TRUE(are_equal(ref_val, actual_val))
|
||||
<< "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
|
||||
for (size_t zi = 0; zi < expected[0][0].size(); ++zi)
|
||||
for (size_t yi = 0; yi < expected[0][0][0].size(); ++yi)
|
||||
for (size_t xi = 0; xi < expected[0][0][0][0].size(); ++xi) {
|
||||
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, 0));
|
||||
size_t offset = out_lay.get_linear_offset(coords);
|
||||
auto ref_val = static_cast<float>(expected[bi][fi][zi][yi][xi]);
|
||||
auto actual_val = static_cast<float>(out_ptr[offset]);
|
||||
if (compare_with_tolerance) {
|
||||
auto tolerance = 1;
|
||||
ASSERT_NEAR(ref_val, actual_val, tolerance)
|
||||
<< "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
|
||||
} else {
|
||||
EXPECT_TRUE(are_equal(ref_val, actual_val))
|
||||
<< "at b= " << bi << ", f= " << fi << ", z= " << zi << ", y= " << yi << ", x= " << xi;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
size_t batch_num() { return _input.size(); }
|
||||
size_t input_features() { return _input[0].size(); }
|
||||
size_t input_x() { return _input[0][0][0].size(); }
|
||||
size_t input_y() { return _input[0][0].size(); }
|
||||
size_t input_x() { return _input[0][0][0][0].size(); }
|
||||
size_t input_y() { return _input[0][0][0].size(); }
|
||||
size_t input_z() { return _input[0][0].size(); }
|
||||
|
||||
format::type input_format() { return _input_fmt; }
|
||||
data_types input_type() {
|
||||
@@ -2460,46 +2477,52 @@ public:
|
||||
pooling_mode pool_mode() { return Mode; }
|
||||
size_t pool_x() { return _pool_x; }
|
||||
size_t pool_y() { return _pool_y; }
|
||||
size_t pool_z() { return _pool_z; }
|
||||
int stride_x() { return _stride_x; }
|
||||
int stride_y() { return _stride_y; }
|
||||
int stride_z() { return _stride_z; }
|
||||
int offset_x() { return _offset_x; }
|
||||
int offset_y() { return _offset_y; }
|
||||
int offset_z() { return _offset_z; }
|
||||
|
||||
void set_input(format::type input_fmt, VVVVF<InputT> input_data) {
|
||||
void set_input(format::type input_fmt, VVVVVF<InputT> input_data) {
|
||||
_input_fmt = input_fmt;
|
||||
_input = std::move(input_data);
|
||||
}
|
||||
|
||||
void set_pool_size(size_t x, size_t y) {
|
||||
void set_pool_size(size_t x, size_t y, size_t z) {
|
||||
_pool_x = x;
|
||||
_pool_y = y;
|
||||
_pool_z = z;
|
||||
}
|
||||
|
||||
void set_strides(int x, int y) {
|
||||
void set_strides(int x, int y, int z) {
|
||||
_stride_x = x;
|
||||
_stride_y = y;
|
||||
_stride_z = z;
|
||||
}
|
||||
|
||||
void set_offsets(int x, int y) {
|
||||
void set_offsets(int x, int y, int z) {
|
||||
_offset_x = x;
|
||||
_offset_y = y;
|
||||
_offset_z = z;
|
||||
}
|
||||
|
||||
VVVVF<InputT> _input;
|
||||
VVVVVF<InputT> _input;
|
||||
format::type _input_fmt;
|
||||
size_t _pool_x, _pool_y;
|
||||
int _stride_x, _stride_y;
|
||||
int _offset_x, _offset_y;
|
||||
size_t _pool_x, _pool_y, _pool_z;
|
||||
int _stride_x, _stride_y, _stride_z;
|
||||
int _offset_x, _offset_y, _offset_z;
|
||||
};
|
||||
|
||||
using pooling_random_test_params = std::tuple<
|
||||
size_t, // batch
|
||||
size_t, // features
|
||||
std::tuple<size_t, size_t>, // input x, y
|
||||
std::tuple<size_t, size_t>, // pool x, y
|
||||
std::tuple<int, int>, // stride x, y
|
||||
std::tuple<int, int>, // offset x, y
|
||||
format::type // input format
|
||||
size_t, // batch
|
||||
size_t, // features
|
||||
std::tuple<size_t, size_t, size_t>, // input x, y, z
|
||||
std::tuple<size_t, size_t, size_t>, // pool x, y, z
|
||||
std::tuple<int, int, int>, // stride x, y, z
|
||||
std::tuple<int, int, int>, // offset x, y, z
|
||||
format::type // input format
|
||||
>;
|
||||
|
||||
template <typename InputT, pooling_mode Mode>
|
||||
@@ -2508,44 +2531,47 @@ public:
|
||||
using parent = pooling_test_base<InputT, Mode>;
|
||||
using output_t = typename parent::output_t;
|
||||
|
||||
virtual VVVVF<output_t> calculate_reference() {
|
||||
VVVVF<output_t> reference(this->batch_num(), VVVF<output_t>(this->input_features()));
|
||||
virtual VVVVVF<output_t> calculate_reference() {
|
||||
VVVVVF<output_t> reference(this->batch_num(), VVVVF<output_t>(this->input_features()));
|
||||
for (size_t bi = 0; bi < this->batch_num(); ++bi) {
|
||||
for (size_t fi = 0; fi < this->input_features(); ++fi) {
|
||||
reference[bi][fi] = reference_pooling<InputT, Mode>(
|
||||
this->_input[bi][fi],
|
||||
this->pool_x(),
|
||||
this->pool_y(),
|
||||
this->pool_z(),
|
||||
this->stride_x(),
|
||||
this->stride_y(),
|
||||
this->stride_z(),
|
||||
this->offset_x(),
|
||||
this->offset_y());
|
||||
this->offset_y(),
|
||||
this->offset_z());
|
||||
}
|
||||
}
|
||||
return reference;
|
||||
}
|
||||
|
||||
virtual void param_set_up(const pooling_random_test_params& params) {
|
||||
size_t b, f, in_x, in_y, p_x, p_y;
|
||||
int s_x, s_y, o_x, o_y;
|
||||
size_t b, f, in_x, in_y, in_z, p_x, p_y, p_z;
|
||||
int s_x, s_y, s_z, o_x, o_y, o_z;
|
||||
format::type in_fmt;
|
||||
|
||||
std::forward_as_tuple(
|
||||
b,
|
||||
f,
|
||||
std::forward_as_tuple(in_x, in_y),
|
||||
std::forward_as_tuple(p_x, p_y),
|
||||
std::forward_as_tuple(s_x, s_y),
|
||||
std::forward_as_tuple(o_x, o_y),
|
||||
std::forward_as_tuple(in_x, in_y, in_z),
|
||||
std::forward_as_tuple(p_x, p_y, p_z),
|
||||
std::forward_as_tuple(s_x, s_y, s_z),
|
||||
std::forward_as_tuple(o_x, o_y, o_z),
|
||||
in_fmt
|
||||
) = params;
|
||||
|
||||
auto input_data = generate_random_4d<InputT>(b, f, in_y, in_x, -256, 256);
|
||||
auto input_data = generate_random_5d<InputT>(b, f, in_z, in_y, in_x, -256, 256);
|
||||
|
||||
this->set_input(in_fmt, std::move(input_data));
|
||||
this->set_pool_size(p_x, p_y);
|
||||
this->set_strides(s_x, s_y);
|
||||
this->set_offsets(o_x, o_y);
|
||||
this->set_pool_size(p_x, p_y, p_z);
|
||||
this->set_strides(s_x, s_y, s_z);
|
||||
this->set_offsets(o_x, o_y, o_z);
|
||||
}
|
||||
|
||||
void run_random(const pooling_random_test_params& params) {
|
||||
@@ -2583,14 +2609,14 @@ TEST_P(pooling_random_test, avg_u8) {
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_low_precision,
|
||||
smoke_low_precision_2d_spatial,
|
||||
pooling_random_test,
|
||||
testing::Combine(testing::Values(1, 2),
|
||||
testing::Values(3, 8, 64),
|
||||
testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
|
||||
testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
|
||||
testing::Values(std::tuple<int, int>(2, 2)),
|
||||
testing::Values(std::tuple<int, int>(0, 0)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(2, 2, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
|
||||
testing::Values(format::yxfb,
|
||||
format::bfyx,
|
||||
format::byxf_af32,
|
||||
@@ -2599,16 +2625,29 @@ INSTANTIATE_TEST_CASE_P(
|
||||
format::b_fs_yx_fsv32)),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
smoke_low_precision_3d_spatial,
|
||||
pooling_random_test,
|
||||
testing::Combine(testing::Values(1, 2),
|
||||
testing::Values(3, 8, 64),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 12), std::tuple<size_t, size_t, size_t>(24, 24, 24)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 4), std::tuple<size_t, size_t, size_t>(2, 2, 2)),
|
||||
testing::Values(std::tuple<int, int, int>(2, 2, 2)),
|
||||
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
|
||||
testing::Values(format::bfzyx,
|
||||
format::b_fs_zyx_fsv16)),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
batched_low_precision,
|
||||
pooling_random_test,
|
||||
testing::Combine(
|
||||
testing::Values(16),
|
||||
testing::Values(16, 32),
|
||||
testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
|
||||
testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
|
||||
testing::Values(std::tuple<int, int>(1, 1)),
|
||||
testing::Values(std::tuple<int, int>(0, 0)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(3, 3, 1), std::tuple<size_t, size_t, size_t>(8, 8, 1)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(1, 1, 1), std::tuple<size_t, size_t, size_t>(3, 3, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(1, 1, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
|
||||
testing::Values(format::bs_fs_yx_bsv16_fsv16)
|
||||
),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
@@ -2622,7 +2661,7 @@ public:
|
||||
topology build_topology(const engine& eng) override {
|
||||
topology topo = parent::build_topology(eng);
|
||||
|
||||
auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
|
||||
auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1, 1, 1)));
|
||||
auto scale_mem = memory::allocate(eng, scale_lay);
|
||||
auto shift_mem = memory::allocate(eng, scale_lay);
|
||||
set_values(scale_mem, _scale);
|
||||
@@ -2640,7 +2679,7 @@ public:
|
||||
return "scale_wa_out";
|
||||
}
|
||||
|
||||
VVVVF<output_t> calculate_reference() override {
|
||||
VVVVVF<output_t> calculate_reference() override {
|
||||
auto expected = parent::calculate_reference();
|
||||
|
||||
for (size_t bi = 0; bi < this->batch_num(); ++bi)
|
||||
@@ -2688,10 +2727,10 @@ INSTANTIATE_TEST_CASE_P(
|
||||
pooling_random_test_fp16_fp32,
|
||||
testing::Combine(testing::Values(1, 2),
|
||||
testing::Values(3, 8),
|
||||
testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
|
||||
testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
|
||||
testing::Values(std::tuple<int, int>(2, 2)),
|
||||
testing::Values(std::tuple<int, int>(0, 0)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(12, 12, 1), std::tuple<size_t, size_t, size_t>(24, 24, 1)),
|
||||
testing::Values(std::tuple<size_t, size_t, size_t>(4, 4, 1), std::tuple<size_t, size_t, size_t>(2, 2, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(2, 2, 1)),
|
||||
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
|
||||
testing::Values(format::yxfb,
|
||||
format::bfyx,
|
||||
format::byxf,
|
||||
|
||||
Reference in New Issue
Block a user