[IE CLDNN] Add asymmetric quantization support to fsv16 imad general convolution kernel (#2778)

This commit is contained in:
Jedrzej Hajduczenia 2020-11-04 15:31:40 +01:00 committed by GitHub
parent 9c509e5f41
commit fbae10a235
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 448 additions and 104 deletions

View File

@ -320,6 +320,9 @@ ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
k.EnableBatching();
k.EnableGroupedConvolution();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.EnableDilation();
k.DisableTuning();
return k;
@ -422,11 +425,31 @@ bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, cons
}
KernelData kd = KernelData::Default<convolution_params>(params);
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
convolution_params& conv_params = *static_cast<convolution_params*>(kd.params.get());
if (newParams.split != 1)
if (conv_params.split != 1)
return false;
if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) {
if ((conv_params.activations_zero_points.empty() || conv_params.weights_zero_points.empty()) &&
(conv_params.compensation.empty()))
return false;
}
else if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA) {
if ((conv_params.activations_zero_points.empty()) &&
(conv_params.compensation.empty()))
return false;
}
else if (conv_params.quantization == QuantizationType::ASYMMETRIC_WEIGHTS) {
if (conv_params.weights_zero_points.empty())
return false;
} else {
if (!conv_params.activations_zero_points.empty() ||
!conv_params.weights_zero_points.empty() ||
!conv_params.compensation.empty())
return false;
}
return true;
}
} // namespace kernel_selector

View File

@ -18,10 +18,42 @@
#include "include/mmad.cl"
#include "include/data_types.cl"
#define TYPE_N_(type, n) type##n
#define TYPE_N(type, n) TYPE_N_(type, n)
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#if INPUT0_PAD_BEFORE_SIZE_X != 0 || \
INPUT0_PAD_BEFORE_SIZE_Y != 0 || \
INPUT0_PAD_BEFORE_SIZE_Z != 0
#define NON_ZERO_INPUT0_PAD_BEFORE
#endif
#if !defined COMPENSATION_TERM || \
(defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
#define SHOULD_BALANCE_COMPENSATION
#endif
#if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
#define SHOULD_USE_DATA_ZP
#endif
#if defined ASYMMETRIC_DATA_QUANTIZATION && \
defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
defined SHOULD_BALANCE_COMPENSATION
#define SHOULD_USE_DATA_AND_WEIGHTS_ZP
#endif
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
#define ACCUMULATOR_TYPE_4 TYPE_N(ACCUMULATOR_TYPE, 4)
#endif
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
#define FILTER_TYPE_16 TYPE_N(FILTER_TYPE, 16)
#endif
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
@ -41,6 +73,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#if BIAS_TERM
const __global BIAS_TYPE *biases,
#endif
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
#endif
#ifdef ASYMMETRIC_DATA_QUANTIZATION
const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
#endif
#ifdef COMPENSATION_TERM
const __global COMPENSATION_TYPE *compensation,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
@ -92,8 +133,67 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
#ifdef SHOULD_USE_DATA_ZP
uint data_zp_idx = g * FILTER_IFM_NUM + in_f_start;
uint4 data_zp_val;
#endif
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
}
#if FILTER_IFM_NUM % FSV != 0
uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
__attribute__((opencl_unroll_hint))
for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
wzp_p[f] = 0;
}
}
#endif
#endif
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
#if FILTER_IFM_NUM % FSV != 0
if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
}
}
#endif
#endif
#ifdef SHOULD_USE_DATA_ZP
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
data_zp_val = as_uint4(vload16(0, activations_zp + data_zp_idx));
#else
data_zp_val = vload4(0, (__global uint *)(activations_zp + data_zp_idx));
#endif
#endif
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
dotProdAZPxWZP[ofb] = 0;
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
IMAD(dotProdAZPxWZP[ofb][ive],
AS_INPUT0_TYPE_4(data_zp_val[ive]),
AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
}
}
#endif
__attribute__((opencl_unroll_hint(1)))
for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
__attribute__((opencl_unroll_hint(1)))
@ -106,48 +206,103 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
__attribute__((opencl_unroll_hint))
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
#ifdef SHOULD_USE_DATA_ZP
const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
const int z_idx = input_z + fzn * DILATION_SIZE_Z + izb;
#endif
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
#ifdef SHOULD_USE_DATA_ZP
const int x_idx = input_x + ixb * SIMD + get_sub_group_local_id();
const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
#ifdef SHOULD_USE_DATA_ZP
if (input_on_padding) {
input_val[izb][iyb][ixb] = data_zp_val;
} else {
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
#ifdef SHOULD_USE_DATA_ZP
}
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
#ifdef SHOULD_USE_DATA_ZP
INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
#endif
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
} else {
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
#ifdef SHOULD_USE_DATA_ZP
if (input_on_padding) {
input_int8_arr[v] = input_zp_int8_arr[v];
} else {
#endif
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
} else {
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
#ifdef SHOULD_USE_DATA_ZP
}
#endif
}
}
#endif
} else {
#ifdef SHOULD_USE_DATA_ZP
const int x_idx = input_x + ixb * SIMD + tmp;
const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
#ifdef SHOULD_USE_DATA_ZP
if (input_on_padding) {
input_val[izb][iyb][ixb] = data_zp_val;
} else {
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + tmp * FSV));
#ifdef SHOULD_USE_DATA_ZP
}
#endif
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
#ifdef SHOULD_USE_DATA_ZP
INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
#endif
__attribute__((opencl_unroll_hint(FSV)))
for (uint v = 0; v < FSV; v++) {
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
} else {
const uint addr = input_idx + tmp * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
#ifdef SHOULD_USE_DATA_ZP
if (input_on_padding) {
input_int8_arr[v] = input_zp_int8_arr[v];
} else {
#endif
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
} else {
const uint addr = input_idx + tmp * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
#ifdef SHOULD_USE_DATA_ZP
}
#endif
}
}
#endif
@ -173,6 +328,14 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
for (uint ive = 0; ive < 4; ive++) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
#ifdef SHOULD_USE_DATA_ZP
ACCUMULATOR_TYPE dotProdAZPxW = 0;
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
IMAD(dotProdAZPxW,
AS_INPUT0_TYPE_4(data_zp_val[ive]),
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
#endif
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
@ -185,11 +348,32 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
const uint shuffle_wi = x_block_idx % SIMD;
const uint shuffle_idx = x_block_idx / SIMD;
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
shuffle_wi));
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
IMAD(dotProd[ofb][od][oh][ow],
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
shuffle_wi)),
inputs,
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
ACCUMULATOR_TYPE dotProdAxWZP = 0;
dotProdAxWZP = TO_ACCUMULATOR_TYPE(
IMAD(dotProdAxWZP,
inputs,
AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
dotProd[ofb][od][oh][ow] -= dotProdAxWZP;
#endif
#if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
dotProd[ofb][od][oh][ow] -= dotProdAZPxW;
#endif
#if (!defined COMPENSATION_TERM && \
defined ASYMMETRIC_DATA_QUANTIZATION && \
defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
dotProd[ofb][od][oh][ow] += dotProdAZPxWZP[ofb][ive];
#endif
}
}
}
@ -207,6 +391,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
#ifdef SHOULD_USE_DATA_ZP
data_zp_idx += FSV;
#endif
}
#if FEATURE_SLM_SPLIT != 1
@ -339,6 +527,14 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
#endif
#ifdef COMPENSATION_TERM
COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
comp[ofb] = compensation[out_f + ofb * SIMD];
}
#endif
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
@ -351,6 +547,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
#if BIAS_TERM
dequantized[ofb][od][oh][ow] += bias[ofb];
#endif
#ifdef COMPENSATION_TERM
dequantized[ofb][od][oh][ow] += comp[ofb];
#endif
}
}
@ -498,9 +697,38 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#endif
}
#undef AS_INPUT0_TYPE_4
#undef TYPE_N_
#undef TYPE_N
#undef AS_TYPE_N
#undef AS_TYPE_N_
#undef INPUT0_TYPE_4
#undef AS_INPUT0_TYPE_4
#ifdef NON_ZERO_INPUT0_PAD_BEFORE
#undef NON_ZERO_INPUT0_PAD_BEFORE
#endif
#ifdef SHOULD_BALANCE_COMPENSATION
#undef SHOULD_BALANCE_COMPENSATION
#endif
#ifdef SHOULD_USE_DATA_ZP
#undef SHOULD_USE_DATA_ZP
#endif
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
#undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
#endif
#ifdef ACCUMULATOR_TYPE_4
#undef ACCUMULATOR_TYPE_4
#endif
#ifdef FILTER_TYPE_16
#undef FILTER_TYPE_16
#endif
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV

View File

@ -193,7 +193,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
if (next.is_type<convolution>() &&
fmt_prev == format::bfyx &&
fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4)
fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4 &&
next.as<convolution>().get_primitive()->activations_zero_points.empty() &&
next.as<convolution>().get_primitive()->weights_zero_points.empty())
return true;
if (next.is_type<convolution>() &&
@ -366,9 +368,7 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
weights_layout.size.batch[0] >= 16 &&
((conv->groups == 1 && conv->split() == 1) ||
conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]) ||
conv->split() == static_cast<int32_t>(input_layout.size.feature[0])) &&
((conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) ||
(input_layout.size.feature[0] <= 4))) // only bfyx -> fsv16 kernel supports asymmetric quantization in fsv16 format
conv->split() == static_cast<int32_t>(input_layout.size.feature[0])))
return true;
// Check for grouped convolution
else if (input_layout.format.dimension() == 4 && input_layout.size.batch[0] < 16 &&
@ -380,7 +380,6 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
return true;
// Check for fsv16 imad kernel
else if ((input_layout.format.dimension() == 4) &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
((in_features_per_group > 8) || (out_features_per_group >= 4)))
return true;
return false;
@ -447,7 +446,6 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
// Check for fsv16 imad kernel
if ((input_layout.format.dimension() == 5) &&
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
(input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
(weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
((in_features_per_group > 8) || (out_features_per_group >= 4)))

View File

@ -4894,9 +4894,12 @@ using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 -
int, // 7 - Kernel sizeZ
int, // 8 - Groups number
int, // 9 - Stride
int, // 10 - Batch
format, // 11 - Input data format
std::string>; // 12 - Implementation name
int, // 10 - Batch
bool, // 11 - Zero points for activations
bool, // 12 - Zero points for weights
bool, // 13 - Compensation
format, // 14 - Input data format
std::string>; // 15 - Implementation name
using TestParamType_general_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
int, // 1 - Input Y size
@ -4996,10 +4999,13 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
"_groups" + std::to_string(testing::get<8>(param_info.param)) +
"_stride" + std::to_string(testing::get<9>(param_info.param)) +
"_batch" + std::to_string(testing::get<10>(param_info.param)) +
"_format" + std::to_string(testing::get<11>(param_info.param));
"_data_zp" + std::to_string(testing::get<11>(param_info.param)) +
"_weights_zp" + std::to_string(testing::get<12>(param_info.param)) +
"_comp" + std::to_string(testing::get<13>(param_info.param)) +
"_format" + std::to_string(testing::get<14>(param_info.param));
if (testing::get<12>(param_info.param) != "") {
res += "_impl_" + testing::get<12>(param_info.param);
if (testing::get<15>(param_info.param) != "") {
res += "_impl_" + testing::get<15>(param_info.param);
}
return res;
@ -7205,57 +7211,60 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
::testing::Values(
// Input X size, Input Y size, Input Z size, Input features, Output features,
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
// Activation zero points, Weights zero points, Compensation,
// Input data format, Implementation name
// Format: b_fs_yx_fsv4
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, false, false, false, format::b_fs_yx_fsv4, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, false, false, false, format::b_fs_yx_fsv4, ""),
// Format: b_fs_yx_fsv16
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, true, true, true, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, true, true, true, format::b_fs_yx_fsv16, ""),
// Format: b_fs_zyx_fsv16
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
),
convolution_grouped_gpu::PrintToStringParamName);
@ -7273,23 +7282,28 @@ TEST_P(convolution_grouped_gpu, base) {
groups = testing::get<8>(GetParam()),
stride = testing::get<9>(GetParam()),
batch_num = testing::get<10>(GetParam()),
output_padding = 0,
input_offset_z = (filter_z - 1) / 2,
input_offset_y = (filter_y - 1) / 2,
input_offset_x = (filter_x - 1) / 2;
auto input_data_format = testing::get<11>(GetParam());
auto impl_name = testing::get<12>(GetParam());
const auto has_input_zp = testing::get<11>(GetParam());
const auto has_weights_zp = testing::get<12>(GetParam());
const auto has_comp = testing::get<13>(GetParam());
const auto input_data_format = testing::get<14>(GetParam());
const auto impl_name = testing::get<15>(GetParam());
// can use compensation term only if data zero points are available
ASSERT_TRUE(has_input_zp || !has_comp);
auto num_in_spatial_dims = input_data_format.spatial_num();
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
auto input_rnd = generate_random_5d<int8_t>(batch_num, input_f, input_z, input_y, input_x, -127, 127);
auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
auto input_lay = layout(data_types::i8, format::bfzyx, input_size);
if (num_in_spatial_dims == 2) {
input_lay = layout(data_types::u8, format::bfyx, input_size);
input_lay = layout(data_types::i8, format::bfyx, input_size);
}
std::vector<uint8_t> input_flat(input_lay.get_linear_size());
std::vector<int8_t> input_flat(input_lay.get_linear_size());
for (int b = 0; b < batch_num; b++)
for (int f = 0; f < input_f; f++)
for (int z = 0; z < input_z; z++)
@ -7302,6 +7316,16 @@ TEST_P(convolution_grouped_gpu, base) {
auto input = memory::allocate(engine, input_lay);
set_values(input, input_flat);
auto input_zp_rnd = std::vector<int8_t>(input_f);
auto input_zp_prim_name = std::vector<primitive_id>(0);
if (has_input_zp) {
input_zp_rnd = generate_random_1d<int8_t>(input_f, -127, 127);
input_zp_prim_name = { "input_zp" };
}
auto input_zp_lay = layout(data_types::i8, format::bfyx, tensor(feature(input_f)));
auto input_zp = memory::allocate(engine, input_zp_lay);
set_values(input_zp, input_zp_rnd);
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
@ -7323,6 +7347,16 @@ TEST_P(convolution_grouped_gpu, base) {
auto weights = memory::allocate(engine, weights_lay);
set_values(weights, weights_flat);
auto weights_zp_rnd = std::vector<int8_t>(output_f);
auto weights_zp_prim_name = std::vector<primitive_id>(0);
if (has_weights_zp) {
weights_zp_rnd = generate_random_1d<int8_t>(output_f, -127, 127);
weights_zp_prim_name = { "weights_zp" };
}
auto weights_zp_lay = layout(data_types::i8, format::bfyx, tensor(batch(output_f)));
auto weights_zp = memory::allocate(engine, weights_zp_lay);
set_values(weights_zp, weights_zp_rnd);
VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
// Calculate reference values without bias
@ -7333,36 +7367,94 @@ TEST_P(convolution_grouped_gpu, base) {
int f_begin = gi * input_f / groups;
int f_end = gi * input_f / groups + input_f / groups;
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
stride, stride, stride, // strides
0, // bias
1, 1, 1, // dilation
input_offset_z, input_offset_y, input_offset_x, // input padding
0, 0, 0, // output_padding
f_begin, f_end, // f_begin, f_end
false, // depthwise
grouped); // grouped
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<int8_t, float, int8_t>(
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
stride, stride, stride, // strides
0, // bias
1, 1, 1, // dilation
input_offset_z, input_offset_y, input_offset_x, // input padding
0, 0, 0, // output_padding
f_begin, f_end, // f_begin, f_end
false, // depthwise
grouped, // grouped
input_zp_rnd, // input zero points
weights_zp_rnd[gi * (int)weights_rnd[0].size() + ofi]); // weights zero points
}
auto ref_conv_out_size = tensor(batch(expected_result.size()),
feature(expected_result[0].size()),
spatial(expected_result[0][0][0][0].size(),
expected_result[0][0][0].size(),
expected_result[0][0].size()));
auto comp_val = std::vector<float>(output_f);
auto comp_prim_name = std::vector<primitive_id>(0);
if (has_comp) {
for (int g = 0; g < groups; g++) {
for (int oc = 0; oc < output_f / groups; oc++) {
float c = 0.f;
for (int ic = 0; ic < input_f / groups; ic++) {
for (int zi = 0; zi < filter_z; zi++) {
for (int yi = 0; yi < filter_y; yi++) {
for (int xi = 0; xi < filter_x; xi++) {
int azp_idx = g*(input_f / groups) + ic;
int wzp_idx = g*(output_f / groups) + oc;
c += weights_rnd[g][oc][ic][zi][yi][xi] * input_zp_rnd[azp_idx];
if (has_weights_zp) {
c -= input_zp_rnd[azp_idx] * weights_zp_rnd[wzp_idx];
}
}
}
}
}
comp_val[g*(output_f / groups) + oc] = -c;
}
}
comp_prim_name = { "compensation" };
}
auto comp_lay = layout(data_types::f32, format::bfyx, tensor(batch(output_f)));
auto comp = memory::allocate(engine, comp_lay);
set_values(comp, comp_val);
auto stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, stride, 1));
if (num_in_spatial_dims == 2) {
stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, 1, 1));
}
topology topology(input_layout("input", input.get_layout()),
data("weights", weights),
reorder("input_fsv", "input", {data_types::u8, input_data_format, input_size}),
reorder("input_fsv", "input", {data_types::i8, input_data_format, input_size}),
convolution("conv",
"input_fsv",
{"weights"},
std::vector<primitive_id>(0),
weights_zp_prim_name,
input_zp_prim_name,
comp_prim_name,
groups,
tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
data_types::f32,
stride_tensor,
tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
ref_conv_out_size),
reorder("out", "conv", {data_types::f32, format::bfzyx, ref_conv_out_size}));
if (has_input_zp)
topology.add(data(input_zp_prim_name[0], input_zp));
if (has_weights_zp)
topology.add(data(weights_zp_prim_name[0], weights_zp));
if (has_comp)
topology.add(data(comp_prim_name[0], comp));
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc conv_impl = {input_data_format, impl_name};
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
network network(engine, topology, options);
cldnn::network network(engine, topology, options);
network.set_input_data("input", input);
network.execute();
@ -8231,8 +8323,11 @@ INSTANTIATE_TEST_CASE_P(
.smoke_test_params(format::b_fs_yx_fsv32, false, true)
.smoke_test_params(format::b_fs_yx_fsv32, true, false)
.smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
.smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
.smoke_test_params(format::b_fs_yx_fsv16)
.smoke_test_params(format::b_fs_yx_fsv16, true, true)
.smoke_test_params(format::b_fs_yx_fsv16, false, true)
.smoke_test_params(format::b_fs_yx_fsv16, true, false)
.smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
.bs_test_params(format::bs_fs_yx_bsv16_fsv16)
),
to_string_convolution_all_params