[IE CLDNN] Add asymmetric quantization support to fsv16 imad general convolution kernel (#2778)
This commit is contained in:
parent
9c509e5f41
commit
fbae10a235
@ -320,6 +320,9 @@ ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
|
||||
k.EnableBatching();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.EnableDilation();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
@ -422,11 +425,31 @@ bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params, cons
|
||||
}
|
||||
|
||||
KernelData kd = KernelData::Default<convolution_params>(params);
|
||||
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
|
||||
convolution_params& conv_params = *static_cast<convolution_params*>(kd.params.get());
|
||||
|
||||
if (newParams.split != 1)
|
||||
if (conv_params.split != 1)
|
||||
return false;
|
||||
|
||||
if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) {
|
||||
if ((conv_params.activations_zero_points.empty() || conv_params.weights_zero_points.empty()) &&
|
||||
(conv_params.compensation.empty()))
|
||||
return false;
|
||||
}
|
||||
else if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA) {
|
||||
if ((conv_params.activations_zero_points.empty()) &&
|
||||
(conv_params.compensation.empty()))
|
||||
return false;
|
||||
}
|
||||
else if (conv_params.quantization == QuantizationType::ASYMMETRIC_WEIGHTS) {
|
||||
if (conv_params.weights_zero_points.empty())
|
||||
return false;
|
||||
} else {
|
||||
if (!conv_params.activations_zero_points.empty() ||
|
||||
!conv_params.weights_zero_points.empty() ||
|
||||
!conv_params.compensation.empty())
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
|
@ -18,10 +18,42 @@
|
||||
#include "include/mmad.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define TYPE_N_(type, n) type##n
|
||||
#define TYPE_N(type, n) TYPE_N_(type, n)
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
#if INPUT0_PAD_BEFORE_SIZE_X != 0 || \
|
||||
INPUT0_PAD_BEFORE_SIZE_Y != 0 || \
|
||||
INPUT0_PAD_BEFORE_SIZE_Z != 0
|
||||
#define NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#endif
|
||||
|
||||
#if !defined COMPENSATION_TERM || \
|
||||
(defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
|
||||
#define SHOULD_BALANCE_COMPENSATION
|
||||
#endif
|
||||
|
||||
#if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
|
||||
#define SHOULD_USE_DATA_ZP
|
||||
#endif
|
||||
|
||||
#if defined ASYMMETRIC_DATA_QUANTIZATION && \
|
||||
defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
|
||||
defined SHOULD_BALANCE_COMPENSATION
|
||||
#define SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#define ACCUMULATOR_TYPE_4 TYPE_N(ACCUMULATOR_TYPE, 4)
|
||||
#endif
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#define FILTER_TYPE_16 TYPE_N(FILTER_TYPE, 16)
|
||||
#endif
|
||||
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
|
||||
@ -41,6 +73,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
|
||||
#endif
|
||||
#ifdef ASYMMETRIC_DATA_QUANTIZATION
|
||||
const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
|
||||
#endif
|
||||
#ifdef COMPENSATION_TERM
|
||||
const __global COMPENSATION_TYPE *compensation,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
@ -92,8 +133,67 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
|
||||
uint4 input_val[IN_BLOCK_DEPTH][IN_BLOCK_HEIGHT][CEIL_DIV(IN_BLOCK_WIDTH, SIMD)];
|
||||
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
uint data_zp_idx = g * FILTER_IFM_NUM + in_f_start;
|
||||
uint4 data_zp_val;
|
||||
#endif
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
|
||||
}
|
||||
#if FILTER_IFM_NUM % FSV != 0
|
||||
uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
|
||||
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
|
||||
wzp_p[f] = 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < CEIL_DIV(FILTER_IFM_NUM, FSV) / FEATURE_SLM_SPLIT; k++) {
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#if FILTER_IFM_NUM % FSV != 0
|
||||
if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
data_zp_val = as_uint4(vload16(0, activations_zp + data_zp_idx));
|
||||
#else
|
||||
data_zp_val = vload4(0, (__global uint *)(activations_zp + data_zp_idx));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
dotProdAZPxWZP[ofb] = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProdAZPxWZP[ofb][ive],
|
||||
AS_INPUT0_TYPE_4(data_zp_val[ive]),
|
||||
AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint fzn = 0; fzn < FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL; fzn++) {
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
@ -106,48 +206,103 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
|
||||
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
|
||||
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
|
||||
const int z_idx = input_z + fzn * DILATION_SIZE_Z + izb;
|
||||
#endif
|
||||
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
const int x_idx = input_x + ixb * SIMD + get_sub_group_local_id();
|
||||
const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
|
||||
((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
|
||||
((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
|
||||
#endif
|
||||
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding) {
|
||||
input_val[izb][iyb][ixb] = data_zp_val;
|
||||
} else {
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
}
|
||||
#endif
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(FSV)))
|
||||
for (uint v = 0; v < FSV; v++) {
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
|
||||
} else {
|
||||
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding) {
|
||||
input_int8_arr[v] = input_zp_int8_arr[v];
|
||||
} else {
|
||||
#endif
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
|
||||
} else {
|
||||
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
const int x_idx = input_x + ixb * SIMD + tmp;
|
||||
const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
|
||||
((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)) ||
|
||||
((z_idx < 0) || (z_idx >= INPUT0_SIZE_Z)));
|
||||
#endif
|
||||
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding) {
|
||||
input_val[izb][iyb][ixb] = data_zp_val;
|
||||
} else {
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + tmp * FSV));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
}
|
||||
#endif
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(FSV)))
|
||||
for (uint v = 0; v < FSV; v++) {
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
|
||||
} else {
|
||||
const uint addr = input_idx + tmp * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding) {
|
||||
input_int8_arr[v] = input_zp_int8_arr[v];
|
||||
} else {
|
||||
#endif
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
|
||||
} else {
|
||||
const uint addr = input_idx + tmp * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -173,6 +328,14 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxW = 0;
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProdAZPxW,
|
||||
AS_INPUT0_TYPE_4(data_zp_val[ive]),
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
|
||||
@ -185,11 +348,32 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
const uint shuffle_wi = x_block_idx % SIMD;
|
||||
const uint shuffle_idx = x_block_idx / SIMD;
|
||||
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
|
||||
shuffle_wi));
|
||||
|
||||
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProd[ofb][od][oh][ow],
|
||||
AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
|
||||
shuffle_wi)),
|
||||
inputs,
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
ACCUMULATOR_TYPE dotProdAxWZP = 0;
|
||||
dotProdAxWZP = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProdAxWZP,
|
||||
inputs,
|
||||
AS_FILTER_TYPE_4(weights_zp_val[ofb][ive])));
|
||||
dotProd[ofb][od][oh][ow] -= dotProdAxWZP;
|
||||
#endif
|
||||
|
||||
#if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
|
||||
dotProd[ofb][od][oh][ow] -= dotProdAZPxW;
|
||||
#endif
|
||||
|
||||
#if (!defined COMPENSATION_TERM && \
|
||||
defined ASYMMETRIC_DATA_QUANTIZATION && \
|
||||
defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
|
||||
dotProd[ofb][od][oh][ow] += dotProdAZPxWZP[ofb][ive];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -207,6 +391,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
input_start_idx += INPUT0_FEATURE_PITCH * FSV * FEATURE_SLM_SPLIT - (FILTER_SIZE_Z / FILTER_SIZE_Z_UNROLL) * DILATION_SIZE_Z * INPUT0_Z_PITCH * FSV;
|
||||
|
||||
filter_idx += FSV * FSV * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * (FEATURE_SLM_SPLIT - 1);
|
||||
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
data_zp_idx += FSV;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if FEATURE_SLM_SPLIT != 1
|
||||
@ -339,6 +527,14 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef COMPENSATION_TERM
|
||||
COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
comp[ofb] = compensation[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
@ -351,6 +547,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
|
||||
#if BIAS_TERM
|
||||
dequantized[ofb][od][oh][ow] += bias[ofb];
|
||||
#endif
|
||||
#ifdef COMPENSATION_TERM
|
||||
dequantized[ofb][od][oh][ow] += comp[ofb];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -498,9 +697,38 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef TYPE_N_
|
||||
#undef TYPE_N
|
||||
#undef AS_TYPE_N
|
||||
#undef AS_TYPE_N_
|
||||
|
||||
#undef INPUT0_TYPE_4
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
|
||||
#ifdef NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#undef NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_BALANCE_COMPENSATION
|
||||
#undef SHOULD_BALANCE_COMPENSATION
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
#undef SHOULD_USE_DATA_ZP
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#endif
|
||||
|
||||
#ifdef ACCUMULATOR_TYPE_4
|
||||
#undef ACCUMULATOR_TYPE_4
|
||||
#endif
|
||||
|
||||
#ifdef FILTER_TYPE_16
|
||||
#undef FILTER_TYPE_16
|
||||
#endif
|
||||
|
||||
#undef AS_FILTER_TYPE_4
|
||||
|
||||
#undef CEIL_DIV
|
||||
|
@ -193,7 +193,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
fmt_prev == format::bfyx &&
|
||||
fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4)
|
||||
fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] <= 4 &&
|
||||
next.as<convolution>().get_primitive()->activations_zero_points.empty() &&
|
||||
next.as<convolution>().get_primitive()->weights_zero_points.empty())
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
@ -366,9 +368,7 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
|
||||
weights_layout.size.batch[0] >= 16 &&
|
||||
((conv->groups == 1 && conv->split() == 1) ||
|
||||
conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]) ||
|
||||
conv->split() == static_cast<int32_t>(input_layout.size.feature[0])) &&
|
||||
((conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) ||
|
||||
(input_layout.size.feature[0] <= 4))) // only bfyx -> fsv16 kernel supports asymmetric quantization in fsv16 format
|
||||
conv->split() == static_cast<int32_t>(input_layout.size.feature[0])))
|
||||
return true;
|
||||
// Check for grouped convolution
|
||||
else if (input_layout.format.dimension() == 4 && input_layout.size.batch[0] < 16 &&
|
||||
@ -380,7 +380,6 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
|
||||
return true;
|
||||
// Check for fsv16 imad kernel
|
||||
else if ((input_layout.format.dimension() == 4) &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
|
||||
((in_features_per_group > 8) || (out_features_per_group >= 4)))
|
||||
return true;
|
||||
return false;
|
||||
@ -447,7 +446,6 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
|
||||
|
||||
// Check for fsv16 imad kernel
|
||||
if ((input_layout.format.dimension() == 5) &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()) &&
|
||||
(input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
|
||||
(weights_layout.data_type == data_types::i8 || weights_layout.data_type == data_types::u8) &&
|
||||
((in_features_per_group > 8) || (out_features_per_group >= 4)))
|
||||
|
@ -4894,9 +4894,12 @@ using TestParamType_grouped_convolution_gpu = ::testing::tuple< int, // 0 -
|
||||
int, // 7 - Kernel sizeZ
|
||||
int, // 8 - Groups number
|
||||
int, // 9 - Stride
|
||||
int, // 10 - Batch
|
||||
format, // 11 - Input data format
|
||||
std::string>; // 12 - Implementation name
|
||||
int, // 10 - Batch
|
||||
bool, // 11 - Zero points for activations
|
||||
bool, // 12 - Zero points for weights
|
||||
bool, // 13 - Compensation
|
||||
format, // 14 - Input data format
|
||||
std::string>; // 15 - Implementation name
|
||||
|
||||
using TestParamType_general_convolution_gpu = ::testing::tuple< int, // 0 - Input X size
|
||||
int, // 1 - Input Y size
|
||||
@ -4996,10 +4999,13 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
|
||||
"_groups" + std::to_string(testing::get<8>(param_info.param)) +
|
||||
"_stride" + std::to_string(testing::get<9>(param_info.param)) +
|
||||
"_batch" + std::to_string(testing::get<10>(param_info.param)) +
|
||||
"_format" + std::to_string(testing::get<11>(param_info.param));
|
||||
"_data_zp" + std::to_string(testing::get<11>(param_info.param)) +
|
||||
"_weights_zp" + std::to_string(testing::get<12>(param_info.param)) +
|
||||
"_comp" + std::to_string(testing::get<13>(param_info.param)) +
|
||||
"_format" + std::to_string(testing::get<14>(param_info.param));
|
||||
|
||||
if (testing::get<12>(param_info.param) != "") {
|
||||
res += "_impl_" + testing::get<12>(param_info.param);
|
||||
if (testing::get<15>(param_info.param) != "") {
|
||||
res += "_impl_" + testing::get<15>(param_info.param);
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -7205,57 +7211,60 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
::testing::Values(
|
||||
// Input X size, Input Y size, Input Z size, Input features, Output features,
|
||||
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
|
||||
// Activation zero points, Weights zero points, Compensation,
|
||||
// Input data format, Implementation name
|
||||
|
||||
// Format: b_fs_yx_fsv4
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
|
||||
// Format: b_fs_yx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
|
||||
// Format: b_fs_zyx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 17, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 16, 16, 3, 3, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 4, 8, 4, 2, 2, 2, 2, 1, 4, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 8, 16, 16, 4, 4, 4, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 17, 32, 96, 3, 3, 3, 2, 2, 2, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 16, 8, 48, 2, 2, 2, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 3, 48, 96, 2, 2, 2, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 6, 8, 26, 3, 3, 3, 2, 4, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
|
||||
TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
|
||||
TestParamType_grouped_convolution_gpu(7, 5, 3, 51, 99, 3, 3, 3, 3, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, false, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, false, true, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, false, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, false, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(1, 3, 1, 18, 2, 1, 3, 1, 2, 1, 1, true, true, true, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 3, 4, 3, 18, 3, 3, 3, 1, 1, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32"),
|
||||
TestParamType_grouped_convolution_gpu(79, 224, 224, 3, 64, 3, 3, 3, 1, 2, 1, false, false, false, format::b_fs_zyx_fsv16, "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32")
|
||||
),
|
||||
convolution_grouped_gpu::PrintToStringParamName);
|
||||
|
||||
@ -7273,23 +7282,28 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
groups = testing::get<8>(GetParam()),
|
||||
stride = testing::get<9>(GetParam()),
|
||||
batch_num = testing::get<10>(GetParam()),
|
||||
output_padding = 0,
|
||||
input_offset_z = (filter_z - 1) / 2,
|
||||
input_offset_y = (filter_y - 1) / 2,
|
||||
input_offset_x = (filter_x - 1) / 2;
|
||||
auto input_data_format = testing::get<11>(GetParam());
|
||||
auto impl_name = testing::get<12>(GetParam());
|
||||
const auto has_input_zp = testing::get<11>(GetParam());
|
||||
const auto has_weights_zp = testing::get<12>(GetParam());
|
||||
const auto has_comp = testing::get<13>(GetParam());
|
||||
const auto input_data_format = testing::get<14>(GetParam());
|
||||
const auto impl_name = testing::get<15>(GetParam());
|
||||
|
||||
// can use compensation term only if data zero points are available
|
||||
ASSERT_TRUE(has_input_zp || !has_comp);
|
||||
|
||||
auto num_in_spatial_dims = input_data_format.spatial_num();
|
||||
|
||||
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z));
|
||||
auto input_rnd = generate_random_5d<uint8_t>(batch_num, input_f, input_z, input_y, input_x, 0, 255);
|
||||
auto input_rnd = generate_random_5d<int8_t>(batch_num, input_f, input_z, input_y, input_x, -127, 127);
|
||||
|
||||
auto input_lay = layout(data_types::u8, format::bfzyx, input_size);
|
||||
auto input_lay = layout(data_types::i8, format::bfzyx, input_size);
|
||||
if (num_in_spatial_dims == 2) {
|
||||
input_lay = layout(data_types::u8, format::bfyx, input_size);
|
||||
input_lay = layout(data_types::i8, format::bfyx, input_size);
|
||||
}
|
||||
std::vector<uint8_t> input_flat(input_lay.get_linear_size());
|
||||
std::vector<int8_t> input_flat(input_lay.get_linear_size());
|
||||
for (int b = 0; b < batch_num; b++)
|
||||
for (int f = 0; f < input_f; f++)
|
||||
for (int z = 0; z < input_z; z++)
|
||||
@ -7302,6 +7316,16 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
auto input = memory::allocate(engine, input_lay);
|
||||
set_values(input, input_flat);
|
||||
|
||||
auto input_zp_rnd = std::vector<int8_t>(input_f);
|
||||
auto input_zp_prim_name = std::vector<primitive_id>(0);
|
||||
if (has_input_zp) {
|
||||
input_zp_rnd = generate_random_1d<int8_t>(input_f, -127, 127);
|
||||
input_zp_prim_name = { "input_zp" };
|
||||
}
|
||||
auto input_zp_lay = layout(data_types::i8, format::bfyx, tensor(feature(input_f)));
|
||||
auto input_zp = memory::allocate(engine, input_zp_lay);
|
||||
set_values(input_zp, input_zp_rnd);
|
||||
|
||||
auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y, filter_z));
|
||||
|
||||
VVVVVVF<int8_t> weights_rnd = generate_random_6d<int8_t>(groups, output_f / groups, input_f / groups, filter_z, filter_y, filter_x, -127, 127);
|
||||
@ -7323,6 +7347,16 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
auto weights = memory::allocate(engine, weights_lay);
|
||||
set_values(weights, weights_flat);
|
||||
|
||||
auto weights_zp_rnd = std::vector<int8_t>(output_f);
|
||||
auto weights_zp_prim_name = std::vector<primitive_id>(0);
|
||||
if (has_weights_zp) {
|
||||
weights_zp_rnd = generate_random_1d<int8_t>(output_f, -127, 127);
|
||||
weights_zp_prim_name = { "weights_zp" };
|
||||
}
|
||||
auto weights_zp_lay = layout(data_types::i8, format::bfyx, tensor(batch(output_f)));
|
||||
auto weights_zp = memory::allocate(engine, weights_zp_lay);
|
||||
set_values(weights_zp, weights_zp_rnd);
|
||||
|
||||
VVVVVF<float> expected_result(batch_num, VVVVF<float>(output_f));
|
||||
|
||||
// Calculate reference values without bias
|
||||
@ -7333,36 +7367,94 @@ TEST_P(convolution_grouped_gpu, base) {
|
||||
int f_begin = gi * input_f / groups;
|
||||
int f_end = gi * input_f / groups + input_f / groups;
|
||||
|
||||
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
|
||||
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
|
||||
stride, stride, stride, // strides
|
||||
0, // bias
|
||||
1, 1, 1, // dilation
|
||||
input_offset_z, input_offset_y, input_offset_x, // input padding
|
||||
0, 0, 0, // output_padding
|
||||
f_begin, f_end, // f_begin, f_end
|
||||
false, // depthwise
|
||||
grouped); // grouped
|
||||
expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<int8_t, float, int8_t>(
|
||||
input_rnd[bi], weights_rnd[gi][ofi], // input, weights
|
||||
stride, stride, stride, // strides
|
||||
0, // bias
|
||||
1, 1, 1, // dilation
|
||||
input_offset_z, input_offset_y, input_offset_x, // input padding
|
||||
0, 0, 0, // output_padding
|
||||
f_begin, f_end, // f_begin, f_end
|
||||
false, // depthwise
|
||||
grouped, // grouped
|
||||
input_zp_rnd, // input zero points
|
||||
weights_zp_rnd[gi * (int)weights_rnd[0].size() + ofi]); // weights zero points
|
||||
}
|
||||
|
||||
auto ref_conv_out_size = tensor(batch(expected_result.size()),
|
||||
feature(expected_result[0].size()),
|
||||
spatial(expected_result[0][0][0][0].size(),
|
||||
expected_result[0][0][0].size(),
|
||||
expected_result[0][0].size()));
|
||||
|
||||
auto comp_val = std::vector<float>(output_f);
|
||||
auto comp_prim_name = std::vector<primitive_id>(0);
|
||||
if (has_comp) {
|
||||
for (int g = 0; g < groups; g++) {
|
||||
for (int oc = 0; oc < output_f / groups; oc++) {
|
||||
float c = 0.f;
|
||||
for (int ic = 0; ic < input_f / groups; ic++) {
|
||||
for (int zi = 0; zi < filter_z; zi++) {
|
||||
for (int yi = 0; yi < filter_y; yi++) {
|
||||
for (int xi = 0; xi < filter_x; xi++) {
|
||||
int azp_idx = g*(input_f / groups) + ic;
|
||||
int wzp_idx = g*(output_f / groups) + oc;
|
||||
c += weights_rnd[g][oc][ic][zi][yi][xi] * input_zp_rnd[azp_idx];
|
||||
if (has_weights_zp) {
|
||||
c -= input_zp_rnd[azp_idx] * weights_zp_rnd[wzp_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
comp_val[g*(output_f / groups) + oc] = -c;
|
||||
}
|
||||
}
|
||||
comp_prim_name = { "compensation" };
|
||||
}
|
||||
auto comp_lay = layout(data_types::f32, format::bfyx, tensor(batch(output_f)));
|
||||
auto comp = memory::allocate(engine, comp_lay);
|
||||
set_values(comp, comp_val);
|
||||
|
||||
auto stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, stride, 1));
|
||||
if (num_in_spatial_dims == 2) {
|
||||
stride_tensor = tensor(batch(1), feature(1), spatial(stride, stride, 1, 1));
|
||||
}
|
||||
|
||||
topology topology(input_layout("input", input.get_layout()),
|
||||
data("weights", weights),
|
||||
reorder("input_fsv", "input", {data_types::u8, input_data_format, input_size}),
|
||||
reorder("input_fsv", "input", {data_types::i8, input_data_format, input_size}),
|
||||
convolution("conv",
|
||||
"input_fsv",
|
||||
{"weights"},
|
||||
std::vector<primitive_id>(0),
|
||||
weights_zp_prim_name,
|
||||
input_zp_prim_name,
|
||||
comp_prim_name,
|
||||
groups,
|
||||
tensor(batch(1), feature(1), spatial(stride, stride, stride, 1)),
|
||||
data_types::f32,
|
||||
stride_tensor,
|
||||
tensor(batch(0), feature(0), spatial(-input_offset_x, -input_offset_y, -input_offset_z, 0)),
|
||||
tensor(batch(1), feature(1), spatial(1, 1, 1, 1)),
|
||||
padding({0, 0, output_padding, output_padding, output_padding}, 0.f)));
|
||||
ref_conv_out_size),
|
||||
reorder("out", "conv", {data_types::f32, format::bfzyx, ref_conv_out_size}));
|
||||
|
||||
if (has_input_zp)
|
||||
topology.add(data(input_zp_prim_name[0], input_zp));
|
||||
|
||||
if (has_weights_zp)
|
||||
topology.add(data(weights_zp_prim_name[0], weights_zp));
|
||||
|
||||
if (has_comp)
|
||||
topology.add(data(comp_prim_name[0], comp));
|
||||
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
implementation_desc conv_impl = {input_data_format, impl_name};
|
||||
options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
|
||||
|
||||
network network(engine, topology, options);
|
||||
cldnn::network network(engine, topology, options);
|
||||
network.set_input_data("input", input);
|
||||
network.execute();
|
||||
|
||||
@ -8231,8 +8323,11 @@ INSTANTIATE_TEST_CASE_P(
|
||||
.smoke_test_params(format::b_fs_yx_fsv32, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv32, true, false)
|
||||
.smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, true, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, true, false)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
|
||||
.bs_test_params(format::bs_fs_yx_bsv16_fsv16)
|
||||
),
|
||||
to_string_convolution_all_params
|
||||
|
Loading…
Reference in New Issue
Block a user