diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp index 8283fc5b8b1..c1febaf4894 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp @@ -41,7 +41,7 @@ static void getOutBlock_WH(size_t output_size, if (output_size % max_posible_tile_size == 0) { output_block_w = max_posible_tile_size; } else { - size_t min_horisontal_block_size = 2; // 4; + size_t min_horisontal_block_size = 2; // 4; size_t block_size = 0; @@ -95,6 +95,9 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const { k.EnableNonBiasTerm(); k.EnableBatching(); k.EnableQuantization(QuantizationType::SYMMETRIC); + k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA); + k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS); + k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS); k.DisableTuning(); return k; } @@ -185,15 +188,33 @@ bool ConvolutionKernel_imad::Validate(const Params& params, const optional_param return false; } - auto& newParams = static_cast(params); - if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0 && - newParams.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16) + auto& conv_params = static_cast(params); + if (conv_params.groups > 1 && conv_params.weights.IFM().v % 4 != 0 && + conv_params.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16) return false; - size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1; + size_t min_block_size_x = (conv_params.weights.X().v - 1) * conv_params.dilation.x + 1; if (min_block_size_x > SIMD_SIZE) return false; + if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) { + if ((conv_params.activations_zero_points.empty() || conv_params.weights_zero_points.empty()) && + (conv_params.compensation.empty())) + return false; + } else if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA) { + if ((conv_params.activations_zero_points.empty()) && + (conv_params.compensation.empty())) + return false; + } else if (conv_params.quantization == QuantizationType::ASYMMETRIC_WEIGHTS) { + if (conv_params.weights_zero_points.empty()) + return false; + } else { + if (!conv_params.activations_zero_points.empty() || + !conv_params.weights_zero_points.empty() || + !conv_params.compensation.empty()) + return false; + } + return true; } } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl index b4f39ea8922..759b1862bb3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl @@ -44,14 +44,40 @@ // if we later add 4bit, then PACK would be 8. #define PACK 4 +#define TYPE_N_(type, n) type##n +#define TYPE_N(type, n) TYPE_N_(type, n) #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4) #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x) #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x) #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) #define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b) +#if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0 + #define NON_ZERO_INPUT0_PAD_BEFORE +#endif + +#if !defined COMPENSATION_TERM || \ + (defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE) + #define SHOULD_BALANCE_COMPENSATION +#endif + +#if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION + #define SHOULD_USE_DATA_ZP +#endif + +#if defined ASYMMETRIC_DATA_QUANTIZATION && \ + defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \ + defined SHOULD_BALANCE_COMPENSATION + #define SHOULD_USE_DATA_AND_WEIGHTS_ZP +#endif + +#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION + #define FILTER_TYPE_4 TYPE_N(FILTER_TYPE, 4) +#endif + // int8 conv_input and weights data is packed to int32 "batches", // int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience __attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) @@ -67,6 +93,15 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( #if BIAS_TERM const __global BIAS_TYPE *biases, #endif +#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION + const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp, +#endif +#ifdef ASYMMETRIC_DATA_QUANTIZATION + const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp, +#endif +#ifdef COMPENSATION_TERM + const __global COMPENSATION_TYPE *compensation, +#endif #if HAS_FUSED_OPS_DECLS FUSED_OPS_DECLS, #endif @@ -88,14 +123,36 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( const uint f = fm % ALIGN(FILTER_OFM_NUM, SIMD_SIZE) + g * FILTER_OFM_NUM; const uint sglid = get_sub_group_local_id(); - const int input_x = oc * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = or * STRIDE_SIZE_Y - PADDING_SIZE_Y; + const int input_x = oc * STRIDE_SIZE_X - INPUT0_PAD_BEFORE_SIZE_X; + const int input_y = or * STRIDE_SIZE_Y - INPUT0_PAD_BEFORE_SIZE_Y; PACKED_TYPE in[IN_BLOCK_HEIGHT]; + #ifdef SHOULD_USE_DATA_ZP + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + uint data_zp_idx = g * FILTER_IFM_NUM; + #else + uint data_zp_idx = (g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK; + #endif + PACKED_TYPE data_zp_val; + #endif ACCUMULATOR_TYPE out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 }; // this is the 32 bit signed accumulator that must be converted to 8 bits before final write. #define NUM_FILTERS (FILTER_SIZE_Y * FILTER_SIZE_X) int w[NUM_FILTERS]; + + #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION + int weights_zp_val = as_int((FILTER_TYPE_4)weights_zp[f]); + #if FILTER_IFM_NUM % PACK != 0 + int weights_zp_vec_partial; + weights_zp_vec_partial = weights_zp_val; + FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial; + __attribute__((opencl_unroll_hint)) + for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) { + wzp_p[in_f] = 0; + } + #endif + #endif + int in_addr; #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) @@ -113,54 +170,112 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( __attribute__((opencl_unroll_hint(1))) for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++) { -#if INPUT0_LAYOUT_B_FS_YX_FSV16 - #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) - int feature_location = kd * PACK + g * FILTER_IFM_NUM; - #else - in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid); - #endif -#else - #ifdef BLOCK_LOAD_INPUTS - in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x; - #else - in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid; - #endif - in_addr += batch * input_size; // adjust for batching -#endif + #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION + #if FILTER_IFM_NUM % PACK != 0 + if ((kd + 1) * PACK >= ALIGN(FILTER_IFM_NUM, PACK)) { + weights_zp_val = weights_zp_vec_partial; + } + #endif + #endif + #if INPUT0_LAYOUT_B_FS_YX_FSV16 + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + int feature_location = kd * PACK + g * FILTER_IFM_NUM; + #else + in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid); + #endif + #else + #ifdef BLOCK_LOAD_INPUTS + in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x; + #else + in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid; + #endif + in_addr += batch * input_size; // adjust for batching + #endif + #ifdef SHOULD_USE_DATA_ZP + #if INPUT0_LAYOUT_B_FS_YX_FSV16 + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val; + for (uint v = 0; v < PACK; v++) { + input_zp_int8_arr[v] = activations_zp[feature_location + v]; + } + #else + data_zp_val = *(__global PACKED_TYPE*)(activations_zp + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK); + #endif + #else + data_zp_val = AS_PACKED_TYPE(*((__global PACKED_TYPE*)activations_zp + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)))); + #endif + #endif + + #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP + ACCUMULATOR_TYPE dotProdAZPxWZP; + dotProdAZPxWZP = 0; + dotProdAZPxWZP = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxWZP, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(weights_zp_val))); + #endif for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) { -#if INPUT0_LAYOUT_B_FS_YX_FSV16 - #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) - INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg]; - in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV; - for (uint v = 0; v < PACK; v++) { - int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV; - input_int8_arr[v] = conv_input[in_addr + f_addr]; - } - #else - in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr); - in_addr += (INPUT0_SIZE_X + IWPAD) * 16; - #endif -#else - #ifdef BLOCK_LOAD_INPUTS - in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr])); - #else - in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide - #endif - in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down -#endif + #ifdef SHOULD_USE_DATA_ZP + const uint x_idx = input_x + sglid; + const uint y_idx = input_y + reg; + + const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) || + ((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y))); + #endif + + #if INPUT0_LAYOUT_B_FS_YX_FSV16 + #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0)) + #ifdef SHOULD_USE_DATA_ZP + if (input_on_padding) { + in[reg] = data_zp_val; + } else { + #endif + INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg]; + in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV; + for (uint v = 0; v < PACK; v++) { + int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * \ + INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV; + input_int8_arr[v] = conv_input[in_addr + f_addr]; + } + #ifdef SHOULD_USE_DATA_ZP + } + #endif + #else + #ifdef SHOULD_USE_DATA_ZP + if (input_on_padding) + in[reg] = data_zp_val; + else + #endif + in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr); + in_addr += (INPUT0_SIZE_X + IWPAD) * 16; + #endif + #else + #ifdef BLOCK_LOAD_INPUTS + in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr])); + #ifdef SHOULD_USE_DATA_ZP + if (input_on_padding) + in[reg] = data_zp_val; + #endif + #else + #ifdef SHOULD_USE_DATA_ZP + if (input_on_padding) + in[reg] = data_zp_val; + else + #endif + in[reg] = AS_PACKED_TYPE(conv_input[in_addr]); // read SIMD_SIZE elements wide + #endif + in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down + #endif } -#ifdef BLOCK_LOAD_WEIGHTS - *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr])); - w[8]= as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)])); - weight_addr += SIMD_SIZE*NUM_FILTERS; -#else - for(int pf=0; pf < NUM_FILTERS; pf++) { - w[pf] = weights[weight_addr]; - weight_addr += SIMD_SIZE; - } -#endif + #ifdef BLOCK_LOAD_WEIGHTS + *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr])); + w[8] = as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)])); + weight_addr += SIMD_SIZE*NUM_FILTERS; + #else + for(int pf = 0; pf < NUM_FILTERS; pf++) { + w[pf] = weights[weight_addr]; + weight_addr += SIMD_SIZE; + } + #endif int wi = 0; // This loop is temporarily not unrolled because the unroll causes TeamCity hangs. @@ -170,13 +285,35 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) for (int kc = 0; kc < FILTER_SIZE_X; ++kc) // kc = Kernel Column { + #ifdef SHOULD_USE_DATA_ZP + ACCUMULATOR_TYPE dotProdAZPxW = 0; + dotProdAZPxW = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxW, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(w[wi]))); + #endif + __attribute__((opencl_unroll_hint)) for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) { __attribute__((opencl_unroll_hint)) for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) { - PACKED_TYPE input = sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X); + INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], + bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)); - out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), AS_FILTER_TYPE_4(w[wi]))); + out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], inputs, AS_FILTER_TYPE_4(w[wi]))); + + #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION + ACCUMULATOR_TYPE dotProdAxWZP = 0; + dotProdAxWZP = TO_ACCUMULATOR_TYPE(IMAD(dotProdAxWZP, inputs, AS_FILTER_TYPE_4(weights_zp_val))); + out[br * OUT_BLOCK_WIDTH + bc] -= dotProdAxWZP; + #endif + + #if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION + out[br * OUT_BLOCK_WIDTH + bc] -= dotProdAZPxW; + #endif + + #if (!defined COMPENSATION_TERM && \ + defined ASYMMETRIC_DATA_QUANTIZATION && \ + defined ASYMMETRIC_WEIGHTS_QUANTIZATION) + out[br * OUT_BLOCK_WIDTH + bc] += dotProdAZPxWZP; + #endif } } wi++; @@ -194,6 +331,10 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( FUSED_OPS_PRELOAD; #endif + #ifdef COMPENSATION_TERM + COMPENSATION_TYPE comp = compensation[f]; + #endif + for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) { #if OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT != 0 @@ -217,15 +358,19 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( #endif ACCUMULATOR_TYPE dotProd = out[r * OUT_BLOCK_WIDTH + c]; + ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd); + #if BIAS_TERM #if BIAS_PER_OUTPUT const uint bias_index = GET_DATA_INDEX(BIAS, batch, f, or + r, oc + c); #elif BIAS_PER_OFM const uint bias_index = f; #endif - ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]); -#else - ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd); + res += TO_ACTIVATION_TYPE(biases[bias_index]); +#endif + +#ifdef COMPENSATION_TERM + res += comp; #endif OUTPUT_TYPE final_result; @@ -243,9 +388,9 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( if (fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) != CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) - 1 || sglid < FILTER_OFM_NUM % SIMD_SIZE) #endif output[out_idx] = final_result; - }// if(!zero_c) + } // if(!zero_c) } // for (int c = 0; c < OUT_BLOCK_WIDTH; c++) - }// if(!zero_r) + } // if(!zero_r) } // for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) } @@ -257,9 +402,17 @@ KERNEL (fused_convolution_eltwise_gpu_imad)( #undef IN_BLOCK_WIDTH #undef IN_BLOCK_HEIGHT #undef PACK +#undef TYPE_N_ +#undef TYPE_N #undef AS_TYPE_N_ #undef AS_TYPE_N +#undef INPUT0_TYPE_4 #undef AS_INPUT0_TYPE_4 +#undef NON_ZERO_INPUT0_PAD_BEFORE +#undef SHOULD_BALANCE_COMPENSATION +#undef SHOULD_USE_DATA_ZP +#undef SHOULD_USE_DATA_AND_WEIGHTS_ZP +#undef FILTER_TYPE_4 #undef AS_FILTER_TYPE_4 #undef NUM_FILTERS #undef CEIL_DIV diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index 9252746024f..8285036935b 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -375,8 +375,7 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout, out_features_per_group >= 16 && // Need to extend imad fsv4 kernel to handle e.g. 3 input features per group (in_features_per_group % 4 == 0) && - ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 && - (conv->activations_zero_points.empty() && conv->weights_zero_points.empty())) + ((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16) return true; // Check for fsv16 imad kernel else if ((input_layout.format.dimension() == 4) && diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index e96ccec376f..e8d56db2525 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -7214,14 +7214,19 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, // Input data format, Implementation name // Format: b_fs_yx_fsv4 - TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, false, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, false, false, false, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, true, true, true, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, true, false, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, false, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, true, false, format::b_fs_yx_fsv4, ""), + TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, true, format::b_fs_yx_fsv4, ""), // Format: b_fs_yx_fsv16 TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""), @@ -7233,7 +7238,6 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, true, true, true, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv16, ""),