[IE CLDNN] Add asymmetric quantization support to reference convolution imad kernel (#2994)
This commit is contained in:
parent
f633f0035c
commit
f7a6aae5b0
@ -41,7 +41,7 @@ static void getOutBlock_WH(size_t output_size,
|
||||
if (output_size % max_posible_tile_size == 0) {
|
||||
output_block_w = max_posible_tile_size;
|
||||
} else {
|
||||
size_t min_horisontal_block_size = 2; // 4;
|
||||
size_t min_horisontal_block_size = 2; // 4;
|
||||
|
||||
size_t block_size = 0;
|
||||
|
||||
@ -95,6 +95,9 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
@ -185,15 +188,33 @@ bool ConvolutionKernel_imad::Validate(const Params& params, const optional_param
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& newParams = static_cast<const convolution_params&>(params);
|
||||
if (newParams.groups > 1 && newParams.weights.IFM().v % 4 != 0 &&
|
||||
newParams.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16)
|
||||
auto& conv_params = static_cast<const convolution_params&>(params);
|
||||
if (conv_params.groups > 1 && conv_params.weights.IFM().v % 4 != 0 &&
|
||||
conv_params.inputs[0].GetLayout() != DataLayout::b_fs_yx_fsv16)
|
||||
return false;
|
||||
|
||||
size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1;
|
||||
size_t min_block_size_x = (conv_params.weights.X().v - 1) * conv_params.dilation.x + 1;
|
||||
if (min_block_size_x > SIMD_SIZE)
|
||||
return false;
|
||||
|
||||
if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) {
|
||||
if ((conv_params.activations_zero_points.empty() || conv_params.weights_zero_points.empty()) &&
|
||||
(conv_params.compensation.empty()))
|
||||
return false;
|
||||
} else if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA) {
|
||||
if ((conv_params.activations_zero_points.empty()) &&
|
||||
(conv_params.compensation.empty()))
|
||||
return false;
|
||||
} else if (conv_params.quantization == QuantizationType::ASYMMETRIC_WEIGHTS) {
|
||||
if (conv_params.weights_zero_points.empty())
|
||||
return false;
|
||||
} else {
|
||||
if (!conv_params.activations_zero_points.empty() ||
|
||||
!conv_params.weights_zero_points.empty() ||
|
||||
!conv_params.compensation.empty())
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
|
@ -44,14 +44,40 @@
|
||||
// if we later add 4bit, then PACK would be 8.
|
||||
#define PACK 4
|
||||
|
||||
#define TYPE_N_(type, n) type##n
|
||||
#define TYPE_N(type, n) TYPE_N_(type, n)
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define INPUT0_TYPE_4 TYPE_N(INPUT0_TYPE, 4)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
|
||||
|
||||
#if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
|
||||
#define NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#endif
|
||||
|
||||
#if !defined COMPENSATION_TERM || \
|
||||
(defined COMPENSATION_TERM && defined NON_ZERO_INPUT0_PAD_BEFORE)
|
||||
#define SHOULD_BALANCE_COMPENSATION
|
||||
#endif
|
||||
|
||||
#if defined ASYMMETRIC_DATA_QUANTIZATION && defined SHOULD_BALANCE_COMPENSATION
|
||||
#define SHOULD_USE_DATA_ZP
|
||||
#endif
|
||||
|
||||
#if defined ASYMMETRIC_DATA_QUANTIZATION && \
|
||||
defined ASYMMETRIC_WEIGHTS_QUANTIZATION && \
|
||||
defined SHOULD_BALANCE_COMPENSATION
|
||||
#define SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#endif
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#define FILTER_TYPE_4 TYPE_N(FILTER_TYPE, 4)
|
||||
#endif
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
@ -67,6 +93,15 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
|
||||
#endif
|
||||
#ifdef ASYMMETRIC_DATA_QUANTIZATION
|
||||
const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
|
||||
#endif
|
||||
#ifdef COMPENSATION_TERM
|
||||
const __global COMPENSATION_TYPE *compensation,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
@ -88,14 +123,36 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
const uint f = fm % ALIGN(FILTER_OFM_NUM, SIMD_SIZE) + g * FILTER_OFM_NUM;
|
||||
const uint sglid = get_sub_group_local_id();
|
||||
|
||||
const int input_x = oc * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = or * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
const int input_x = oc * STRIDE_SIZE_X - INPUT0_PAD_BEFORE_SIZE_X;
|
||||
const int input_y = or * STRIDE_SIZE_Y - INPUT0_PAD_BEFORE_SIZE_Y;
|
||||
|
||||
PACKED_TYPE in[IN_BLOCK_HEIGHT];
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
uint data_zp_idx = g * FILTER_IFM_NUM;
|
||||
#else
|
||||
uint data_zp_idx = (g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK;
|
||||
#endif
|
||||
PACKED_TYPE data_zp_val;
|
||||
#endif
|
||||
ACCUMULATOR_TYPE out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 }; // this is the 32 bit signed accumulator that must be converted to 8 bits before final write.
|
||||
|
||||
#define NUM_FILTERS (FILTER_SIZE_Y * FILTER_SIZE_X)
|
||||
int w[NUM_FILTERS];
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
int weights_zp_val = as_int((FILTER_TYPE_4)weights_zp[f]);
|
||||
#if FILTER_IFM_NUM % PACK != 0
|
||||
int weights_zp_vec_partial;
|
||||
weights_zp_vec_partial = weights_zp_val;
|
||||
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
|
||||
wzp_p[in_f] = 0;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int in_addr;
|
||||
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
@ -113,54 +170,112 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++)
|
||||
{
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
int feature_location = kd * PACK + g * FILTER_IFM_NUM;
|
||||
#else
|
||||
in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
|
||||
#endif
|
||||
#else
|
||||
#ifdef BLOCK_LOAD_INPUTS
|
||||
in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
|
||||
#else
|
||||
in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid;
|
||||
#endif
|
||||
in_addr += batch * input_size; // adjust for batching
|
||||
#endif
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#if FILTER_IFM_NUM % PACK != 0
|
||||
if ((kd + 1) * PACK >= ALIGN(FILTER_IFM_NUM, PACK)) {
|
||||
weights_zp_val = weights_zp_vec_partial;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
int feature_location = kd * PACK + g * FILTER_IFM_NUM;
|
||||
#else
|
||||
in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
|
||||
#endif
|
||||
#else
|
||||
#ifdef BLOCK_LOAD_INPUTS
|
||||
in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
|
||||
#else
|
||||
in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid;
|
||||
#endif
|
||||
in_addr += batch * input_size; // adjust for batching
|
||||
#endif
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
INPUT0_TYPE* input_zp_int8_arr = (INPUT0_TYPE*) &data_zp_val;
|
||||
for (uint v = 0; v < PACK; v++) {
|
||||
input_zp_int8_arr[v] = activations_zp[feature_location + v];
|
||||
}
|
||||
#else
|
||||
data_zp_val = *(__global PACKED_TYPE*)(activations_zp + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK);
|
||||
#endif
|
||||
#else
|
||||
data_zp_val = AS_PACKED_TYPE(*((__global PACKED_TYPE*)activations_zp + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK))));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxWZP;
|
||||
dotProdAZPxWZP = 0;
|
||||
dotProdAZPxWZP = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxWZP, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(weights_zp_val)));
|
||||
#endif
|
||||
|
||||
for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg];
|
||||
in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV;
|
||||
for (uint v = 0; v < PACK; v++) {
|
||||
int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV;
|
||||
input_int8_arr[v] = conv_input[in_addr + f_addr];
|
||||
}
|
||||
#else
|
||||
in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
|
||||
in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
|
||||
#endif
|
||||
#else
|
||||
#ifdef BLOCK_LOAD_INPUTS
|
||||
in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
|
||||
#else
|
||||
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
|
||||
#endif
|
||||
in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
|
||||
#endif
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
const uint x_idx = input_x + sglid;
|
||||
const uint y_idx = input_y + reg;
|
||||
|
||||
const bool input_on_padding = (((x_idx < 0) || (x_idx >= INPUT0_SIZE_X)) ||
|
||||
((y_idx < 0) || (y_idx >= INPUT0_SIZE_Y)));
|
||||
#endif
|
||||
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % PACK != 0))
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding) {
|
||||
in[reg] = data_zp_val;
|
||||
} else {
|
||||
#endif
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &in[reg];
|
||||
in_addr = in_start_addr + reg * INPUT0_Y_PITCH * FSV;
|
||||
for (uint v = 0; v < PACK; v++) {
|
||||
int f_addr = ((feature_location + v) / FSV + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * \
|
||||
INPUT0_FEATURE_PITCH * FSV + (feature_location + v) % FSV;
|
||||
input_int8_arr[v] = conv_input[in_addr + f_addr];
|
||||
}
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding)
|
||||
in[reg] = data_zp_val;
|
||||
else
|
||||
#endif
|
||||
in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
|
||||
in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
|
||||
#endif
|
||||
#else
|
||||
#ifdef BLOCK_LOAD_INPUTS
|
||||
in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding)
|
||||
in[reg] = data_zp_val;
|
||||
#endif
|
||||
#else
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding)
|
||||
in[reg] = data_zp_val;
|
||||
else
|
||||
#endif
|
||||
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]); // read SIMD_SIZE elements wide
|
||||
#endif
|
||||
in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef BLOCK_LOAD_WEIGHTS
|
||||
*((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
|
||||
w[8]= as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
|
||||
weight_addr += SIMD_SIZE*NUM_FILTERS;
|
||||
#else
|
||||
for(int pf=0; pf < NUM_FILTERS; pf++) {
|
||||
w[pf] = weights[weight_addr];
|
||||
weight_addr += SIMD_SIZE;
|
||||
}
|
||||
#endif
|
||||
#ifdef BLOCK_LOAD_WEIGHTS
|
||||
*((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
|
||||
w[8] = as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
|
||||
weight_addr += SIMD_SIZE*NUM_FILTERS;
|
||||
#else
|
||||
for(int pf = 0; pf < NUM_FILTERS; pf++) {
|
||||
w[pf] = weights[weight_addr];
|
||||
weight_addr += SIMD_SIZE;
|
||||
}
|
||||
#endif
|
||||
|
||||
int wi = 0;
|
||||
// This loop is temporarily not unrolled because the unroll causes TeamCity hangs.
|
||||
@ -170,13 +285,35 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (int kc = 0; kc < FILTER_SIZE_X; ++kc) // kc = Kernel Column
|
||||
{
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxW = 0;
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxW, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(w[wi])));
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
|
||||
PACKED_TYPE input = sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y],
|
||||
bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X));
|
||||
|
||||
out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), AS_FILTER_TYPE_4(w[wi])));
|
||||
out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], inputs, AS_FILTER_TYPE_4(w[wi])));
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
ACCUMULATOR_TYPE dotProdAxWZP = 0;
|
||||
dotProdAxWZP = TO_ACCUMULATOR_TYPE(IMAD(dotProdAxWZP, inputs, AS_FILTER_TYPE_4(weights_zp_val)));
|
||||
out[br * OUT_BLOCK_WIDTH + bc] -= dotProdAxWZP;
|
||||
#endif
|
||||
|
||||
#if !defined COMPENSATION_TERM && defined ASYMMETRIC_DATA_QUANTIZATION
|
||||
out[br * OUT_BLOCK_WIDTH + bc] -= dotProdAZPxW;
|
||||
#endif
|
||||
|
||||
#if (!defined COMPENSATION_TERM && \
|
||||
defined ASYMMETRIC_DATA_QUANTIZATION && \
|
||||
defined ASYMMETRIC_WEIGHTS_QUANTIZATION)
|
||||
out[br * OUT_BLOCK_WIDTH + bc] += dotProdAZPxWZP;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
wi++;
|
||||
@ -194,6 +331,10 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
FUSED_OPS_PRELOAD;
|
||||
#endif
|
||||
|
||||
#ifdef COMPENSATION_TERM
|
||||
COMPENSATION_TYPE comp = compensation[f];
|
||||
#endif
|
||||
|
||||
for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
|
||||
{
|
||||
#if OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT != 0
|
||||
@ -217,15 +358,19 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#endif
|
||||
ACCUMULATOR_TYPE dotProd = out[r * OUT_BLOCK_WIDTH + c];
|
||||
|
||||
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
|
||||
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, batch, f, or + r, oc + c);
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = f;
|
||||
#endif
|
||||
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]);
|
||||
#else
|
||||
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
|
||||
res += TO_ACTIVATION_TYPE(biases[bias_index]);
|
||||
#endif
|
||||
|
||||
#ifdef COMPENSATION_TERM
|
||||
res += comp;
|
||||
#endif
|
||||
|
||||
OUTPUT_TYPE final_result;
|
||||
@ -243,9 +388,9 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
if (fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) != CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) - 1 || sglid < FILTER_OFM_NUM % SIMD_SIZE)
|
||||
#endif
|
||||
output[out_idx] = final_result;
|
||||
}// if(!zero_c)
|
||||
} // if(!zero_c)
|
||||
} // for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
|
||||
}// if(!zero_r)
|
||||
} // if(!zero_r)
|
||||
} // for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
|
||||
}
|
||||
|
||||
@ -257,9 +402,17 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#undef IN_BLOCK_WIDTH
|
||||
#undef IN_BLOCK_HEIGHT
|
||||
#undef PACK
|
||||
#undef TYPE_N_
|
||||
#undef TYPE_N
|
||||
#undef AS_TYPE_N_
|
||||
#undef AS_TYPE_N
|
||||
#undef INPUT0_TYPE_4
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#undef SHOULD_BALANCE_COMPENSATION
|
||||
#undef SHOULD_USE_DATA_ZP
|
||||
#undef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
#undef FILTER_TYPE_4
|
||||
#undef AS_FILTER_TYPE_4
|
||||
#undef NUM_FILTERS
|
||||
#undef CEIL_DIV
|
||||
|
@ -375,8 +375,7 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
|
||||
out_features_per_group >= 16 &&
|
||||
// Need to extend imad fsv4 kernel to handle e.g. 3 input features per group
|
||||
(in_features_per_group % 4 == 0) &&
|
||||
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16 &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty()))
|
||||
((conv->dilation.spatial[0] + 1) * (ks_x - 1)) <= 16)
|
||||
return true;
|
||||
// Check for fsv16 imad kernel
|
||||
else if ((input_layout.format.dimension() == 4) &&
|
||||
|
@ -7214,14 +7214,19 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
// Input data format, Implementation name
|
||||
|
||||
// Format: b_fs_yx_fsv4
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(16, 16, 1, 8, 48, 2, 2, 1, 2, 2, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 48, 96, 2, 2, 1, 2, 8, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, true, true, true, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, true, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, true, false, format::b_fs_yx_fsv4, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, true, format::b_fs_yx_fsv4, ""),
|
||||
|
||||
// Format: b_fs_yx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
@ -7233,7 +7238,6 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 16, 16, 4, 4, 1, 4, 1, 1, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(17, 17, 1, 32, 96, 3, 3, 1, 2, 2, 2, true, true, true, format::b_fs_yx_fsv16, ""),
|
||||
|
Loading…
Reference in New Issue
Block a user