[IE CLDNN] Add resample improvements (#933)

This change:
- extends concat in-place optimization for resample on input
- adds resample primitive int8 support for bilinear mode
- fixes some potential issues with offset calculations with in8
This commit is contained in:
Konrad Dobros
2020-06-16 08:07:05 +02:00
committed by GitHub
parent e66e0cd893
commit db3dff36b9
10 changed files with 354 additions and 106 deletions

View File

@@ -89,7 +89,8 @@ bool ResampleKernelBase::Validate(const Params& p, const optional_params& o) con
const auto& input = params.inputs[0];
if ((input.GetDType() == Datatype::UINT8 || input.GetDType() == Datatype::INT8) &&
params.resampleType != ResampleType::NEAREST_NEIGHBOR)
params.resampleType != ResampleType::NEAREST_NEIGHBOR &&
params.resampleType != ResampleType::BILINEAR_INTERP)
return false;
return true;
@@ -154,6 +155,8 @@ JitConstants ResampleKernelBase::GetJitConstants(const resample_params& params)
}
}
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
return jit;
}
@@ -178,4 +181,26 @@ KernelsData ResampleKernelBase::GetCommonKernelsData(const Params& params, const
return {kd};
}
Datatype ResampleKernelBase::GetAccumulatorType(const resample_params& params) const {
auto in_dt = params.inputs[0].GetDType();
auto out_dt = params.output.GetDType();
if (params.resampleType == ResampleType::NEAREST_NEIGHBOR)
return in_dt;
auto smaller_fp_type = [](const Datatype& current, const Datatype& candidate) -> Datatype {
if (candidate != Datatype::F32 || candidate != Datatype::F16)
return current;
return BytesPerElement(candidate) < BytesPerElement(current) ? candidate : current;
};
Datatype fp_type = Datatype::F32;
fp_type = smaller_fp_type(fp_type, in_dt);
fp_type = smaller_fp_type(fp_type, out_dt);
return fp_type;
}
} // namespace kernel_selector

View File

@@ -58,5 +58,6 @@ protected:
virtual JitConstants GetJitConstants(const resample_params& params) const;
KernelsData GetCommonKernelsData(const Params& params, const optional_params& options) const;
size_t GetFeatureBlockSize(const resample_params& params) const;
virtual Datatype GetAccumulatorType(const resample_params& params) const;
};
} // namespace kernel_selector

View File

@@ -106,8 +106,8 @@ JitConstants ResampleKernelOpt::GetJitConstants(const resample_params &params) c
jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size));
if (!params.fused_ops.empty()) {
std::vector<std::string> idx_order = {"b", "feature_num", "y", "(x + out_x)"};
FusedOpsConfiguration conf = {"", idx_order, "res", params.inputs[0].GetDType(), vec_size, LoadType::LT_ALIGNED_READ};
std::vector<std::string> idx_order = {"b", "feature_block", "y", "(x + out_x)"};
FusedOpsConfiguration conf = {"", idx_order, "res", GetAccumulatorType(params), vec_size, LoadType::LT_ALIGNED_READ};
conf.SetVectorAxis(Tensor::DataChannelName::FEATURE);
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}

View File

@@ -115,7 +115,7 @@ JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) c
idx_order = {"batch", "OF_ID", "oz", "oy", "ox"};
}
FusedOpsConfiguration conf = {"", idx_order, "interp_val", params.inputs[0].GetDType(), 1};
FusedOpsConfiguration conf = {"", idx_order, "interp_val", GetAccumulatorType(params), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}

View File

@@ -15,17 +15,18 @@
#include "include/common.cl"
#include "include/data_types.cl"
#include "include/include_all.cl"
#include "include/unit_type.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#ifdef INPUT0_LAYOUT_FS_B_YX_FSV32
#define READ_FUNC(ptr, offset) CAT(UNIT_BLOCK_READ, VEC_SIZE)(ptr, offset)
#define WRITE_FUNC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, VEC_SIZE)(ptr, offset, val)
#else
#define READ_FUNC(ptr, offset) UNIT_BLOCK_READ(ptr, offset)
#define WRITE_FUNC(ptr, offset, val) UNIT_BLOCK_WRITE(ptr, offset, val)
#endif
#define READ_FUNC(ptr, offset) BLOCK_READN(INPUT0_TYPE, VEC_SIZE, ptr, offset)
#define WRITE_FUNC(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, VEC_SIZE, ptr, offset, val)
#define IN_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
#define TO_IN_VEC_TYPE(x) CAT(convert_, IN_VEC_TYPE)(x)
#define ACC_VEC_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_SIZE)
#define TO_ACC_VEC_TYPE(x) CAT(convert_, ACC_VEC_TYPE)(x)
#define OUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
#define TO_OUT_VEC_TYPE(x) CAT(convert_, OUT_VEC_TYPE)(x)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL (resample_opt)(__global INPUT0_TYPE* input,
@@ -41,11 +42,10 @@ KERNEL (resample_opt)(__global INPUT0_TYPE* input,
const int f_block = get_group_id(1);
const int b = get_global_id(2);
const int feature_num = f_block * FEATURE_SLICE_SIZE + get_sub_group_local_id();
#ifdef INPUT0_LAYOUT_FS_B_YX_FSV32
typedef MAKE_VECTOR_TYPE(UNIT_TYPE, VEC_SIZE) unit_t;
#else
typedef UNIT_TYPE unit_t;
#endif
const uint feature_block = f_block * FEATURE_SLICE_SIZE;
typedef IN_VEC_TYPE in_vec_t;
typedef ACC_VEC_TYPE acc_vec_t;
if (feature_num >= OUTPUT_FEATURE_NUM)
return;
@@ -55,46 +55,36 @@ KERNEL (resample_opt)(__global INPUT0_TYPE* input,
const int ix = floor((x + out_x) * X_RATIO);
const int iy = floor(y * Y_RATIO);
unit_t res = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_num, iy, ix));
in_vec_t res = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_block, iy, ix));
#else
const UNIT_TYPE ix = TO_UNIT_TYPE(X_RATIO) * (x + out_x);
const UNIT_TYPE iy = TO_UNIT_TYPE(Y_RATIO) * y;
const ACCUMULATOR_TYPE ix = TO_ACCUMULATOR_TYPE(X_RATIO) * (x + out_x);
const ACCUMULATOR_TYPE iy = TO_ACCUMULATOR_TYPE(Y_RATIO) * y;
const int top_y_index = (int)(floor(iy));
const int bottom_y_index = (int)(min(ceil(iy), TO_UNIT_TYPE(INPUT0_SIZE_Y) - 1));
const int bottom_y_index = min((int)ceil(iy), INPUT0_SIZE_Y - 1);
const int left_x_index = (int)(floor(ix));
const int right_x_index = (int)(min(ceil(ix), TO_UNIT_TYPE(INPUT0_SIZE_X) - 1));
const int right_x_index = min((int)ceil(ix), INPUT0_SIZE_X - 1);
const UNIT_TYPE dx = ix - left_x_index;
const UNIT_TYPE dy = iy - top_y_index;
const ACCUMULATOR_TYPE dx = ix - left_x_index;
const ACCUMULATOR_TYPE dy = iy - top_y_index;
const unit_t top_left = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_num, top_y_index, left_x_index));
const unit_t top_right = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_num, top_y_index, right_x_index));
const unit_t bottom_left = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_num, bottom_y_index, left_x_index));
const unit_t bottom_right = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_num, bottom_y_index, right_x_index));
const in_vec_t top_left = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_block, top_y_index, left_x_index));
const in_vec_t top_right = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_block, top_y_index, right_x_index));
const in_vec_t bottom_left = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_block, bottom_y_index, left_x_index));
const in_vec_t bottom_right = READ_FUNC(input, INPUT0_GET_INDEX(b, feature_block, bottom_y_index, right_x_index));
const unit_t top = top_left + (top_right - top_left) * dx;
const unit_t bottom = bottom_left + (bottom_right - bottom_left) * dx;
unit_t res = top + (bottom - top) * dy;
const acc_vec_t top = TO_ACC_VEC_TYPE(top_left) + (TO_ACC_VEC_TYPE(top_right) - TO_ACC_VEC_TYPE(top_left)) * dx;
const acc_vec_t bottom = TO_ACC_VEC_TYPE(bottom_left) + (TO_ACC_VEC_TYPE(bottom_right) - TO_ACC_VEC_TYPE(bottom_left)) * dx;
acc_vec_t res = top + (bottom - top) * dy;
#endif
#if HAS_FUSED_OPS
FUSED_OPS;
res = FUSED_OPS_RESULT;
OUT_VEC_TYPE out = FUSED_OPS_RESULT;
#else
res = ACTIVATION(res, ACTIVATION_PARAMS);
OUT_VEC_TYPE out = TO_OUT_VEC_TYPE(ACTIVATION(res, ACTIVATION_PARAMS));
#endif
#if OUTPUT_IS_FP
WRITE_FUNC(output, OUTPUT_GET_INDEX(b, feature_num, y, (x + out_x)), res);
#else
#if VEC_SIZE > 1
for (uint i = 0; i < VEC_SIZE; i++)
output[OUTPUT_GET_INDEX(b, feature_num + i*SUB_GROUP_SIZE, y, (x + out_x))] = res[i];
#else
output[OUTPUT_GET_INDEX(b, feature_num, y, (x + out_x))] = res;
#endif
#endif
WRITE_FUNC(output, OUTPUT_GET_INDEX(b, feature_block, y, (x + out_x)), out);
}
}

View File

@@ -39,7 +39,7 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint z, uint y, uint x)
}
#define TRIANGLE_COEFF(x) (INPUT0_MAX_FUNC(INPUT0_VAL_ZERO, INPUT0_VAL_ONE - INPUT0_ABS_FUNC(x)))
#define TRIANGLE_COEFF(x) (ACCUMULATOR_MAX_FUNC(ACCUMULATOR_VAL_ZERO, ACCUMULATOR_VAL_ONE - ACCUMULATOR_ABS_FUNC(x)))
#define unroll_for __attribute__((opencl_unroll_hint)) for
KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
@@ -54,10 +54,15 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, PACK_SIZE) out_pack_t;
const int ox = get_global_id(0);
const int oy = get_global_id(1) % OUTPUT_SIZE_Y;
const int oz = get_global_id(1) / OUTPUT_SIZE_Y;
const int feature = (get_global_id(2) * PACK_SIZE) % OUTPUT_FEATURE_NUM;
const int batch = (get_global_id(2) * PACK_SIZE) / OUTPUT_FEATURE_NUM;
#if OUTPUT_DIMS <= 4
const int oy = get_global_id(1);
const int oz = 0;
#else
const int oy = (int)get_global_id(1) % OUTPUT_SIZE_Y;
const int oz = (int)get_global_id(1) / OUTPUT_SIZE_Y;
#endif
const int feature = ((int)get_global_id(2) * PACK_SIZE) % OUTPUT_FEATURE_NUM;
const int batch = ((int)get_global_id(2) * PACK_SIZE) / OUTPUT_FEATURE_NUM;
const int ix = floor(ox * X_RATIO);
const int iy = floor(oy * Y_RATIO);
const int iz = floor(oz * Z_RATIO);
@@ -117,13 +122,13 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
return;
#endif
const int top_y_index = (int)(floor(iy));
const int bottom_y_index = (int)(min(TO_INPUT0_TYPE(ceil(iy)), TO_INPUT0_TYPE(INPUT0_SIZE_Y) - 1));
const int left_x_index = (int)(floor(ix));
const int right_x_index = (int)(min(TO_INPUT0_TYPE(ceil(ix)), TO_INPUT0_TYPE(INPUT0_SIZE_X) - 1));
const int top_y_index = (int)(floor(iy));
const int bottom_y_index = min((int)ceil(iy), INPUT0_SIZE_Y - 1);
const int left_x_index = (int)(floor(ix));
const int right_x_index = min((int)ceil(ix), INPUT0_SIZE_X - 1);
const INPUT0_TYPE dx = TO_INPUT0_TYPE(ix - left_x_index);
const INPUT0_TYPE dy = TO_INPUT0_TYPE(iy - top_y_index);
const ACCUMULATOR_TYPE dx = TO_ACCUMULATOR_TYPE(ix - left_x_index);
const ACCUMULATOR_TYPE dy = TO_ACCUMULATOR_TYPE(iy - top_y_index);
unroll_for(int in_f = 0; in_f < OUTPUT_FEATURE_NUM; in_f++) {
INPUT0_TYPE top_left = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, left_x_index)];
@@ -131,17 +136,17 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
INPUT0_TYPE bottom_left = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, left_x_index)];
INPUT0_TYPE bottom_right = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, right_x_index)];
INPUT0_TYPE top = top_left + (top_right - top_left) * dx;
INPUT0_TYPE bottom = bottom_left + (bottom_right - bottom_left) * dx;
ACCUMULATOR_TYPE top = TO_ACCUMULATOR_TYPE(top_left) + (TO_ACCUMULATOR_TYPE(top_right) - TO_ACCUMULATOR_TYPE(top_left)) * dx;
ACCUMULATOR_TYPE bottom = TO_ACCUMULATOR_TYPE(bottom_left) + (TO_ACCUMULATOR_TYPE(bottom_right) - TO_ACCUMULATOR_TYPE(bottom_left)) * dx;
INPUT0_TYPE interp_val = top + (bottom - top) * dy;
ACCUMULATOR_TYPE interp_val = top + (bottom - top) * dy;
#if HAS_FUSED_OPS
#define OF_ID (in_f)
FUSED_OPS;
OUTPUT_TYPE res = FUSED_OPS_RESULT;
#else
OUTPUT_TYPE res = ACTIVATION(interp_val, ACTIVATION_PARAMS);
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(interp_val, ACTIVATION_PARAMS));
#endif
output[OUTPUT_GET_INDEX(batch, in_f, oy, ox)] = res;
}
@@ -158,32 +163,32 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
const int oz = (int)get_global_id(2) / OUTPUT_BATCH_NUM;
#endif
const INPUT0_TYPE ix = ox * X_RATIO + X_RATIO_HALF - 0.5f;
const INPUT0_TYPE iy = oy * Y_RATIO + Y_RATIO_HALF - 0.5f;
const INPUT0_TYPE iz = oz * Z_RATIO + Z_RATIO_HALF - 0.5f;
const ACCUMULATOR_TYPE ix = ox * X_RATIO + X_RATIO_HALF - 0.5f;
const ACCUMULATOR_TYPE iy = oy * Y_RATIO + Y_RATIO_HALF - 0.5f;
const ACCUMULATOR_TYPE iz = oz * Z_RATIO + Z_RATIO_HALF - 0.5f;
const int ix_r = (int)ix;
const int iy_r = (int)iy;
const int iz_r = (int)iz;
#if ANTIALIAS == 1
const INPUT0_TYPE ax = 1.0f / X_RATIO;
const INPUT0_TYPE ay = 1.0f / Y_RATIO;
const INPUT0_TYPE az = 1.0f / Z_RATIO;
const ACCUMULATOR_TYPE ax = 1.0f / X_RATIO;
const ACCUMULATOR_TYPE ay = 1.0f / Y_RATIO;
const ACCUMULATOR_TYPE az = 1.0f / Z_RATIO;
#else
const INPUT0_TYPE ax = 1.0f;
const INPUT0_TYPE ay = 1.0f;
const INPUT0_TYPE az = 1.0f;
const ACCUMULATOR_TYPE ax = 1.0f;
const ACCUMULATOR_TYPE ay = 1.0f;
const ACCUMULATOR_TYPE az = 1.0f;
#endif
const int rx = (X_RATIO < 1.0f) ? 2 : (int)ceil(TO_INPUT0_TYPE(KERNEL_W) / ax);
const int ry = (Y_RATIO < 1.0f) ? 2 : (int)ceil(TO_INPUT0_TYPE(KERNEL_W) / ay);
const int rz = (Z_RATIO < 1.0f) ? 2 : (int)ceil(TO_INPUT0_TYPE(KERNEL_W) / az);
const int rx = (X_RATIO < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ax);
const int ry = (Y_RATIO < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ay);
const int rz = (Z_RATIO < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / az);
INPUT0_TYPE sum[FEATURE_BLOCK_SIZE];
ACCUMULATOR_TYPE sum[FEATURE_BLOCK_SIZE];
for (int i = 0; i < FEATURE_BLOCK_SIZE; i++)
sum[i] = 0;
INPUT0_TYPE wsum = 0;
ACCUMULATOR_TYPE wsum = 0;
int const y_init = max(0, iy_r - ry);
int const x_init = max(0, ix_r - rx);
@@ -195,13 +200,13 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
unroll_for(int z = z_init; z < z_max; z++) {
unroll_for(int y = y_init; y < y_max; y++) {
unroll_for(int x = x_init; x < x_max; x++) {
INPUT0_TYPE dx = ix - x;
INPUT0_TYPE dy = iy - y;
INPUT0_TYPE dz = iz - z;
ACCUMULATOR_TYPE dx = ix - x;
ACCUMULATOR_TYPE dy = iy - y;
ACCUMULATOR_TYPE dz = iz - z;
#if ANTIALIAS == 1
INPUT0_TYPE w = ax * TRIANGLE_COEFF(ax * dx) * ay * TRIANGLE_COEFF(ay * dy) * az * triangleCoeff(az * dz);
ACCUMULATOR_TYPE w = ax * TRIANGLE_COEFF(ax * dx) * ay * TRIANGLE_COEFF(ay * dy) * az * triangleCoeff(az * dz);
#else
INPUT0_TYPE w = TRIANGLE_COEFF(dx) * TRIANGLE_COEFF(dy) * TRIANGLE_COEFF(dz);
ACCUMULATOR_TYPE w = TRIANGLE_COEFF(dx) * TRIANGLE_COEFF(dy) * TRIANGLE_COEFF(dz);
#endif
#ifndef LEFTOVERS
@@ -211,7 +216,7 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
unroll_for(int f = 0; f < f_max; f++) {
#endif
if (w != 0)
sum[f] += w * input[FUNC_CALL(get_input_index)(batch, feature + f, z, y, x)];
sum[f] += w * TO_ACCUMULATOR_TYPE(input[FUNC_CALL(get_input_index)(batch, feature + f, z, y, x)]);
}
wsum += w;
}
@@ -224,13 +229,13 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
unroll_for (int f = 0; f < f_max; f++) {
#endif
INPUT0_TYPE interp_val = (wsum == 0) ? 0 : (sum[f] / wsum);
ACCUMULATOR_TYPE interp_val = (wsum == 0) ? 0 : (sum[f] / wsum);
#if HAS_FUSED_OPS
#define OF_ID (feature + f)
FUSED_OPS;
OUTPUT_TYPE res = FUSED_OPS_RESULT;
#else
OUTPUT_TYPE res = ACTIVATION(interp_val, ACTIVATION_PARAMS);
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(interp_val, ACTIVATION_PARAMS));
#endif
output[FUNC_CALL(get_output_index)(batch, feature + f, oz, oy, ox)] = res;
}

View File

@@ -27,6 +27,7 @@
#include "reshape_inst.h"
#include "scale_inst.h"
#include "depth_to_space_inst.h"
#include "resample_inst.h"
#include "pass_manager.h"
#include "program_helpers.h"
@@ -136,7 +137,8 @@ void prepare_buffer_fusing::run(program_impl& p) {
// todo: we need add padding support for all optimized kernels to remove this condition
if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
!input->is_type<activation>() && !input->is_type<deconvolution>() &&
!input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>())
!input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
!input->is_type<resample>())
return;
// if an input is marked as network output, prevent optimizations

View File

@@ -32,6 +32,10 @@ layout resample_inst::calc_output_layout(resample_node const& node) {
auto input_layout = node.input().get_output_layout();
auto output_type = input_layout.data_type;
if ((input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8)
&& desc->operation_type != resample_type::nearest) {
output_type = data_types::f32;
}
if (node.has_fused_primitives()) {
output_type = node.get_fused_output_layout().data_type;
}

View File

@@ -2332,6 +2332,16 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_int8_2in_act_scale_quantize_eltwise_i8
#define CASE_RESAMPLE_FP16_9 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f16, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f16, format::bfyx
#define CASE_RESAMPLE_FP16_10 {2, 32, 4, 5}, {2, 32, 7, 8}, data_types::f16, format::fs_b_yx_fsv32, resample_type::bilinear, data_types::f16, format::bfyx
#define CASE_RESAMPLE_I8_1 {1, 16, 4, 5}, {1, 16, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
#define CASE_RESAMPLE_I8_2 {2, 32, 4, 5}, {2, 32, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
#define CASE_RESAMPLE_I8_3 {1, 16, 4, 5}, {1, 16, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx
#define CASE_RESAMPLE_I8_4 {2, 32, 4, 5}, {2, 32, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx
#define CASE_RESAMPLE_U8_1 {1, 16, 4, 5}, {1, 16, 2, 3}, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
#define CASE_RESAMPLE_U8_2 {2, 32, 4, 5}, {2, 32, 2, 3}, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
#define CASE_RESAMPLE_U8_3 {1, 16, 4, 5}, {1, 16, 2, 3}, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx
#define CASE_RESAMPLE_U8_4 {2, 32, 4, 5}, {2, 32, 2, 3}, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx
class resample_quantize : public ResamplePrimitiveFusingTest {};
TEST_P(resample_quantize, basic) {
auto p = GetParam();
@@ -2410,6 +2420,126 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_activation,
resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 4 },
resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 4 },
resample_test_params{ CASE_RESAMPLE_FP16_10, 2, 4 },
resample_test_params{ CASE_RESAMPLE_I8_1, 2, 4 },
resample_test_params{ CASE_RESAMPLE_I8_2, 2, 4 },
resample_test_params{ CASE_RESAMPLE_I8_3, 2, 4 },
resample_test_params{ CASE_RESAMPLE_I8_4, 2, 4 },
resample_test_params{ CASE_RESAMPLE_U8_1, 2, 4 },
resample_test_params{ CASE_RESAMPLE_U8_2, 2, 4 },
resample_test_params{ CASE_RESAMPLE_U8_3, 2, 4 },
resample_test_params{ CASE_RESAMPLE_U8_4, 2, 4 },
}), );
class resample_quantize_concat : public ResamplePrimitiveFusingTest {};
TEST_P(resample_quantize_concat, along_f) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type),
data("in_lo_1", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi_1", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo_1", get_mem(get_single_element_layout(p), -128)),
data("out_hi_1", get_mem(get_single_element_layout(p), 127)),
quantize("quant1", "resample1", "in_lo_1", "in_hi_1", "out_lo_1", "out_hi_1", 256, data_types::i8),
resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type),
data("in_lo_2", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi_2", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo_2", get_mem(get_single_element_layout(p), -127)),
data("out_hi_2", get_mem(get_single_element_layout(p), 127)),
quantize("quant2", "resample2", "in_lo_2", "in_hi_2", "out_lo_2", "out_hi_2", 255, data_types::i8),
concatenation("concat", { "quant1", "quant2" }, cldnn::concatenation::along_f),
reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_quantize_concat,
::testing::ValuesIn(std::vector<resample_test_params>{
resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 },
resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 },
resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 },
}), );
class resample_scale_concat : public ResamplePrimitiveFusingTest {};
TEST_P(resample_scale_concat, along_f) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type),
data("scale1_scale", get_mem(get_per_channel_layout(p), -10, 10)),
data("scale1_shift", get_mem(get_per_channel_layout(p), -10, 10)),
scale("scale1", "resample1", "scale1_scale", "scale1_shift"),
resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type),
data("scale2_scale", get_mem(get_per_channel_layout(p), -10, 10)),
data("scale2_shift", get_mem(get_per_channel_layout(p), -10, 10)),
scale("scale2", "resample2", "scale2_scale", "scale2_shift"),
concatenation("concat", { "scale1", "scale2" }, cldnn::concatenation::along_f),
reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type)
);
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_concat,
::testing::ValuesIn(std::vector<resample_test_params>{
resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 },
resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 },
resample_test_params{ CASE_RESAMPLE_I8_1, 3, 6},
resample_test_params{ CASE_RESAMPLE_I8_2, 3, 6},
resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6},
resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6},
resample_test_params{ CASE_RESAMPLE_U8_1, 3, 6 },
resample_test_params{ CASE_RESAMPLE_U8_2, 3, 6 },
resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 },
resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 },
}), );
/* ----------------------------------------------------------------------------------------------------- */

View File

@@ -530,20 +530,21 @@ struct resample_random_test_params {
tensor output_size;
uint32_t num_filter;
resample_type operation_type;
uint32_t align_corners;
format::type in_format;
format::type out_format;
};
struct resample_random_test : testing::TestWithParam<resample_random_test_params>{
template <typename T>
void fill_random_typed(memory& mem, int min, int max) {
void fill_random_typed(memory& mem, int min, int max, int k) {
auto size = mem.get_layout().size;
size_t b = size.batch[0];
size_t f = size.feature[0];
size_t x = size.spatial[0];
size_t y = size.spatial[1];
auto data = generate_random_4d<T>(b, f, y, x, min, max);
auto data = generate_random_4d<T>(b, f, y, x, min, max, k);
auto ptr = mem.pointer<T>();
for (size_t bi = 0; bi < b; ++bi) {
for (size_t fi = 0; fi < f; ++fi) {
@@ -562,16 +563,16 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
auto dt = mem.get_layout().data_type;
switch (dt) {
case data_types::f32:
fill_random_typed<float>(mem, -127, 127);
fill_random_typed<float>(mem, -127, 127, 2);
break;
case data_types::f16:
fill_random_typed<FLOAT16>(mem, -127, 127);
fill_random_typed<FLOAT16>(mem, -127, 127, 2);
break;
case data_types::i8:
fill_random_typed<int8_t>(mem, -127, 127);
fill_random_typed<int8_t>(mem, -127, 127, 1);
break;
case data_types::u8:
fill_random_typed<uint8_t>(mem, 0, 255);
fill_random_typed<uint8_t>(mem, 0, 255, 1);
break;
default:
break;
@@ -579,14 +580,16 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
}
template <typename T>
void compare_nearest_typed(const memory& input, const memory& output) {
void compare_nearest_typed(const memory& input, const memory& output, uint32_t align_corners) {
auto output_lay = output.get_layout();
size_t b = output_lay.size.batch[0];
size_t f = output_lay.size.feature[0];
size_t x = output_lay.size.spatial[0];
size_t y = output_lay.size.spatial[1];
float x_ratio = static_cast<float>(input.get_layout().size.spatial[0]) / static_cast<float>(x);
float y_ratio = static_cast<float>(input.get_layout().size.spatial[1]) / static_cast<float>(y);
size_t in_x = input.get_layout().size.spatial[0];
size_t in_y = input.get_layout().size.spatial[1];
float x_ratio = x > align_corners ? static_cast<float>(in_x - align_corners) / static_cast<float>(x - align_corners) : 0.f;
float y_ratio = y > align_corners ? static_cast<float>(in_y - align_corners) / static_cast<float>(y - align_corners) : 0.f;
auto in_ptr = input.pointer<T>();
auto out_ptr = output.pointer<T>();
@@ -609,17 +612,88 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
}
}
void compare(const memory& input, const memory& output, resample_type operation) {
auto dt = output.get_layout().data_type;
template <typename InT, typename OutT>
void compare_bilinear_typed(const memory& input, const memory& output, uint32_t align_corners) {
auto output_lay = output.get_layout();
size_t b = output_lay.size.batch[0];
size_t f = output_lay.size.feature[0];
size_t x = output_lay.size.spatial[0];
size_t y = output_lay.size.spatial[1];
auto input_lay = input.get_layout();
size_t in_x = input_lay.size.spatial[0];
size_t in_y = input_lay.size.spatial[1];
float x_ratio = x > align_corners ? static_cast<float>(in_x - align_corners) / static_cast<float>(x - align_corners) : 0.f;
float y_ratio = y > align_corners ? static_cast<float>(in_y - align_corners) / static_cast<float>(y - align_corners) : 0.f;
auto in_ptr = input.pointer<InT>();
auto out_ptr = output.pointer<OutT>();
for (size_t bi = 0; bi < b; ++bi) {
for (size_t fi = 0; fi < f; ++fi) {
for (size_t yi = 0; yi < y; ++yi) {
for (size_t xi = 0; xi < x; ++xi) {
auto low_in_xi = static_cast<size_t>(floor(x_ratio * xi));
auto low_in_yi = static_cast<size_t>(floor(y_ratio * yi));
auto high_in_xi = static_cast<size_t>(ceil(x_ratio * xi));
auto high_in_yi = static_cast<size_t>(ceil(y_ratio * yi));
high_in_xi = std::min(high_in_xi, static_cast<size_t>(in_x - 1));
high_in_yi = std::min(high_in_yi, static_cast<size_t>(in_y - 1));
auto dx = x_ratio * xi - static_cast<float>(low_in_xi);
auto dy = y_ratio * yi - static_cast<float>(low_in_yi);
auto top_left_coords = tensor(batch(bi), feature(fi), spatial(low_in_xi, low_in_yi, 0, 0));
auto top_right_coords = tensor(batch(bi), feature(fi), spatial(high_in_xi, low_in_yi, 0, 0));
auto bottom_left_coords = tensor(batch(bi), feature(fi), spatial(low_in_xi, high_in_yi, 0, 0));
auto bottom_right_coords = tensor(batch(bi), feature(fi), spatial(high_in_xi, high_in_yi, 0, 0));
auto top_left_val = in_ptr[input_lay.get_linear_offset(top_left_coords)];
auto top_right_val = in_ptr[input_lay.get_linear_offset(top_right_coords)];
auto bottom_left_val = in_ptr[input_lay.get_linear_offset(bottom_left_coords)];
auto bottom_right_val = in_ptr[input_lay.get_linear_offset(bottom_right_coords)];
auto top_val = static_cast<float>(top_left_val)
+ (static_cast<float>(top_right_val) - static_cast<float>(top_left_val)) * dx;
auto bottom_val = static_cast<float>(bottom_left_val)
+ (static_cast<float>(bottom_right_val) - static_cast<float>(bottom_left_val)) * dx;
auto final_val = top_val + (bottom_val - top_val) * dy;
auto output_coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
auto output_val = out_ptr[output_lay.get_linear_offset(output_coords)];
EXPECT_NEAR(static_cast<float>(output_val), final_val, 1.e-1f)
<< " at bi=" << bi << ", fi=" << fi << ", xi=" << xi << ", yi=" << yi;
}
}
}
}
}
void compare(const memory& input, const memory& output, resample_type operation, uint32_t align_corners) {
auto dt = input.get_layout().data_type;
if (operation == resample_type::nearest) {
// Nearest resampling implicitly ignores align_corners
if (dt == data_types::f32) {
compare_nearest_typed<float>(input, output);
compare_nearest_typed<float>(input, output, 0);
} else if (dt == data_types::f16) {
compare_nearest_typed<FLOAT16>(input, output);
compare_nearest_typed<FLOAT16>(input, output, 0);
} else if (dt == data_types::i8) {
compare_nearest_typed<int8_t>(input, output);
compare_nearest_typed<int8_t>(input, output, 0);
} else if (dt == data_types::u8) {
compare_nearest_typed<uint8_t>(input, output);
compare_nearest_typed<uint8_t>(input, output, 0);
} else {
FAIL() << "Not supported data type: " << static_cast<size_t>(dt);
}
} else if (operation == resample_type::bilinear) {
if (dt == data_types::f32) {
compare_bilinear_typed<float, float>(input, output, align_corners);
} else if (dt == data_types::f16) {
compare_bilinear_typed<FLOAT16, FLOAT16>(input, output, align_corners);
} else if (dt == data_types::i8) {
compare_bilinear_typed<int8_t, float>(input, output, align_corners);
} else if (dt == data_types::u8) {
compare_bilinear_typed<uint8_t, float>(input, output, align_corners);
} else {
FAIL() << "Not supported data type: " << static_cast<size_t>(dt);
}
@@ -633,10 +707,11 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
auto in_layout = layout(params.input_type, params.in_format, params.input_size);
auto topo = topology(
input_layout("in", in_layout),
resample("resample", "in", params.output_size, params.num_filter, params.operation_type)
);
cldnn::topology topo;
topo.add(input_layout("in", in_layout));
auto prim = resample("resample", "in", params.output_size, params.num_filter, params.operation_type);
prim.align_corners = params.align_corners;
topo.add(prim);
auto build_opts = build_options(
build_option::force_implementations({ {"resample", {params.out_format, ""}} })
@@ -650,7 +725,14 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
auto result = net.execute();
auto output = result.at("resample").get_memory();
compare(in_mem, output, params.operation_type);
std::string kernel = "";
for (auto& info : net.get_primitives_info()) {
if (info.original_id == "resample")
kernel = info.kernel_id;
}
SCOPED_TRACE("kernel: " + kernel);
compare(in_mem, output, params.operation_type, params.align_corners);
}
};
@@ -665,10 +747,16 @@ struct resample_random_test_param_generator : std::vector<resample_random_test_p
}
resample_random_test_param_generator& smoke_params(data_types type, format::type input_format, format::type output_format) {
push_back(resample_random_test_params{ type, {1, 17, 5, 9}, {1, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 17, 5, 9}, {2, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
push_back(resample_random_test_params{ type, {1, 7, 10, 17}, {1, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 7, 10, 17}, {2, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
push_back(resample_random_test_params{ type, {1, 17, 5, 9}, {1, 17, 15, 18}, 1, resample_type::nearest, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 17, 5, 9}, {2, 17, 15, 18}, 1, resample_type::nearest, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {1, 7, 10, 17}, {1, 7, 21, 35}, 1, resample_type::nearest, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 7, 10, 17}, {2, 7, 21, 35}, 1, resample_type::nearest, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {1, 17, 5, 9}, {1, 17, 15, 18}, 1, resample_type::bilinear, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 17, 5, 9}, {2, 17, 15, 18}, 1, resample_type::bilinear, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {1, 7, 10, 17}, {1, 7, 21, 35}, 1, resample_type::bilinear, 1, input_format, output_format });
push_back(resample_random_test_params{ type, {2, 7, 10, 17}, {2, 7, 21, 35}, 1, resample_type::bilinear, 1, input_format, output_format });
return *this;
}
@@ -684,4 +772,7 @@ INSTANTIATE_TEST_CASE_P(smoke,
.smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
.smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
.smoke_params(data_types::u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
.smoke_params(data_types::f32, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
.smoke_params(data_types::f16, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
), );