[GPU] Support 8d tensors in activation and quantize primitives (#16947)
This commit is contained in:
parent
6663367183
commit
3bb0fb61f6
@ -160,6 +160,7 @@ public:
|
||||
const variables_state_info_map& GetVariablesStatesInfo() const { return m_variablesStateInfo; }
|
||||
|
||||
bool use_new_shape_infer() const { return allow_new_shape_infer; }
|
||||
bool requires_new_shape_infer(const ngraph::Node& op) const;
|
||||
|
||||
private:
|
||||
static factories_map_t factories_map;
|
||||
|
@ -72,7 +72,7 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
|
||||
namespace detail {
|
||||
|
||||
attach_activation_impl::attach_activation_impl() {
|
||||
auto dyn_types = {
|
||||
auto types = {
|
||||
data_types::f32,
|
||||
data_types::f16,
|
||||
data_types::i8,
|
||||
@ -83,76 +83,40 @@ attach_activation_impl::attach_activation_impl() {
|
||||
auto dyn_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx
|
||||
format::bfwzyx,
|
||||
format::bfuwzyx,
|
||||
format::bfvuwzyx,
|
||||
};
|
||||
|
||||
auto static_formats = {
|
||||
format::yxfb,
|
||||
format::byxf,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_zyx_fsv16,
|
||||
format::bs_fs_zyx_bsv16_fsv16,
|
||||
format::bs_fs_yx_bsv16_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv32,
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx,
|
||||
format::bfuwzyx,
|
||||
format::bfvuwzyx,
|
||||
};
|
||||
|
||||
auto keys = implementation_map<activation>::combine(types, static_formats);
|
||||
keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
|
||||
|
||||
implementation_map<activation>::add(impl_types::ocl,
|
||||
shape_types::dynamic_shape,
|
||||
typed_primitive_impl_ocl<activation>::create<activation_impl>,
|
||||
dyn_types,
|
||||
types,
|
||||
dyn_formats);
|
||||
|
||||
implementation_map<activation>::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl<activation>::create<activation_impl>, {
|
||||
std::make_tuple(data_types::f32, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::yxfb),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::byxf),
|
||||
std::make_tuple(data_types::u8, format::yxfb),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::byxf),
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::i32, format::byxf),
|
||||
std::make_tuple(data_types::i32, format::yxfb),
|
||||
// block f16 format
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
// 3D
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::i32, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
|
||||
|
||||
// bfwzyx
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i32, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
// fs_b_yx_fsv32
|
||||
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
||||
});
|
||||
implementation_map<activation>::add(impl_types::ocl,
|
||||
shape_types::static_shape,
|
||||
typed_primitive_impl_ocl<activation>::create<activation_impl>,
|
||||
keys);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -167,8 +167,6 @@ public:
|
||||
namespace detail {
|
||||
|
||||
attach_pooling_impl::attach_pooling_impl() {
|
||||
std::set<implementation_map<pooling>::key_type> keys;
|
||||
|
||||
auto types = { data_types::f16, data_types::f32, data_types::i8, data_types::u8 };
|
||||
auto formats = { format::bfyx,
|
||||
format::byxf,
|
||||
@ -189,12 +187,7 @@ attach_pooling_impl::attach_pooling_impl() {
|
||||
format::bs_fs_zyx_bsv32_fsv16,
|
||||
format::bs_fs_zyx_bsv32_fsv32 };
|
||||
|
||||
for (const auto type : types) {
|
||||
for (const auto format : formats) {
|
||||
keys.emplace(type, format);
|
||||
}
|
||||
}
|
||||
|
||||
auto keys = implementation_map<pooling>::combine(types, formats);
|
||||
keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
|
||||
keys.emplace(data_types::f32, format::fs_b_yx_fsv32);
|
||||
|
||||
|
@ -98,8 +98,6 @@ public:
|
||||
namespace detail {
|
||||
|
||||
attach_quantize_impl::attach_quantize_impl() {
|
||||
std::set<implementation_map<quantize>::key_type> keys;
|
||||
|
||||
auto types = {
|
||||
data_types::f16,
|
||||
data_types::f32,
|
||||
@ -127,21 +125,20 @@ attach_quantize_impl::attach_quantize_impl() {
|
||||
format::bs_fs_zyx_bsv32_fsv16,
|
||||
format::bs_fs_zyx_bsv32_fsv32,
|
||||
|
||||
format::bfwzyx
|
||||
format::bfwzyx,
|
||||
format::bfuwzyx,
|
||||
format::bfvuwzyx,
|
||||
};
|
||||
|
||||
auto dyn_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx
|
||||
format::bfwzyx,
|
||||
format::bfuwzyx,
|
||||
format::bfvuwzyx,
|
||||
};
|
||||
|
||||
for (const auto type : types) {
|
||||
for (const auto format : formats) {
|
||||
keys.emplace(type, format);
|
||||
}
|
||||
}
|
||||
|
||||
auto keys = implementation_map<quantize>::combine(types, formats);
|
||||
keys.emplace(data_types::f16, format::yxfb);
|
||||
keys.emplace(data_types::f32, format::yxfb);
|
||||
|
||||
|
@ -120,7 +120,6 @@ public:
|
||||
map_type::instance().insert({{impl_type, shape_type}, {keys, factory}});
|
||||
}
|
||||
|
||||
private:
|
||||
static std::set<key_type> combine(const std::vector<data_types>& types, const std::vector<format::type>& formats) {
|
||||
std::set<key_type> keys;
|
||||
for (const auto& type : types) {
|
||||
|
@ -23,16 +23,40 @@ KERNEL(activation)(
|
||||
#endif
|
||||
)
|
||||
{
|
||||
#if OUTPUT_DIMS == 5
|
||||
#if OUTPUT_DIMS == 8
|
||||
#define ORDER batch,feature,v,u,w,z,y,x
|
||||
#elif OUTPUT_DIMS == 7
|
||||
#define ORDER batch,feature,u,w,z,y,x
|
||||
#elif OUTPUT_DIMS == 6
|
||||
#define ORDER batch,feature,w,z,y,x
|
||||
#elif OUTPUT_DIMS == 5
|
||||
#define ORDER batch,feature,z,y,x
|
||||
#elif OUTPUT_DIMS == 4
|
||||
#define ORDER batch,feature,y,x
|
||||
#endif
|
||||
|
||||
#if OUTPUT_DIMS == 5
|
||||
const unsigned x = get_global_id(0);
|
||||
#if OUTPUT_DIMS >= 5
|
||||
|
||||
#if OUTPUT_DIMS == 8
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
|
||||
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
|
||||
const uint v = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
|
||||
#elif OUTPUT_DIMS == 7
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
|
||||
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
|
||||
#elif OUTPUT_DIMS == 6
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
|
||||
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z;
|
||||
#elif OUTPUT_DIMS == 5
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
|
||||
#endif
|
||||
const unsigned x = get_global_id(0);
|
||||
#if OUTPUT_BATCH_NUM_CONST == 1
|
||||
const unsigned feature = (uint)get_global_id(2);
|
||||
const unsigned batch = 0;
|
||||
@ -40,7 +64,7 @@ KERNEL(activation)(
|
||||
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#else
|
||||
#elif OUTPUT_DIMS <= 4
|
||||
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
const unsigned x = (uint)get_global_id(1);
|
||||
const unsigned y = (uint)get_global_id(2);
|
||||
|
@ -35,6 +35,21 @@ KERNEL(quantize_ref)(
|
||||
const int y = (wzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
|
||||
const int z = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
|
||||
const int w = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z;
|
||||
#elif OUTPUT_DIMS == 7
|
||||
const int uwzyx = get_global_id(2);
|
||||
const int x = uwzyx % OUTPUT_SIZE_X;
|
||||
const int y = (uwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
|
||||
const int z = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
|
||||
const int w = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const int u = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
|
||||
#elif OUTPUT_DIMS == 8
|
||||
const int vuwzyx = get_global_id(2);
|
||||
const int x = vuwzyx % OUTPUT_SIZE_X;
|
||||
const int y = (vuwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
|
||||
const int z = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
|
||||
const int w = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const int u = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
|
||||
const int v = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
|
||||
#endif
|
||||
|
||||
#if PACKED_BINARY_OUTPUT
|
||||
@ -73,7 +88,11 @@ KERNEL(quantize_ref)(
|
||||
|
||||
#else
|
||||
|
||||
#if INPUT0_DIMS == 6
|
||||
#if INPUT0_DIMS == 8
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
|
||||
#elif INPUT0_DIMS == 7
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
|
||||
@ -81,7 +100,11 @@ KERNEL(quantize_ref)(
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, y, x);
|
||||
#endif
|
||||
|
||||
#if OUTPUT_DIMS == 6
|
||||
#if OUTPUT_DIMS == 8
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
|
||||
#elif OUTPUT_DIMS == 7
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
|
||||
#elif OUTPUT_DIMS == 6
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x);
|
||||
@ -89,7 +112,11 @@ KERNEL(quantize_ref)(
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, y, x);
|
||||
#endif
|
||||
|
||||
#if INPUT1_DIMS == 6
|
||||
#if INPUT1_DIMS == 8
|
||||
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#elif INPUT1_DIMS == 7
|
||||
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT1_DIMS == 6
|
||||
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT1_DIMS == 5
|
||||
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
@ -97,7 +124,11 @@ KERNEL(quantize_ref)(
|
||||
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, y, x);
|
||||
#endif
|
||||
|
||||
#if INPUT2_DIMS == 6
|
||||
#if INPUT2_DIMS == 8
|
||||
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#elif INPUT2_DIMS == 7
|
||||
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT2_DIMS == 6
|
||||
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT2_DIMS == 5
|
||||
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
@ -105,7 +136,11 @@ KERNEL(quantize_ref)(
|
||||
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, y, x);
|
||||
#endif
|
||||
|
||||
#if INPUT3_DIMS == 6
|
||||
#if INPUT3_DIMS == 8
|
||||
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#elif INPUT3_DIMS == 7
|
||||
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT3_DIMS == 6
|
||||
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT3_DIMS == 5
|
||||
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
@ -113,7 +148,11 @@ KERNEL(quantize_ref)(
|
||||
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, y, x);
|
||||
#endif
|
||||
|
||||
#if INPUT4_DIMS == 6
|
||||
#if INPUT4_DIMS == 8
|
||||
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#elif INPUT4_DIMS == 7
|
||||
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT4_DIMS == 6
|
||||
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT4_DIMS == 5
|
||||
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
|
@ -53,6 +53,27 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
|
||||
const int w = wzyx_div_xy / OUTPUT_SIZE_Z;
|
||||
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
|
||||
#elif OUTPUT_DIMS == 7
|
||||
const int uwzyx = get_global_id(GWS_YX);
|
||||
|
||||
const int x = uwzyx % OUTPUT_SIZE_X;
|
||||
const int y = uwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
|
||||
const int z = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
|
||||
const int w = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const int u = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
|
||||
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
|
||||
#elif OUTPUT_DIMS == 8
|
||||
const int vuwzyx = get_global_id(GWS_YX);
|
||||
|
||||
const int x = vuwzyx % OUTPUT_SIZE_X;
|
||||
const int y = vuwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
|
||||
const int z = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
|
||||
const int w = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
|
||||
const int u = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
|
||||
const int v = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
|
||||
|
||||
const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
|
||||
#else
|
||||
# error quantize_gpu_scale_shift_opt.cl: output tensors with more than 6 dimensions are unsupported
|
||||
#endif
|
||||
@ -63,6 +84,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
|
||||
#elif INPUT0_DIMS == 7
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
|
||||
#elif INPUT0_DIMS == 8
|
||||
const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
|
||||
#else
|
||||
# error quantize_gpu_scale_shift_opt.cl: input tensors with more than 6 dimensions are unsupported
|
||||
#endif
|
||||
@ -74,6 +99,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
|
||||
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
#elif INPUT1_DIMS == 6
|
||||
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT1_DIMS == 7
|
||||
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT1_DIMS == 8
|
||||
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#else
|
||||
# error quantize_gpu_scale_shift_opt.cl: unsupported INPUT1_DIMS size
|
||||
#endif
|
||||
@ -85,6 +114,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
|
||||
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, z, y, x);
|
||||
#elif INPUT7_DIMS == 6
|
||||
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, w, z, y, x);
|
||||
#elif INPUT7_DIMS == 7
|
||||
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, u, w, z, y, x);
|
||||
#elif INPUT7_DIMS == 8
|
||||
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
|
||||
#else
|
||||
# error quantize_gpu_scale_shift_opt.cl: unsupported INPUT7_DIMS size
|
||||
#endif
|
||||
|
@ -24,13 +24,14 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa
|
||||
} else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) {
|
||||
dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
|
||||
dispatchData.lws = {16, 1, 1};
|
||||
} else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
|
||||
} else if (out_layout == DataLayout::bs_fs_yx_bsv32_fsv16 || out_layout == DataLayout::bs_fs_yx_bsv32_fsv32) {
|
||||
dispatchData.gws = {out.X().v * out.Y().v, Align(out.Feature().v, 16), Align(out.Batch().v, 16)};
|
||||
dispatchData.lws = {1, 16, 16};
|
||||
} else {
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v * out.W().v * out.U().v * out.V().v, out.Feature().v * out.Batch().v};
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W,
|
||||
Tensor::DataChannelName::U, Tensor::DataChannelName::V},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
@ -81,6 +81,9 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params.outputs[0].GetDims().size() > 5)
|
||||
return false;
|
||||
|
||||
if (params.outputs[0].GetLayout() != params.inputs[0].GetLayout())
|
||||
return false;
|
||||
|
||||
|
@ -45,7 +45,7 @@ CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params)
|
||||
} else {
|
||||
dispatchData.gws[0] = output.Batch().v;
|
||||
dispatchData.gws[1] = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v;
|
||||
dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v, 16);
|
||||
dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v * output.U().v * output.V().v, 16);
|
||||
|
||||
dispatchData.lws[0] = 1;
|
||||
dispatchData.lws[1] = 1;
|
||||
|
@ -356,22 +356,11 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::BuildProgram");
|
||||
|
||||
for (const auto& op : ops) {
|
||||
if (op->is_dynamic()) {
|
||||
if (requires_new_shape_infer(*op)) {
|
||||
allow_new_shape_infer = true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < op->get_output_size(); i++) {
|
||||
if (op->get_output_partial_shape(i).size() > 6)
|
||||
allow_new_shape_infer = true;
|
||||
}
|
||||
for (size_t i = 0; i < op->get_input_size(); i++) {
|
||||
if (op->get_input_partial_shape(i).size() > 6)
|
||||
allow_new_shape_infer = true;
|
||||
}
|
||||
|
||||
if (allow_new_shape_infer)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
m_config.set_property(ov::intel_gpu::partial_build_program(partialBuild));
|
||||
m_config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
@ -415,6 +404,7 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
|
||||
// 2. We also check parameters of each operation, which means we have more
|
||||
// reliable results of QueryNetwork call.
|
||||
PrepareBuild(network.getInputsInfo(), network.getOutputsInfo());
|
||||
allow_new_shape_infer = requires_new_shape_infer(*op);
|
||||
CreateSingleLayerPrimitive(topology, op);
|
||||
CleanupBuild();
|
||||
DisableQueryMode();
|
||||
@ -540,6 +530,24 @@ void Program::add_primitive(const ngraph::Node& op, std::shared_ptr<cldnn::primi
|
||||
m_topology->add_primitive(prim);
|
||||
}
|
||||
|
||||
bool Program::requires_new_shape_infer(const ngraph::Node& op) const {
|
||||
if (op.is_dynamic()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < op.get_output_size(); i++) {
|
||||
if (op.get_output_partial_shape(i).size() > 6)
|
||||
return true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < op.get_input_size(); i++) {
|
||||
if (op.get_input_partial_shape(i).size() > 6)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: Does it make sense to add such method to ngraph core?
|
||||
bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node) {
|
||||
std::set<std::shared_ptr<ngraph::Node>> nodes_processed = {};
|
||||
|
@ -17,7 +17,7 @@ using namespace ::tests;
|
||||
|
||||
namespace {
|
||||
struct activation_test_params {
|
||||
tensor input_size;
|
||||
ov::PartialShape input_size;
|
||||
data_types input_type;
|
||||
format input_format;
|
||||
data_types default_type;
|
||||
@ -46,11 +46,11 @@ public:
|
||||
}
|
||||
|
||||
layout get_input_layout(activation_test_params& p) {
|
||||
return layout{ p.input_type, p.input_format, p.input_size };
|
||||
return layout{ p.input_size, p.input_type, p.input_format, };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(activation_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
|
||||
return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format };
|
||||
}
|
||||
|
||||
format get_input_format(activation_test_params &p) { return p.input_format; }
|
||||
@ -70,6 +70,7 @@ public:
|
||||
#define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_8 { 1, 2, 3, 4, 5, 3, 2, 3 }, data_types::f32, format::bfvuwzyx, data_types::f32, format::bfvuwzyx
|
||||
#define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx
|
||||
@ -221,6 +222,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8,
|
||||
activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_ref" },
|
||||
activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_ref" },
|
||||
activation_test_params{ CASE_ACTIVATION_3D_F32_2, 3, 5, "activation_ref" },
|
||||
activation_test_params{ CASE_ACTIVATION_F32_8, 3, 5, "activation_ref" },
|
||||
}));
|
||||
|
||||
class activation_eltwise_activation_quantize_u8_onendnn : public ActivationFusingTest {};
|
||||
|
@ -17,7 +17,7 @@ using namespace ::tests;
|
||||
|
||||
namespace {
|
||||
struct eltwise_test_params {
|
||||
tensor input_size;
|
||||
ov::PartialShape input_size;
|
||||
data_types input_type;
|
||||
data_types input_type2;
|
||||
format input_format;
|
||||
@ -49,15 +49,15 @@ public:
|
||||
}
|
||||
|
||||
layout get_input_layout(eltwise_test_params& p) {
|
||||
return layout{ p.input_type, p.input_format, p.input_size };
|
||||
return layout{ p.input_size, p.input_type, p.input_format };
|
||||
}
|
||||
|
||||
layout get_input_layout2(eltwise_test_params& p) {
|
||||
return layout{ p.input_type2, p.input_format, p.input_size };
|
||||
return layout{ p.input_size, p.input_type2, p.input_format };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(eltwise_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
|
||||
return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format };
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
@ -105,6 +105,8 @@ public:
|
||||
#define CASE_ELTWISE_I8_4 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum
|
||||
#define CASE_ELTWISE_U8_4 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum
|
||||
|
||||
#define CASE_ELTWISE_FP16_7 { 3, 32, 2, 3, 3, 2, 1, 2 }, data_types::f16, data_types::f16, format::bfvuwzyx, data_types::f16, format::bfvuwzyx, eltwise_mode::sum
|
||||
|
||||
class eltwise_quantize : public EltwiseFusingTest {};
|
||||
TEST_P(eltwise_quantize, u8) {
|
||||
auto p = GetParam();
|
||||
@ -412,6 +414,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::Value
|
||||
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP16_7, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
|
||||
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },
|
||||
|
@ -63,6 +63,16 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
|
||||
{SoftSign, {}},
|
||||
};
|
||||
|
||||
const std::map<ActivationTypes, std::vector<std::vector<float>>> big_rank_activation_types = {
|
||||
{Relu, {}},
|
||||
{Exp, {}},
|
||||
{Log, {}},
|
||||
{Abs, {}},
|
||||
{Clamp, {{-2.0f, 2.0f}}},
|
||||
{Ceiling, {}},
|
||||
{Swish, {{0.5f}}},
|
||||
};
|
||||
|
||||
const std::map<ActivationTypes, std::vector<std::vector<float>>> activationParamTypes = {
|
||||
{PReLu, {{-0.01f}}},
|
||||
{LeakyRelu, {{0.01f}}}
|
||||
@ -73,12 +83,19 @@ std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic = {
|
||||
{{1, 128}, {{}}},
|
||||
};
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> big_ranks = {
|
||||
{{1, 2, 3, 4, 5, 3}, {{}}},
|
||||
{{1, 2, 3, 4, 1, 3, 2}, {{}}},
|
||||
{{1, 2, 3, 4, 3, 2, 1, 2}, {{}}},
|
||||
};
|
||||
|
||||
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> preluBasic = {
|
||||
{{1, 10, 20}, {{10}, {20}, {10, 20}}},
|
||||
{{1, 128}, {{1}, {128}}},
|
||||
};
|
||||
|
||||
const auto basicCases = ::testing::Combine(
|
||||
const auto basicCases = []() {
|
||||
return ::testing::Combine(
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
@ -86,10 +103,11 @@ const auto basicCases = ::testing::Combine(
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(basic)),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU)
|
||||
);
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU));
|
||||
};
|
||||
|
||||
const auto basicPreluCases = ::testing::Combine(
|
||||
const auto basicPreluCases = []() {
|
||||
return ::testing::Combine(
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(activationParamTypes)),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
@ -97,13 +115,27 @@ const auto basicPreluCases = ::testing::Combine(
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(preluBasic)),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU)
|
||||
);
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU));
|
||||
};
|
||||
|
||||
const auto big_rank_cases = []() {
|
||||
return ::testing::Combine(
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(big_rank_activation_types)),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::Values(InferenceEngine::Layout::ANY),
|
||||
::testing::ValuesIn(CommonTestUtils::combineParams(big_ranks)),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GPU));
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases, ActivationLayerTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases(), ActivationLayerTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
|
||||
INSTANTIATE_TEST_SUITE_P(Activation_BigRanks, ActivationLayerTest, big_rank_cases(), ActivationLayerTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
|
@ -16,7 +16,7 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
InferenceEngine::Precision::FP16
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
|
||||
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}, {1, 2, 3, 4, 2, 3, 2, 2}};
|
||||
const std::vector<std::vector<size_t>> constShapes = {{1}};
|
||||
const std::vector<size_t> levels = {16, 255, 256};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user