[GPU] Support 8d tensors in activation and quantize primitives (#16947)

This commit is contained in:
Vladimir Paramuzov 2023-04-25 09:02:54 +04:00 committed by GitHub
parent 6663367183
commit 3bb0fb61f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 228 additions and 129 deletions

View File

@ -160,6 +160,7 @@ public:
const variables_state_info_map& GetVariablesStatesInfo() const { return m_variablesStateInfo; }
bool use_new_shape_infer() const { return allow_new_shape_infer; }
bool requires_new_shape_infer(const ngraph::Node& op) const;
private:
static factories_map_t factories_map;

View File

@ -72,7 +72,7 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
namespace detail {
attach_activation_impl::attach_activation_impl() {
auto dyn_types = {
auto types = {
data_types::f32,
data_types::f16,
data_types::i8,
@ -83,76 +83,40 @@ attach_activation_impl::attach_activation_impl() {
auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};
auto static_formats = {
format::yxfb,
format::byxf,
format::b_fs_yx_fsv16,
format::b_fs_zyx_fsv16,
format::bs_fs_zyx_bsv16_fsv16,
format::bs_fs_yx_bsv16_fsv16,
format::bs_fs_yx_bsv32_fsv16,
format::bs_fs_yx_bsv32_fsv32,
format::bfyx,
format::bfzyx,
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};
auto keys = implementation_map<activation>::combine(types, static_formats);
keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
implementation_map<activation>::add(impl_types::ocl,
shape_types::dynamic_shape,
typed_primitive_impl_ocl<activation>::create<activation_impl>,
dyn_types,
types,
dyn_formats);
implementation_map<activation>::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl<activation>::create<activation_impl>, {
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i8, format::yxfb),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::i8, format::byxf),
std::make_tuple(data_types::u8, format::yxfb),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::u8, format::byxf),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i32, format::byxf),
std::make_tuple(data_types::i32, format::yxfb),
// block f16 format
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
// 3D
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
// bfwzyx
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),
// fs_b_yx_fsv32
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
});
implementation_map<activation>::add(impl_types::ocl,
shape_types::static_shape,
typed_primitive_impl_ocl<activation>::create<activation_impl>,
keys);
}
} // namespace detail

View File

@ -167,8 +167,6 @@ public:
namespace detail {
attach_pooling_impl::attach_pooling_impl() {
std::set<implementation_map<pooling>::key_type> keys;
auto types = { data_types::f16, data_types::f32, data_types::i8, data_types::u8 };
auto formats = { format::bfyx,
format::byxf,
@ -189,12 +187,7 @@ attach_pooling_impl::attach_pooling_impl() {
format::bs_fs_zyx_bsv32_fsv16,
format::bs_fs_zyx_bsv32_fsv32 };
for (const auto type : types) {
for (const auto format : formats) {
keys.emplace(type, format);
}
}
auto keys = implementation_map<pooling>::combine(types, formats);
keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
keys.emplace(data_types::f32, format::fs_b_yx_fsv32);

View File

@ -98,8 +98,6 @@ public:
namespace detail {
attach_quantize_impl::attach_quantize_impl() {
std::set<implementation_map<quantize>::key_type> keys;
auto types = {
data_types::f16,
data_types::f32,
@ -127,21 +125,20 @@ attach_quantize_impl::attach_quantize_impl() {
format::bs_fs_zyx_bsv32_fsv16,
format::bs_fs_zyx_bsv32_fsv32,
format::bfwzyx
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};
auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
format::bfwzyx,
format::bfuwzyx,
format::bfvuwzyx,
};
for (const auto type : types) {
for (const auto format : formats) {
keys.emplace(type, format);
}
}
auto keys = implementation_map<quantize>::combine(types, formats);
keys.emplace(data_types::f16, format::yxfb);
keys.emplace(data_types::f32, format::yxfb);

View File

@ -120,7 +120,6 @@ public:
map_type::instance().insert({{impl_type, shape_type}, {keys, factory}});
}
private:
static std::set<key_type> combine(const std::vector<data_types>& types, const std::vector<format::type>& formats) {
std::set<key_type> keys;
for (const auto& type : types) {

View File

@ -23,16 +23,40 @@ KERNEL(activation)(
#endif
)
{
#if OUTPUT_DIMS == 5
#if OUTPUT_DIMS == 8
#define ORDER batch,feature,v,u,w,z,y,x
#elif OUTPUT_DIMS == 7
#define ORDER batch,feature,u,w,z,y,x
#elif OUTPUT_DIMS == 6
#define ORDER batch,feature,w,z,y,x
#elif OUTPUT_DIMS == 5
#define ORDER batch,feature,z,y,x
#elif OUTPUT_DIMS == 4
#define ORDER batch,feature,y,x
#endif
#if OUTPUT_DIMS == 5
const unsigned x = get_global_id(0);
#if OUTPUT_DIMS >= 5
#if OUTPUT_DIMS == 8
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
const uint v = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
#elif OUTPUT_DIMS == 7
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
#elif OUTPUT_DIMS == 6
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z;
#elif OUTPUT_DIMS == 5
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
#endif
const unsigned x = get_global_id(0);
#if OUTPUT_BATCH_NUM_CONST == 1
const unsigned feature = (uint)get_global_id(2);
const unsigned batch = 0;
@ -40,7 +64,7 @@ KERNEL(activation)(
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
#endif
#else
#elif OUTPUT_DIMS <= 4
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
const unsigned x = (uint)get_global_id(1);
const unsigned y = (uint)get_global_id(2);

View File

@ -35,6 +35,21 @@ KERNEL(quantize_ref)(
const int y = (wzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
const int z = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
const int w = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z;
#elif OUTPUT_DIMS == 7
const int uwzyx = get_global_id(2);
const int x = uwzyx % OUTPUT_SIZE_X;
const int y = (uwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
const int z = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
const int w = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const int u = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
#elif OUTPUT_DIMS == 8
const int vuwzyx = get_global_id(2);
const int x = vuwzyx % OUTPUT_SIZE_X;
const int y = (vuwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
const int z = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
const int w = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const int u = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
const int v = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
#endif
#if PACKED_BINARY_OUTPUT
@ -73,7 +88,11 @@ KERNEL(quantize_ref)(
#else
#if INPUT0_DIMS == 6
#if INPUT0_DIMS == 8
const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
#elif INPUT0_DIMS == 7
const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
#elif INPUT0_DIMS == 6
const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
#elif INPUT0_DIMS == 5
const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
@ -81,7 +100,11 @@ KERNEL(quantize_ref)(
const int input_offset = INPUT0_GET_INDEX(b, of, y, x);
#endif
#if OUTPUT_DIMS == 6
#if OUTPUT_DIMS == 8
const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
#elif OUTPUT_DIMS == 7
const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
#elif OUTPUT_DIMS == 6
const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
#elif OUTPUT_DIMS == 5
const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x);
@ -89,7 +112,11 @@ KERNEL(quantize_ref)(
const int output_offset = OUTPUT_GET_INDEX(b, of, y, x);
#endif
#if INPUT1_DIMS == 6
#if INPUT1_DIMS == 8
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#elif INPUT1_DIMS == 7
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT1_DIMS == 6
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT1_DIMS == 5
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
@ -97,7 +124,11 @@ KERNEL(quantize_ref)(
const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, y, x);
#endif
#if INPUT2_DIMS == 6
#if INPUT2_DIMS == 8
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#elif INPUT2_DIMS == 7
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT2_DIMS == 6
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT2_DIMS == 5
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, z, y, x);
@ -105,7 +136,11 @@ KERNEL(quantize_ref)(
const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, y, x);
#endif
#if INPUT3_DIMS == 6
#if INPUT3_DIMS == 8
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#elif INPUT3_DIMS == 7
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT3_DIMS == 6
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT3_DIMS == 5
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, z, y, x);
@ -113,7 +148,11 @@ KERNEL(quantize_ref)(
const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, y, x);
#endif
#if INPUT4_DIMS == 6
#if INPUT4_DIMS == 8
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#elif INPUT4_DIMS == 7
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT4_DIMS == 6
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT4_DIMS == 5
const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, z, y, x);

View File

@ -53,6 +53,27 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
const int w = wzyx_div_xy / OUTPUT_SIZE_Z;
const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
#elif OUTPUT_DIMS == 7
const int uwzyx = get_global_id(GWS_YX);
const int x = uwzyx % OUTPUT_SIZE_X;
const int y = uwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
const int z = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
const int w = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const int u = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
#elif OUTPUT_DIMS == 8
const int vuwzyx = get_global_id(GWS_YX);
const int x = vuwzyx % OUTPUT_SIZE_X;
const int y = vuwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
const int z = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
const int w = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
const int u = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
const int v = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
#else
# error quantize_gpu_scale_shift_opt.cl: output tensors with more than 6 dimensions are unsupported
#endif
@ -63,6 +84,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
#elif INPUT0_DIMS == 6
const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
#elif INPUT0_DIMS == 7
const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
#elif INPUT0_DIMS == 8
const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
#else
# error quantize_gpu_scale_shift_opt.cl: input tensors with more than 6 dimensions are unsupported
#endif
@ -74,6 +99,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
#elif INPUT1_DIMS == 6
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT1_DIMS == 7
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT1_DIMS == 8
const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#else
# error quantize_gpu_scale_shift_opt.cl: unsupported INPUT1_DIMS size
#endif
@ -85,6 +114,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, z, y, x);
#elif INPUT7_DIMS == 6
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, w, z, y, x);
#elif INPUT7_DIMS == 7
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, u, w, z, y, x);
#elif INPUT7_DIMS == 8
const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
#else
# error quantize_gpu_scale_shift_opt.cl: unsupported INPUT7_DIMS size
#endif

View File

@ -24,13 +24,14 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa
} else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) {
dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
dispatchData.lws = {16, 1, 1};
} else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
} else if (out_layout == DataLayout::bs_fs_yx_bsv32_fsv16 || out_layout == DataLayout::bs_fs_yx_bsv32_fsv32) {
dispatchData.gws = {out.X().v * out.Y().v, Align(out.Feature().v, 16), Align(out.Batch().v, 16)};
dispatchData.lws = {1, 16, 16};
} else {
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v * out.W().v * out.U().v * out.V().v, out.Feature().v * out.Batch().v};
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W,
Tensor::DataChannelName::U, Tensor::DataChannelName::V},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
}

View File

@ -81,6 +81,9 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co
return false;
}
if (params.outputs[0].GetDims().size() > 5)
return false;
if (params.outputs[0].GetLayout() != params.inputs[0].GetLayout())
return false;

View File

@ -45,7 +45,7 @@ CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params)
} else {
dispatchData.gws[0] = output.Batch().v;
dispatchData.gws[1] = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v;
dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v, 16);
dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v * output.U().v * output.V().v, 16);
dispatchData.lws[0] = 1;
dispatchData.lws[1] = 1;

View File

@ -356,22 +356,11 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::BuildProgram");
for (const auto& op : ops) {
if (op->is_dynamic()) {
if (requires_new_shape_infer(*op)) {
allow_new_shape_infer = true;
}
for (size_t i = 0; i < op->get_output_size(); i++) {
if (op->get_output_partial_shape(i).size() > 6)
allow_new_shape_infer = true;
}
for (size_t i = 0; i < op->get_input_size(); i++) {
if (op->get_input_partial_shape(i).size() > 6)
allow_new_shape_infer = true;
}
if (allow_new_shape_infer)
break;
}
}
m_config.set_property(ov::intel_gpu::partial_build_program(partialBuild));
m_config.set_property(ov::intel_gpu::optimize_data(true));
@ -415,6 +404,7 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
// 2. We also check parameters of each operation, which means we have more
// reliable results of QueryNetwork call.
PrepareBuild(network.getInputsInfo(), network.getOutputsInfo());
allow_new_shape_infer = requires_new_shape_infer(*op);
CreateSingleLayerPrimitive(topology, op);
CleanupBuild();
DisableQueryMode();
@ -540,6 +530,24 @@ void Program::add_primitive(const ngraph::Node& op, std::shared_ptr<cldnn::primi
m_topology->add_primitive(prim);
}
bool Program::requires_new_shape_infer(const ngraph::Node& op) const {
if (op.is_dynamic()) {
return true;
}
for (size_t i = 0; i < op.get_output_size(); i++) {
if (op.get_output_partial_shape(i).size() > 6)
return true;
}
for (size_t i = 0; i < op.get_input_size(); i++) {
if (op.get_input_partial_shape(i).size() > 6)
return true;
}
return false;
}
// TODO: Does it make sense to add such method to ngraph core?
bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node) {
std::set<std::shared_ptr<ngraph::Node>> nodes_processed = {};

View File

@ -17,7 +17,7 @@ using namespace ::tests;
namespace {
struct activation_test_params {
tensor input_size;
ov::PartialShape input_size;
data_types input_type;
format input_format;
data_types default_type;
@ -46,11 +46,11 @@ public:
}
layout get_input_layout(activation_test_params& p) {
return layout{ p.input_type, p.input_format, p.input_size };
return layout{ p.input_size, p.input_type, p.input_format, };
}
layout get_per_channel_layout(activation_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format };
}
format get_input_format(activation_test_params &p) { return p.input_format; }
@ -70,6 +70,7 @@ public:
#define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_8 { 1, 2, 3, 4, 5, 3, 2, 3 }, data_types::f32, format::bfvuwzyx, data_types::f32, format::bfvuwzyx
#define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx
@ -221,6 +222,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8,
activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_ref" },
activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_ref" },
activation_test_params{ CASE_ACTIVATION_3D_F32_2, 3, 5, "activation_ref" },
activation_test_params{ CASE_ACTIVATION_F32_8, 3, 5, "activation_ref" },
}));
class activation_eltwise_activation_quantize_u8_onendnn : public ActivationFusingTest {};

View File

@ -17,7 +17,7 @@ using namespace ::tests;
namespace {
struct eltwise_test_params {
tensor input_size;
ov::PartialShape input_size;
data_types input_type;
data_types input_type2;
format input_format;
@ -49,15 +49,15 @@ public:
}
layout get_input_layout(eltwise_test_params& p) {
return layout{ p.input_type, p.input_format, p.input_size };
return layout{ p.input_size, p.input_type, p.input_format };
}
layout get_input_layout2(eltwise_test_params& p) {
return layout{ p.input_type2, p.input_format, p.input_size };
return layout{ p.input_size, p.input_type2, p.input_format };
}
layout get_per_channel_layout(eltwise_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format };
}
};
} // namespace
@ -105,6 +105,8 @@ public:
#define CASE_ELTWISE_I8_4 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum
#define CASE_ELTWISE_U8_4 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum
#define CASE_ELTWISE_FP16_7 { 3, 32, 2, 3, 3, 2, 1, 2 }, data_types::f16, data_types::f16, format::bfvuwzyx, data_types::f16, format::bfvuwzyx, eltwise_mode::sum
class eltwise_quantize : public EltwiseFusingTest {};
TEST_P(eltwise_quantize, u8) {
auto p = GetParam();
@ -412,6 +414,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::Value
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_7, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },

View File

@ -63,6 +63,16 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
{SoftSign, {}},
};
const std::map<ActivationTypes, std::vector<std::vector<float>>> big_rank_activation_types = {
{Relu, {}},
{Exp, {}},
{Log, {}},
{Abs, {}},
{Clamp, {{-2.0f, 2.0f}}},
{Ceiling, {}},
{Swish, {{0.5f}}},
};
const std::map<ActivationTypes, std::vector<std::vector<float>>> activationParamTypes = {
{PReLu, {{-0.01f}}},
{LeakyRelu, {{0.01f}}}
@ -73,12 +83,19 @@ std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic = {
{{1, 128}, {{}}},
};
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> big_ranks = {
{{1, 2, 3, 4, 5, 3}, {{}}},
{{1, 2, 3, 4, 1, 3, 2}, {{}}},
{{1, 2, 3, 4, 3, 2, 1, 2}, {{}}},
};
std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> preluBasic = {
{{1, 10, 20}, {{10}, {20}, {10, 20}}},
{{1, 128}, {{1}, {128}}},
};
const auto basicCases = ::testing::Combine(
const auto basicCases = []() {
return ::testing::Combine(
::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)),
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -86,10 +103,11 @@ const auto basicCases = ::testing::Combine(
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(CommonTestUtils::combineParams(basic)),
::testing::Values(CommonTestUtils::DEVICE_GPU)
);
::testing::Values(CommonTestUtils::DEVICE_GPU));
};
const auto basicPreluCases = ::testing::Combine(
const auto basicPreluCases = []() {
return ::testing::Combine(
::testing::ValuesIn(CommonTestUtils::combineParams(activationParamTypes)),
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -97,13 +115,27 @@ const auto basicPreluCases = ::testing::Combine(
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(CommonTestUtils::combineParams(preluBasic)),
::testing::Values(CommonTestUtils::DEVICE_GPU)
);
::testing::Values(CommonTestUtils::DEVICE_GPU));
};
const auto big_rank_cases = []() {
return ::testing::Combine(
::testing::ValuesIn(CommonTestUtils::combineParams(big_rank_activation_types)),
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(CommonTestUtils::combineParams(big_ranks)),
::testing::Values(CommonTestUtils::DEVICE_GPU));
};
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases, ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases(), ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(Activation_BigRanks, ActivationLayerTest, big_rank_cases(), ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);
} // namespace

View File

@ -16,7 +16,7 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP16
};
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}, {1, 2, 3, 4, 2, 3, 2, 2}};
const std::vector<std::vector<size_t>> constShapes = {{1}};
const std::vector<size_t> levels = {16, 255, 256};