diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp index 6ab54260f01..f2b75d3e539 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp @@ -160,6 +160,7 @@ public: const variables_state_info_map& GetVariablesStatesInfo() const { return m_variablesStateInfo; } bool use_new_shape_infer() const { return allow_new_shape_infer; } + bool requires_new_shape_infer(const ngraph::Node& op) const; private: static factories_map_t factories_map; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp index 14d5958f307..acb547d8c9d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp @@ -72,7 +72,7 @@ struct activation_impl : typed_primitive_impl_ocl { namespace detail { attach_activation_impl::attach_activation_impl() { - auto dyn_types = { + auto types = { data_types::f32, data_types::f16, data_types::i8, @@ -83,76 +83,40 @@ attach_activation_impl::attach_activation_impl() { auto dyn_formats = { format::bfyx, format::bfzyx, - format::bfwzyx + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, }; + auto static_formats = { + format::yxfb, + format::byxf, + format::b_fs_yx_fsv16, + format::b_fs_zyx_fsv16, + format::bs_fs_zyx_bsv16_fsv16, + format::bs_fs_yx_bsv16_fsv16, + format::bs_fs_yx_bsv32_fsv16, + format::bs_fs_yx_bsv32_fsv32, + format::bfyx, + format::bfzyx, + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, + }; + + auto keys = implementation_map::combine(types, static_formats); + keys.emplace(data_types::f16, format::fs_b_yx_fsv32); + implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, typed_primitive_impl_ocl::create, - dyn_types, + types, dyn_formats); - implementation_map::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl::create, { - std::make_tuple(data_types::f32, format::yxfb), - std::make_tuple(data_types::f16, format::yxfb), - std::make_tuple(data_types::f32, format::bfyx), - std::make_tuple(data_types::f16, format::bfyx), - std::make_tuple(data_types::f32, format::byxf), - std::make_tuple(data_types::f16, format::byxf), - std::make_tuple(data_types::i8, format::yxfb), - std::make_tuple(data_types::i8, format::bfyx), - std::make_tuple(data_types::i8, format::byxf), - std::make_tuple(data_types::u8, format::yxfb), - std::make_tuple(data_types::u8, format::bfyx), - std::make_tuple(data_types::u8, format::byxf), - std::make_tuple(data_types::i32, format::bfyx), - std::make_tuple(data_types::i32, format::byxf), - std::make_tuple(data_types::i32, format::yxfb), - // block f16 format - std::make_tuple(data_types::f16, format::b_fs_yx_fsv16), - std::make_tuple(data_types::f32, format::b_fs_yx_fsv16), - std::make_tuple(data_types::i8, format::b_fs_yx_fsv16), - std::make_tuple(data_types::u8, format::b_fs_yx_fsv16), - // 3D - std::make_tuple(data_types::f32, format::bfzyx), - std::make_tuple(data_types::f16, format::bfzyx), - std::make_tuple(data_types::i8, format::bfzyx), - std::make_tuple(data_types::i32, format::bfzyx), - - std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16), - std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16), - std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16), - std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16), - - std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16), - std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16), - std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16), - std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16), - - std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16), - std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16), - std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16), - std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16), - - std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16), - std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16), - std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16), - std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16), - - std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32), - std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32), - std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32), - std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32), - - // bfwzyx - std::make_tuple(data_types::f32, format::bfwzyx), - std::make_tuple(data_types::f16, format::bfwzyx), - std::make_tuple(data_types::i32, format::bfwzyx), - std::make_tuple(data_types::i8, format::bfwzyx), - std::make_tuple(data_types::u8, format::bfwzyx), - // fs_b_yx_fsv32 - std::make_tuple(data_types::f16, format::fs_b_yx_fsv32), - }); + implementation_map::add(impl_types::ocl, + shape_types::static_shape, + typed_primitive_impl_ocl::create, + keys); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp index d8bf96d8efb..64a092940a0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp @@ -167,8 +167,6 @@ public: namespace detail { attach_pooling_impl::attach_pooling_impl() { - std::set::key_type> keys; - auto types = { data_types::f16, data_types::f32, data_types::i8, data_types::u8 }; auto formats = { format::bfyx, format::byxf, @@ -189,12 +187,7 @@ attach_pooling_impl::attach_pooling_impl() { format::bs_fs_zyx_bsv32_fsv16, format::bs_fs_zyx_bsv32_fsv32 }; - for (const auto type : types) { - for (const auto format : formats) { - keys.emplace(type, format); - } - } - + auto keys = implementation_map::combine(types, formats); keys.emplace(data_types::f16, format::fs_b_yx_fsv32); keys.emplace(data_types::f32, format::fs_b_yx_fsv32); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp index 671622ff627..9ac7df3c681 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp @@ -98,8 +98,6 @@ public: namespace detail { attach_quantize_impl::attach_quantize_impl() { - std::set::key_type> keys; - auto types = { data_types::f16, data_types::f32, @@ -127,21 +125,20 @@ attach_quantize_impl::attach_quantize_impl() { format::bs_fs_zyx_bsv32_fsv16, format::bs_fs_zyx_bsv32_fsv32, - format::bfwzyx + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, }; auto dyn_formats = { format::bfyx, format::bfzyx, - format::bfwzyx + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, }; - for (const auto type : types) { - for (const auto format : formats) { - keys.emplace(type, format); - } - } - + auto keys = implementation_map::combine(types, formats); keys.emplace(data_types::f16, format::yxfb); keys.emplace(data_types::f32, format::yxfb); diff --git a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp index 1ca0415ad96..6a79e4391ab 100644 --- a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp +++ b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp @@ -120,7 +120,6 @@ public: map_type::instance().insert({{impl_type, shape_type}, {keys, factory}}); } -private: static std::set combine(const std::vector& types, const std::vector& formats) { std::set keys; for (const auto& type : types) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl index 6e646e5a49a..967778d8cc1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl @@ -23,16 +23,40 @@ KERNEL(activation)( #endif ) { -#if OUTPUT_DIMS == 5 +#if OUTPUT_DIMS == 8 + #define ORDER batch,feature,v,u,w,z,y,x +#elif OUTPUT_DIMS == 7 + #define ORDER batch,feature,u,w,z,y,x +#elif OUTPUT_DIMS == 6 + #define ORDER batch,feature,w,z,y,x +#elif OUTPUT_DIMS == 5 #define ORDER batch,feature,z,y,x #elif OUTPUT_DIMS == 4 #define ORDER batch,feature,y,x #endif -#if OUTPUT_DIMS == 5 - const unsigned x = get_global_id(0); +#if OUTPUT_DIMS >= 5 + +#if OUTPUT_DIMS == 8 + const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; + const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z; + const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U; + const uint v = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U; +#elif OUTPUT_DIMS == 7 + const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; + const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z; + const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W; +#elif OUTPUT_DIMS == 6 + const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; + const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z; + const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z; +#elif OUTPUT_DIMS == 5 const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y; +#endif + const unsigned x = get_global_id(0); #if OUTPUT_BATCH_NUM_CONST == 1 const unsigned feature = (uint)get_global_id(2); const unsigned batch = 0; @@ -40,7 +64,7 @@ KERNEL(activation)( const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM; const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM; #endif -#else +#elif OUTPUT_DIMS <= 4 #if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32 const unsigned x = (uint)get_global_id(1); const unsigned y = (uint)get_global_id(2); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl index 0a9b3d2cb01..8941cf83fb3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl @@ -35,6 +35,21 @@ KERNEL(quantize_ref)( const int y = (wzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y; const int z = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z; const int w = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z; +#elif OUTPUT_DIMS == 7 + const int uwzyx = get_global_id(2); + const int x = uwzyx % OUTPUT_SIZE_X; + const int y = (uwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y; + const int z = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z; + const int w = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const int u = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W; +#elif OUTPUT_DIMS == 8 + const int vuwzyx = get_global_id(2); + const int x = vuwzyx % OUTPUT_SIZE_X; + const int y = (vuwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y; + const int z = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z; + const int w = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const int u = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U; + const int v = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U; #endif #if PACKED_BINARY_OUTPUT @@ -73,7 +88,11 @@ KERNEL(quantize_ref)( #else -#if INPUT0_DIMS == 6 +#if INPUT0_DIMS == 8 + const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x); +#elif INPUT0_DIMS == 7 + const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x); +#elif INPUT0_DIMS == 6 const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x); #elif INPUT0_DIMS == 5 const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x); @@ -81,7 +100,11 @@ KERNEL(quantize_ref)( const int input_offset = INPUT0_GET_INDEX(b, of, y, x); #endif -#if OUTPUT_DIMS == 6 +#if OUTPUT_DIMS == 8 + const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x); +#elif OUTPUT_DIMS == 7 + const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x); +#elif OUTPUT_DIMS == 6 const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x); #elif OUTPUT_DIMS == 5 const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x); @@ -89,7 +112,11 @@ KERNEL(quantize_ref)( const int output_offset = OUTPUT_GET_INDEX(b, of, y, x); #endif -#if INPUT1_DIMS == 6 +#if INPUT1_DIMS == 8 + const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); +#elif INPUT1_DIMS == 7 + const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT1_DIMS == 6 const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x); #elif INPUT1_DIMS == 5 const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x); @@ -97,7 +124,11 @@ KERNEL(quantize_ref)( const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, y, x); #endif -#if INPUT2_DIMS == 6 +#if INPUT2_DIMS == 8 + const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); +#elif INPUT2_DIMS == 7 + const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT2_DIMS == 6 const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, w, z, y, x); #elif INPUT2_DIMS == 5 const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, z, y, x); @@ -105,7 +136,11 @@ KERNEL(quantize_ref)( const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, y, x); #endif -#if INPUT3_DIMS == 6 +#if INPUT3_DIMS == 8 + const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); +#elif INPUT3_DIMS == 7 + const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT3_DIMS == 6 const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, w, z, y, x); #elif INPUT3_DIMS == 5 const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, z, y, x); @@ -113,7 +148,11 @@ KERNEL(quantize_ref)( const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, y, x); #endif -#if INPUT4_DIMS == 6 +#if INPUT4_DIMS == 8 + const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); +#elif INPUT4_DIMS == 7 + const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT4_DIMS == 6 const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, w, z, y, x); #elif INPUT4_DIMS == 5 const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, z, y, x); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl index e8cb7e584f4..ce460c91eaa 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl @@ -53,6 +53,27 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input, const int w = wzyx_div_xy / OUTPUT_SIZE_Z; const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x); +#elif OUTPUT_DIMS == 7 + const int uwzyx = get_global_id(GWS_YX); + + const int x = uwzyx % OUTPUT_SIZE_X; + const int y = uwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; + const int z = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z; + const int w = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const int u = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W; + + const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x); +#elif OUTPUT_DIMS == 8 + const int vuwzyx = get_global_id(GWS_YX); + + const int x = vuwzyx % OUTPUT_SIZE_X; + const int y = vuwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; + const int z = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z; + const int w = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W; + const int u = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U; + const int v = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U; + + const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x); #else # error quantize_gpu_scale_shift_opt.cl: output tensors with more than 6 dimensions are unsupported #endif @@ -63,6 +84,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input, const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x); #elif INPUT0_DIMS == 6 const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x); +#elif INPUT0_DIMS == 7 + const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x); +#elif INPUT0_DIMS == 8 + const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x); #else # error quantize_gpu_scale_shift_opt.cl: input tensors with more than 6 dimensions are unsupported #endif @@ -74,6 +99,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input, const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x); #elif INPUT1_DIMS == 6 const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x); +#elif INPUT1_DIMS == 7 + const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT1_DIMS == 8 + const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); #else # error quantize_gpu_scale_shift_opt.cl: unsupported INPUT1_DIMS size #endif @@ -85,6 +114,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input, const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, z, y, x); #elif INPUT7_DIMS == 6 const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, w, z, y, x); +#elif INPUT7_DIMS == 7 + const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, u, w, z, y, x); +#elif INPUT7_DIMS == 8 + const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, v, u, w, z, y, x); #else # error quantize_gpu_scale_shift_opt.cl: unsupported INPUT7_DIMS size #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp index db267fe3af9..f27b2d189da 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp @@ -24,13 +24,14 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa } else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) { dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v}; dispatchData.lws = {16, 1, 1}; - } else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) { + } else if (out_layout == DataLayout::bs_fs_yx_bsv32_fsv16 || out_layout == DataLayout::bs_fs_yx_bsv32_fsv32) { dispatchData.gws = {out.X().v * out.Y().v, Align(out.Feature().v, 16), Align(out.Batch().v, 16)}; dispatchData.lws = {1, 16, 16}; } else { - dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v}; + dispatchData.gws = {out.X().v, out.Y().v * out.Z().v * out.W().v * out.U().v * out.V().v, out.Feature().v * out.Batch().v}; std::vector> dims_by_gws = {{Tensor::DataChannelName::X}, - {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W, + Tensor::DataChannelName::U, Tensor::DataChannelName::V}, {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp index d0b8f2861b6..800d645be9f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp @@ -81,6 +81,9 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co return false; } + if (params.outputs[0].GetDims().size() > 5) + return false; + if (params.outputs[0].GetLayout() != params.inputs[0].GetLayout()) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp index b8910974697..131cbb5f9e7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp @@ -45,7 +45,7 @@ CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params) } else { dispatchData.gws[0] = output.Batch().v; dispatchData.gws[1] = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v; - dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v, 16); + dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v * output.U().v * output.V().v, 16); dispatchData.lws[0] = 1; dispatchData.lws[1] = 1; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_opt.cpp index 2c225b34aff..4c505c8cfda 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_opt.cpp @@ -38,7 +38,7 @@ CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& p if (output.GetLayout() == DataLayout::b_fs_yx_fsv16 || output.GetLayout() == DataLayout::b_fs_yx_fsv32 || output.GetLayout() == DataLayout::b_fs_zyx_fsv32) { - dispatchData.gws[0] = output.Z().v *output.Y().v * output.X().v; + dispatchData.gws[0] = output.Z().v * output.Y().v * output.X().v; dispatchData.gws[1] = Align(output.Feature().v, sub_group_size); dispatchData.gws[2] = output.Batch().v; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 201eb3f311d..2639cc3bd35 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -132,7 +132,7 @@ InferenceEngine::CNNNetwork Plugin::clone_and_transform_model(const InferenceEng GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) { auto path_base = debug_config->dump_graphs + "/" + network.getName() + "_" + "transformed_func"; ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(nGraphFunc); - } + } } return clonedNetwork; } diff --git a/src/plugins/intel_gpu/src/plugin/program.cpp b/src/plugins/intel_gpu/src/plugin/program.cpp index ac85d4291d8..1d880f18065 100644 --- a/src/plugins/intel_gpu/src/plugin/program.cpp +++ b/src/plugins/intel_gpu/src/plugin/program.cpp @@ -356,21 +356,10 @@ std::shared_ptr Program::BuildProgram(const std::vectoris_dynamic()) { + if (requires_new_shape_infer(*op)) { allow_new_shape_infer = true; - } - - for (size_t i = 0; i < op->get_output_size(); i++) { - if (op->get_output_partial_shape(i).size() > 6) - allow_new_shape_infer = true; - } - for (size_t i = 0; i < op->get_input_size(); i++) { - if (op->get_input_partial_shape(i).size() > 6) - allow_new_shape_infer = true; - } - - if (allow_new_shape_infer) break; + } } m_config.set_property(ov::intel_gpu::partial_build_program(partialBuild)); @@ -415,6 +404,7 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st // 2. We also check parameters of each operation, which means we have more // reliable results of QueryNetwork call. PrepareBuild(network.getInputsInfo(), network.getOutputsInfo()); + allow_new_shape_infer = requires_new_shape_infer(*op); CreateSingleLayerPrimitive(topology, op); CleanupBuild(); DisableQueryMode(); @@ -540,6 +530,24 @@ void Program::add_primitive(const ngraph::Node& op, std::shared_ptradd_primitive(prim); } +bool Program::requires_new_shape_infer(const ngraph::Node& op) const { + if (op.is_dynamic()) { + return true; + } + + for (size_t i = 0; i < op.get_output_size(); i++) { + if (op.get_output_partial_shape(i).size() > 6) + return true; + } + + for (size_t i = 0; i < op.get_input_size(); i++) { + if (op.get_input_partial_shape(i).size() > 6) + return true; + } + + return false; +} + // TODO: Does it make sense to add such method to ngraph core? bool IsNodeOnConstPath(const std::shared_ptr& node) { std::set> nodes_processed = {}; diff --git a/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp index 8bcaabe3254..9f104f7fc44 100644 --- a/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp @@ -17,7 +17,7 @@ using namespace ::tests; namespace { struct activation_test_params { - tensor input_size; + ov::PartialShape input_size; data_types input_type; format input_format; data_types default_type; @@ -46,11 +46,11 @@ public: } layout get_input_layout(activation_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; + return layout{ p.input_size, p.input_type, p.input_format, }; } layout get_per_channel_layout(activation_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; + return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format }; } format get_input_format(activation_test_params &p) { return p.input_format; } @@ -70,6 +70,7 @@ public: #define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx #define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_8 { 1, 2, 3, 4, 5, 3, 2, 3 }, data_types::f32, format::bfvuwzyx, data_types::f32, format::bfvuwzyx #define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx #define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx @@ -221,6 +222,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8, activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_ref" }, activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_ref" }, activation_test_params{ CASE_ACTIVATION_3D_F32_2, 3, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_8, 3, 5, "activation_ref" }, })); class activation_eltwise_activation_quantize_u8_onendnn : public ActivationFusingTest {}; diff --git a/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp index 46c06e9034c..50ddb9870e5 100644 --- a/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp @@ -17,7 +17,7 @@ using namespace ::tests; namespace { struct eltwise_test_params { - tensor input_size; + ov::PartialShape input_size; data_types input_type; data_types input_type2; format input_format; @@ -49,15 +49,15 @@ public: } layout get_input_layout(eltwise_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; + return layout{ p.input_size, p.input_type, p.input_format }; } layout get_input_layout2(eltwise_test_params& p) { - return layout{ p.input_type2, p.input_format, p.input_size }; + return layout{ p.input_size, p.input_type2, p.input_format }; } layout get_per_channel_layout(eltwise_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; + return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format }; } }; } // namespace @@ -105,6 +105,8 @@ public: #define CASE_ELTWISE_I8_4 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum #define CASE_ELTWISE_U8_4 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum +#define CASE_ELTWISE_FP16_7 { 3, 32, 2, 3, 3, 2, 1, 2 }, data_types::f16, data_types::f16, format::bfvuwzyx, data_types::f16, format::bfvuwzyx, eltwise_mode::sum + class eltwise_quantize : public EltwiseFusingTest {}; TEST_P(eltwise_quantize, u8) { auto p = GetParam(); @@ -412,6 +414,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::Value eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 }, eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 }, eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_7, 3, 5 }, eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 }, eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 }, eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp index 128f046ac53..ee55f61ab2c 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp @@ -63,6 +63,16 @@ const std::map>> activationTypes {SoftSign, {}}, }; +const std::map>> big_rank_activation_types = { + {Relu, {}}, + {Exp, {}}, + {Log, {}}, + {Abs, {}}, + {Clamp, {{-2.0f, 2.0f}}}, + {Ceiling, {}}, + {Swish, {{0.5f}}}, +}; + const std::map>> activationParamTypes = { {PReLu, {{-0.01f}}}, {LeakyRelu, {{0.01f}}} @@ -73,12 +83,19 @@ std::map, std::vector>> basic = { {{1, 128}, {{}}}, }; +std::map, std::vector>> big_ranks = { + {{1, 2, 3, 4, 5, 3}, {{}}}, + {{1, 2, 3, 4, 1, 3, 2}, {{}}}, + {{1, 2, 3, 4, 3, 2, 1, 2}, {{}}}, +}; + std::map, std::vector>> preluBasic = { {{1, 10, 20}, {{10}, {20}, {10, 20}}}, {{1, 128}, {{1}, {128}}}, }; -const auto basicCases = ::testing::Combine( +const auto basicCases = []() { + return ::testing::Combine( ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), ::testing::ValuesIn(netPrecisions), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), @@ -86,10 +103,11 @@ const auto basicCases = ::testing::Combine( ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::ValuesIn(CommonTestUtils::combineParams(basic)), - ::testing::Values(CommonTestUtils::DEVICE_GPU) -); + ::testing::Values(CommonTestUtils::DEVICE_GPU)); +}; -const auto basicPreluCases = ::testing::Combine( +const auto basicPreluCases = []() { + return ::testing::Combine( ::testing::ValuesIn(CommonTestUtils::combineParams(activationParamTypes)), ::testing::ValuesIn(netPrecisions), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), @@ -97,13 +115,27 @@ const auto basicPreluCases = ::testing::Combine( ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::ValuesIn(CommonTestUtils::combineParams(preluBasic)), - ::testing::Values(CommonTestUtils::DEVICE_GPU) -); + ::testing::Values(CommonTestUtils::DEVICE_GPU)); +}; +const auto big_rank_cases = []() { + return ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::combineParams(big_rank_activation_types)), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::ValuesIn(CommonTestUtils::combineParams(big_ranks)), + ::testing::Values(CommonTestUtils::DEVICE_GPU)); +}; -INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases, ActivationLayerTest::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases(), ActivationLayerTest::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(Activation_BigRanks, ActivationLayerTest, big_rank_cases(), ActivationLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName); } // namespace diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp index 884c66b857a..a1166a1fd52 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp @@ -16,7 +16,7 @@ const std::vector netPrecisions = { InferenceEngine::Precision::FP16 }; -const std::vector> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}}; +const std::vector> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}, {1, 2, 3, 4, 2, 3, 2, 2}}; const std::vector> constShapes = {{1}}; const std::vector levels = {16, 255, 256};