[GPU] Support 8d tensors in activation and quantize primitives (#16947)

2023-04-25 09:02:54 +04:00 · 2023-04-25 09:02:54 +04:00 · 3bb0fb61f6
commit 3bb0fb61f6
parent 6663367183
18 changed files with 228 additions and 129 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp
@ -160,6 +160,7 @@ public:
    const variables_state_info_map& GetVariablesStatesInfo() const { return m_variablesStateInfo; }

    bool use_new_shape_infer() const { return allow_new_shape_infer; }
+    bool requires_new_shape_infer(const ngraph::Node& op) const;

 private:
    static factories_map_t factories_map;
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp
@ -72,7 +72,7 @@ struct activation_impl : typed_primitive_impl_ocl<activation> {
 namespace detail {

 attach_activation_impl::attach_activation_impl() {
-     auto dyn_types = {
+     auto types = {
        data_types::f32,
        data_types::f16,
        data_types::i8,
@ -83,76 +83,40 @@ attach_activation_impl::attach_activation_impl() {
    auto dyn_formats = {
        format::bfyx,
        format::bfzyx,
-        format::bfwzyx
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
    };

+    auto static_formats = {
+        format::yxfb,
+        format::byxf,
+        format::b_fs_yx_fsv16,
+        format::b_fs_zyx_fsv16,
+        format::bs_fs_zyx_bsv16_fsv16,
+        format::bs_fs_yx_bsv16_fsv16,
+        format::bs_fs_yx_bsv32_fsv16,
+        format::bs_fs_yx_bsv32_fsv32,
+        format::bfyx,
+        format::bfzyx,
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
+    };
+
+    auto keys = implementation_map<activation>::combine(types, static_formats);
+    keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
+
    implementation_map<activation>::add(impl_types::ocl,
                                        shape_types::dynamic_shape,
                                        typed_primitive_impl_ocl<activation>::create<activation_impl>,
-                                        dyn_types,
+                                        types,
                                        dyn_formats);

-    implementation_map<activation>::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl<activation>::create<activation_impl>, {
-        std::make_tuple(data_types::f32, format::yxfb),
-        std::make_tuple(data_types::f16, format::yxfb),
-        std::make_tuple(data_types::f32, format::bfyx),
-        std::make_tuple(data_types::f16, format::bfyx),
-        std::make_tuple(data_types::f32, format::byxf),
-        std::make_tuple(data_types::f16, format::byxf),
-        std::make_tuple(data_types::i8, format::yxfb),
-        std::make_tuple(data_types::i8, format::bfyx),
-        std::make_tuple(data_types::i8, format::byxf),
-        std::make_tuple(data_types::u8, format::yxfb),
-        std::make_tuple(data_types::u8, format::bfyx),
-        std::make_tuple(data_types::u8, format::byxf),
-        std::make_tuple(data_types::i32, format::bfyx),
-        std::make_tuple(data_types::i32, format::byxf),
-        std::make_tuple(data_types::i32, format::yxfb),
-        // block f16 format
-        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
-        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
-        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
-        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
-        // 3D
-        std::make_tuple(data_types::f32, format::bfzyx),
-        std::make_tuple(data_types::f16, format::bfzyx),
-        std::make_tuple(data_types::i8, format::bfzyx),
-        std::make_tuple(data_types::i32, format::bfzyx),
-
-        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
-        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
-        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
-        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
-
-        std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
-        std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
-        std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
-        std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
-
-        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
-        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
-        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
-        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
-
-        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
-        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
-        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
-        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
-
-        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
-        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
-        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
-        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
-
-        // bfwzyx
-        std::make_tuple(data_types::f32, format::bfwzyx),
-        std::make_tuple(data_types::f16, format::bfwzyx),
-        std::make_tuple(data_types::i32, format::bfwzyx),
-        std::make_tuple(data_types::i8, format::bfwzyx),
-        std::make_tuple(data_types::u8, format::bfwzyx),
-        // fs_b_yx_fsv32
-        std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
-    });
+    implementation_map<activation>::add(impl_types::ocl,
+                                        shape_types::static_shape,
+                                        typed_primitive_impl_ocl<activation>::create<activation_impl>,
+                                        keys);
 }

 }  // namespace detail
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp
@ -167,8 +167,6 @@ public:
 namespace detail {

 attach_pooling_impl::attach_pooling_impl() {
-    std::set<implementation_map<pooling>::key_type> keys;
-
    auto types = { data_types::f16, data_types::f32, data_types::i8, data_types::u8 };
    auto formats = { format::bfyx,
                     format::byxf,
@ -189,12 +187,7 @@ attach_pooling_impl::attach_pooling_impl() {
                     format::bs_fs_zyx_bsv32_fsv16,
                     format::bs_fs_zyx_bsv32_fsv32 };

-    for (const auto type : types) {
-        for (const auto format : formats) {
-            keys.emplace(type, format);
-        }
-    }
-
+    auto keys = implementation_map<pooling>::combine(types, formats);
    keys.emplace(data_types::f16, format::fs_b_yx_fsv32);
    keys.emplace(data_types::f32, format::fs_b_yx_fsv32);

--- a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp
@ -98,8 +98,6 @@ public:
 namespace detail {

 attach_quantize_impl::attach_quantize_impl() {
-    std::set<implementation_map<quantize>::key_type> keys;
-
    auto types = {
        data_types::f16,
        data_types::f32,
@ -127,21 +125,20 @@ attach_quantize_impl::attach_quantize_impl() {
        format::bs_fs_zyx_bsv32_fsv16,
        format::bs_fs_zyx_bsv32_fsv32,

-        format::bfwzyx
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
    };

    auto dyn_formats = {
        format::bfyx,
        format::bfzyx,
-        format::bfwzyx
+        format::bfwzyx,
+        format::bfuwzyx,
+        format::bfvuwzyx,
    };

-    for (const auto type : types) {
-        for (const auto format : formats) {
-            keys.emplace(type, format);
-        }
-    }
-
+    auto keys = implementation_map<quantize>::combine(types, formats);
    keys.emplace(data_types::f16, format::yxfb);
    keys.emplace(data_types::f32, format::yxfb);

--- a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp
+++ b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp
@ -120,7 +120,6 @@ public:
        map_type::instance().insert({{impl_type, shape_type}, {keys, factory}});
    }

-private:
    static std::set<key_type> combine(const std::vector<data_types>& types, const std::vector<format::type>& formats) {
        std::set<key_type> keys;
        for (const auto& type : types) {
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl
@ -23,16 +23,40 @@ KERNEL(activation)(
 #endif
    )
 {
-#if OUTPUT_DIMS == 5
+#if OUTPUT_DIMS == 8
+    #define ORDER batch,feature,v,u,w,z,y,x
+#elif OUTPUT_DIMS == 7
+    #define ORDER batch,feature,u,w,z,y,x
+#elif OUTPUT_DIMS == 6
+    #define ORDER batch,feature,w,z,y,x
+#elif OUTPUT_DIMS == 5
    #define ORDER batch,feature,z,y,x
 #elif OUTPUT_DIMS == 4
    #define ORDER batch,feature,y,x
 #endif

-#if OUTPUT_DIMS == 5
-    const unsigned x = get_global_id(0);
+#if OUTPUT_DIMS >= 5
+
+#if OUTPUT_DIMS == 8
+    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
+    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
+    const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
+    const uint v = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
+#elif OUTPUT_DIMS == 7
+    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
+    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
+    const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const uint u = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
+#elif OUTPUT_DIMS == 6
+    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
+    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
+    const uint w = (uint)get_global_id(1) / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z;
+#elif OUTPUT_DIMS == 5
    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
+#endif
+    const unsigned x = get_global_id(0);
    #if OUTPUT_BATCH_NUM_CONST == 1
        const unsigned feature = (uint)get_global_id(2);
        const unsigned batch = 0;
@ -40,7 +64,7 @@ KERNEL(activation)(
        const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
        const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
    #endif
-#else
+#elif OUTPUT_DIMS <= 4
    #if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
        const unsigned x = (uint)get_global_id(1);
        const unsigned y = (uint)get_global_id(2);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_ref.cl
@ -35,6 +35,21 @@ KERNEL(quantize_ref)(
    const int y = (wzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
    const int z = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
    const int w = ((wzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z;
+#elif OUTPUT_DIMS == 7
+    const int uwzyx = get_global_id(2);
+    const int x = uwzyx % OUTPUT_SIZE_X;
+    const int y = (uwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
+    const int z = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
+    const int w = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const int u = ((uwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
+#elif OUTPUT_DIMS == 8
+    const int vuwzyx = get_global_id(2);
+    const int x = vuwzyx % OUTPUT_SIZE_X;
+    const int y = (vuwzyx / OUTPUT_SIZE_X) % OUTPUT_SIZE_Y;
+    const int z = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z;
+    const int w = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const int u = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
+    const int v = ((vuwzyx / OUTPUT_SIZE_X) / OUTPUT_SIZE_Y) / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
 #endif

 #if PACKED_BINARY_OUTPUT
@ -73,7 +88,11 @@ KERNEL(quantize_ref)(

 #else

-#if INPUT0_DIMS == 6
+#if INPUT0_DIMS == 8
+    const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
+#elif INPUT0_DIMS == 7
+    const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
+#elif INPUT0_DIMS == 6
    const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
 #elif INPUT0_DIMS == 5
    const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
@ -81,7 +100,11 @@ KERNEL(quantize_ref)(
    const int input_offset = INPUT0_GET_INDEX(b, of, y, x);
 #endif

-#if OUTPUT_DIMS == 6
+#if OUTPUT_DIMS == 8
+    const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
+#elif OUTPUT_DIMS == 7
+    const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
+#elif OUTPUT_DIMS == 6
    const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
 #elif OUTPUT_DIMS == 5
    const int output_offset = OUTPUT_GET_INDEX(b, of, z, y, x);
@ -89,7 +112,11 @@ KERNEL(quantize_ref)(
    const int output_offset = OUTPUT_GET_INDEX(b, of, y, x);
 #endif

-#if INPUT1_DIMS == 6
+#if INPUT1_DIMS == 8
+    const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
+#elif INPUT1_DIMS == 7
+    const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT1_DIMS == 6
    const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
 #elif INPUT1_DIMS == 5
    const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
@ -97,7 +124,11 @@ KERNEL(quantize_ref)(
    const int input_low_offset = INPUT1_GET_INDEX_SAFE(b, of, y, x);
 #endif

-#if INPUT2_DIMS == 6
+#if INPUT2_DIMS == 8
+    const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
+#elif INPUT2_DIMS == 7
+    const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT2_DIMS == 6
    const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, w, z, y, x);
 #elif INPUT2_DIMS == 5
    const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, z, y, x);
@ -105,7 +136,11 @@ KERNEL(quantize_ref)(
    const int input_high_offset = INPUT2_GET_INDEX_SAFE(b, of, y, x);
 #endif

-#if INPUT3_DIMS == 6
+#if INPUT3_DIMS == 8
+    const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
+#elif INPUT3_DIMS == 7
+    const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT3_DIMS == 6
    const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, w, z, y, x);
 #elif INPUT3_DIMS == 5
    const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, z, y, x);
@ -113,7 +148,11 @@ KERNEL(quantize_ref)(
    const int output_low_offset = INPUT3_GET_INDEX_SAFE(b, of, y, x);
 #endif

-#if INPUT4_DIMS == 6
+#if INPUT4_DIMS == 8
+    const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
+#elif INPUT4_DIMS == 7
+    const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT4_DIMS == 6
    const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, w, z, y, x);
 #elif INPUT4_DIMS == 5
    const int output_high_offset = INPUT4_GET_INDEX_SAFE(b, of, z, y, x);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_opt.cl
@ -53,6 +53,27 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
    const int w = wzyx_div_xy / OUTPUT_SIZE_Z;

    const int output_offset = OUTPUT_GET_INDEX(b, of, w, z, y, x);
+#elif OUTPUT_DIMS == 7
+    const int uwzyx = get_global_id(GWS_YX);
+
+    const int x = uwzyx % OUTPUT_SIZE_X;
+    const int y = uwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
+    const int z = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
+    const int w = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const int u = uwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W;
+
+    const int output_offset = OUTPUT_GET_INDEX(b, of, u, w, z, y, x);
+#elif OUTPUT_DIMS == 8
+    const int vuwzyx = get_global_id(GWS_YX);
+
+    const int x = vuwzyx % OUTPUT_SIZE_X;
+    const int y = vuwzyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
+    const int z = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y % OUTPUT_SIZE_Z;
+    const int w = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z % OUTPUT_SIZE_W;
+    const int u = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W % OUTPUT_SIZE_U;
+    const int v = vuwzyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y / OUTPUT_SIZE_Z / OUTPUT_SIZE_W / OUTPUT_SIZE_U;
+
+    const int output_offset = OUTPUT_GET_INDEX(b, of, v, u, w, z, y, x);
 #else
 #   error quantize_gpu_scale_shift_opt.cl: output tensors with more than 6 dimensions are unsupported
 #endif
@ -63,6 +84,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
    const int input_offset = INPUT0_GET_INDEX(b, of, z, y, x);
 #elif INPUT0_DIMS == 6
    const int input_offset = INPUT0_GET_INDEX(b, of, w, z, y, x);
+#elif INPUT0_DIMS == 7
+    const int input_offset = INPUT0_GET_INDEX(b, of, u, w, z, y, x);
+#elif INPUT0_DIMS == 8
+    const int input_offset = INPUT0_GET_INDEX(b, of, v, u, w, z, y, x);
 #else
 #   error quantize_gpu_scale_shift_opt.cl: input tensors with more than 6 dimensions are unsupported
 #endif
@ -74,6 +99,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
    const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, z, y, x);
 #elif INPUT1_DIMS == 6
    const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, w, z, y, x);
+#elif INPUT1_DIMS == 7
+    const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT1_DIMS == 8
+    const int in_range_offset = INPUT1_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
 #else
 #   error quantize_gpu_scale_shift_opt.cl: unsupported INPUT1_DIMS size
 #endif
@ -85,6 +114,10 @@ KERNEL(quantize_gpu_scale_shift_opt)(const __global INPUT0_TYPE* input,
    const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, z, y, x);
 #elif INPUT7_DIMS == 6
    const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, w, z, y, x);
+#elif INPUT7_DIMS == 7
+    const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, u, w, z, y, x);
+#elif INPUT7_DIMS == 8
+    const int scales_offset = INPUT7_GET_INDEX_SAFE(b, of, v, u, w, z, y, x);
 #else
 #   error quantize_gpu_scale_shift_opt.cl: unsupported INPUT7_DIMS size
 #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp
@ -24,13 +24,14 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa
    } else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) {
        dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
        dispatchData.lws = {16, 1, 1};
-    } else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
+    } else if (out_layout == DataLayout::bs_fs_yx_bsv32_fsv16 || out_layout == DataLayout::bs_fs_yx_bsv32_fsv32) {
        dispatchData.gws = {out.X().v * out.Y().v, Align(out.Feature().v, 16), Align(out.Batch().v, 16)};
        dispatchData.lws = {1, 16, 16};
    } else {
-        dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
+        dispatchData.gws = {out.X().v, out.Y().v * out.Z().v * out.W().v * out.U().v * out.V().v, out.Feature().v * out.Batch().v};
        std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
-                                                                         {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
+                                                                         {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W,
+                                                                          Tensor::DataChannelName::U, Tensor::DataChannelName::V},
                                                                         {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
        dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
    }
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_opt.cpp
@ -81,6 +81,9 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co
        return false;
    }

+    if (params.outputs[0].GetDims().size() > 5)
+        return false;
+
    if (params.outputs[0].GetLayout() != params.inputs[0].GetLayout())
        return false;

--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_ref.cpp
@ -45,7 +45,7 @@ CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params)
    } else {
        dispatchData.gws[0] = output.Batch().v;
        dispatchData.gws[1] = params.packed_binary_output ? CeilDiv(output.Feature().v, 32) : output.Feature().v;
-        dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v, 16);
+        dispatchData.gws[2] = Align(output.X().v * output.Y().v * output.Z().v * output.W().v * output.U().v * output.V().v, 16);

        dispatchData.lws[0] = 1;
        dispatchData.lws[1] = 1;
--- a/src/plugins/intel_gpu/src/plugin/program.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program.cpp
@ -356,22 +356,11 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::BuildProgram");

    for (const auto& op : ops) {
-        if (op->is_dynamic()) {
+        if (requires_new_shape_infer(*op)) {
            allow_new_shape_infer = true;
-        }
-
-        for (size_t i = 0; i < op->get_output_size(); i++) {
-            if (op->get_output_partial_shape(i).size() > 6)
-                allow_new_shape_infer = true;
-        }
-        for (size_t i = 0; i < op->get_input_size(); i++) {
-            if (op->get_input_partial_shape(i).size() > 6)
-                allow_new_shape_infer = true;
-        }
-
-        if (allow_new_shape_infer)
            break;
        }
+    }

    m_config.set_property(ov::intel_gpu::partial_build_program(partialBuild));
    m_config.set_property(ov::intel_gpu::optimize_data(true));
@ -415,6 +404,7 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
        // 2. We also check parameters of each operation, which means we have more
        //    reliable results of QueryNetwork call.
        PrepareBuild(network.getInputsInfo(), network.getOutputsInfo());
+        allow_new_shape_infer = requires_new_shape_infer(*op);
        CreateSingleLayerPrimitive(topology, op);
        CleanupBuild();
        DisableQueryMode();
@ -540,6 +530,24 @@ void Program::add_primitive(const ngraph::Node& op, std::shared_ptr<cldnn::primi
    m_topology->add_primitive(prim);
 }

+bool Program::requires_new_shape_infer(const ngraph::Node& op) const {
+    if (op.is_dynamic()) {
+        return true;
+    }
+
+    for (size_t i = 0; i < op.get_output_size(); i++) {
+        if (op.get_output_partial_shape(i).size() > 6)
+            return true;
+    }
+
+    for (size_t i = 0; i < op.get_input_size(); i++) {
+        if (op.get_input_partial_shape(i).size() > 6)
+            return true;
+    }
+
+    return false;
+}
+
 // TODO: Does it make sense to add such method to ngraph core?
 bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node) {
    std::set<std::shared_ptr<ngraph::Node>> nodes_processed = {};
--- a/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp
@ -17,7 +17,7 @@ using namespace ::tests;

 namespace {
 struct activation_test_params {
-    tensor input_size;
+    ov::PartialShape input_size;
    data_types input_type;
    format input_format;
    data_types default_type;
@ -46,11 +46,11 @@ public:
    }

    layout get_input_layout(activation_test_params& p) {
-        return layout{ p.input_type, p.input_format, p.input_size };
+        return layout{ p.input_size, p.input_type, p.input_format, };
    }

    layout get_per_channel_layout(activation_test_params& p) {
-        return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
+        return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format };
    }

    format get_input_format(activation_test_params &p) { return p.input_format; }
@ -70,6 +70,7 @@ public:
 #define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
 #define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx
+#define CASE_ACTIVATION_F32_8 { 1, 2, 3, 4, 5, 3, 2, 3 }, data_types::f32, format::bfvuwzyx, data_types::f32, format::bfvuwzyx
 #define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx
@ -221,6 +222,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8,
    activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_ref" },
    activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_ref" },
    activation_test_params{ CASE_ACTIVATION_3D_F32_2, 3, 5, "activation_ref" },
+    activation_test_params{ CASE_ACTIVATION_F32_8, 3, 5, "activation_ref" },
 }));

 class activation_eltwise_activation_quantize_u8_onendnn : public ActivationFusingTest {};
--- a/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp
@ -17,7 +17,7 @@ using namespace ::tests;

 namespace {
 struct eltwise_test_params {
-    tensor input_size;
+    ov::PartialShape input_size;
    data_types input_type;
    data_types input_type2;
    format input_format;
@ -49,15 +49,15 @@ public:
    }

    layout get_input_layout(eltwise_test_params& p) {
-        return layout{ p.input_type, p.input_format, p.input_size };
+        return layout{ p.input_size, p.input_type, p.input_format };
    }

    layout get_input_layout2(eltwise_test_params& p) {
-        return layout{ p.input_type2, p.input_format, p.input_size };
+        return layout{ p.input_size, p.input_type2, p.input_format };
    }

    layout get_per_channel_layout(eltwise_test_params& p) {
-        return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } };
+        return layout{ { 1, p.input_size[1], 1, 1 }, p.default_type, p.default_format  };
    }
 };
 }  // namespace
@ -105,6 +105,8 @@ public:
 #define CASE_ELTWISE_I8_4           { 2, 16, 4, 4 }, data_types::i8,  data_types::i8,  format::b_fs_yx_fsv4,  data_types::f32,  format::b_fs_yx_fsv4,    eltwise_mode::sum
 #define CASE_ELTWISE_U8_4           { 2, 16, 4, 4 }, data_types::u8,  data_types::u8,  format::b_fs_yx_fsv4,  data_types::f32,  format::b_fs_yx_fsv4,    eltwise_mode::sum

+#define CASE_ELTWISE_FP16_7         { 3, 32, 2, 3, 3, 2, 1, 2 }, data_types::f16, data_types::f16, format::bfvuwzyx,  data_types::f16,  format::bfvuwzyx,    eltwise_mode::sum
+
 class eltwise_quantize : public EltwiseFusingTest {};
 TEST_P(eltwise_quantize, u8) {
    auto p = GetParam();
@ -412,6 +414,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::Value
    eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
    eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
    eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 },
+    eltwise_test_params{ CASE_ELTWISE_FP16_7, 3, 5 },
    eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
    eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
    eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp
@ -63,6 +63,16 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
        {SoftSign,              {}},
 };

+const std::map<ActivationTypes, std::vector<std::vector<float>>> big_rank_activation_types = {
+        {Relu,                  {}},
+        {Exp,                   {}},
+        {Log,                   {}},
+        {Abs,                   {}},
+        {Clamp,                 {{-2.0f, 2.0f}}},
+        {Ceiling,               {}},
+        {Swish,                 {{0.5f}}},
+};
+
 const std::map<ActivationTypes, std::vector<std::vector<float>>> activationParamTypes = {
    {PReLu, {{-0.01f}}},
    {LeakyRelu, {{0.01f}}}
@ -73,12 +83,19 @@ std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic = {
        {{1, 128}, {{}}},
 };

+std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> big_ranks = {
+        {{1, 2, 3, 4, 5, 3}, {{}}},
+        {{1, 2, 3, 4, 1, 3, 2}, {{}}},
+        {{1, 2, 3, 4, 3, 2, 1, 2}, {{}}},
+};
+
 std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> preluBasic = {
        {{1, 10, 20}, {{10}, {20}, {10, 20}}},
        {{1, 128}, {{1}, {128}}},
 };

-const auto basicCases = ::testing::Combine(
+const auto basicCases = []() {
+    return ::testing::Combine(
        ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)),
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -86,10 +103,11 @@ const auto basicCases = ::testing::Combine(
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::ValuesIn(CommonTestUtils::combineParams(basic)),
-        ::testing::Values(CommonTestUtils::DEVICE_GPU)
-);
+        ::testing::Values(CommonTestUtils::DEVICE_GPU));
+};

-const auto basicPreluCases = ::testing::Combine(
+const auto basicPreluCases = []() {
+    return ::testing::Combine(
        ::testing::ValuesIn(CommonTestUtils::combineParams(activationParamTypes)),
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -97,13 +115,27 @@ const auto basicPreluCases = ::testing::Combine(
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::ValuesIn(CommonTestUtils::combineParams(preluBasic)),
-        ::testing::Values(CommonTestUtils::DEVICE_GPU)
-);
+        ::testing::Values(CommonTestUtils::DEVICE_GPU));
+};

+const auto big_rank_cases = []() {
+    return ::testing::Combine(
+        ::testing::ValuesIn(CommonTestUtils::combineParams(big_rank_activation_types)),
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::ValuesIn(CommonTestUtils::combineParams(big_ranks)),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU));
+};

-INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases, ActivationLayerTest::getTestCaseName);
-INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationLayerTest, basicCases(), ActivationLayerTest::getTestCaseName);

-INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases, ActivationLayerTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(Activation_BigRanks, ActivationLayerTest, big_rank_cases(), ActivationLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic_Prelu, ActivationLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Activation_Basic, ActivationParamLayerTest, basicPreluCases(), ActivationLayerTest::getTestCaseName);

 }  // namespace
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp
@ -16,7 +16,7 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
        InferenceEngine::Precision::FP16
 };

-const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
+const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}, {1, 2, 3, 4, 2, 3, 2, 2}};
 const std::vector<std::vector<size_t>> constShapes = {{1}};
 const std::vector<size_t> levels = {16, 255, 256};