[GPU] Better extension requirements checks in kernels. Subgroups basic emulation (#13926)

2022-12-29 09:08:05 +04:00
parent 4831a9ead4
commit 13c8b4fdc7
398 changed files with 2706 additions and 3093 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -55,12 +55,15 @@ struct device_info {
    bool supports_fp16;                         ///< Does engine support FP16.
    bool supports_fp64;                         ///< Does engine support FP64.
    bool supports_fp16_denorms;                 ///< Does engine support denormalized FP16.
-    bool supports_subgroups;                    ///< Does engine support cl_intel_subgroups extension.
-    bool supports_subgroups_short;              ///< Does engine support cl_intel_subgroups_short extension.
-    bool supports_subgroups_char;               ///< Does engine support cl_intel_subgroups_char extension.
-    bool supports_local_block_io;               ///< Does engine support cl_intel_subgroup_local_block_io extension. Check program build with this option.
+    bool supports_khr_subgroups;                ///< Does engine support cl_khr_subgroups extension.
+    bool supports_intel_subgroups;              ///< Does engine support cl_intel_subgroups extension.
+    bool supports_intel_subgroups_short;        ///< Does engine support cl_intel_subgroups_short extension.
+    bool supports_intel_subgroups_char;         ///< Does engine support cl_intel_subgroups_char extension.
+    bool supports_intel_required_subgroup_size; ///< Does engine support cl_intel_required_subgroup_size extension.
+    bool supports_local_block_io;               ///< Does engine support cl_intel_subgroup_local_block_io extension.
    bool supports_queue_families;               ///< Does engine support cl_intel_command_queue_families extension.
    bool supports_image;                        ///< Does engine support images (CL_DEVICE_IMAGE_SUPPORT cap).
+    bool supports_intel_planar_yuv;             ///< Does engine support cl_intel_planar_yuv extension.

    bool supports_imad;                         ///< Does engine support int8 mad.
    bool supports_immad;                        ///< Does engine support int8 multi mad.
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -591,6 +591,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                return true;
            }

+            if (node.get_primitive()->deformable_mode)
+                return false;
+
            // Since reorder inputs is called after this pass
            // we have to check that blocked formats can be used in the network and layer is optimized for it.
            if ((node.get_output_layout().format == format::b_fs_yx_fsv16 ||
--- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp
@@ -1016,14 +1016,18 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
    const auto& device_info = program->get_engine().get_device_info();

    params.uniqueID = std::to_string(param_info.unique_id);
-    params.engineInfo.bSubGroupSupport = device_info.supports_subgroups;
-    params.engineInfo.bSubGroupShortSupport = device_info.supports_subgroups_short;
-    params.engineInfo.bSubGroupCharSupport = device_info.supports_subgroups_char;
-    params.engineInfo.bFP16Support = device_info.supports_fp16;
-    params.engineInfo.bFP64Support = device_info.supports_fp64;
-    params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
-    params.engineInfo.bIMMADSupport = device_info.supports_immad != 0;
-    params.engineInfo.bImageSupport = device_info.supports_image != 0;
+    params.engineInfo.supports_fp16 = device_info.supports_fp16;
+    params.engineInfo.supports_fp64 = device_info.supports_fp64;
+    params.engineInfo.supports_fp16_denorms = device_info.supports_fp16_denorms;
+    params.engineInfo.supports_khr_subgroups = device_info.supports_khr_subgroups;
+    params.engineInfo.supports_intel_subgroups = device_info.supports_intel_subgroups;
+    params.engineInfo.supports_intel_subgroups_short = device_info.supports_intel_subgroups_short;
+    params.engineInfo.supports_intel_subgroups_char = device_info.supports_intel_subgroups_char;
+    params.engineInfo.supports_intel_required_subgroup_size = device_info.supports_intel_required_subgroup_size;
+
+    params.engineInfo.supports_imad = device_info.supports_imad;
+    params.engineInfo.supports_immad = device_info.supports_immad;
+    params.engineInfo.enable_sub_groups_emulation = true;
    params.engineInfo.bOptHintsSupport = false;

    params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io && program->is_local_block_io_supported();
@@ -1038,6 +1042,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
    params.engineInfo.deviceCache = program->get_tuning_cache();
    params.engineInfo.driverVersion = device_info.driver_version;
    params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
+    params.engineInfo.vendor_id = device_info.vendor_id;

    auto impl_forcing_bo = program->get_options().get<build_option_type::force_implementations>();
    const auto& impl_forcing = impl_forcing_bo->forcing;
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1066,6 +1066,11 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
    auto input_layout = node.get_dependency(0).get_output_layout();
    auto output_layout = node.calc_output_layout();

+    if (prim->deformable_mode) {
+        output_layout.format = format::adjust_to_rank(format::bfyx, output_layout.get_partial_shape().size());
+        return output_layout;
+    }
+
    if (input_layout.is_dynamic() || output_layout.is_dynamic()) {
        if (input_layout.get_partial_shape().size() <= 4)
            expected_format = format::b_fs_yx_fsv16;
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_opt.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"

 KERNEL(activation)(
    __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #ifdef PARAMETERIZED
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/adaptive_pooling_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/adaptive_pooling_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #if MAX_POOLING
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_axis.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_axis.cl
@@ -2,8 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
+#include "include/fetch_utils.cl"

 #ifdef BATCH_AXIS
    #define VALUES_NUM INPUT0_BATCH_NUM
@@ -44,32 +43,6 @@

 #define MINIMUM_NUMBER_FOR_PARTIAL_SORTING 100

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
-///////////////////////// Input offset /////////////////////////
-inline uint FUNC(get_input_offset)(uint b, uint f, uint z, uint y, uint x)
-{
-#if INPUT0_DIMS < 5
-    return INPUT0_GET_INDEX(b, f, y, x);
-#elif INPUT0_DIMS == 5
-    return INPUT0_GET_INDEX(b, f, z, y, x);
-#else
-#error arg_max_min_axis.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output offset ////////////////////////
-inline uint FUNC(get_output_offset)(uint b, uint f, uint z, uint y, uint x)
-{
-#if OUTPUT_DIMS < 5
-    return OUTPUT_GET_INDEX(b, f, y, x);
-#elif OUTPUT_DIMS == 5
-    return OUTPUT_GET_INDEX(b, f, z, y, x);
-#else
-#error arg_max_min_axis.cl: output format - not supported
-#endif
-}
-
 KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
                                  ,__global OUTPUT_TYPE* output
 #ifdef SECOND_OUTPUT_EXIST
@@ -174,41 +147,41 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
    indices[AXIS] = sort_idx;

    iav_type result;
-    result.value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+    result.value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    result.index = sort_idx;

    for (uint i = 0; i < sort_idx / 8; i++) {
        uint index_offset = i * 8;
        indices[AXIS] = index_offset;
-        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 1;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 2;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 3;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 4;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 5;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 6;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        indices[AXIS] = index_offset + 7;
-        test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
        if (sort_position >= TOP_K)
@@ -217,7 +190,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input

    for (uint i = (sort_idx / 8) * 8; i < sort_idx; i++) {
        indices[AXIS] = i;
-        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
            sort_position++;
    }
@@ -227,7 +200,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input

    for (uint i = sort_idx + 1; i < VALUES_NUM; i++) {
        indices[AXIS] = i;
-        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        if (result.value COMPARE_PARALLEL_SIGN_2 test_value)
            sort_position++;
        if (sort_position >= TOP_K)
@@ -236,7 +209,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input

 // Using simple sorting for sorting by indices and when TOP_K == 1
 #elif TOP_K == 1
-    INPUT0_TYPE val = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+    INPUT0_TYPE val = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    result[0].index = 0;
    result[0].value = val;
    bool already_exist = false;
@@ -255,7 +228,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
            }

            indices[AXIS] = i;
-            INPUT0_TYPE in_data = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+            INPUT0_TYPE in_data = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
            if (val COMPARE_SIGN in_data) {
                result[top_k].index = i;
                result[top_k].value = in_data;
@@ -270,26 +243,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
    for (uint i = 0; i < VALUES_NUM / 8; i++) {
        uint index_offset = i * 8;
        indices[AXIS] = result[index_offset].index = index_offset;
-        result[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 1].index = index_offset + 1;
-        result[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 2].index = index_offset + 2;
-        result[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 3].index = index_offset + 3;
-        result[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 4].index = index_offset  + 4;
-        result[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 5].index = index_offset + 5;
-        result[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 6].index = index_offset + 6;
-        result[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = result[index_offset + 7].index = index_offset + 7;
-        result[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    }

    for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
        indices[AXIS] = result[i].index = i;
-        result[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        result[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    }

    for (uint k = 1; k < VALUES_NUM; k *= 2) {
@@ -320,26 +293,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
    for (uint i = 0; i < VALUES_NUM / 8; i++) {
        uint index_offset = i * 8;
        indices[AXIS] = temp_buf[index_offset].index = index_offset;
-        temp_buf[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 1].index = index_offset + 1;
-        temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 2].index = index_offset + 2;
-        temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 3].index = index_offset + 3;
-        temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 4].index = index_offset  + 4;
-        temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 5].index = index_offset + 5;
-        temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 6].index = index_offset + 6;
-        temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
        indices[AXIS] = temp_buf[index_offset + 7].index = index_offset + 7;
-        temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    }

    for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
        indices[AXIS] = temp_buf[i].index = i;
-        temp_buf[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
+        temp_buf[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
    }

    for (uint group = 0; group < group_num - 1; group++) {
@@ -439,22 +412,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
 #if SORT_BY_VALUE
    indices[AXIS] = sort_position;
 #ifdef TOP_K_ORDER
-    output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
+    output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
 #else
-    output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
+    output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
 #endif
 #ifdef SECOND_OUTPUT_EXIST
 #ifdef MULTIPLE_OUTPUTS
    #ifdef TOP_K_ORDER
-    second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
+    second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
    #else
-    second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
+    second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
    #endif
 #else
    #ifdef TOP_K_ORDER
-    second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
+    second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
    #else
-    second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
+    second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
    #endif
 #endif
 #endif
@@ -472,22 +445,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input

        indices[AXIS] = out_position;
 #ifdef TOP_K_ORDER
-        output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
+        output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
 #else
-        output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
+        output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
 #endif
 #ifdef SECOND_OUTPUT_EXIST
 #ifdef MULTIPLE_OUTPUTS
    #ifdef TOP_K_ORDER
-        second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
+        second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
    #else
-        second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
+        second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
    #endif
 #else
    #ifdef TOP_K_ORDER
-        second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
+        second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
    #else
-        second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
+        second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
    #endif
 #endif
 #endif
@@ -504,4 +477,3 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
 #undef AXIS
 #undef VALUES_NUM
 #undef MINIMUM_NUMBER_FOR_PARTIAL_SORTING
-#undef unroll_for
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_gpu_ref.cl
@@ -3,8 +3,7 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"
-    
+
 #define GLOBAL_SIZE 128
 #define LOCAL_SIZE GLOBAL_SIZE

@@ -13,7 +12,7 @@
    #define INPUT0_FILL_VAL INPUT0_VAL_MIN
 #else
    #define COMPARE_SIGN >
-    #define INPUT0_FILL_VAL INPUT0_VAL_MAX    
+    #define INPUT0_FILL_VAL INPUT0_VAL_MAX
 #endif

 __attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
@@ -39,8 +38,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP

    uint temp_index = global_index;

-    __attribute__((opencl_unroll_hint))
-    for (uint i = 0; i < TOP_K; i++){
+    unroll_for(uint i = 0; i < TOP_K; i++){
        accumulator.index = global_index;
        accumulator.value = input[global_index];
        for (int j = 0; j < i; j++){
@@ -49,10 +47,10 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
        }
        global_index += GLOBAL_SIZE;
 #ifdef INPUT0_LAYOUT_BFYX
-            while (global_index < size + batch_offset) 
+            while (global_index < size + batch_offset)
 #else
            while (global_index < size)
-#endif   
+#endif
        {
            iav_type element;
            element.value = input[global_index];
@@ -72,7 +70,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
            global_index += GLOBAL_SIZE * INPUT0_BATCH_NUM;
 #endif
        }
-        
+
 #ifdef INPUT0_LAYOUT_BFYX
        if (local_index < size)
            scratch[local_index] = accumulator;
@@ -84,14 +82,13 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
        else
            scratch[local_index].value = INPUT0_FILL_VAL;
 #endif
-        
+

        barrier(CLK_LOCAL_MEM_FENCE);

-        __attribute__((opencl_unroll_hint))
-        for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
+        unroll_for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
        {
-            if (local_index < offset) 
+            if (local_index < offset)
            {
                iav_type other = scratch[local_index + offset];
                iav_type mine = scratch[local_index];
@@ -103,16 +100,16 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
-        
+
 #ifdef INPUT0_LAYOUT_BFYX
-        if (local_index == 0) 
+        if (local_index == 0)
        {
            output[current_batch * TOP_K + i] = scratch[0].index % size;
        }
        global_index = temp_index;
        results[i] = scratch[0].index % size;
 #else
-        if (local_index == 0) 
+        if (local_index == 0)
        {
            output[current_batch + i*INPUT0_BATCH_NUM] = scratch[0].index / INPUT0_BATCH_NUM;
        }
@@ -123,4 +120,4 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
 }

 #undef COMPARE_SIGN
-#undef INPUT0_FILL_VAL
+#undef INPUT0_FILL_VAL
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_opt.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"

 #ifndef SG_SIZE
    #define SG_SIZE   16
@@ -36,7 +35,7 @@
 #endif


-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
+REQD_SUB_GROUP_SIZE(SG_SIZE)
 __attribute__((reqd_work_group_size(SG_SIZE, 1, 1)))
 KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
 {
@@ -56,8 +55,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
    // (gid + 1) <= input_size / (INB_ARRAY_SIZE * SG_SIZE)   ->   as gid is integral, the floor is not an issue
    if (gid + 1 <= input_size / (INB_ARRAY_SIZE * SG_SIZE))
    {
-        __attribute__((opencl_unroll_hint))
-        for (uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
+        unroll_for(uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
        {
            // Can be exchanged with sub-group block read to INB_ARRAY_SIZE-component vector.
            input_blocks[ai] = input[gid * INB_ARRAY_SIZE * SG_SIZE + ai * SG_SIZE + lid];
@@ -69,8 +67,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
        const uint last_gid = input_size / (INB_ARRAY_SIZE * SG_SIZE);

        uint ai = 0;
-        __attribute__((opencl_unroll_hint))
-        for (uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
+        unroll_for(uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
        {
            // Can be exchanged with sub-group block read to scalar.
            input_blocks[ai] = input[last_base_off + lid];
@@ -85,8 +82,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
            indices[ai++] = lid < input_size - remainder_off ? remainder_off + lid : 0;
        }

-        __attribute__((opencl_unroll_hint))
-        for (; ai < INB_ARRAY_SIZE; ++ai)
+        unroll_for(; ai < INB_ARRAY_SIZE; ++ai)
        {
            input_blocks[ai] = UNIT_FILL_VAL;
        }
@@ -98,8 +94,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
    UNIT_TYPE acc[minmax_acc_array_size];
    uint result[minmax_acc_array_size];

-    __attribute__((opencl_unroll_hint))
-    for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
+    unroll_for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
    {
        acc[ai] = UNIT_FILL_VAL;
        result[ai] = 0;
@@ -109,24 +104,22 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
    __attribute__((opencl_unroll_hint(1)))
    for (uint ii = 0; ii < INB_ARRAY_SIZE * SG_SIZE; ++ii)
    {
-        UNIT_TYPE in_val = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
-        uint in_index = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
-        __attribute__((opencl_unroll_hint))
-        for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
+        UNIT_TYPE in_val = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
+        uint in_index = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
+        unroll_for(uint ai = 0; ai < minmax_acc_array_size; ++ai)
        {
            bool insert_flag = (in_val OP_ARG_REL acc[ai]);
            if (sub_group_any(insert_flag))
            {
-                __attribute__((opencl_unroll_hint))
-                for (uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
+                unroll_for(uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
                {
-                    acc[aj - 1] = intel_sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
-                    result[aj - 1] = intel_sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
+                    acc[aj - 1] = _sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
+                    result[aj - 1] = _sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
                }
                UNIT_TYPE in_val_acc_mask = select(in_val, acc[ai], insert_flag);
                uint in_index_mask = select(in_index, result[ai], insert_flag);
-                acc[ai] = select(acc[ai], intel_sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
-                result[ai] = select(result[ai], intel_sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
+                acc[ai] = select(acc[ai], _sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
+                result[ai] = select(result[ai], _sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
                break;
            }
        }
@@ -135,8 +128,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)

    // Write TOP_K sorted results.
    uint ai = 0;
-    __attribute__((opencl_unroll_hint))
-    for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
+    unroll_for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
    {
        output[k_base_off + lid] = result[ai++] % input_size;
    }
@@ -161,4 +153,4 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
    #undef UNIT_FILL_VAL
    #undef UNIT_FILL_VAL_NEEDSUNDEF_
 #endif
-#undef OP_ARG_REL
+#undef OP_ARG_REL
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/average_unpooling_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/average_unpooling_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(average_unpooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/batch_to_space_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/batch_to_space_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(batch_to_space_ref)(const __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_1x1.cl
@@ -2,17 +2,19 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define OC_BLOCK_SIZE 32

-#define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
-#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
-#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
+#define GET_WEI(data, id) _sub_group_shuffle(data, id)
+#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) _sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
+#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
                                     __global OUTPUT_TYPE* output,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_1x1_b_fs_yx_fsv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_1x1_b_fs_yx_fsv16.cl
@@ -2,17 +2,19 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/unit_type.cl"

 #define OC_BLOCK_SIZE 16

-#define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
-#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
+#define GET_SRC(data, id) _sub_group_shuffle(data, id)
+#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 KERNEL(binary_convolution_1x1_b_fs_yx_fsv16)(const __global INPUT0_TYPE* input,
                                                   __global OUTPUT_TYPE* output,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_generic.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_generic.cl
@@ -2,13 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define OC_BLOCK_SIZE 32

-#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))

 #if BINARY_PACKED_OUTPUT
    #define BUFFER_TYPE UNIT_TYPE
@@ -16,7 +17,7 @@
    #define BUFFER_TYPE OUTPUT_TYPE
 #endif

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
                                         __global OUTPUT_TYPE* output,
@@ -107,7 +108,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
                __attribute__((opencl_unroll_hint(SUB_GROUP_SIZE)))
                for (int i = 0; i < SUB_GROUP_SIZE; i++)
                {
-                    INPUT0_TYPE src = intel_sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
+                    INPUT0_TYPE src = _sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
                                                                         (kw + i*STRIDE_SIZE_X) % SUB_GROUP_SIZE);
 #if EXCLUDE_PAD
                    int compute = ((input_x + kw + i*STRIDE_SIZE_X >= 0) &&
@@ -149,7 +150,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
    for (int i = 0; i < SUB_GROUP_SIZE*2; i++)
    {
 #if EXCLUDE_PAD
-        CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*intel_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
+        CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
 #else
        CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*FILTER_SIZE_Y*FILTER_SIZE_X - 2*dst_buf[i]);
 #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/binary_convolution_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(binary_convolution_ref)(const __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/border_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/border_gpu_ref.cl
@@ -2,34 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
-
-inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if INPUT0_DIMS < 5
-    return INPUT0_GET_INDEX(b, f, y, x);
-#elif INPUT0_DIMS == 5
-    return INPUT0_GET_INDEX(b, f, z, y, x);
-#elif INPUT0_DIMS == 6
-    return INPUT0_GET_INDEX(b, f, w, z, y, x);
-#else
-#error [clDNN border_gpu_ref.cl]: input format - not supported
-#endif
-}
-
-inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if OUTPUT_DIMS < 5
-    return OUTPUT_GET_INDEX(b, f, y, x);
-#elif OUTPUT_DIMS == 5
-    return OUTPUT_GET_INDEX(b, f, z, y, x);
-#elif OUTPUT_DIMS == 6
-    return OUTPUT_GET_INDEX(b, f, w, z, y, x);
-#else
-#error [clDNN border_gpu_ref.cl]: output format - not supported
-#endif
-}
+#include "include/fetch_utils.cl"

 KERNEL(border_gpu_ref)(
    const __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl
@@ -2,8 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
+#include "include/fetch_utils.cl"

 #define GET_UPDATES_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_blocked.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_blocked.cl
@@ -3,7 +3,8 @@
 //

 #include "include/batch_headers/fetch_data.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"

 #define WORK_GROUP_SIZE 16
 #define IC_BLOCK 16
@@ -21,10 +22,8 @@
 #   define TILE_F 1
 #endif

-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-
 __attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
-__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
 KERNEL (concatenation_gpu_blocked)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -52,8 +51,7 @@ KERNEL (concatenation_gpu_blocked)(
        OUTPUT_BLOCK_WRITE(output, dst_index, res);
    } else {
        if (lid < INPUT0_FEATURE_NUM % IC_BLOCK) {
-            __attribute__((opencl_unroll_hint))
-            for (uint tx = 0; tx < TILE_XY; ++tx) {
+            unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
                OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src)[tx], ACTIVATION_PARAMS));
                output[dst_index + tx * IC_BLOCK + lid] = res;
            }
@@ -78,12 +76,11 @@ KERNEL (concatenation_gpu_blocked)(
        INPUT_VEC_TYPE src_al1 = 0;
        INPUT_VEC_TYPE src_al2 = 0;
    #endif
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_XY; ++tx) {
-            ((INPUT0_TYPE*)&src_al0)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
+        unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
+            ((INPUT0_TYPE*)&src_al0)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
    #if TILE_F == 4
-            ((INPUT0_TYPE*)&src_al1)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
-            ((INPUT0_TYPE*)&src_al2)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
+            ((INPUT0_TYPE*)&src_al1)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
+            ((INPUT0_TYPE*)&src_al2)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
    #endif
        }
        OUTPUT_VEC_TYPE res_al0 = TO_OUTPUT_VEC_TYPE(ACTIVATION(src_al0, ACTIVATION_PARAMS));
@@ -105,8 +102,7 @@ KERNEL (concatenation_gpu_blocked)(
    #endif

        dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid_f_offset + output_offset_in_concat_axis), y, x);
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_XY; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
            OUTPUT_TYPE res_unal = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src_unal)[tx], ACTIVATION_PARAMS));
            output[dst_index + tx * IC_BLOCK] = res_unal;
        }
@@ -115,15 +111,13 @@ KERNEL (concatenation_gpu_blocked)(
    {
        const uint dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid + output_offset_in_concat_axis), y, x);

-        __attribute__((opencl_unroll_hint))
-        for (uint fw = 0; fw < TILE_F; ++fw) {
+        unroll_for(uint fw = 0; fw < TILE_F; ++fw) {
            if (TILE_F != 1 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F != 0 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F == fw)
                break;

            bool do_leftover_write = INPUT0_FEATURE_NUM % IC_BLOCK == 0 || f_block * IC_BLOCK + fw * IC_BLOCK + lid < INPUT0_FEATURE_NUM;
            if (do_leftover_write) {
-                __attribute__((opencl_unroll_hint))
-                for (uint tx = 0; tx < TILE_XY; ++tx) {
+                unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
                    INPUT0_TYPE src = input[input_offset + lid + tx * IC_BLOCK + fw * INPUT0_FEATURE_PITCH * IC_BLOCK];
                    OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(src, ACTIVATION_PARAMS));
                    output[dst_index + tx * IC_BLOCK + fw * OUTPUT_FEATURE_PITCH * IC_BLOCK] = res;
@@ -144,4 +138,3 @@ KERNEL (concatenation_gpu_blocked)(
 #undef OUTPUT_BLOCK_WRITE

 #undef TILE_F
-#undef CEIL_DIV
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_depth_bfyx_no_pitch.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_depth_bfyx_no_pitch.cl
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"

 //
@@ -16,17 +17,9 @@
 #define WORK_GROUP_SIZE 16
 #define INPUT0_ELEMENTS_COUNT (INPUT0_LENGTH/INPUT0_BATCH_NUM)

-#if FP16_UNIT_USED
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
-    #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
-#else
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
-    #define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (byte_offset), as_uint8(val))
-#endif
-
 __attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
-__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
-KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
+REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
+KERNEL(concatenation_gpu_depth_bfyx_no_pitch)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
 {
    const uint batch_id = get_group_id(0);

@@ -41,7 +34,7 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl
    const uint output_offset = OUTPUT_OFFSET + element_group_offset + output_batch_offset + output_offset_in_concat_axis*OUTPUT_PITCHES[CONCAT_AXIS_INDEX];

    //Check if current group in batch starts from 16-byte aligned pos. If not then move block read to 16-byte aligned position.
-    //Requirement for intel_sub_group_block_write8.
+    //Requirement for _sub_group_block_write8.
    uint align_offset = 0;
    const uint group_start_pos = output_offset;
    if(group_start_pos % WORK_GROUP_SIZE != 0)
@@ -52,8 +45,8 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl

    if(element_group_offset + align_offset + WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM < INPUT0_ELEMENTS_COUNT)
    {
-        MAKE_VECTOR_TYPE(UNIT_TYPE, 8) in = ALIGNED_BLOCK_READ8(input, input_offset + align_offset);
-        ALIGNED_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));
+        MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) in = DT_INPUT_BLOCK_READ8(input, input_offset + align_offset);
+        DT_OUTPUT_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));

        //Fill the values that were missed upon adding align_offset
        if((align_offset != 0) && (element_offset + output_batch_offset < group_start_pos + align_offset))
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_fs_b_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_fs_b_yx_fsv32.cl
@@ -2,12 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/unit_type.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
 #include "include/batch_headers/fetch_data.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)

@@ -23,10 +20,10 @@
 //                            must be equal FSV / SUB_GROUP_SIZE
 // ======================================================================================

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
-KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
-                                         __global UNIT_TYPE* output,
+KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global INPUT0_TYPE* input,
+                                         __global OUTPUT_TYPE* output,
                                         uint output_offset_in_concat_axis
 )
 {
@@ -44,12 +41,12 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
    input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;

-    UNIT_TYPE2 in = UNIT_BLOCK_READ2(input, input_offset);
+    MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) in = DT_INPUT_BLOCK_READ2(input, input_offset);

    in = ACTIVATION(in, ACTIVATION_PARAMS);
 #if ALIGNED
    const uint dst_index = OUTPUT_GET_INDEX(b, output_offset_in_concat_axis + fs * FSV, y, x);
-    UNIT_BLOCK_WRITE2(output, dst_index, in);
+    DT_OUTPUT_BLOCK_WRITE2(output, dst_index, in);
 #else
    const uint dst_feature = fs * FSV + output_offset_in_concat_axis + sglid;
    if (dst_feature + SUB_GROUP_SIZE < OUTPUT_FEATURE_NUM) {
@@ -63,8 +60,6 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
 #endif
 }

-#undef unroll_for
-
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_simple_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_simple_ref.cl
@@ -2,53 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
-
-///////////////////////// Input Index /////////////////////////
-inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if   INPUT0_SIMPLE && INPUT0_DIMS <= 4
-    return GET_DATA_INDEX(INPUT0, b, f, y, x);
-#elif INPUT0_SIMPLE && INPUT0_DIMS == 5
-    return GET_DATA_INDEX_5D(INPUT0, b, f, z, y, x);
-#elif INPUT0_SIMPLE && INPUT0_DIMS == 6
-    return GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
-#elif INPUT0_LAYOUT_B_FS_ZYX_FSV16
-    return GET_DATA_B_FS_ZYX_FSV16_INDEX(INPUT0, b, f, z, y, x);
-#elif INPUT0_LAYOUT_BS_FS_ZYX_BSV16_FSV16
-    return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(INPUT0, b, f, z, y, x);
-#elif INPUT0_LAYOUT_BS_FS_YX_BSV16_FSV16
-    return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, y, x);
-#elif INPUT0_LAYOUT_BS_FS_YX_BSV32_FSV32
-    return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(INPUT0, b, f, y, x);
-#else
-#error concatenation_gpu_simple_ref.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output Index /////////////////////////
-inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if   OUTPUT_SIMPLE && OUTPUT_DIMS <= 4
-    return GET_DATA_INDEX(OUTPUT, b, f, y, x);
-#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 5
-    return GET_DATA_INDEX_5D(OUTPUT, b, f, z, y, x);
-#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 6
-    return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, y, x);
-#elif OUTPUT_LAYOUT_B_FS_ZYX_FSV16
-    return GET_DATA_B_FS_ZYX_FSV16_INDEX(OUTPUT, b, f, z, y, x);
-#elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
-    return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(OUTPUT, b, f, z, y, x);
-#elif OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16
-    return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, b, f, y, x);
-#elif OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32
-    return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(OUTPUT, b, f, y, x);
-#else
-#error concatenation_gpu_simple_ref.cl: output format - not supported
-#endif
-}
-
+#include "include/fetch_utils.cl"

 KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
 {
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convert_color_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convert_color_ref.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/fetch_data.cl"
-#include "include/batch_headers/data_types.cl"

 #if defined(CONVERT_FROM_NV12) || defined(CONVERT_FROM_I420)
 #ifdef BUFFER_MEM
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl
@@ -4,8 +4,9 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 #define TYPE_N_(type, n) type##n
 #define TYPE_N(type, n) TYPE_N_(type, n)
@@ -60,13 +61,10 @@

 #endif

-#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
-
 #define FSV  16
 #define SIMD 16

-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
 KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
    const __global INPUT0_TYPE   *conv_input,
@@ -102,8 +100,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

    const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
    uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
-    __attribute__((opencl_unroll_hint))
-    for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+    unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
        uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
        uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
                          ? out_yx_shuffle
@@ -136,8 +133,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        uint input_y[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
    #endif

-    __attribute__((opencl_unroll_hint))
-    for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+    unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
        #ifdef SHOULD_USE_DATA_ZP
            input_x[os] = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
            input_y[os] = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
@@ -158,18 +154,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

    #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
        uint4 weights_zp_val[OUT_BLOCK_FEATURES];
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
            weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
        }
        #if INPUT0_FEATURE_NUM % FSV != 0
            uint4 weights_zp_vec_partial[OUT_BLOCK_FEATURES];
-            __attribute__((opencl_unroll_hint))
-            for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+            unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
                weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
                FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
-                __attribute__((opencl_unroll_hint))
-                for (uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
+                unroll_for(uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
                    wzp_p[f] = 0;
                }
            }
@@ -181,8 +174,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
            #if INPUT0_FEATURE_NUM % FSV != 0
                if (feature_offset + (k + 1) * FSV >= ALIGN(INPUT0_FEATURE_NUM, FSV)) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+                    unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
                        weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
                    }
                }
@@ -199,11 +191,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

        #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
            ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OUT_BLOCK_FEATURES];
-            __attribute__((opencl_unroll_hint))
-            for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+            unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
                dotProdAZPxWZP[ofb] = 0;
-                __attribute__((opencl_unroll_hint))
-                for (uint ive = 0; ive < 4; ive++) {
+                unroll_for(uint ive = 0; ive < 4; ive++) {
                    dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
                    IMAD(dotProdAZPxWZP[ofb][ive],
                    AS_INPUT0_TYPE_4(data_zp_val[ive]),
@@ -213,14 +203,12 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        #endif

        uint4 weights_val[OUT_BLOCK_FEATURES] = { };
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
            weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
        }

        uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+        unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
            #if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
                if (((input_x[os] < 0) || (input_x[os] >= INPUT0_SIZE_X)) ||
                    ((input_y[os] < 0) || (input_y[os] >= INPUT0_SIZE_Y))) {
@@ -236,12 +224,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
 #if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
        // For some cases compiler spills here due to loop order
        // Use suboptimal order to avoid this at cost of instruction dispatch delays.
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
-            __attribute__((opencl_unroll_hint))
-            for (uint ive = 0; ive < 4; ++ive) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+            unroll_for(uint ive = 0; ive < 4; ++ive) {
+                unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
                    #ifdef SHOULD_USE_DATA_ZP
                        ACCUMULATOR_TYPE dotProdAZPxW = 0;
                        dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -250,10 +235,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
                        AS_FILTER_TYPE_4(weights_val[ofb][ive])));
                    #endif
 #else
-        __attribute__((opencl_unroll_hint))
-        for (uint ive = 0; ive < 4; ++ive) {
-            __attribute__((opencl_unroll_hint))
-            for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint ive = 0; ive < 4; ++ive) {
+            unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
                #ifdef SHOULD_USE_DATA_ZP
                    ACCUMULATOR_TYPE dotProdAZPxW = 0;
                    dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -261,10 +244,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
                    AS_INPUT0_TYPE_4(data_zp_val[ive]),
                    AS_FILTER_TYPE_4(weights_val[ofb][ive])));
                #endif
-                __attribute__((opencl_unroll_hint))
-                for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+                unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 #endif
-                        INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
+                        INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));

                        dotProd[ofb][os] = IMAD(dotProd[ofb][os],
                                                inputs,
@@ -293,8 +275,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        }

        filter_idx += WEIGHTS_IS_PITCH;
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+        unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
            input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
        }

@@ -317,27 +298,21 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
    __local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;

    if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
-        __attribute__((opencl_unroll_hint))
-        for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
+        unroll_for(uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
            if (get_sub_group_id() == wg) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = 0; ofb < wg; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+                unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
+                    unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
                        const uint partial_acc_ptr_idx =
                            ofb * OUT_BLOCK_SPATIAL * SIMD +
                            os * SIMD;
                        partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
                    }
                }
-                __attribute__((opencl_unroll_hint))
-                for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+                unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
                    dotProd[0][os] = dotProd[wg][os];
                }
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+                unroll_for(uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+                    unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
                        const uint partial_acc_ptr_idx =
                            ((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
                            ofb * OUT_BLOCK_SPATIAL * SIMD +
@@ -348,10 +323,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
            }
        }
    } else {
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
-            __attribute__((opencl_unroll_hint))
-            for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+        unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
+            unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
                const uint partial_acc_ptr_idx =
                    ofb * OUT_BLOCK_SPATIAL * SIMD +
                    os * SIMD;
@@ -366,10 +339,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        return;

    partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
-    __attribute__((opencl_unroll_hint))
-    for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+    unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
+        unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
            const uint partial_acc_ptr_idx =
                wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
                os * SIMD;
@@ -399,18 +370,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

 #ifdef COMPENSATION_TERM
    COMPENSATION_TYPE comp[FINAL_OUT_BLOCK_FEATURES];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
        comp[ofb] = compensation[out_f + ofb * SIMD];
    }
 #endif

    // Convert accumulator type to activation type
    ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+    unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
            dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);

 #if BIAS_TERM
@@ -424,13 +392,11 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

    // Fused ops/activation
    OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
        FUSED_OPS_PRELOAD_SCALAR;
 #endif
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+        unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
 #if HAS_FUSED_OPS
    #if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
            FUSED_OPS_CALC_SCALAR;
@@ -462,10 +428,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
    if (can_use_full_block_write) {
        uint output_idx = OUTPUT_GET_INDEX(out_b,
                                           out_fg,
-                                           intel_sub_group_shuffle(out_y_shuffle[0], 0),
-                                           intel_sub_group_shuffle(out_x_shuffle[0], 0));
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
+                                           _sub_group_shuffle(out_y_shuffle[0], 0),
+                                           _sub_group_shuffle(out_x_shuffle[0], 0));
+        unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
            bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
                               || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
                               || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
@@ -474,8 +439,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
 #if OUTPUT_TYPE_SIZE == 1
                for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
-                    __attribute__((opencl_unroll_hint))
-                    for (uint i = 0; i < 8; ++i) {
+                    unroll_for(uint i = 0; i < 8; ++i) {
                        result_val[i] = result[ofb][os + i];
                    }
                    DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
@@ -485,8 +449,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
 #if OUTPUT_TYPE_SIZE <= 2
                for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
-                    __attribute__((opencl_unroll_hint))
-                    for (uint i = 0; i < 4; ++i) {
+                    unroll_for(uint i = 0; i < 4; ++i) {
                        result_val[i] = result[ofb][os + i];
                    }
                    DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
@@ -495,8 +458,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
 #endif
                for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
-                    __attribute__((opencl_unroll_hint))
-                    for (uint i = 0; i < 2; ++i) {
+                    unroll_for(uint i = 0; i < 2; ++i) {
                        result_val[i] = result[ofb][os + i];
                    }
                    DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
@@ -512,23 +474,20 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
        }
    } else {
        uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
-        __attribute__((opencl_unroll_hint))
-        for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+        unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
            output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
        }
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
+        unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
            bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
                               || (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
                               || (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
            if (good_of_block) {
-                __attribute__((opencl_unroll_hint))
-                for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
+                unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
                    bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
                    if (!good_os)
                        break;

-                    uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
+                    uint output_idx = _sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
                    bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);

                    if (!good_of)
@@ -538,8 +497,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
                }
            }

-            __attribute__((opencl_unroll_hint))
-            for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
+            unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
                output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
            }
        }
@@ -582,8 +540,5 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(

 #undef AS_FILTER_TYPE_4

-#undef CEIL_DIV
-#undef ALIGN
-
 #undef SIMD
 #undef FSV
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_1x1.cl
@@ -4,8 +4,7 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"

 // ======================================================================================
 // Host side jit-constants:
@@ -23,8 +22,6 @@
 //                             data prefetching; requires additional global barrier
 // ======================================================================================

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define FSV 4
 #define WEIGHTS_OSV 16

@@ -61,7 +58,7 @@
 // WI: 1 x FEATURES_PER_WI x 1
 // SG: 1 x FEATURES_PER_WI x SIMD

-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(SIMD, 1, LWG_DEPTH)))
 KERNEL(convolution)(
    const __global uint          *input,
@@ -134,7 +131,7 @@ KERNEL(convolution)(
        weights_offset += WEIGHTS_IS_PITCH / FSV * LWG_DEPTH;

        unroll_for (uint out_fi = 0; out_fi < FEATURES_PER_WI; ++out_fi) {
-            int wei_i = intel_sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
+            int wei_i = _sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
            FILTER_TYPE4 wei_val = AS_FILTER_TYPE4(wei_i);

            dotProd[out_fi] = IMAD(dotProd[out_fi], in_val, wei_val);
@@ -223,8 +220,6 @@ KERNEL(convolution)(
    }
 }

-#undef unroll_for
-
 #undef FSV
 #undef WEIGHTS_OSV

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_dw.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_dw.cl
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 // ======================================================================================
 // Host side jit-constants:
@@ -51,7 +51,6 @@
 #define WEIGHTS_YXS_PITCH 4

 #define FILTER_SPATIAL_SIZE (FILTER_SIZE_X * FILTER_SIZE_Y)
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

 #if FILTER_BLOCKED < FILTER_SPATIAL_SIZE && FILTER_BLOCKED % 4 != 0
 #   error convolution_gpu_b_fs_yx_fsv4_dw.cl - filter blocks must either cover whole spatial filter or be multiple of 4.
@@ -76,9 +75,9 @@
 #endif

 #if TILED
-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 #endif
-KERNEL(convolution)(
+KERNEL(convolution_gpu_b_fs_yx_fsv4_dw)(
    const __global INPUT_TYPE4   *input,
    __global OUTPUT_TYPE4        *output,
    const __global FILTER_TYPE4  *weights,
@@ -114,11 +113,9 @@ KERNEL(convolution)(
 #if PRELOAD_INPUT || TILED
    INPUT_TYPE4 in[FILTER_SIZE_Y * INPUT_LINE_SIZE];

-    __attribute__((opencl_unroll_hint))
-    for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
+    unroll_for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
        // TODO Try to avoid loading last input line in padded situations
-        __attribute__((opencl_unroll_hint))
-        for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
+        unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
            uint preload_offset = yi * INPUT_LINE_SIZE + xi;
            uint input_x_offset = xi * (INPUT_X_PITCH / FSV);
            uint input_y_offset = yi * (DILATION_SIZE_Y * INPUT_Y_PITCH / FSV);
@@ -135,10 +132,8 @@ KERNEL(convolution)(

 #if PRELOAD_WEIGHTS
    FILTER_TYPE4 wei[CEIL_DIV(FILTER_SPATIAL_SIZE, 4) * 4];
-    __attribute__((opencl_unroll_hint))
-    for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
-        __attribute__((opencl_unroll_hint))
-        for (uint ofi = 0; ofi < 4; ++ofi) {
+    unroll_for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
+        unroll_for(uint ofi = 0; ofi < 4; ++ofi) {
            uint preload_offset = (fsi / 4) * 4 + ofi;
            uint weights_idx = weights_offset + ofi * WEIGHTS_I_PITCH + (fsi / 4) * WEIGHTS_YXS_PITCH;
            wei[preload_offset] = weights[weights_idx];
@@ -159,8 +154,7 @@ for (; y < tile_y_end; ++y) {

    int acc[OUTPUT_BLOCK_X][4] = { };

-    __attribute__((opencl_unroll_hint))
-    for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
+    unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
        uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);

        uint4 fx = fis % FILTER_SIZE_X;
@@ -178,17 +172,16 @@ for (; y < tile_y_end; ++y) {
        wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
 #endif

-        __attribute__((opencl_unroll_hint))
-        for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
+        unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
            INPUT_TYPE4 in_trans0;
            INPUT_TYPE4 in_trans1;
            INPUT_TYPE4 in_trans2;
            INPUT_TYPE4 in_trans3;
 #if TILED
-            in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
-            in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
-            in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
-            in_trans3 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans3 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
 #elif PRELOAD_INPUT
            uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
            uint4 input_y_offset = fy * INPUT_LINE_SIZE;
@@ -243,19 +236,18 @@ for (; y < tile_y_end; ++y) {
        wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
 #   endif

-        __attribute__((opencl_unroll_hint))
-        for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
+        unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
            INPUT_TYPE4 in_trans0;
            INPUT_TYPE4 in_trans1;
            INPUT_TYPE4 in_trans2;
            INPUT_TYPE4 in_trans3;
 #if TILED
-            in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
 #   if FILTER_BLOCKED % 4 > 1
-            in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
 #   endif
 #   if FILTER_BLOCKED % 4 > 2
-            in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
 #   endif
 #elif PRELOAD_INPUT
            uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
@@ -317,16 +309,14 @@ for (; y < tile_y_end; ++y) {
        wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
 #   endif

-    __attribute__((opencl_unroll_hint))
-    for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
+    unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
        uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
        uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;

-        __attribute__((opencl_unroll_hint))
-        for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
+        unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {

 #   if TILED
-            in0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
+            in0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
 #   elif PRELOAD_INPUT
            uint input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
            uint input_y_offset = fy * INPUT_LINE_SIZE;
@@ -349,17 +339,14 @@ for (; y < tile_y_end; ++y) {
 #endif

 #if TILE_Y != 1
-    __attribute__((opencl_unroll_hint))
-    for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
-        __attribute__((opencl_unroll_hint))
-        for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
+    unroll_for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
+        unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
            in[yi * INPUT_LINE_SIZE + xi] = in[(yi + 1) * INPUT_LINE_SIZE + xi];
        }
    }
    {
        uint yi = FILTER_SIZE_Y - 1;
-        __attribute__((opencl_unroll_hint))
-        for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
+        unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
            in[yi * INPUT_LINE_SIZE + xi] = input[input_offset + xi * (INPUT_X_PITCH / FSV)];
        }
        input_offset += DILATION_SIZE_Y * INPUT_Y_PITCH / FSV;
@@ -456,4 +443,3 @@ for (; y < tile_y_end; ++y) {
 #undef WEIGHTS_YXS_PITCH

 #undef FILTER_SPATIAL_SIZE
-#undef CEIL_DIV
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_int8.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv4_int8.cl
@@ -3,12 +3,11 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"
-#include "include/imad.cl"
+#include "include/batch_headers/imad.cl"

 #define INPUT0_PACKED_TYPE uint

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_b_fs_yx_fsv4_int8)(
    const __global INPUT0_PACKED_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl
@@ -2,8 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"

@@ -146,10 +147,8 @@
 #   error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - internal error, CHECK_BOUNDARY_IN_SLM enabled without PRELOAD_INPUT_TO_SLM.
 #endif

-#define CEIL_DIV(a, b)  ( ((a) + (b) - 1) / (b) )

-
-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(LWS0, LWS1, SIMD)))
 KERNEL(convolution)(
    const __global  INPUT0_TYPE  *input,
@@ -209,8 +208,7 @@ KERNEL(convolution)(

    #if ASYMMETRIC_DATA_QUANTIZATION && CHECK_BOUNDARY_IN_SLM
        uint4 azp_uniform[FSV / iteration_preload_bytes];
-        __attribute__((opencl_unroll_hint))
-        for (uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
+        unroll_for(uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
            azp_uniform[i] = ((const __global uint4*)(activations_zp + (f + i * iteration_preload_bytes)))[0];
        }
    #endif
@@ -285,8 +283,7 @@ KERNEL(convolution)(
    if (early_return)
        return;

-    __attribute__((opencl_unroll_hint))
-    for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
+    unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
        // Loop over 4 filter spatials that match imad case
        uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);

@@ -307,8 +304,7 @@ KERNEL(convolution)(
        uint4 input_idx = input_spatial_offset + input_offset;

        uint tx = 0;
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+        unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
            INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s0);
            INPUT_TYPE16 tmp_in1 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s1);
            INPUT_TYPE16 tmp_in2 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s2);
@@ -374,13 +370,11 @@ KERNEL(convolution)(
        uint4 input_y_offset = fy * dilation_size_y * input_y_pitch;
        uint4 input_spatial_offset = input_x_offset + input_y_offset;
        uint4 input_start_offset = input_spatial_offset + input_offset;
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
            uint4 input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
            // Block reads along feature slice
            uint fw = 0;
-            __attribute__((opencl_unroll_hint))
-            for (; fw + 4 <= F_PER_WI; fw += 4) {
+            unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
                INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s0);
                INPUT_TYPE4 tmp_in1 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s1);
                INPUT_TYPE4 tmp_in2 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s2);
@@ -417,14 +411,12 @@ KERNEL(convolution)(
 #endif
        // Weights loading:
        FILTER_TYPE4 wei[F_PER_WI];
-        __attribute__((opencl_unroll_hint))
-        for (uint fw = 0; fw < F_PER_WI; ++fw) {
-            wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+        unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
+            wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
        }

    #if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
            int4 input_x = convert_int4(x * STRIDE_SIZE_X + tx * STRIDE_SIZE_X + fx * DILATION_SIZE_X) - PADDING_SIZE_X;
            int4 input_y = convert_int4(y * STRIDE_SIZE_Y + fy * dilation_size_y) - PADDING_SIZE_Y;
            int4 input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
@@ -433,20 +425,16 @@ KERNEL(convolution)(
        #else
            #define padding_value(fw) ((INPUT0_TYPE)0)
        #endif
-            __attribute__((opencl_unroll_hint))
-            for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
+            unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
                in_trans0[tx * F_PER_WI + fwp] = input_pad.s0 ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
            }
-            __attribute__((opencl_unroll_hint))
-            for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
+            unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
                in_trans1[tx * F_PER_WI + fwp] = input_pad.s1 ? padding_value(fwp) : in_trans1[tx * F_PER_WI + fwp];
            }
-            __attribute__((opencl_unroll_hint))
-            for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
+            unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
                in_trans2[tx * F_PER_WI + fwp] = input_pad.s2 ? padding_value(fwp) : in_trans2[tx * F_PER_WI + fwp];
            }
-            __attribute__((opencl_unroll_hint))
-            for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
+            unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
                in_trans3[tx * F_PER_WI + fwp] = input_pad.s3 ? padding_value(fwp) : in_trans3[tx * F_PER_WI + fwp];
            }
        #undef padding_value
@@ -455,30 +443,24 @@ KERNEL(convolution)(

        // Transpose input:
        INPUT_TYPE4 in[TILE_X * F_PER_WI];
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                uint in_offset = tx * F_PER_WI + fw;
                in[in_offset] = (INPUT_TYPE4)(in_trans0[in_offset], in_trans1[in_offset], in_trans2[in_offset], in_trans3[in_offset]);
            }
        }

        // IMAD:
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                acc[tx * F_PER_WI + fw] = IMAD(acc[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], wei[fw]);
            }
        }

    #if ASYMMETRIC_WEIGHTS_QUANTIZATION
        // Accumulate for input values for asymmetric weights:
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                src_sum[tx * F_PER_WI + fw] = IMAD(src_sum[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], (char4)(1, 1, 1, 1));
            }
        }
@@ -492,13 +474,11 @@ KERNEL(convolution)(
    // Leftovers in filters spatial - use raw multiplication instead of imad
    // Load inputs before loop to avoid byte scattered reads + there are at most 3 leftovers
    FILTER_TYPE4 wei[F_PER_WI];
-    __attribute__((opencl_unroll_hint))
-    for (uint fw = 0; fw < F_PER_WI; ++fw) {
-        wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+    unroll_for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
    }

-    __attribute__((opencl_unroll_hint))
-    for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
+    unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
        // Input loading:
        uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
        uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
@@ -511,8 +491,7 @@ KERNEL(convolution)(
        uint input_idx = input_spatial_offset + input_offset;

        uint tx = 0;
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+        unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
            INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx);
            VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
            input_idx += 16 * SIMD;
@@ -543,12 +522,10 @@ KERNEL(convolution)(
        uint input_y_offset = fy * dilation_size_y * input_y_pitch;
        uint input_spatial_offset = input_x_offset + input_y_offset;
        uint input_start_offset = input_spatial_offset + input_offset;
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
            uint input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
            uint fw = 0;
-            __attribute__((opencl_unroll_hint))
-            for (; fw + 4 <= F_PER_WI; fw += 4) {
+            unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
                INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx);
                VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
                input_idx += 4 * SIMD;
@@ -566,8 +543,7 @@ KERNEL(convolution)(
 #   endif

    #if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
            int input_x = (x + tx) * STRIDE_SIZE_X + fx * DILATION_SIZE_X - PADDING_SIZE_X;
            int input_y = y * STRIDE_SIZE_Y + fy * dilation_size_y - PADDING_SIZE_Y;
            int input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
@@ -576,8 +552,7 @@ KERNEL(convolution)(
        #else
            #define padding_value(fw) ((INPUT0_TYPE)0)
        #endif
-            __attribute__((opencl_unroll_hint))
-            for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
+            unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
                in_trans0[tx * F_PER_WI + fwp] = input_pad ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
            }
        #undef padding_value
@@ -585,20 +560,16 @@ KERNEL(convolution)(
    #endif

        // Raw multiply accumulate:
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                acc[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw] * (int)wei[fw][fi];
            }
        }

    #if ASYMMETRIC_WEIGHTS_QUANTIZATION
        // Accumulate input values for asymmetric weights:
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                src_sum[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw];
            }
        }
@@ -614,18 +585,14 @@ KERNEL(convolution)(
 #if BIAS_TERM
 #   if BIAS_PER_OFM
    MAKE_VECTOR_TYPE(BIAS_TYPE, F_PER_WI) bias_val = BLOCK_READN(BIAS_TYPE, F_PER_WI, biases, f);
-    __attribute__((opencl_unroll_hint))
-    for (uint tx = 0; tx < TILE_X; ++tx) {
-        __attribute__((opencl_unroll_hint))
-        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+    unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
            dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((BIAS_TYPE*)&bias_val)[fw]);
        }
    }
 #   elif BIAS_PER_OUTPUT
-    __attribute__((opencl_unroll_hint))
-    for (uint tx = 0; tx < TILE_X; ++tx) {
-        __attribute__((opencl_unroll_hint))
-        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+    unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
            uint bias_offset = GET_BIAS_INDEX(b, f + fw * SIMD + get_sub_group_local_id(), y, x + tx);
            BIAS_TYPE bias = biases[bias_offset];
            dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
@@ -639,10 +606,8 @@ KERNEL(convolution)(
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
    {
        MAKE_VECTOR_TYPE(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI) wzp = BLOCK_READN(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI, weights_zp, f);
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                dequantized[tx * F_PER_WI + fw] -= TO_DEQUANTIZED_TYPE(src_sum[tx * F_PER_WI + fw]) * TO_DEQUANTIZED_TYPE(((WEIGHTS_ZERO_POINTS_TYPE*)&wzp)[fw]);
            }
        }
@@ -652,10 +617,8 @@ KERNEL(convolution)(
 #if COMPENSATION_TERM
    {
        MAKE_VECTOR_TYPE(COMPENSATION_TYPE, F_PER_WI) comp = BLOCK_READN(COMPENSATION_TYPE, F_PER_WI, compensation, f);
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((COMPENSATION_TYPE*)&comp)[fw]);
            }
        }
@@ -664,14 +627,12 @@ KERNEL(convolution)(

    OUTPUT_TYPE out[TILE_X * F_PER_WI];
    // Fused ops and conversion to output type
-    __attribute__((opencl_unroll_hint))
-    for (uint tx = 0; tx < TILE_X; ++tx) {
+    unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
 #if HAS_FUSED_OPS
        uint fused_ops_x = x + tx;
        uint fused_ops_f = f;
        uint fw = 0;
-        __attribute__((opencl_unroll_hint))
-        for (; fw + 4 <= F_PER_WI; fw += 4) {
+        unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
            DEQUANTIZED_TYPE4 fused_ops_in;
            ARRAY_TO_VEC_4(fused_ops_in, dequantized, tx * F_PER_WI + fw);
            FUSED_OPS_4;
@@ -693,8 +654,7 @@ KERNEL(convolution)(
            out[tx * F_PER_WI + fw] = FUSED_OPS_RESULT_1;
        }
 #else
-        __attribute__((opencl_unroll_hint))
-        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
            out[tx * F_PER_WI + fw] = TO_OUTPUT_TYPE(dequantized[tx * F_PER_WI + fw]);
        }
 #endif
@@ -702,10 +662,8 @@ KERNEL(convolution)(

    // Fill results outside output in features with OUTPUT_PAD_VALUE.
    if (OUTPUT_FEATURE_NUM % FSV != 0 && f + FSV > OUTPUT_FEATURE_NUM) {
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
-            __attribute__((opencl_unroll_hint))
-            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+            unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
                const uint sglid = get_sub_group_local_id();
                // Hint here can save some movs if features are divisible by SIMD and not by FSV
                ASSUME_HINT(sglid < SIMD);
@@ -721,8 +679,7 @@ KERNEL(convolution)(
        // Full output tile x write using block write ladder
        uint tx = 0;
    #if OUTPUT_TYPE_SIZE * 16 <= MAX_OPT_BLOCK_WRITE_BYTES
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+        unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
            OUTPUT_TYPE16 tmp_write;
            ARRAY_TO_VEC_16(tmp_write, out, tx);
            DT_OUTPUT_BLOCK_WRITE16(output, output_offset, tmp_write);
@@ -730,8 +687,7 @@ KERNEL(convolution)(
        }
    #endif
    #if OUTPUT_TYPE_SIZE * 8 <= MAX_OPT_BLOCK_WRITE_BYTES
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
+        unroll_for(; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
            OUTPUT_TYPE8 tmp_write;
            ARRAY_TO_VEC_8(tmp_write, out, tx);
            DT_OUTPUT_BLOCK_WRITE8(output, output_offset, tmp_write);
@@ -739,16 +695,14 @@ KERNEL(convolution)(
        }
    #endif
    #if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
+        unroll_for(; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
            OUTPUT_TYPE4 tmp_write;
            ARRAY_TO_VEC_4(tmp_write, out, tx);
            DT_OUTPUT_BLOCK_WRITE4(output, output_offset, tmp_write);
            output_offset += 4 * SIMD;
        }
    #endif
-        __attribute__((opencl_unroll_hint))
-        for (; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
+        unroll_for(; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
            OUTPUT_TYPE2 tmp_write;
            ARRAY_TO_VEC_2(tmp_write, out, tx);
            DT_OUTPUT_BLOCK_WRITE2(output, output_offset, tmp_write);
@@ -759,20 +713,17 @@ KERNEL(convolution)(
        }
    } else {
        // Leftovers write, block writes in f dimension only
-        __attribute__((opencl_unroll_hint))
-        for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
            if (tx < OUTPUT_SIZE_X % TILE_X) {
                uint fw = 0;
            #if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
-                __attribute__((opencl_unroll_hint))
-                for (; fw + 4 <= F_PER_WI; fw += 4) {
+                unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
                    OUTPUT_TYPE4 tmp_write;
                    ARRAY_TO_VEC_4(tmp_write, out, tx * F_PER_WI + fw);
                    DT_OUTPUT_BLOCK_WRITE4(output, output_offset + fw * SIMD, tmp_write);
                }
            #endif
-                __attribute__((opencl_unroll_hint))
-                for (; fw + 2 <= F_PER_WI; fw += 2) {
+                unroll_for(; fw + 2 <= F_PER_WI; fw += 2) {
                    OUTPUT_TYPE2 tmp_write;
                    ARRAY_TO_VEC_2(tmp_write, out, tx * F_PER_WI + fw);
                    DT_OUTPUT_BLOCK_WRITE2(output, output_offset + fw * SIMD, tmp_write);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl
@@ -4,8 +4,9 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 #define TYPE_N_(type, n) type##n
 #define TYPE_N(type, n) TYPE_N_(type, n)
@@ -45,15 +46,12 @@

 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)

-#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
-
 #define SIMD 16
 #define FSV 16

 // int8 conv_input and weights data is packed to int32 "batches",
 // int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
 KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
    const __global INPUT0_TYPE *conv_input,
@@ -129,18 +127,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(

 #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
    uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
        weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
    }
    #if FILTER_IFM_NUM % FSV != 0
        uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+        unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
            weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
            FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
-            __attribute__((opencl_unroll_hint))
-            for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
+            unroll_for(uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
                wzp_p[f] = 0;
            }
        }
@@ -152,8 +147,7 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
        #ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
            #if FILTER_IFM_NUM % FSV != 0
                if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                    unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
                        weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
                    }
                }
@@ -170,11 +164,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(

        #ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
            ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
-            __attribute__((opencl_unroll_hint))
-            for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+            unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
                dotProdAZPxWZP[ofb] = 0;
-                __attribute__((opencl_unroll_hint))
-                for (uint ive = 0; ive < 4; ive++) {
+                unroll_for(uint ive = 0; ive < 4; ive++) {
                    dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
                    IMAD(dotProdAZPxWZP[ofb][ive],
                    AS_INPUT0_TYPE_4(data_zp_val[ive]),
@@ -188,12 +180,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
            __attribute__((opencl_unroll_hint(1)))
            for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
                // Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
-                __attribute__((opencl_unroll_hint))
-                for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
+                unroll_for(uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
+                    unroll_for(uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
+                        unroll_for(uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
                            uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
                            #ifdef SHOULD_USE_DATA_ZP
                                const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
@@ -300,23 +289,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                    }
                }

-                __attribute__((opencl_unroll_hint))
-                for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
-                        __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-                        for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
+                unroll_for(uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
+                    unroll_for(uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
+                        unroll_for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {

                            uint4 weights_val[OFM_BLOCKS_PER_SIMD];
-                            __attribute__((opencl_unroll_hint))
-                            for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                            unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
                                weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
                            }

-                            __attribute__((opencl_unroll_hint))
-                            for (uint ive = 0; ive < 4; ive++) {
-                                __attribute__((opencl_unroll_hint))
-                                for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                            unroll_for (uint ive = 0; ive < 4; ive++) {
+                                unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
                                    #ifdef SHOULD_USE_DATA_ZP
                                        ACCUMULATOR_TYPE dotProdAZPxW = 0;
                                        dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -325,19 +308,16 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                                        AS_FILTER_TYPE_4(weights_val[ofb][ive])));
                                    #endif

-                                    __attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
-                                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-                                    __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-                                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                                            __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-                                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+                                    unroll_for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                                        unroll_for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                                            unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
                                                const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
                                                const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
                                                const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
                                                const uint shuffle_wi = x_block_idx % SIMD;
                                                const uint shuffle_idx = x_block_idx / SIMD;

-                                                INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
+                                                INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
                                                    shuffle_wi));

                                                dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
@@ -401,17 +381,12 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                                                get_sub_group_local_id();

    if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
-        __attribute__((opencl_unroll_hint))
-        for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
+        unroll_for(uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
            if (get_sub_group_id() == wg) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = 0; ofb < wg; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                            __attribute__((opencl_unroll_hint))
-                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
+                    unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                        unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                                const uint partial_acc_ptr_idx =
                                    ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
                                    od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -422,24 +397,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                        }
                    }
                }
-                __attribute__((opencl_unroll_hint))
-                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                    unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                        unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                            dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
                        }
                    }
                }
-                __attribute__((opencl_unroll_hint))
-                for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                            __attribute__((opencl_unroll_hint))
-                            for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+                unroll_for(uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+                    unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                        unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                            unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                                const uint partial_acc_ptr_idx =
                                    ((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
                                    ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -454,14 +422,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
            }
        }
    } else {
-        __attribute__((opencl_unroll_hint))
-        for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
-            __attribute__((opencl_unroll_hint))
-            for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-                __attribute__((opencl_unroll_hint))
-                for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+        unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
+            unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                    unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                        const uint partial_acc_ptr_idx =
                            ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
                            od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -480,14 +444,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
        return;

    partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
-    __attribute__((opencl_unroll_hint))
-    for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
-        __attribute__((opencl_unroll_hint))
-        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-            __attribute__((opencl_unroll_hint))
-            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+    unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
+        unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                    const uint partial_acc_ptr_idx =
                        wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
                        od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -510,29 +470,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(

 #if BIAS_TERM
    BIAS_TYPE bias[OFM_VALUES_PER_WI];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
        bias[ofb] = biases[out_f + ofb * SIMD];
    }
 #endif

 #ifdef COMPENSATION_TERM
    COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
        comp[ofb] = compensation[out_f + ofb * SIMD];
    }
 #endif

    ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
-        __attribute__((opencl_unroll_hint))
-        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-            __attribute__((opencl_unroll_hint))
-            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+    unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+        unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                    dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
 #if BIAS_TERM
                    dequantized[ofb][od][oh][ow] += bias[ofb];
@@ -546,17 +500,13 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
    }

    OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
-    __attribute__((opencl_unroll_hint))
-    for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
+    unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
        FUSED_OPS_PRELOAD_SCALAR;
 #endif
-        __attribute__((opencl_unroll_hint))
-        for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
-            __attribute__((opencl_unroll_hint))
-            for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
-                __attribute__((opencl_unroll_hint))
-                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
+        unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+            unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
                    ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
 #if HAS_FUSED_OPS
 #   if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
@@ -585,21 +535,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
        for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
            bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
            if (good_of_block) {
-                __attribute__((opencl_unroll_hint))
-                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
                    bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
                    if (good_z) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                        unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
                            bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
                            if (good_y) {
                                uint ow = 0;
                            #if OUTPUT_TYPE_SIZE == 1
-                                __attribute__((opencl_unroll_hint))
-                                for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
+                                unroll_for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
-                                    __attribute__((opencl_unroll_hint))
-                                    for (uint i = 0; i < 8; ++i) {
+                                    unroll_for (uint i = 0; i < 8; ++i) {
                                        result_val[i] = result[ofb][od][oh][ow + i];
                                    }
                                    DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
@@ -607,11 +553,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                                }
                            #endif
                            #if OUTPUT_TYPE_SIZE <= 2
-                                __attribute__((opencl_unroll_hint))
-                                for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
+                                unroll_for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
-                                    __attribute__((opencl_unroll_hint))
-                                    for (uint i = 0; i < 4; ++i) {
+                                    unroll_for (uint i = 0; i < 4; ++i) {
                                        result_val[i] = result[ofb][od][oh][ow + i];
                                    }
                                    DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
@@ -619,11 +563,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
                                }
                            #endif

-                                __attribute__((opencl_unroll_hint))
-                                for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
+                                unroll_for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
                                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
-                                    __attribute__((opencl_unroll_hint))
-                                    for (uint i = 0; i < 2; ++i) {
+                                    unroll_for (uint i = 0; i < 2; ++i) {
                                        result_val[i] = result[ofb][od][oh][ow + i];
                                    }
                                    DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
@@ -655,12 +597,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
        #else
                const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
        #endif
-                __attribute__((opencl_unroll_hint))
-                for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
+                unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
                    bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
                    if (good_z) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
+                        unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
                            bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
                            if (good_y) {
                                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
@@ -720,9 +660,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(

 #undef AS_FILTER_TYPE_4

-#undef CEIL_DIV
-#undef ALIGN
-
 #undef SIMD
 #undef FSV
 #undef OFM_VALUES_PER_WI
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1.cl
@@ -2,13 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/sub_group.cl"

 #if FP16_UNIT_USED
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
-
    #define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
    { \
        const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
@@ -29,9 +29,6 @@
        _result = fma( _blockB.s7, acol7, _result ); \
    }
 #else
-    // Block read - currently block is 4 bytes aligned.
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
-
    #define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
    { \
        const float16 acol0 = TRANSPOSE_BLOCK_16( _blockA.s0 ); \
@@ -53,7 +50,11 @@
    }
 #endif

-__attribute__((intel_reqd_sub_group_size(16)))
+#ifndef ACCUMULATOR_TYPE
+#define ACCUMULATOR_TYPE INPUT0_TYPE
+#endif
+
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_bfyx_1x1)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -63,14 +64,15 @@ KERNEL(convolution_bfyx_1x1)(
 #endif
    uint split_idx)
 {
-    const uint xy = (uint)get_group_id(0) * 16 + get_sub_group_local_id();
+    const uint group_xy = (uint)get_group_id(0) * 16;
+    const uint xy = group_xy + get_sub_group_local_id();
    const uint x = xy % OUTPUT_SIZE_X;
    const uint y = xy / OUTPUT_SIZE_X;
    const uint f = (uint)get_group_id(1) * 16 + get_sub_group_local_id();//get_global_id(1);
    const uint b = (uint)get_global_id(2);
    const uint group_f = (uint)get_group_id(1) * 16;

-    MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
+    MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 16) blockC00 = INPUT0_VAL_ZERO;

 #if BIAS_TERM
    #if   BIAS_PER_OUTPUT
@@ -80,7 +82,7 @@ KERNEL(convolution_bfyx_1x1)(
    #endif
    for(uint i = 0; i < 16; i++)
    {
-        blockC00[i] = intel_sub_group_shuffle(biases[bias_index], i);
+        blockC00[i] = _sub_group_shuffle(biases[bias_index], i);
    }
 #endif

@@ -92,18 +94,18 @@ KERNEL(convolution_bfyx_1x1)(
    const uint filter_offset = group_f * ((FILTER_OFM_PITCH + 8 - 1) / 8) * 8;//f*FILTER_OFM_PITCH;
    const uint xy_block_num = (INPUT0_FEATURE_PITCH + 16 - 1) / 16;
    const uint f_block_num = (INPUT0_FEATURE_NUM + 8 - 1) / 8;
-    const uint input_offset = in_split_offset + xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
+    const uint input_offset = in_split_offset + group_xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;

    for (uint k = 0; k < (FILTER_IFM_NUM + 8 - 1) / 8; ++k)
    {
-        MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
-        MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
+        MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) blockA00;
+        MAKE_VECTOR_TYPE(FILTER_TYPE, 8) blockB00;

        uint input_idx = input_offset + k * 8 * xy_block_num * 16;
        uint filter_idx = filter_offset + k * 8 * 16;

-        blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
-        blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
+        blockA00 = DT_INPUT_BLOCK_READ8(input, input_idx);
+        blockB00 = DT_FILTER_BLOCK_READ8(weights, filter_idx);

        MULTIPLY_BLOCKS_16x8_8x16(blockC00, blockB00, blockA00);
    }
@@ -128,3 +130,4 @@ KERNEL(convolution_bfyx_1x1)(
 #undef CONCAT_TOKEN
 #undef CONCAT_TOKEN_HANDLER1
 #undef MULTIPLY_BLOCKS_16x16
+#undef ACCUMULATOR_TYPE
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1_hgemm_buf_16x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1_hgemm_buf_16x1.cl
@@ -2,8 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
 #include "include/gemm_common.cl"

 #define MULT(C_, A_, i_)                   \
@@ -13,7 +12,7 @@
    DOT8i(C_, B24, A_, i_ + 3);

 __attribute__((reqd_work_group_size(16, TY, 1)))
-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -56,14 +55,14 @@ KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(

        // 512 MADs

-        half8 B0 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
+        half8 B0 = as_half8(_sub_group_block_read_us8(weights, coordB));
        coordB.y += 8;
-        half8 B8 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
+        half8 B8 = as_half8(_sub_group_block_read_us8(weights, coordB));
        coordB.y += 8;

-        half8 B16 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
+        half8 B16 = as_half8(_sub_group_block_read_us8(weights, coordB));
        coordB.y += 8;
-        half8 B24 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
+        half8 B24 = as_half8(_sub_group_block_read_us8(weights, coordB));
        coordB.y += 8;

        half8 A0 = A_load[K8*0 + k8];
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_1x1_opt.cl
@@ -2,11 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define SIMD_SIZE 8
-__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+REQD_SUB_GROUP_SIZE(SIMD_SIZE)
 KERNEL(convolution)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -58,18 +57,18 @@ KERNEL(convolution)(
        }

 #if OUT_BLOCK_DEPTH == 8
-        float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
+        float8 w = as_float8(_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
 #elif OUT_BLOCK_DEPTH == 4
-        float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
+        float4 w = as_float4(_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
 #elif OUT_BLOCK_DEPTH == 2
-        float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
+        float2 w = as_float2(_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
 #endif

        for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
        {
            for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
            {
-                float _in = intel_sub_group_shuffle(in[br], bc);
+                float _in = _sub_group_shuffle(in[br], bc);
                for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
                {
                    dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_depthwise_weights_lwg.cl
@@ -2,18 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

-#if FP16_UNIT_USED
-    #define ALIGNED_BLOCK_READ(ptr, byte_offset) as_half(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
-    #define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
-#else
-    #define ALIGNED_BLOCK_READ(ptr, byte_offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
-    #define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint8(val))
-#endif
-
-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 __attribute__((reqd_work_group_size(16, 1, 1)))
 KERNEL(convolution_depthwise_weights_lwg)(
    __global INPUT0_TYPE* input,
@@ -41,7 +34,7 @@ KERNEL(convolution_depthwise_weights_lwg)(
    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_group_offset;

 #if FILTER_SIZE_Y * FILTER_SIZE_X % 16 == 0 && !FP16_UNIT_USED
-    UNIT_TYPE w = ALIGNED_BLOCK_READ(weights, filter_offset);
+    UNIT_TYPE w = DT_FILTER_BLOCK_READ(weights, filter_offset);
 #elif FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
    const uint lid = get_local_id(0);
    UNIT_TYPE w[2] = { UNIT_VAL_ZERO };
@@ -78,9 +71,9 @@ KERNEL(convolution_depthwise_weights_lwg)(
 #if FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
                    const uint id = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) / 16;
                    const uint idx = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) % 16;
-                    UNIT_TYPE w1 = intel_sub_group_shuffle(w[id], idx);
+                    UNIT_TYPE w1 = _sub_group_shuffle(w[id], idx);
 #else
-                    UNIT_TYPE w1 = intel_sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
+                    UNIT_TYPE w1 = _sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
 #endif
                    dotProd = mad(input[input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH],
                                  w1, dotProd);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_direct_10_12_16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_direct_10_12_16.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 //////////////////////////////////////////////////////////////////////////////
@@ -16,7 +15,7 @@
 #define TILE_X          12      // Width of tile loaded in input (src0)
 #define TILE_Y          10      // Height of tile loaded in input (src0)

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_f16_10x12x16)(
    const __global half *src0,
    __global half *dst,
@@ -100,12 +99,12 @@ KERNEL(convolution_f16_10x12x16)(
        unsigned interleaved_y = 0;
        LOOP(KERNEL_SLICE_DIV2, interleaved_y,
        {
-            p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
+            p2BlockB[interleaved_y] = _sub_group_block_read_us2( (const __global ushort*)src1_read );
            src1_read += ALIGNED_OFM_PER_GROUP * 2;
        } )
        if ( kernel_slice_is_odd )
        {
-            pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
+            pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = _sub_group_block_read_us( (const __global ushort*)src1_read );
            src1_read += ALIGNED_OFM_PER_GROUP * 2;
        }

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_direct_8_8_16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_direct_8_8_16.cl
@@ -1,159 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-//#include "include/cnn_common.cl"
-
-//////////////////////////////////////////////////////////////////////////////
-// Direct Convolution
-#if defined(cl_intel_subgroups_short)
-
-#define TILE_M          DY      // Height of tile in input patches (src0)
-#define TILE_K          DX      // Width of tile in input patches (src0)
-#define TILE_N          16      // Num filter channels per tile (src1)
-
-#define TILE_X          8       // Width of tile loaded in input (src0)
-#define TILE_Y          8       // Height of tile loaded in input (src0)
-
-__attribute__((intel_reqd_sub_group_size(16)))
-__kernel void convolution_f16_8x8x16(
-    const __global half *src0,
-    __global half *dst,
-    const __global half *src1,
-    const __global half *biases)
-{
-    const unsigned global_x = (uint)get_global_id(0);
-    const unsigned global_y = (uint)get_global_id(1);
-    const unsigned global_z = (uint)get_global_id(2);
-    const unsigned out_fm   = global_z % ALIGNED_OFM;
-    const unsigned batch_id = global_z / ALIGNED_OFM;
-    const unsigned group_x = get_group_id(0);
-    const unsigned group_z = get_group_id(2);
-    const unsigned max_group_x = get_num_groups(0);
-    const unsigned local_z = get_local_id(2);
-
-    half blockC[TILE_M * TILE_K] = { 0 };
-
-    uint src0_offset_tile =
-       batch_id * INPUT_BATCH_PITCH                         // batch offset
-     + ( global_y * TILE_M * STRIDE_Y ) * INPUT_Y_PITCH   // y offset
-     + ( global_x * TILE_K * STRIDE_X );                    // x offset
-    uint src0_offset = src0_offset_tile
-     + ( local_z / ( TILE_X / 4 ) ) * INPUT_Y_PITCH       // y tile offset
-     + ( local_z % ( TILE_X / 4 ) ) * 4;                    // x tile offset
-
-    const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2;
-
-    unsigned patch_depth = 0;
-    __attribute__((opencl_unroll_hint(3)))
-    do
-    {
-        // Load atile (input) and btile (filters).
-        // Kernel data is partially interleaved.  Every 2 rows are interleaved at float16 granularity.
-        // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
-        // interleaved row is padded with zero to ensure same size as interleaved rows. This
-        // interleaving is done to increase consecutive data to fetch which reduces loads required.
-        // For example, this is how the kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
-        // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
-        // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
-        // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
-        // ...
-        
-        // in case the data is not aligned to sizeof(T)*KERNEL_WIDTH we need to use vload or set the data in a loop
-        half4 blockA = vload4(0, src0 + src0_offset );
-        src0_offset += INPUT_FEATURE_PITCH;
-
-        half blockB[KERNEL_WIDTH * KERNEL_HEIGHT];
-        ushort2* p2BlockB = (ushort2*)blockB;
-        ushort*  pBlockB =  (ushort* )blockB;
-
-        const bool kernel_slice_is_odd = ( KERNEL_WIDTH * KERNEL_HEIGHT ) % 2 == 1;
-        unsigned interleaved_y = 0;
-        LOOP(KERNEL_SLICE_DIV2, interleaved_y,
-        {
-            p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
-            src1_read += ALIGNED_OFM * 2;
-        } )
-        if ( kernel_slice_is_odd )
-        {
-            pBlockB[KERNEL_WIDTH * KERNEL_HEIGHT - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
-            src1_read += ALIGNED_OFM * 2;
-        }
-
-#define BLOCK_A(n) sub_group_broadcast( blockA[(n)%4], (n)/4 )
-
-        // Perform MADs
-        // Loop through all patches in tile (patch_x/y)
-        // For each patch, sum values (x/y)
-        unsigned patch_y=0;
-        LOOP(TILE_M, patch_y,
-        {
-            unsigned patch_x=0;
-            LOOP(TILE_K, patch_x,
-            {
-                unsigned tile_idx = patch_y * TILE_X * STRIDE_Y + patch_x * STRIDE_X;
-                unsigned out_idx  = patch_y * TILE_K + patch_x;
-
-                unsigned y=0;
-                LOOP(KERNEL_HEIGHT, y,
-                {
-                    unsigned x=0;
-                    LOOP(KERNEL_WIDTH, x,
-                    {
-                        unsigned offset_idx = y * TILE_X + x;
-                        unsigned out_chan_idx = y * KERNEL_WIDTH + x;
-
-                        blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] );
-                    } )
-                } )
-            } )
-        } )
-    }
-    while ( ++patch_depth < INPUT_FEATURE_NUM );
-
-    // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
-    // TILE_K x TILE_M x SIMD.  Partial writes most likely generated if output padding used.
-    // Group stores into vectors to expedite writeback.  One large write is faster than many
-    // small saves. Right-most column may be smaller if output width not divisible by tile width.
-    __global half *out = dst
-     + batch_id * OUTPUT_BATCH_PITCH            // batch offset
-     + out_fm * OUTPUT_FEATURE_PITCH              // channel offset
-     + ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset
-     + ( global_x * TILE_K );                // x offset
-
-    if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM )
-    {
-        half bias = biases[out_fm];
-        if ( OUTPUT_SIZE_X % TILE_K == 0 ||
-             group_x < max_group_x - 1 )
-        {
-            typedef CAT( half, TILE_K ) half_t;
-            half bias = biases[out_fm];
-            for( unsigned y = 0; y < TILE_M; y++ )
-            {
-                if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
-                {
-                    half_t vBlockC;
-                    half *pvBlockC = (half*)&vBlockC;
-                    for (unsigned i = 0; i < TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
-                    *(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
-                }
-            }
-        }
-        else
-        {
-            typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t;
-            for( unsigned y = 0; y < TILE_M; y++ )
-            {
-                if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
-                {
-                    half_t vBlockC;
-                    half *pvBlockC = (half*)&vBlockC;
-                    for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
-                    *(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
-                }
-            }
-        }
-    }
-}
-#endif // cl_intel_subgroups_short
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define INPUT_TYPE        INPUT0_TYPE
@@ -19,59 +21,19 @@

 #define AS_FILTER_TYPE8   CAT(as_, FILTER_TYPE8)

-#if INPUT0_TYPE_SIZE == 2
-#   define INPUT_BLOCK_READ(ptr, offset)    AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ2(ptr, offset)   AS_INPUT_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ4(ptr, offset)   AS_INPUT_TYPE4(intel_sub_group_block_read_us4((__global ushort*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ8(ptr, offset)   AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
-#elif INPUT0_TYPE_SIZE == 4
-#   define INPUT_BLOCK_READ(ptr, offset)    AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ2(ptr, offset)   AS_INPUT_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ4(ptr, offset)   AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ8(ptr, offset)   AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
-#else
-#   error convolution_gpu_bfyx_f16.cl: unsupported input type
-#endif
-
-#if FILTER_TYPE_SIZE == 2
-#   define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
-#elif FILTER_TYPE_SIZE == 4
-#   define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
-#else
-#   error convolution_gpu_bfyx_f16.cl: unsupported filter type
-#endif

 #if OUTPUT_FORMAT_BFYX
 #   define OUTPUTVTYPE(n)       CAT(OUTPUT_TYPE, n)
 #   define TO_OUTPUTVTYPE       CAT(convert_, OUTPUTVTYPE(OUTPUT_X_BLOCK_SIZE))
 #   define VSTORE               CAT(vstore, OUTPUT_X_BLOCK_SIZE)
-#else
-#   if OUTPUT_TYPE_SIZE == 1
-#       define OUTPUT_BLOCK_WRITE(ptr, offset, val)    BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
-#       define OUTPUT_BLOCK_WRITE2(ptr, offset, val)   BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val))
-#       define OUTPUT_BLOCK_WRITE4(ptr, offset, val)   BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val))
-#       define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
-#   elif OUTPUT_TYPE_SIZE == 2
-#       define OUTPUT_BLOCK_WRITE(ptr, offset, val)    intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
-#       define OUTPUT_BLOCK_WRITE2(ptr, offset, val)   intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val))
-#       define OUTPUT_BLOCK_WRITE4(ptr, offset, val)   intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val))
-#       define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
-#   elif OUTPUT_TYPE_SIZE == 4
-#       define OUTPUT_BLOCK_WRITE(ptr, offset, val)    intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
-#       define OUTPUT_BLOCK_WRITE2(ptr, offset, val)   intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val))
-#       define OUTPUT_BLOCK_WRITE4(ptr, offset, val)   intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
-#       define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
-#   else
-#       error convolution_gpu_bfyx_f16.cl: unsupported output type
-#   endif
 #endif  // OUTPUT_FORMAT_BFYX

 #if INPUT0_TYPE_SIZE == 2
 #   define AS_INPUT_SRC         CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE))
 #   define AS_US_SRC            CAT(as_, MAKE_VECTOR_TYPE(ushort, OUTPUT_X_BLOCK_SIZE))
-#   define GET_SRC(data, id)    AS_INPUT_SRC(intel_sub_group_shuffle(AS_US_SRC(data), id))
+#   define GET_SRC(data, id)    AS_INPUT_SRC(_sub_group_shuffle(AS_US_SRC(data), id))
 #else
-#   define GET_SRC(data, id)    intel_sub_group_shuffle(data, id)
+#   define GET_SRC(data, id)    _sub_group_shuffle(data, id)
 #endif

 #define FEATURE_SLICE_SIZE 16
@@ -79,7 +41,7 @@
 #define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
 #define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
 KERNEL(convolution_bfyx_f16)(
    __global INPUT0_TYPE* input,
@@ -169,12 +131,12 @@ KERNEL(convolution_bfyx_f16)(

 #if BIAS_TERM
 #if SLM_DIV_FACTOR == 1
-    vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
+    vec_t dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
 #else
    vec_t dst;

    if (feature_sub_block == 0) {
-        dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
+        dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
    } else {
        dst = INPUT0_VAL_ZERO;
    }
@@ -240,7 +202,7 @@ KERNEL(convolution_bfyx_f16)(
                {
                    int xb = 0;
                    for (; xb + 8 <= INPUT_LINE_SIZE; xb += 8) {
-                        INPUT_TYPE8 vv = INPUT_BLOCK_READ8(input, grouped_input_offset +
+                        INPUT_TYPE8 vv = DT_INPUT_BLOCK_READ8(input, grouped_input_offset +
                                                                  icb * input_fs_pitch +
                                                                  kh * DILATION_SIZE_Y * input_y_pitch +
                                                                  xb * input_x_pitch);
@@ -255,7 +217,7 @@ KERNEL(convolution_bfyx_f16)(
                        line_cache[xb + 7] = vv[7];
                    }
                    for (; xb + 4 <= INPUT_LINE_SIZE; xb += 4) {
-                        INPUT_TYPE4 vv = INPUT_BLOCK_READ4(input, grouped_input_offset +
+                        INPUT_TYPE4 vv = DT_INPUT_BLOCK_READ4(input, grouped_input_offset +
                                                                  icb * input_fs_pitch +
                                                                  kh * DILATION_SIZE_Y * input_y_pitch +
                                                                  xb * input_x_pitch);
@@ -266,7 +228,7 @@ KERNEL(convolution_bfyx_f16)(
                        line_cache[xb + 3] = vv[3];
                    }
                    for (; xb < INPUT_LINE_SIZE; xb++) {
-                        line_cache[xb] = INPUT_BLOCK_READ(input, grouped_input_offset +
+                        line_cache[xb] = DT_INPUT_BLOCK_READ(input, grouped_input_offset +
                                                                 icb * input_fs_pitch +
                                                                 kh * DILATION_SIZE_Y * input_y_pitch +
                                                                 xb * input_x_pitch);
@@ -333,11 +295,11 @@ KERNEL(convolution_bfyx_f16)(
 #   error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
 #endif  // FILTER_IFM_NUM
 #else
-                    FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
+                    FILTER_TYPE8 wei0 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
                                                                    icb * filter_is_pitch +
                                                                    kh * filter_y_pitch +
                                                                    kw * filter_x_pitch);
-                    FILTER_TYPE8 wei1 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
+                    FILTER_TYPE8 wei1 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
                                                                    icb * filter_is_pitch +
                                                                    kh * filter_y_pitch +
                                                                    kw * filter_x_pitch +
@@ -388,8 +350,7 @@ KERNEL(convolution_bfyx_f16)(
    barrier(CLK_LOCAL_MEM_FENCE);

    if (feature_sub_block == 0) {
-        __attribute__((opencl_unroll_hint))
-        for (int i = 1; i < SLM_DIV_FACTOR; i++)
+        unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
            dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
 #endif // SLM_DIV_FACTOR > 1

@@ -453,13 +414,13 @@ KERNEL(convolution_bfyx_f16)(
    #endif
 #else
    #if OUTPUT_X_BLOCK_SIZE == 8
-            OUTPUT_BLOCK_WRITE8(output, output_offset, res);
+            DT_OUTPUT_BLOCK_WRITE8(output, output_offset, res);
    #elif OUTPUT_X_BLOCK_SIZE == 4
-            OUTPUT_BLOCK_WRITE4(output, output_offset, res);
+            DT_OUTPUT_BLOCK_WRITE4(output, output_offset, res);
    #elif OUTPUT_X_BLOCK_SIZE == 2
-            OUTPUT_BLOCK_WRITE2(output, output_offset, res);
+            DT_OUTPUT_BLOCK_WRITE2(output, output_offset, res);
    #elif OUTPUT_X_BLOCK_SIZE == 1
-            OUTPUT_BLOCK_WRITE(output, output_offset, res);
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset, res);
    #else
    #   error convolution_gpu_bfyx_f16.cl: unsupported output x block size
    #endif
@@ -480,7 +441,7 @@ KERNEL(convolution_bfyx_f16)(
 #if OUTPUT_FORMAT_BFYX
                output[output_offset + i] = res[i];
 #else
-                OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
+                DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
 #endif
            }
        }
@@ -511,20 +472,8 @@ KERNEL(convolution_bfyx_f16)(

 #undef AS_FILTER_TYPE8

-#undef INPUT_BLOCK_READ
-#undef INPUT_BLOCK_READ2
-#undef INPUT_BLOCK_READ4
-#undef INPUT_BLOCK_READ8
-
-#undef FILTER_BLOCK_READ8
-
 #if OUTPUT_FORMAT_BFYX
 #   undef OUTPUTVTYPE
 #   undef TO_OUTPUTVTYPE
 #   undef VSTORE
-#else
-#   undef OUTPUT_BLOCK_WRITE
-#   undef OUTPUT_BLOCK_WRITE2
-#   undef OUTPUT_BLOCK_WRITE4
-#   undef OUTPUT_BLOCK_WRITE8
 #endif  // OUTPUT_FORMAT_BFYX
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@@ -2,17 +2,18 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/unit_type.cl"

 #if X_BLOCK_SIZE > 1
 #   define GET_SRC(data, id)    AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE),                             \
-                                    intel_sub_group_shuffle(                                                   \
+                                    _sub_group_shuffle(                                                   \
                                    AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data),         \
                                    id))
 #else
-#   define GET_SRC(data, id)    AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
+#   define GET_SRC(data, id)    AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
 #endif

 #define FEATURE_SLICE_SIZE 16
@@ -22,7 +23,7 @@
 #   define UNIT_BLOCK_WRITE_VEC(ptr, offset, val)   CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
 #endif

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
 KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    __global INPUT0_TYPE* input,
@@ -211,8 +212,7 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    barrier(CLK_LOCAL_MEM_FENCE);

    if (feature_sub_block == 0) {
-        __attribute__((opencl_unroll_hint))
-        for (int i = 1; i < SLM_DIV_FACTOR; i++)
+        unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
            dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
 #endif // SLM_DIV_FACTOR > 1

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl
@@ -3,9 +3,9 @@
 //

 #include "include/batch_headers/fetch_data.cl"
-#include "include/batch_headers/data_types.cl"
-
-#define unroll_for __attribute__((opencl_unroll_hint)) for
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 #define FEATURE_SLICE_SIZE 16

@@ -22,42 +22,9 @@
 #define AS_FILTER_TYPE2   CAT(as_, FILTER_TYPE2)
 #define TO_OUTPUT_TYPE8   CAT(convert_, OUTPUT_TYPE8)

-#if INPUT0_TYPE_SIZE == 2
-#   define INPUT_BLOCK_READ(ptr, offset)    AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ8(ptr, offset)   AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
-#elif INPUT0_TYPE_SIZE == 4
-#   define INPUT_BLOCK_READ(ptr, offset)    AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
-#   define INPUT_BLOCK_READ8(ptr, offset)   AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
-#else
-#   error convolution_gpu_bfyx_f16_depthwise.cl - unsupported input type.
-#endif
-
-#if FILTER_TYPE_SIZE == 2
-#   define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
-#   define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
-#elif FILTER_TYPE_SIZE == 4
-#   define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
-#   define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
-#else
-#   error convolution_gpu_bfyx_f16_depthwise.cl - unsupported filter type.
-#endif
-
-#if OUTPUT_TYPE_SIZE == 1
-#   define OUTPUT_BLOCK_WRITE(ptr, offset, val)    BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
-#   define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
-#elif OUTPUT_TYPE_SIZE == 2
-#   define OUTPUT_BLOCK_WRITE(ptr, offset, val)    intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
-#   define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
-#elif OUTPUT_TYPE_SIZE == 4
-#   define OUTPUT_BLOCK_WRITE(ptr, offset, val)    intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
-#   define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
-#else
-#   error convolution_gpu_bfyx_f16_depthwise.cl - unsupported output type.
-#endif
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
-KERNEL(convolution_depthwise)(
+KERNEL(convolution_gpu_bfyx_f16_depthwise)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
    __global FILTER_TYPE* weights,
@@ -96,32 +63,32 @@ KERNEL(convolution_depthwise)(
                              (f_block + input_fs_pad_before) * input_fs_pitch;

 #if BIAS_TERM
-    INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
+    INPUT_TYPE8 dst = (INPUT_TYPE8)(DT_INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
 #else
    INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT0_VAL_ZERO);
 #endif

 #if ((FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3) && (STRIDE_SIZE_X == 1) && (DILATION_SIZE_X == 1) && (DILATION_SIZE_Y == 1))

-    FILTER_TYPE wei_00 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_01 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_02 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_10 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_11 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_12 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_20 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_21 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
-    FILTER_TYPE wei_22 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_00 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_01 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_02 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_10 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_11 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_12 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_20 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_21 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
+    FILTER_TYPE wei_22 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);

-    INPUT_TYPE8 src_block_0 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
-    INPUT_TYPE8 src_block_1 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
-    INPUT_TYPE8 src_block_2 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
-    INPUT_TYPE src_tail_00 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
-    INPUT_TYPE src_tail_01 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
-    INPUT_TYPE src_tail_10 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
-    INPUT_TYPE src_tail_11 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
-    INPUT_TYPE src_tail_20 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
-    INPUT_TYPE src_tail_21 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);
+    INPUT_TYPE8 src_block_0 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
+    INPUT_TYPE8 src_block_1 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
+    INPUT_TYPE8 src_block_2 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
+    INPUT_TYPE src_tail_00 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
+    INPUT_TYPE src_tail_01 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
+    INPUT_TYPE src_tail_10 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
+    INPUT_TYPE src_tail_11 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
+    INPUT_TYPE src_tail_20 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
+    INPUT_TYPE src_tail_21 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);

 #if X_BLOCK_SIZE == 8
    for (uint i = 0; i < X_BLOCK_SIZE - 2; i++)
@@ -185,12 +152,12 @@ KERNEL(convolution_depthwise)(

    unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
        unroll_for (uint j = 0; j < FILTER_SIZE_X_DIV_2; j++) {
-            wei_temp = FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
+            wei_temp = DT_FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
            wei[i * FILTER_SIZE_X + j * 2] = wei_temp.s0;
            wei[i * FILTER_SIZE_X + j * 2 + 1] = wei_temp.s1;
        }
 #if (FILTER_SIZE_X % 2)
-        wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = FILTER_BLOCK_READ(weights, filter_offset +
+        wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = DT_FILTER_BLOCK_READ(weights, filter_offset +
                                                                                i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE +
                                                                                (FILTER_SIZE_X - 1) * FEATURE_SLICE_SIZE);
 #endif // (FILTER_SIZE_X % 2)
@@ -201,7 +168,7 @@ KERNEL(convolution_depthwise)(
    unroll_for (uint k = 0; k < X_BLOCK_SIZE; k++) {
        unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
            unroll_for (uint j = 0; j < FILTER_SIZE_X; j++) {
-                src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = INPUT_BLOCK_READ(input, input_offset +
+                src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = DT_INPUT_BLOCK_READ(input, input_offset +
                                                                                                         (input_y + (i * DILATION_SIZE_Y)) * input_y_pitch +
                                                                                                         (input_x + (j * DILATION_SIZE_X) + k * STRIDE_SIZE_X) * input_x_pitch);
            }
@@ -260,7 +227,7 @@ KERNEL(convolution_depthwise)(
 #else
            res = TO_OUTPUT_TYPE8(dst);
 #endif // HAS_FUSED_OPS
-            OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
+            DT_OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
        }
        else
        {
@@ -271,7 +238,7 @@ KERNEL(convolution_depthwise)(
 #else
                res[i] = TO_OUTPUT_TYPE(dst[i]);
 #endif // HAS_FUSED_OPS
-                OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
+                DT_OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
            }
        }
    }
@@ -300,13 +267,11 @@ KERNEL(convolution_depthwise)(
 #else
        res = TO_OUTPUT_TYPE(dst[0]);
 #endif // HAS_FUSED_OPS
-        OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
+        DT_OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
    }
 #endif
 }

-#undef unroll_for
-
 #undef FEATURE_SLICE_SIZE
 #undef X_BLOCK_SIZE

@@ -322,12 +287,3 @@ KERNEL(convolution_depthwise)(

 #undef AS_FILTER_TYPE2
 #undef TO_OUTPUT_TYPE8
-
-#undef INPUT_BLOCK_READ
-#undef INPUT_BLOCK_READ8
-
-#undef FILTER_BLOCK_READ
-#undef FILTER_BLOCK_READ2
-
-#undef OUTPUT_BLOCK_WRITE
-#undef OUTPUT_BLOCK_WRITE8
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_gemm_like_fp16.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #if defined(cl_intel_subgroups_short)
@@ -10,7 +9,7 @@
 #define TILE_K          FILTER_SIZE_X
 #define TILE_N          32

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_f16)(
    const __global half *src0,
    __global half *dst,
@@ -207,12 +206,12 @@ KERNEL(convolution_f16)(
            interleaved_y = 0;
            LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
            {
-                p4BlockB00[interleaved_y] = intel_sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
+                p4BlockB00[interleaved_y] = _sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
                src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
            } )
            if ( kernel_width_is_odd )
            {
-                p2BlockB00[FILTER_SIZE_X - 1] = intel_sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
+                p2BlockB00[FILTER_SIZE_X - 1] = _sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
                src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
            }

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_gemm_like_fp32.cl
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

@@ -10,7 +10,7 @@
 #define TILE_K          FILTER_SIZE_X
 #define TILE_N          32

-__attribute__((intel_reqd_sub_group_size(8)))
+REQD_SUB_GROUP_SIZE(8)
 KERNEL(convolution_f32)(
    const __global float *src0,
    __global float *dst,
@@ -149,12 +149,12 @@ KERNEL(convolution_f32)(
            interleaved_y = 0;
            LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
            {
-                p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
+                p8BlockB00[interleaved_y] = as_float8( _sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
                src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
            } )
            if ( kernel_width_is_odd )
            {
-                p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
+                p4BlockB00[FILTER_SIZE_X - 1] = as_float4( _sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
                src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
            }

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_iyxo.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_iyxo.cl
@@ -2,10 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_bfyx_iyxo_5x5)(
    const __global UNIT_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"


@@ -53,7 +53,7 @@ if (_kernel_data.leftovers)
 #define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
 #endif

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
    const __global UNIT_TYPE* input,
@@ -173,10 +173,10 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(

 #if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
                        //if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
-                        UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
+                        UNIT_TYPE val = _sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
                                                                    (((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
 #else
-                        UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
+                        UNIT_TYPE val = _sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
 #endif

                        out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define FEATURE_SLICE_SIZE 16
@@ -17,9 +19,9 @@
 #error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type."
 #endif

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
-KERNEL(convolution_bfyx_to_bfyx_f16)(
+KERNEL(convolution_gpu_bfyx_to_bfyx_f16)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
    __global FILTER_TYPE* weights,
@@ -134,7 +136,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
                INPUT0_TYPE src[INPUT0_FEATURE_NUM];
                __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
                for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
-                    src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
+                    src[ic] = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
                    dst[i] = mad(wei[ic], src[ic], dst[i]);
                }
            }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/unit_type.cl"

@@ -10,7 +12,7 @@
 #define FEATURE_SLICE_SIZE 16
 #define INPUT_FEATURE_NUM 3

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
 KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(
    __global INPUT0_TYPE* input,
@@ -142,7 +144,7 @@ KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(

                __attribute__((opencl_unroll_hint(INPUT_FEATURE_NUM)))
                for (int ic = 0; ic < INPUT_FEATURE_NUM; ic++) {
-                    UNIT_TYPE src = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
+                    UNIT_TYPE src = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
                    dst[i] = mad(w[ic], src, dst[i]);
                }
            }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
@@ -2,12 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/unit_type.cl"
 #include "include/batch_headers/fetch_data.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)

@@ -58,7 +58,7 @@

 #define ALIGNED_IFM_NUM (((FILTER_IFM_NUM + FSV - 1) / FSV) * FSV)

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
    __global UNIT_TYPE* input,
@@ -164,7 +164,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
                            // With simd along x dimension:
                            // (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE - element number in simd-lane;
                            // (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE - simd-lane with that element.
-                            UNIT_TYPE in_val = intel_sub_group_shuffle(
+                            UNIT_TYPE in_val = _sub_group_shuffle(
                                in[(out_y * STRIDE_SIZE_Y + f_y * DILATION_SIZE_Y) * INPUT_BLOCK_WIDTH_EL_CNT +
                                   (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE],
                                (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE);
@@ -299,8 +299,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
    // ========================================================================
 }

-#undef unroll_for
-
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
 #define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -41,7 +41,7 @@
 // ======================================================================================


-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_fs_byx_fsv32)(
    __global INPUT0_TYPE* input,
@@ -128,7 +128,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
                    {
                        unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
                        {
-                            INPUT0_TYPE in_val = intel_sub_group_shuffle(
+                            INPUT0_TYPE in_val = _sub_group_shuffle(
                                in[(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) * FSV_PER_THREAD + ifii / SUB_GROUP_SIZE],
                                ifii % SUB_GROUP_SIZE);

@@ -242,8 +242,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
    // ========================================================================
 }

-#undef unroll_for
-
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
 #undef INPUT0_SIZE_B_WITH_PADDING
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
@@ -3,11 +3,11 @@
 //

 #include "include/unit_type.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
 #define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -33,7 +33,7 @@
 // OUTPUT_BLOCK_HEIGHT - [int] number of elements calculated in y dimension by one thread
 // ======================================================================================

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
 KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
    __global UNIT_TYPE* input,
@@ -109,7 +109,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                {
                    unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
                    {
-                        UNIT_TYPE in_val = intel_sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);
+                        UNIT_TYPE in_val = _sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);

                        const uint out_idx = out_y * OUTPUT_BLOCK_WIDTH * FSV_PER_THREAD + out_x * FSV_PER_THREAD + out_f;
                        out[out_idx] = mad(w[in_f * FSV_PER_THREAD + out_f], in_val, out[out_idx]);
@@ -236,8 +236,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
    // ========================================================================
 }

-#undef unroll_for
-
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
 #undef INPUT0_SIZE_B_WITH_PADDING
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
@@ -2,12 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/unit_type.cl"
 #include "include/batch_headers/fetch_data.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
 #define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -32,9 +31,9 @@
 // ======================================================================================


-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
-KERNEL(convolution_gpu_fs_byx_fsv32)(
+KERNEL(convolution_gpu_fs_byx_fsv32_depthwise)(
       __global UNIT_TYPE* input,
       __global UNIT_TYPE* output,
       __global UNIT_TYPE* weights,
@@ -226,8 +225,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
    // ========================================================================
 }

-#undef unroll_for
-
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
 #undef INPUT0_SIZE_B_WITH_PADDING
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad.cl
@@ -3,7 +3,8 @@
 //

 #include "include/batch_headers/fetch_data.cl"
-#include "include/imad.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/imad.cl"
 #if QUANTIZATION_TERM
 #    define ACCUMULATOR_TYPE int
 #    define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -40,9 +41,6 @@
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
 #define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)

-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
-
 #if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
    #define NON_ZERO_INPUT0_PAD_BEFORE
 #endif
@@ -68,7 +66,7 @@

 // int8 conv_input and weights data is packed to int32 "batches",
 // int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
-__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+REQD_SUB_GROUP_SIZE(SIMD_SIZE)
 __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 KERNEL (fused_convolution_eltwise_gpu_imad)(
 #if INPUT0_LAYOUT_B_FS_YX_FSV16
@@ -134,8 +132,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
            int weights_zp_vec_partial;
            weights_zp_vec_partial = weights_zp_val;
            FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial;
-            __attribute__((opencl_unroll_hint))
-            for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
+            unroll_for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
                wzp_p[in_f] = 0;
            }
        #endif
@@ -237,7 +234,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
                 #endif
            #else
                #ifdef BLOCK_LOAD_INPUTS
-                    in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
+                    in[reg] = AS_PACKED_TYPE(_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
                    #ifdef SHOULD_USE_DATA_ZP
                        if (input_on_padding)
                            in[reg] = data_zp_val;
@@ -255,8 +252,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
        }

        #ifdef BLOCK_LOAD_WEIGHTS
-            *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
-            w[8] = as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
+            *((int8*)&w[0]) = as_int8(_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
+            w[8] = as_int(_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
            weight_addr += SIMD_SIZE*NUM_FILTERS;
        #else
            for(int pf = 0; pf < NUM_FILTERS; pf++) {
@@ -278,10 +275,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
                    dotProdAZPxW = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxW, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(w[wi])));
                #endif

-                __attribute__((opencl_unroll_hint))
-                for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
-                    __attribute__((opencl_unroll_hint))
-                    for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
+                unroll_for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
+                    unroll_for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
                        INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y],
                                                                                    bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X));

@@ -403,5 +398,3 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
 #undef FILTER_TYPE_4
 #undef AS_FILTER_TYPE_4
 #undef NUM_FILTERS
-#undef CEIL_DIV
-#undef ALIGN
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1.cl
@@ -4,7 +4,8 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
+#include "include/batch_headers/imad.cl"
 #if QUANTIZATION_TERM
 #define ACCUMULATOR_TYPE int
 #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -24,7 +25,7 @@
 #define BATCH_SLICE_SIZE 16
 #define FEATURE_SLICE_SIZE 16

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
    const __global INPUT0_TYPE *conv_input,
    __global OUTPUT_TYPE *output,
@@ -63,15 +64,15 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(

        __attribute__((opencl_unroll_hint(16)))
        for (uint j = 0; j < 16; j++) {
-            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
-            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
-            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
-            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));

-            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val2.s0, j))));
-            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val2.s1, j))));
-            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val2.s2, j))));
-            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val2.s3, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val2.s0, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val2.s1, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val2.s2, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val2.s3, j))));
        }
        filter_idx += weights_x_pitch;
        filter_idx2 += weights_x_pitch;
@@ -94,7 +95,7 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(

            ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
 #if BIAS_TERM
-            dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + intel_sub_group_shuffle(bias, i);
+            dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + _sub_group_shuffle(bias, i);
 #else
            dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i];
 #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3.cl
@@ -4,7 +4,8 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
+#include "include/batch_headers/imad.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #if QUANTIZATION_TERM
 #define ACCUMULATOR_TYPE int
 #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -25,7 +26,7 @@

 // int8 conv_input and weights data is packed to int32 "batches",
 // int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3)(
    const __global INPUT0_TYPE *conv_input,
    __global OUTPUT_TYPE *output,
@@ -68,10 +69,10 @@ uint split_idx)

                __attribute__((opencl_unroll_hint(16)))
                for (uint j = 0; j < 16; j++) {
-                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
-                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
-                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
-                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));
                }
                filter_idx += weights_x_pitch;
                input_idx += input_x_pitch;
@@ -93,7 +94,7 @@ uint split_idx)
    for (uint i = 0; i < 16; i++) {
        ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
 #if BIAS_TERM
-        dequantized = (ACTIVATION_TYPE)dotProd[i] + intel_sub_group_shuffle(bias, i);
+        dequantized = (ACTIVATION_TYPE)dotProd[i] + _sub_group_shuffle(bias, i);
 #else
        dequantized = (ACTIVATION_TYPE)dotProd[i];
 #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/mmad.cl"

@@ -26,7 +27,7 @@
    #define ACTIVATION_TYPE_VEC float8
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
    #define MMAD MMAD_8x8
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
 #elif OUTPUT_X_BLOCK_SIZE == 4
    #define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
    #define ACCUMULATOR_TYPE_VEC int4
@@ -34,13 +35,13 @@
    #define ACTIVATION_TYPE_VEC float4
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
    #define MMAD MMAD_4x8
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
 #else
 #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size"
 #endif

 __attribute__((reqd_work_group_size(8, OW_GROUP, 1)))
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(convolution_mmad_b_fs_yx_fsv32)(
    __global INPUT0_TYPE* input,
    __global PACKED_OUT_TYPE* output,
@@ -145,7 +146,7 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
                        }
                        else
                        {
-                            line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, intel_sub_group_block_read((const __global uint*)(input + in_addr +
+                            line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr +
                                                                          icb * input_fs_pitch +
                                                                          kd * DILATION_SIZE_Z * input_z_pitch +
                                                                          kh * DILATION_SIZE_Y * input_y_pitch +
@@ -166,10 +167,10 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
                                     + kh * ISV_SIZE * OSV_SIZE * FILTER_SIZE_X
                                     + kw * ISV_SIZE * OSV_SIZE;

-                    int8 weights_data0 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
-                    int8 weights_data1 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
-                    int8 weights_data2 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
-                    int8 weights_data3 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
+                    int8 weights_data0 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
+                    int8 weights_data1 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
+                    int8 weights_data2 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
+                    int8 weights_data3 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));

                    acc[0] = MMAD(src, weights_data0, acc[0]); // 8 elements in 4*lid+0 out channel
                    acc[1] = MMAD(src, weights_data1, acc[1]); // 8 elements in 4*lid+1 out channel
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32_dw.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32_dw.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(convolution_mmad_b_fs_yx_fsv32_dw)(
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
@@ -2,11 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
-#include "include/imad.cl"
-
-#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
+#include "include/batch_headers/imad.cl"

 #define ISV 4

@@ -30,9 +29,9 @@
    #define ACTIVATION_TYPE_VEC float8
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
 #if OUTPUT_LAYOUT_B_FS_YX_FSV32
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
 #else // OUTPUT_LAYOUT_B_FS_YX_FSV32
-    #define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr), as_uchar8(val))
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc8((__global uchar*)(ptr), as_uchar8(val))
 #endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
 #elif OUTPUT_X_BLOCK_SIZE == 4
    #define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
@@ -41,9 +40,9 @@
    #define ACTIVATION_TYPE_VEC float4
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
 #if OUTPUT_LAYOUT_B_FS_YX_FSV32
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
 #else // OUTPUT_LAYOUT_B_FS_YX_FSV32
-    #define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr), as_uchar4(val))
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc4((__global uchar*)(ptr), as_uchar4(val))
 #endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
 #else
 #error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: Unsupported block size"
@@ -52,9 +51,8 @@
 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(LWS0, LWS1, LWS2)))
 KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
    __global INPUT0_TYPE* input,
@@ -265,9 +263,9 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
                                + kh * OSV * ISV * FILTER_SIZE_X
                                + kw * OSV * ISV;

-                int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
+                int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
 #if OUTPUT_FEATURE_NUM > 16
-                int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
+                int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
 #endif
                PACKED_TYPE_VEC src;

@@ -492,7 +490,6 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(

 #endif  // OUTPUT_IS_FP
 }
-#undef CEIL_DIV
 #undef PACKED_TYPE_VEC
 #undef ACCUMULATOR_TYPE_VEC
 #undef TO_ACCUMULATOR_TYPE_VEC
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl
@@ -2,11 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
-#include "include/imad.cl"
-
-#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
+#include "include/batch_headers/imad.cl"

 #ifdef ACCUMULATOR_TYPE
 #undef ACCUMULATOR_TYPE
@@ -27,14 +26,14 @@
    #define TO_ACCUMULATOR_TYPE_VEC(x) convert_int8(x)
    #define ACTIVATION_TYPE_VEC float8
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
 #elif OUTPUT_X_BLOCK_SIZE == 4
    #define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
    #define ACCUMULATOR_TYPE_VEC int4
    #define TO_ACCUMULATOR_TYPE_VEC(x) convert_int4(x)
    #define ACTIVATION_TYPE_VEC float4
    #define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
-    #define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
+    #define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
 #else
 #error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: Unsupported block size"
 #endif
@@ -43,7 +42,7 @@
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
    __global INPUT0_TYPE* input,
    __global PACKED_OUT_TYPE* output,
@@ -129,8 +128,8 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
                             + kh * OSV * 4 * FILTER_SIZE_X
                             + kw * OSV * 4;

-            int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
-            int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));
+            int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
+            int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));

            PACKED_TYPE_VEC src;

@@ -223,7 +222,6 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
 #endif  // OUTPUT_IS_FP
 }

-#undef CEIL_DIV
 #undef PACKED_TYPE_VEC
 #undef ACCUMULATOR_TYPE_VEC
 #undef TO_ACCUMULATOR_TYPE_VEC
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_2x3_s1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_2x3_s1.cl
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "include/batch_headers/sub_group_shuffle.cl"
+
 // --------------------------------------------------------------------------------------------------------------------------------
 // L3_SIMD_4x8
 // Input matrices dimensions: M x K x N
@@ -35,7 +37,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
    const int INPUT0_SIZE_Y_PITCH_UNIT_4 = INPUT0_PITCH_SIZE_Y / VEC_SIZE; //for bxyf -> INPUT0_PITCH_SIZE_Y is equal to input features count, since ifm % 32 == 0, division by VEC_SIZE is ok
    const int OUTPUT_SIZE_Y_PITCH_UNIT_4 = OUTPUT_Y_PITCH / VEC_SIZE; //for bxyf -> OUTPUT_Y_PITCH is equal to output features count, since ofm % 32 == 0, division by VEC_SIZE is ok
 	  const int WEIGHTS_FEATURE_PITCH_UNIT_4 = WEIGHTS_PITCH_FEATURE / VEC_SIZE; //for xyio -> WEIGHTS_PITCH_FEATURE is equal to the output features count
-	
+
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int group_z = get_group_id(2);
@@ -59,10 +61,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
    const int y_idx = tile_idx_y; //winograd tile height == 1
    const int f_idx = group_x * TILE_N + local_x * VEC_SIZE;
    const int b_idx = batch_idx;
-	
+
 	  const int in_tile_idx = (x_idx % WINOGRAD_TILE_WIDTH);
 	  const int tile_idx_x = (x_idx / WINOGRAD_TILE_WIDTH);
-	
+
    // Result ctile is M rows x N columns
    // M = 8, we have 1 rows of work-items, so we need 8/1 = 8 results down
    // N = 32, we have 8 columns of work-items, so we need 32/8 = 4 results across = 1 float4s across
@@ -124,11 +126,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
        const UNIT_TYPE_4 a6 = src0[6 * INPUT0_SIZE_Y_PITCH_UNIT_4];
        const UNIT_TYPE_4 a7 = src0[7 * INPUT0_SIZE_Y_PITCH_UNIT_4];

-#define DOT_PRODUCT( _i, _j ) { a = intel_sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }
+#define DOT_PRODUCT( _i, _j ) { a = _sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }

 		//in one iteration load weights tile 1-width, 1-height, 4-depth from 4 different filters (ofms)
 		//SIMD reads are chained along b-axis (different ofms), resulting in 1-width, 1-height, 4-depth blocks from 4*8=32 different filters
-		//consecutive reads are chained along f-dim and overflows to y-dim, reading in total 
+		//consecutive reads are chained along f-dim and overflows to y-dim, reading in total
 #define ITERATION( _j ) \
        {   \
            const UNIT_TYPE_4 b0 = src1[0]; src1 += WEIGHTS_FEATURE_PITCH_UNIT_4; \
@@ -165,7 +167,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)

        src0 += TILE_K / VEC_SIZE;
    }
-    
+
    dst[0] = c0; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
    dst[0] = c1; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
    dst[0] = c2; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_2x3_s1_fused.cl
@@ -9,26 +9,27 @@
 // --------------------------------------------------------------------------------------------------------------------------------

 #include "include/batch_headers/common.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"


 #define DOT4i0( _result, _A, _B, i)					\
    {	\
-	_result = mad(_A.s0, intel_sub_group_shuffle( _B.s0, (i)), _result);	\
+	_result = mad(_A.s0, _sub_group_shuffle( _B.s0, (i)), _result);	\
    }

 #define DOT4i1( _result, _A, _B, i)					\
    {	\
-	_result = mad(_A.s1, intel_sub_group_shuffle( _B.s1, (i)), _result);	\
+	_result = mad(_A.s1, _sub_group_shuffle( _B.s1, (i)), _result);	\
    }

 #define DOT4i2( _result, _A, _B, i)					\
    {	\
-	_result = mad(_A.s2, intel_sub_group_shuffle( _B.s2, (i)), _result);	\
+	_result = mad(_A.s2, _sub_group_shuffle( _B.s2, (i)), _result);	\
    }

 #define DOT4i3( _result, _A, _B, i)					\
    {	\
-	_result = mad(_A.s3, intel_sub_group_shuffle( _B.s3, (i)), _result);	\
+	_result = mad(_A.s3, _sub_group_shuffle( _B.s3, (i)), _result);	\
    }

 #define UNIT_TYPE_2 CAT(UNIT_TYPE, 2)
@@ -36,15 +37,15 @@
 #define UNIT_TYPE_8 CAT(UNIT_TYPE, 8)

 __attribute__((reqd_work_group_size(8, 2, 8)))
-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 (
-    __global INPUT0_TYPE* I, 
-    __global OUTPUT_TYPE* O, 
-    __global FILTER_TYPE* U, 
+    __global INPUT0_TYPE* I,
+    __global OUTPUT_TYPE* O,
+    __global FILTER_TYPE* U,
 #if BIAS_TERM
    const __global UNIT_TYPE * bias,
-#endif 
+#endif
    uint split_idx)
 {
    //               (DxC2)x(UxWx8c)
@@ -52,17 +53,17 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
    __local UNIT_TYPE_4 V[slmSize*2]; // 8 KB

    /* These constants are defined as precompiler macros during compilation. */
-     const uint WC = W*INPUT0_FEATURE_NUM; 
-	 const uint HW = H*W; 
-     const uint HWC = H*WC; 
-     const uint WC4 = WC >> 2; 
-     const uint K16 = FILTER_OFM_NUM >> 4; 
-     const uint C4 = INPUT0_FEATURE_NUM >> 2; 
-     const uint K2 = FILTER_OFM_NUM >> 1; 
-     const uint QK2 = Q*K2; 
-     const uint QK = Q*FILTER_OFM_NUM; 
-     const uint PQK = P*QK; 
-    
+     const uint WC = W*INPUT0_FEATURE_NUM;
+	 const uint HW = H*W;
+     const uint HWC = H*WC;
+     const uint WC4 = WC >> 2;
+     const uint K16 = FILTER_OFM_NUM >> 4;
+     const uint C4 = INPUT0_FEATURE_NUM >> 2;
+     const uint K2 = FILTER_OFM_NUM >> 1;
+     const uint QK2 = Q*K2;
+     const uint QK = Q*FILTER_OFM_NUM;
+     const uint PQK = P*QK;
+
 	const uint upperHalf = get_local_id(1);
    uint gx = get_group_id(0);
    uint gy = (uint)get_group_id(1)*2+((uint)get_group_id(2)%2);
@@ -86,7 +87,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
    int x = gx*14 + lz*2 + lxd4 - px;
    int y = gy*4 - py;
    uint k = gk*16 + lzd4*8;
-    
+
    // #                                  x->
    // #     M0    M1    M2    M3    M4    M5    M6
    // #   +------------------------------------------
@@ -113,13 +114,13 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)

    uint lxm2 = lx % 2;
    uint lxb1 = (lx & 2)/2;
-                                     
+
    uint2 coordU0;
    coordU0.x = (lzm4*24 + k*12);
    coordU0.y = 0;

 	uint slmPipeStage = 0;
-	
+
    __attribute__((opencl_unroll_hint(1)))
    for (uint c = lxm4; c < C4_up16; c += 4) {

@@ -142,7 +143,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 			bool y5_in = 0 <= (y + 5) && (y + 5) < H && x_in;

 			#if INPUT0_LAYOUT_BYXF
-			    
+
 /*				const  UNIT_TYPE_4 I_load_0 = y0_in ? I_load[0*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
 				const  UNIT_TYPE_4 I_load_1 = y1_in ? I_load[1*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
 				const  UNIT_TYPE_4 I_load_2 = y2_in ? I_load[2*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
@@ -227,10 +228,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 				//uint coordU_x = coordU0.x + get_sub_group_local_id()%8;
 				const uint flatA = coordU0.y*FILTER_OFM_NUM*KCOLSW*KROWSW + coordU0.x + get_sub_group_local_id()%8;
                const UNIT_TYPE_4 f0 = (UNIT_TYPE_4)(
-				*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
-				*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
-				*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
-				*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
+				*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
+				*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
+				*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
+				*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));

 				// row 0

@@ -554,7 +555,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 				DOT4i3(M6.s2, f1, V13, 2 + c4);
 				DOT4i3(M6.s3, f1, V13, 4 + c4);

-				
+
 				//flatA += 8;
 				const UNIT_TYPE_4 f2 = (UNIT_TYPE_4)(
 					*(__global UNIT_TYPE *)(&U[flatA + 16 + 0 * FILTER_OFM_NUM*KCOLSW*KROWSW]),
@@ -563,7 +564,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 					*(__global UNIT_TYPE *)(&U[flatA + 16 + 3 * FILTER_OFM_NUM*KCOLSW*KROWSW]));
 				coordU0.y += 4;

-				
+
 				// f2[c4] x v[2 .. 16]
 				DOT4i0(M0.s0, f2, V00, 4 + c4);
 				DOT4i0(M0.s1, f2, V00, 6 + c4);
@@ -628,7 +629,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)

 				// row 1

-				
+
 				// f2 x v[2 .. 16]
 				DOT4i1(M0.s2, f2, V10, 4 + c4);
 				DOT4i1(M0.s3, f2, V10, 6 + c4);
@@ -649,7 +650,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 				DOT4i1(M6.s3, f2, V13, 6 + c4);


-				
+
 				// f2[c4] x v[2 .. 16]
 				DOT4i2(M0.s0, f2, V00, 4 + c4);
 				DOT4i2(M0.s1, f2, V00, 6 + c4);
@@ -759,11 +760,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
        barrier(CLK_LOCAL_MEM_FENCE);
    }

-    if (lz < 7) 
+    if (lz < 7)
 	{
        // Load multiplies from SLM.
        __local const UNIT_TYPE_8 *M_read = (__local UNIT_TYPE_8*)&V[lz*8 + lxd4*224 + lxm4*2 + slmSize*upperHalf];
-        
+
        UNIT_TYPE_8 M0 = M_read[0*28];
        UNIT_TYPE_8 M1 = M_read[1*28];
        UNIT_TYPE_8 M2 = M_read[2*28];
@@ -821,7 +822,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write_0[0] = ACTIVATION(S0.s0, ACTIVATION_PARAMS);
                    O_write_0[0+Q*P] = ACTIVATION(S0.s4, ACTIVATION_PARAMS);
 #endif
-#endif 
+#endif
                }
                if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -829,7 +830,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s5 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1, ACTIVATION_PARAMS), ACTIVATION(S0.s5, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_0[1] = ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -837,8 +838,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_0[1] = ACTIVATION(S0.s1, ACTIVATION_PARAMS);
                    O_write_0[1+Q*P] = ACTIVATION(S0.s5, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
                }
            }

@@ -850,7 +851,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s4 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0, ACTIVATION_PARAMS), ACTIVATION(S1.s4, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_1[0] = ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -858,8 +859,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_1[0] = ACTIVATION(S1.s0, ACTIVATION_PARAMS);
                    O_write_1[0+Q*P] = ACTIVATION(S1.s4, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
                }
                if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -867,7 +868,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s5 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1, ACTIVATION_PARAMS), ACTIVATION(S1.s5, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_1[1] = ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -875,8 +876,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_1[1] = ACTIVATION(S1.s1, ACTIVATION_PARAMS);
                    O_write_1[1+Q*P] = ACTIVATION(S1.s5, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
                }
            }

@@ -888,7 +889,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s6 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2, ACTIVATION_PARAMS), ACTIVATION(S0.s6, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_2[0] = ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -896,8 +897,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_2[0] = ACTIVATION(S0.s2, ACTIVATION_PARAMS);
                    O_write_2[0+Q*P] = ACTIVATION(S0.s6, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
                }
                if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -905,7 +906,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s7 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3, ACTIVATION_PARAMS), ACTIVATION(S0.s7, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_2[1] = ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -913,8 +914,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_2[1] = ACTIVATION(S0.s3, ACTIVATION_PARAMS);
                    O_write_2[1+Q*P] = ACTIVATION(S0.s7, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
                }
            }

@@ -926,7 +927,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s6 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2, ACTIVATION_PARAMS), ACTIVATION(S1.s6, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_3[0] = ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -934,7 +935,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_3[0] = ACTIVATION(S1.s2, ACTIVATION_PARAMS);
                    O_write_3[0+Q*P] = ACTIVATION(S1.s6, ACTIVATION_PARAMS);
-#endif 
+#endif
 #endif
                }
                if (q1_in) {
@@ -943,7 +944,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
                    O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s7 + bias[bias_index1], ACTIVATION_PARAMS));
 #else
                    O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3, ACTIVATION_PARAMS), ACTIVATION(S1.s7, ACTIVATION_PARAMS));
-#endif 
+#endif
 #else
 #if BIAS_TERM
                    O_write_3[1] = ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -951,8 +952,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
 #else
                    O_write_3[1] = ACTIVATION(S1.s3, ACTIVATION_PARAMS);
                    O_write_3[1+Q*P] = ACTIVATION(S1.s7, ACTIVATION_PARAMS);
-#endif   
-#endif   
+#endif
+#endif
                }
            }
        }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_winograd_6x3_s1_fused.cl
@@ -9,7 +9,7 @@
 // --------------------------------------------------------------------------------------------------------------------------------

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"


 #define DOT8i_0( _result, _A, _B, i)					\
@@ -63,7 +63,7 @@


 __attribute__((reqd_work_group_size(16, 1, 8)))
-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 (
 	__global INPUT0_TYPE* I,
@@ -75,7 +75,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 #endif
 #if BIAS_TERM
 	const __global UNIT_TYPE * bias,
-#endif 
+#endif
 	uint split_idx)
 {
 	//               (DxC2)x(UxWx8c)
@@ -100,7 +100,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)

 	uint gx = get_group_id(0);
 	uint gy = get_group_id(1);
-	uint gz = get_group_id(2);	
+	uint gz = get_group_id(2);
 	uint gk = gz % K16;
 	uint gn = gz / K16;

@@ -266,7 +266,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 			__local const UNIT_TYPE_8 *V_read_c16 = V_read;

 			__attribute__((opencl_unroll_hint(1)))
-            for (uint c16 = 0; c16 < 2 
+            for (uint c16 = 0; c16 < 2
 #ifndef FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB
 				&& coordU0.y < last_coord_y
 #endif
@@ -297,17 +297,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)

 							// Fetch 8 channels of Winograd components from f(k,s)
 #if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
-							const UNIT_TYPE_8 f00 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
+							const UNIT_TYPE_8 f00 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
 #else
 							const UNIT_TYPE_8 f00 = (UNIT_TYPE_8)(
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
 #endif


@@ -467,17 +467,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 							DOT8i_7(M6.s1, f00, V8, 10 + c8);

 #if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
-							const UNIT_TYPE_8 f01 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
+							const UNIT_TYPE_8 f01 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
 #else
 							const UNIT_TYPE_8 f01 = (UNIT_TYPE_8)(
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
 #endif

 							// f1[c8] x v[1 .. 15]
@@ -637,17 +637,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 							DOT8i_7(M6.s1, f01, V8, 12 + c8);

 #if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
-							const UNIT_TYPE_8 f02 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
+							const UNIT_TYPE_8 f02 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
 #else
 							const UNIT_TYPE_8 f02 = (UNIT_TYPE_8)(
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
-								as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
+								as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
 #endif
 							coordU0.y += 8;

@@ -919,7 +919,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 #else
 						O_write_0[0] = ACTIVATION(S0.s0 * scl, ACTIVATION_PARAMS);
 #endif
-#endif 
+#endif
 					}
 					if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -927,14 +927,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_0[1] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_0[1] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
 					}
 				}

@@ -946,14 +946,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_1[0] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_1[0] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
 					}
 					if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -961,14 +961,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_1[1] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_1[1] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
 					}
 				}

@@ -980,14 +980,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_2[0] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_2[0] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
 					}
 					if (q1_in) {
 #if OUTPUT_LAYOUT_BYXF
@@ -995,14 +995,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_2[1] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_2[1] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
-#endif 
-#endif 
+#endif
+#endif
 					}
 				}

@@ -1014,13 +1014,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_3[0] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_3[0] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #endif
 					}
 					if (q1_in) {
@@ -1029,14 +1029,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 						O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 						O_write_3[1] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 						O_write_3[1] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
-#endif   
-#endif   
+#endif
+#endif
 					}
 				}
 			}
@@ -1049,13 +1049,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 					O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 					O_write_4[0] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write_4[0] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #endif
 				}
 				if (q1_in) {
@@ -1064,14 +1064,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 					O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 					O_write_4[1] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write_4[1] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
-#endif   
-#endif   
+#endif
+#endif
 				}
 			}

@@ -1083,13 +1083,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 					O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 					O_write_5[0] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write_5[0] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #endif
 				}
 				if (q1_in) {
@@ -1098,14 +1098,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 					O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
-#endif 
+#endif
 #else
 #if BIAS_TERM
 					O_write_5[1] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
 #else
 					O_write_5[1] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
-#endif   
-#endif   
+#endif
+#endif
 				}
 			}
 		}
@@ -1113,4 +1113,4 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
 }
 #undef UNIT_TYPE_2
 #undef UNIT_TYPE_4
-#undef UNIT_TYPE_8
+#undef UNIT_TYPE_8
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(convolution_gpu_yxfb_ref)(
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp16.cl
@@ -2,11 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 __attribute__((reqd_work_group_size(16, 1, 1)))
 KERNEL(convolution_gpu_yxfb_yxio_b16)(
    const __global UNIT_TYPE* input,
@@ -94,7 +96,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
                    for (uint h = 0; h < FILTER_IFM_NUM; h++)
                    {
 #if defined(USE_BLOCK_READ_2)
-                        half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx)));
+                        half4 _input = as_half4(_sub_group_block_read2((const __global uint*)(input + input_idx)));
                        uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
                        half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
                        _data[0] = fma(_input.s0, filter_transp, _data[0]);
@@ -103,7 +105,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
                        _data[3] = fma(_input.s3, filter_transp, _data[3]);
                        input_idx += INPUT0_FEATURE_PITCH;
 #elif defined(USE_BLOCK_READ_1)
-                        half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
+                        half2 _input = as_half2(_sub_group_block_read((const __global uint*)(input + input_idx)));
                        uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
                        half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
                        _data[0] = fma(_input.s0, filter_transp, _data[0]);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b16_fp32.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

@@ -93,7 +95,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
                    for (uint h = 0; h < FILTER_IFM_NUM; h++)
                    {
 #ifdef USE_BLOCK_READ_2
-                        float2 _input = as_float2(intel_sub_group_block_read2((const __global uint*)input + input_idx));
+                        float2 _input = as_float2(_sub_group_block_read2((const __global uint*)input + input_idx));
                        float8 filter_transp = TRANSPOSE_BLOCK_8(filter[filter_idx]);
                        _data[0] = fma(_input.s0, filter_transp, _data[0]);
                        _data[1] = fma(_input.s1, filter_transp, _data[1]);
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_multiple_x_fp32.cl
@@ -2,7 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

@@ -18,23 +19,23 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
 {
 #if USE_VECTOR == 8
    #define VECTOR_FLOAT float8
-    #define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
-    #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
+    #define BLOCK_READ(IN) as_float8(_sub_group_block_read8((const __global uint*)IN))
+    #define BLOCK_WRITE(OUT, DATA) _sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
 #endif
 #if USE_VECTOR == 4
    #define VECTOR_FLOAT float4
-    #define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
-    #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
+    #define BLOCK_READ(IN) as_float4(_sub_group_block_read4((const __global uint*)IN))
+    #define BLOCK_WRITE(OUT, DATA) _sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
 #endif
 #if USE_VECTOR == 2
    #define VECTOR_FLOAT float2
-    #define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
-    #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
+    #define BLOCK_READ(IN) as_float2(_sub_group_block_read2((const __global uint*)IN))
+    #define BLOCK_WRITE(OUT, DATA) _sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
 #endif
 #if USE_VECTOR == 1
    #define VECTOR_FLOAT float
-    #define BLOCK_READ(IN) as_float(intel_sub_group_block_read((const __global uint*)IN))
-    #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write((__global uint*)OUT, as_uint(DATA));
+    #define BLOCK_READ(IN) as_float(_sub_group_block_read((const __global uint*)IN))
+    #define BLOCK_WRITE(OUT, DATA) _sub_group_block_write((__global uint*)OUT, as_uint(DATA));
 #endif

    const uint batch_num = INPUT0_BATCH_NUM;
@@ -99,7 +100,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
                    float _in[X_PER_WORK_ITEM];
                    for(uint a = 0; a < X_PER_WORK_ITEM; a++)
                    {
-                        _in[a] = as_float(intel_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
+                        _in[a] = as_float(_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
                    }
                    float8 _input[X_PER_WORK_ITEM];
                    for(uint a = 0; a < X_PER_WORK_ITEM; a++)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_yxfb_yxio_b8_fp32.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

@@ -65,7 +67,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
 #endif
                    for (uint h = 0; h < FILTER_IFM_NUM / 8; h++)
                    {
-                        float8 _input = as_float8(intel_sub_group_block_read8((const __global uint*)input + input_idx));
+                        float8 _input = as_float8(_sub_group_block_read8((const __global uint*)input + input_idx));

                        DOT_PRODUCT_8(_data0, _input.s0, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
 #if OFM_PER_WORK_ITEM == 16
@@ -128,8 +130,8 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
 #endif

    const uint _out_id = OUTPUT_OFFSET + out_id;
-    intel_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
+    _sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
 #if OFM_PER_WORK_ITEM == 16
-    intel_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
+    _sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
 #endif
 }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/count_nonzero_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/count_nonzero_ref.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"

 #define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/ctc_greedy_decoder_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/cum_sum_partial_sum.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/cum_sum_partial_sum.cl
@@ -2,36 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
-
-///////////////////////// Input Index /////////////////////////
-inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if INPUT0_DIMS < 5
-    return INPUT0_GET_INDEX(b, f, y, x);
-#elif INPUT0_DIMS == 5
-    return INPUT0_GET_INDEX(b, f, z, y, x);
-#elif INPUT0_DIMS == 6
-    return INPUT0_GET_INDEX(b, f, w, z, y, x);
-#else
-#error cum_sum_ref.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output Index /////////////////////////
-inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if OUTPUT_DIMS < 5
-    return OUTPUT_GET_INDEX(b, f, y, x);
-#elif OUTPUT_DIMS == 5
-    return OUTPUT_GET_INDEX(b, f, z, y, x);
-#elif OUTPUT_DIMS == 6
-    return OUTPUT_GET_INDEX(b, f, w, z, y, x);
-#else
-#error cum_sum_ref.cl: output format - not supported
-#endif
-}
+#include "include/fetch_utils.cl"

 inline void FUNC(get_indices)(int *axes)
 {
@@ -87,8 +58,6 @@ inline void FUNC(get_indices)(int *axes)
 #endif
 }

-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
 #if CUM_SUM_PARTIAL_SUM
 inline uint FUNC(get_current_index)(int axis, int i)
 {
@@ -99,7 +68,7 @@ inline uint FUNC(get_current_index)(int axis, int i)
 #endif
 }

-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(LWS, 1, 1)))
 KERNEL(cum_sum_partial_sum)(
    const __global INPUT0_TYPE* input,
@@ -160,7 +129,7 @@ inline uint FUNC(get_current_index)(int i)
 }

 // main
-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 __attribute__((reqd_work_group_size(LWS, 1, 1)))
 KERNEL(cum_sum_final)(
    const __global PARTIAL_TYPE* partial,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/cum_sum_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/cum_sum_ref.cl
@@ -2,36 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
-
-///////////////////////// Input Index /////////////////////////
-inline uint FUNC(get_input_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if INPUT0_DIMS < 5
-    return INPUT0_GET_INDEX(b, f, y, x);
-#elif INPUT0_DIMS == 5
-    return INPUT0_GET_INDEX(b, f, z, y, x);
-#elif INPUT0_DIMS == 6
-    return INPUT0_GET_INDEX(b, f, w, z, y, x);
-#else
-#error cum_sum_ref.cl: input format - not supported
-#endif
-}
-
-///////////////////////// Output Index /////////////////////////
-inline uint FUNC(get_output_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
-{
-#if OUTPUT_DIMS < 5
-    return OUTPUT_GET_INDEX(b, f, y, x);
-#elif OUTPUT_DIMS == 5
-    return OUTPUT_GET_INDEX(b, f, z, y, x);
-#elif OUTPUT_DIMS == 6
-    return OUTPUT_GET_INDEX(b, f, w, z, y, x);
-#else
-#error cum_sum_ref.cl: output format - not supported
-#endif
-}
+#include "include/fetch_utils.cl"

 KERNEL(cum_sum_ref)(
    OPTIONAL_SHAPE_INFO_ARG
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_b_fs_zyx_fsv16_dw.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_b_fs_zyx_fsv16_dw.cl
@@ -2,13 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
 #include "include/batch_headers/fetch_data.cl"
-#include "include/batch_headers/data_types.cl"

 #include "deconvolution_gpu_imad_common.cl"

-#define unroll_for __attribute__((opencl_unroll_hint)) for
 #define FEATURE_SLICE_SIZE 16

 #if X_BLOCK_SIZE == 1
@@ -54,7 +52,7 @@ DECLARE_READ_BLOCK_8(preload_weights, FILTER_TYPE)
 #   endif
 #endif

-__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE))) // attr:no-format
+REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE) // attr:no-format
 __attribute__((reqd_work_group_size(1, FEATURE_SLICE_SIZE, 1)))
 KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
        const  __global INPUT0_TYPE *input,
@@ -272,7 +270,6 @@ KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
    }
 }

-#undef unroll_for
 #undef FEATURE_SLICE_SIZE

 #undef GET_VEC_ELEM
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_bfyx_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_bfyx_opt.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define WORK_GROUP_GROUP_SIZE 16
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_along_f_tile_bfx.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_along_f_tile_bfx.cl
@@ -4,8 +4,10 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 #include "deconvolution_gpu_imad_common.cl"

@@ -31,7 +33,7 @@ DECLARE_STORE_BLOCK_4(store_output, OUTPUT_TYPE)
 #define WEIGHTS_IN_TILE_OFM_PITCH               (TILE_IFM * SIMD)

 __attribute__((reqd_work_group_size(1, SIMD, 1)))
-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 KERNEL(deconvolution_gpu_imad_ref)(
    const __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* restrict output,
@@ -127,8 +129,7 @@ KERNEL(deconvolution_gpu_imad_ref)(

                for (uint fi = 0; fi < FILTER_IFM_NUM; fi += TILE_IFM) {
                    // Load weights [TILE_OFM, TILE_IFM, 1, 1]
-                    __attribute__((opencl_unroll_hint))
-                    for (uint of = 0; of < TILE_OFM; ++of) {
+                    unroll_for (uint of = 0; of < TILE_OFM; ++of) {
                        uint weights_idx = weights_offset + of * WEIGHTS_IN_TILE_OFM_PITCH / 4;
                        FUNC_CALL(load_weights_ui)(weights_ui, weights_idx, TILE_IFM / 4, wei[of]);
                    }
@@ -142,8 +143,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
                    uint input_offset = INPUT0_GET_INDEX(out_b, if_start + fi, fixed_in_z, fixed_in_y, fixed_in_x) / 4;
 #   endif
 #endif
-                    __attribute__((opencl_unroll_hint))
-                    for (uint ob = 0; ob < TILE_B; ++ob) {
+                    unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
                        uint input_idx = input_offset + ob * INPUT_IN_TILE_B_PITCH / 4;
                        FUNC_CALL(load_input_ui)(input_ui, input_idx, TILE_IFM / 4, in[ob]);
                    }
@@ -151,24 +151,18 @@ KERNEL(deconvolution_gpu_imad_ref)(
                    input_offset += INPUT_TILE_IFM_PITCH / 4;
 #endif
                    if (zero_x) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint ob = 0; ob < TILE_B; ++ob) {
-                            __attribute__((opencl_unroll_hint))
-                            for (uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
+                        unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
+                            unroll_for(uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
                                in[ob][ifp] = 0;
                            }
                        }
                    }

-                    __attribute__((opencl_unroll_hint))
-                    for (uint ob = 0; ob < TILE_B; ++ob) {
-                        __attribute__((opencl_unroll_hint))
-                        for (uint of = 0; of < TILE_OFM; ++of) {
-                            __attribute__((opencl_unroll_hint))
-                            for (uint tx = 0; tx < TILE_X; ++tx) {
-                                __attribute__((opencl_unroll_hint))
-                                for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
-                                    uint in_val = intel_sub_group_shuffle(in[ob][imad_it], tx);
+                    unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
+                        unroll_for (uint of = 0; of < TILE_OFM; ++of) {
+                            unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
+                                unroll_for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
+                                    uint in_val = _sub_group_shuffle(in[ob][imad_it], tx);
                                    acc[ob][of][tx] = IMAD(acc[ob][of][tx], AS_INPUT_TYPE4(in_val), AS_FILTER_TYPE4(wei[of][imad_it]));
                                }
                            }
@@ -180,25 +174,19 @@ KERNEL(deconvolution_gpu_imad_ref)(
    }

    ACTIVATION_TYPE dequantized[TILE_B][TILE_OFM][TILE_X];
-    __attribute__((opencl_unroll_hint))
-    for (uint ob = 0; ob < TILE_B; ++ob) {
-        __attribute__((opencl_unroll_hint))
-        for (uint of = 0; of < TILE_OFM; ++of) {
-            __attribute__((opencl_unroll_hint))
-            for (uint tx = 0; tx < TILE_X; ++tx) {
+    unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
+        unroll_for(uint of = 0; of < TILE_OFM; ++of) {
+            unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
                dequantized[ob][of][tx] = TO_ACTIVATION_TYPE(acc[ob][of][tx]);
            }
        }
    }

 #if BIAS_TERM
-    __attribute__((opencl_unroll_hint))
-    for (uint of = 0; of < TILE_OFM; ++of) {
+    unroll_for (uint of = 0; of < TILE_OFM; ++of) {
        BIAS_TYPE bias_val = bias[out_f + of * SIMD];
-        __attribute__((opencl_unroll_hint))
-        for (uint ob = 0; ob < TILE_B; ++ob) {
-            __attribute__((opencl_unroll_hint))
-            for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
+            unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
                dequantized[ob][of][tx] += TO_ACTIVATION_TYPE(bias_val);
            }
        }
@@ -206,15 +194,12 @@ KERNEL(deconvolution_gpu_imad_ref)(
 #endif

    OUTPUT_TYPE result[TILE_B][TILE_OFM][TILE_X];
-    __attribute__((opencl_unroll_hint))
-    for (uint of = 0; of < TILE_OFM; ++of) {
+    unroll_for (uint of = 0; of < TILE_OFM; ++of) {
 #if FUSED_OPS_CAN_USE_PRELOAD
        FUSED_OPS_PRELOAD;
 #endif
-        __attribute__((opencl_unroll_hint))
-        for (uint ob = 0; ob < TILE_B; ++ob) {
-            __attribute__((opencl_unroll_hint))
-            for (uint tx = 0; tx < TILE_X; ++tx) {
+        unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
+            unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
 #if HAS_FUSED_OPS
 #   if FUSED_OPS_CAN_USE_PRELOAD
                FUSED_OPS_CALC;
@@ -233,12 +218,9 @@ KERNEL(deconvolution_gpu_imad_ref)(
    bool leftovers_f = OUTPUT_FEATURE_NUM % SIMD != 0 && out_f + SIMD >= OUTPUT_FEATURE_NUM;

 #if OUTPUT_NAIVE_STORE
-    __attribute__((opencl_unroll_hint))
-    for (uint ob = 0; ob < TILE_B; ++ob) {
-        __attribute__((opencl_unroll_hint))
-        for (uint of = 0; of < TILE_OFM; ++of) {
-            __attribute__((opencl_unroll_hint))
-            for (uint tx = 0; tx < TILE_X; ++tx) {
+    unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
+        unroll_for(uint of = 0; of < TILE_OFM; ++of) {
+            unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
                if ((leftovers_x && tx >= OUTPUT_SIZE_X % TILE_X) ||
                    (leftovers_f && out_f + of * SIMD >= OUTPUT_FEATURE_NUM))
                    break;
@@ -252,10 +234,8 @@ KERNEL(deconvolution_gpu_imad_ref)(
        }
    }
 #elif OUTPUT_BLOCK_X_STORE
-    __attribute__((opencl_unroll_hint))
-    for (uint ob = 0; ob < TILE_B; ++ob) {
-        __attribute__((opencl_unroll_hint))
-        for (uint of = 0; of < TILE_OFM; ++of) {
+    unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
+        unroll_for(uint of = 0; of < TILE_OFM; ++of) {
 #if OUTPUT_DIMS <= 4
            uint output_idx = OUTPUT_GET_INDEX(out_b + ob, out_fg + of * SIMD, out_y, out_x);
 #elif OUTPUT_DIMS == 5
@@ -266,8 +246,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
            } else if (!leftovers_f) {
                FUNC_CALL(store_output)(output, output_idx, OUTPUT_SIZE_X % TILE_X, result[ob][of]);
            } else {
-                __attribute__((opencl_unroll_hint))
-                for (uint tx = 0; tx < TILE_X; ++tx) {
+                unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
                    if (out_f + of * SIMD < OUTPUT_FEATURE_NUM && out_x + tx < OUTPUT_SIZE_X) {
                        output[output_idx + sglid + tx * SIMD] = result[ob][of][tx];
                    }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_common.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_common.cl
@@ -2,10 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
-
-#define CEIL_DIV(a, b) (((a) + ((b) - 1)) / (b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"

 #define VEC_TO_ARR_1(var, arr, idx)             \
    arr[idx] = var
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_imad_ref.cl
@@ -4,8 +4,7 @@

 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
-#include "include/imad.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/imad.cl"

 #include "deconvolution_gpu_imad_common.cl"

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deconvolution_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(deconvolution_gpu_yxfb_ref)(
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_conv.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_conv.cl
@@ -2,15 +2,16 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/unit_type.cl"

 #define FEATURE_SLICE_SIZE 16

-#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))
+#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(deformable_convolution_gpu_bfyx_conv)(
    const __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_interp.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_interp.cl
@@ -2,10 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL(deformable_convolution_gpu_bfyx_interp)(
    const __global INPUT0_TYPE* data,
    const __global INPUT1_TYPE* trans,
@@ -29,7 +28,7 @@ KERNEL(deformable_convolution_gpu_bfyx_interp)(

    const int input_offset_x = input_x + kw * DILATION_SIZE_X;
    const int input_offset_y = input_y + kh * DILATION_SIZE_Y;
-    
+
 #if DEFORMABLE_MASK_ENABLED
    const int dg_size = dg * FILTER_SIZE_Y * FILTER_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_X;
    const int trans_offset = b * INPUT1_BATCH_PITCH + 2 * dg_size;
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/deformable_convolution_gpu_bfyx_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/depth_to_space_block2_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/depth_to_space_block2_opt.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(depth_to_space_block2_opt)(const __global half* input, __global half* output)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/depth_to_space_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/depth_to_space_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(depth_to_space_ref)(const __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/detection_output_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/detection_output_gpu_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/common.cl"
 #include "include/detection_output_common.cl"

@@ -56,7 +55,6 @@
 // LOCAL_BATCHES_NUM     - number of batch that can be process per work-group
 // =================================================================================================================

-#define unroll_for __attribute__((opencl_unroll_hint)) for
 #define NUM_CLASSES_ACC (NUM_CLASSES + 2)

 typedef struct __attribute__((__packed__)) {
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl
@@ -2,11 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define FEATURE_SLICE_SIZE 16
-#define unroll_for  __attribute__((opencl_unroll_hint())) for

 #define OUTPUT_TYPE_BLOCK               MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
 #define TO_TYPE(type, val)              CAT(convert_, type)(val)
@@ -25,7 +25,7 @@
    #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order)
 #endif

-__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
+REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE)
 KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
                              __global OUTPUT_TYPE* output
 #if HAS_FUSED_OPS_DECLS
@@ -107,7 +107,6 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
 }

 #undef FEATURE_SLICE_SIZE
-#undef unroll_for
 #undef OUTPUT_TYPE_BLOCK
 #undef TO_TYPE
 #undef READ_FUNC
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv4.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv4.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #define OUTPUT_TYPE_BLOCK               MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_fs_b_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_fs_b_yx_fsv32.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(eltwise_fs_b_yx_fsv32)(
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_mixed_byxf_and_fs_b_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_mixed_byxf_and_fs_b_yx_fsv32.cl
@@ -2,18 +2,19 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/unit_type.cl"

 // Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
-#define REQD_SUB_GROUP_SIZE 16
+#define SUB_GROUP_SIZE 16
 #define REQD_FEATURE_SLICE_SIZE 32
 #define REQD_FEATURES_PER_WORK_ITEM 2

 //inputs_decls -> __global unit_type * input0, __global unit_type * input1

-__attribute__((intel_reqd_sub_group_size(REQD_SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(eltwise_mixed_byxf_and_fs_b_yx_fsv32)(
    INPUTS_DECLS
    __global UNIT_TYPE* output)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_simple_vload8.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_simple_vload8.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(eltwise_gpu_vload8)(INPUTS_DECLS
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/embedding_bag_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #ifdef PACKED_SUM
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"

 #define INPUT_TYPE  INPUT0_TYPE
 #define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_roi_feature_extractor_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_roi_feature_extractor_ref.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"

 inline int FUNC(get_pyramid_level_index)(uint level, uint c, uint y, uint x) {
    uint idx = 0;
@@ -64,9 +63,10 @@ KERNEL(experimental_detectron_roi_feature_extractor_ref)(const __global INPUT0_T
    const uint roi_bin_grid_w = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_width / POOLED_WIDTH);
    const uint roi_bin_grid_h = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_height / POOLED_HEIGHT);

-    const uint level_h = LEVEL_SIZES[3 * level];
-    const uint level_w = LEVEL_SIZES[3 * level + 1];
-    const uint level_offset = LEVEL_SIZES[3 * level + 2];
+    size_t level_sizes_arr[3*NUM_PYRAMID_LEVELS] = LEVEL_SIZES;
+    const uint level_h = level_sizes_arr[3 * level];
+    const uint level_w = level_sizes_arr[3 * level + 1];
+    const uint level_offset = level_sizes_arr[3 * level + 2];

    INPUT0_TYPE output_val = 0.0;
    INPUT0_TYPE current_bin_start_h = roi_start_h + y * bin_height;
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_topk_rois_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_topk_rois_ref.cl
@@ -3,7 +3,6 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"

 KERNEL(experimental_detectron_topk_rois_ref)(const __global INPUT0_TYPE* input_rois,
        const __global INPUT1_TYPE* topk_indices, __global OUTPUT_TYPE* output_rois)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/extract_image_patches_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/extract_image_patches_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 KERNEL(extract_image_patches_ref)(const __global INPUT0_TYPE* input,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_MMAD.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_MMAD.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"
 #include "include/mmad.cl"
@@ -12,12 +11,12 @@
 #define INPUT_PACKED_TYPE_VEC   CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE)
 #define FILTER_PACKED_TYPE_VEC  CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE)

-#define BLOCK_READ(ptr)         intel_sub_group_block_read((const __global uint*)(ptr))
-#define BLOCK_READ_8(ptr)       intel_sub_group_block_read8((const __global uint*)(ptr))
+#define BLOCK_READ(ptr)         _sub_group_block_read((const __global uint*)(ptr))
+#define BLOCK_READ_8(ptr)       _sub_group_block_read8((const __global uint*)(ptr))

 #define MMAD                    CAT(MMAD_, SUB_GROUP_SIZE)

-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(fully_connected_gpu_MMAD)(
    const __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -133,8 +132,7 @@ KERNEL(fully_connected_gpu_MMAD)(
            INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
            FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR];

-            __attribute__((opencl_unroll_hint))
-            for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
+            unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
                input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH));
 #if SUB_GROUP_SIZE == 8
                weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
@@ -144,8 +142,7 @@ KERNEL(fully_connected_gpu_MMAD)(
 #endif // SUB_GROUP_SIZE
            }

-            __attribute__((opencl_unroll_hint))
-            for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
+            unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
                INPUT_PACKED_TYPE_VEC in;

                in.s0 = sub_group_broadcast(input_data[kb], 0);
@@ -177,8 +174,7 @@ KERNEL(fully_connected_gpu_MMAD)(
    barrier(CLK_LOCAL_MEM_FENCE);

    if (feature_block == 0) {
-        __attribute__((opencl_unroll_hint))
-        for (uint i = 1; i < SLM_DIV_FACTOR; i++)
+        unroll_for(uint i = 1; i < SLM_DIV_FACTOR; i++)
            dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg];
 #endif // SLM_DIV_FACTOR > 1

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_gemm.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_gemm.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 #if defined(__fc_f16)
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_input_spatial.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_input_spatial.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 // Required JIT constants:
@@ -18,7 +17,7 @@

 #define ACC_TYPE float

-__attribute__((intel_reqd_sub_group_size(16)))
+REQD_SUB_GROUP_SIZE(16)
 KERNEL (fully_connected_gpu_bf_io_input_spatial)(
    const __global UNIT_TYPE* input,
    __global UNIT_TYPE* output,
@@ -47,8 +46,8 @@ KERNEL (fully_connected_gpu_bf_io_input_spatial)(
        uint it_w_addr = _inG == UNIT_VAL_ZERO ? weight_idx_base : s_w_idx;
        for (uint j = 0; j < 16; j++)
        {
-            UNIT_TYPE _in = intel_sub_group_shuffle(_inG, j);
-            uint wi_w_addr = intel_sub_group_shuffle(it_w_addr, j);
+            UNIT_TYPE _in = _sub_group_shuffle(_inG, j);
+            uint wi_w_addr = _sub_group_shuffle(it_w_addr, j);
            wi_w_addr += MULTIPLY_OFFSET(UNIT_TYPE, get_sub_group_local_id());
            UNIT_TYPE _w = *OFFSET_GLOBAL_PTR(UNIT_TYPE, weight, wi_w_addr);
            result += _in * _w;
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_io_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

 // Required JIT constants:
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -3,7 +3,9 @@
 //

 #include "include/batch_headers/common.cl"
-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"

 // JIT Parameters:
 // SIMD         - sub-group size/simd width, one of {8, 16};
@@ -51,11 +53,6 @@
 #define BIAS_BLOCK_READ(ptr, offset)         BLOCK_READN(BIAS_TYPE, TILE_OFM, ptr, offset)
 #define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, TILE_OFM, ptr, offset, val)

-// Utility math macros.
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-#define MIN(a, b)      ((a) < (b) ? (a) : (b))
-#define MAX(a, b)      ((a) > (b) ? (a) : (b))
-
 // Check alignment restrictions for using block writes on output.
 #define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * TILE_OUT_B_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)

@@ -80,7 +77,7 @@
 #   define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
 #endif

-__attribute__((intel_reqd_sub_group_size(SIMD)))
+REQD_SUB_GROUP_SIZE(SIMD)
 KERNEL(fc)(
    const __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@@ -122,9 +119,8 @@ KERNEL(fc)(
        INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * TILE_IN_B_PITCH];
        MAKE_VECTOR_TYPE(FILTER_TYPE, TILE_OFM) tmp_wei = BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset);

-        __attribute__((opencl_unroll_hint))
-        for (uint bi = 0; bi < TILE_B; ++bi) {
-            acc[bi] = intel_sub_group_shuffle(tmp_input, bi) * tmp_wei;
+        unroll_for(uint bi = 0; bi < TILE_B; ++bi) {
+            acc[bi] = _sub_group_shuffle(tmp_input, bi) * tmp_wei;
        }

        weights_offset += TILE_OFM * SIMD;
@@ -148,19 +144,15 @@ KERNEL(fc)(
        // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
        //       but significantly degrades readability and generality of code.
        //       It doesn't also show noticable performance improvement on tested configurations.
-        __attribute__((opencl_unroll_hint))
-        for (uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
+        unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
            wei = FILTER_BLOCK_READ(weights, weights_offset);
            weights_offset += TILE_K_OFM * SIMD;

-            __attribute__((opencl_unroll_hint))
-            for (uint kii = 0; kii < TILE_K; ++kii) {
-                __attribute__((opencl_unroll_hint))
-                for (uint fi = 0; fi < TILE_OFM; ++fi) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint bi = 0; bi < TILE_B; ++bi) {
+            unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
+                unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
+                    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
                        const uint total_k = ki * TILE_K + kii;
-                        INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
+                        INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
                        ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
                    }
                }
@@ -181,20 +173,16 @@ KERNEL(fc)(
        CONST_LOOP(TILE_B, LOAD_IN_0);
        #undef LOAD_IN_0
        input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
-        __attribute__((opencl_unroll_hint))
-        for (uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
+        unroll_for(uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
            wei = FILTER_BLOCK_READ(weights, weights_offset);
            weights_offset += TILE_K_OFM * SIMD;

-            __attribute__((opencl_unroll_hint))
-            for (uint kii = 0; kii < TILE_K; ++kii) {
-                __attribute__((opencl_unroll_hint))
-                for (uint fi = 0; fi < TILE_OFM; ++fi) {
-                    __attribute__((opencl_unroll_hint))
-                    for (uint bi = 0; bi < TILE_B; ++bi) {
+            unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
+                unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
+                    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
                        const uint total_k = ki * TILE_K + kii;
                        if (total_k < LEFTOVER_IFM) {
-                            INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
+                            INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
                            ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
                        }
                    }
@@ -216,24 +204,20 @@ KERNEL(fc)(
        BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
    #else
        BIAS_VEC_TYPE bias = 0;
-        __attribute__((opencl_unroll_hint))
-        for (uint fi = 0; fi < TILE_OFM; ++fi) {
+        unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
            ((BIAS_TYPE*)(&bias))[fi] = biases[out_f + sglid + fi * SIMD];
        }
    #endif
-    __attribute__((opencl_unroll_hint))
-    for (uint bi = 0; bi < TILE_B; ++bi) {
+    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
        activated[bi] += TO_ACTIVATION_VEC_TYPE(bias);
    }
 #endif

    OUTPUT_VEC_TYPE result[TILE_B] = { };
 #if HAS_FUSED_OPS
-    __attribute__((opencl_unroll_hint))
-    for (uint bi = 0; bi < TILE_B; ++bi) {
+    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
    #if TILE_OFM > 1
-        __attribute__((opencl_unroll_hint))
-        for (uint fi = 0; fi < TILE_OFM; ++fi) {
+        unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
            FUSED_OPS_VEC;
            result[bi][fi] = FUSED_OPS_RESULT_VEC;
        }
@@ -243,8 +227,7 @@ KERNEL(fc)(
    #endif // TILE_OFM > 1
    }
 #else
-    __attribute__((opencl_unroll_hint))
-    for (uint bi = 0; bi < TILE_B; ++bi) {
+    unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
        result[bi] = TO_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(activated[bi], ACTIVATION_PARAMS_TYPED));
    }
 #endif
@@ -314,10 +297,6 @@ KERNEL(fc)(
 #undef BIAS_BLOCK_READ
 #undef OUTPUT_BLOCK_WRITE

-#undef CEIL_DIV
-#undef MIN
-#undef MAX
-
 #undef USE_BLOCK_WRITE

 #undef MAIN_LOOP_ELEMENTS_COUNT
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bfyx_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bfyx_ref.cl
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/batch_headers/fetch_weights.cl"

--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl
@@ -2,12 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

 // Block read - currently block is 4 bytes aligned.
-#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))

 #define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB)  \
 {   \
@@ -32,8 +33,8 @@
 #define SUB_GROUP_SIZE 16

 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)(
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
+KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
    const __global UNIT_TYPE* input,
    __global UNIT_TYPE* output,
    const __global UNIT_TYPE* weight
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_b1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_b1.cl
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
 #include "include/batch_headers/fetch_data.cl"

 // ---------------------------------------------------------------------------------------------------------------------
@@ -93,7 +95,7 @@

 // Extracts one scalar element of UNIT_TYPE from sub-group chunk;
 //     chunk - name of chunk variable, idx - 0-based index of element.
-#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(intel_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)
+#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)

 // ---------------------------------------------------------------------------------------------------------------------
 // Reads / Writes:
@@ -118,10 +120,10 @@
                                                                        (array)[(idx) + 6] = chunk_vec.s6, (array)[(idx) + 7] = chunk_vec.s7))

 // Currently block read is 4 bytes aligned.
-#define ALIGNED_BLOCK_READ1(ptr, byte_offset) intel_sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_READ2(ptr, byte_offset) intel_sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_READ4(ptr, byte_offset) intel_sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
-#define ALIGNED_BLOCK_READ8(ptr, byte_offset) intel_sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ1(ptr, byte_offset) _sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ2(ptr, byte_offset) _sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ4(ptr, byte_offset) _sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
+#define ALIGNED_BLOCK_READ8(ptr, byte_offset) _sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))

 // Currently read is 4 bytes aligned.
 #define ALIGNED_READ1(ptr, byte_offset) (*(const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
@@ -130,10 +132,10 @@
 #define ALIGNED_READ8(ptr, byte_offset) vload8(0, (const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))

 // Currently block write is 16 bytes aligned.
-#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) intel_sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
-#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) intel_sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
-#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) intel_sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
-#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
+#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) _sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
+#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) _sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
+#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) _sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
+#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) _sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))

 // Currently block write is 4 bytes aligned.
 #define ALIGNED_WRITE1(ptr, byte_offset, val) ((void)(*(__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)) = (val)))
@@ -156,7 +158,7 @@



-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
    const __global UNIT_TYPE* input,
@@ -210,32 +212,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
        CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];

 #if IN_CHUNK_PREFETCH_SIZE % 8 == 0
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
        {
            CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
            input_offset += 8 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
 #elif IN_CHUNK_PREFETCH_SIZE % 4 == 0
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
        {
            CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
            input_offset += 4 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
 #elif IN_CHUNK_PREFETCH_SIZE % 2 == 0
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
        {
            CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
            input_offset += 2 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
 #else
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
        {
            CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
            input_offset += BYTES_PER_SG_READ;
@@ -243,8 +241,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
        }
 #endif

-        __attribute__((opencl_unroll_hint))
-        for (uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
+        unroll_for(uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
        {
            // Contains group of weights for RESPONSES_PER_SG_EXEC responses and for (FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC) spatial points.
            // Currently for floats:
@@ -264,32 +261,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
            CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];

 #if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
            {
                CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
                filter_offset += 8 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
            {
                CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
                filter_offset += 4 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
            {
                CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
                filter_offset += 2 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #else
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
            {
                CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
                filter_offset += BYTES_PER_SG_READ;
@@ -298,8 +291,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
 #endif

            // Processing of cached filter chunks.
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
            {
                const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;

@@ -338,32 +330,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
        CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];

    #if IN_CHUNK_PREFETCH_SIZE % 8 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 8 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 16)
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
        {
            CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
            input_offset += 8 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
    #elif IN_CHUNK_PREFETCH_SIZE % 4 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 4 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 8)
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
        {
            CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
            input_offset += 4 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
    #elif IN_CHUNK_PREFETCH_SIZE % 2 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 2 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 4)
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
        {
            CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
            input_offset += 2 * BYTES_PER_SG_READ;
            EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
        }
    #else
-        __attribute__((opencl_unroll_hint))
-        for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
+        unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
        {
            CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
            input_offset += BYTES_PER_SG_READ;
@@ -371,8 +359,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
        }
    #endif

-        __attribute__((opencl_unroll_hint))
-        for (uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
+        unroll_for(uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
        {
            // Size of array of CHUNK_TYPE needed to contain filter elements for input elements in range [elem_base_idx; INPUT0_ELEMENTS_REMAINDER).
            const uint filter_chunk_remainder_size = ((INPUT0_ELEMENTS_REMAINDER - elem_base_idx) * RESPONSES_PER_SG_EXEC + UNITS_PER_SG_READ - 1) / UNITS_PER_SG_READ;
@@ -381,32 +368,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
            CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];

 #if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
            {
                CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
                filter_offset += 8 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
            {
                CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
                filter_offset += 4 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
            {
                CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
                filter_offset += 2 * BYTES_PER_SG_READ;
                EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
            }
 #else
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
            {
                CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
                filter_offset += BYTES_PER_SG_READ;
@@ -415,8 +398,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
 #endif

            // Processing of cached filter chunks.
-            __attribute__((opencl_unroll_hint))
-            for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
+            unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
            {
                const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;

@@ -458,15 +440,14 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
         sg_reduce_offset < SUB_GROUP_SIZE;
         sg_reduce_offset += SUB_GROUP_SIZE * RESPONSES_PER_SG_EXEC / UNITS_PER_SG_READ)
    {
-        reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(intel_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
+        reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
    }


    // Expand accumulator chunks to units.
    const uint expanded_acc_size = (RESPONSES_PER_SG_EXEC + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE;

-    __attribute__((opencl_unroll_hint))
-    for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
+    unroll_for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
    {
        const uint output_id = output_base_id + expanded_acc_idx * SUB_GROUP_SIZE;
 #if BIAS_TERM
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv8_af8_vload.cl
@@ -2,13 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"
 #include "include/sub_group.cl"

 #if FP16_UNIT_USED
    // Block read - currently block is 4 bytes aligned.
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
+    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))

    #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
    {   \
@@ -31,7 +30,7 @@
    }
 #else
    // Block read - currently block is 4 bytes aligned.
-    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
+    #define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))

    #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB)  \
    {   \
@@ -57,7 +56,7 @@
 #define SUB_GROUP_SIZE 8

 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv8_vload)(
    const __global UNIT_TYPE* input,
    __global UNIT_TYPE* output,
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_fb_io_b8_f8.cl
@@ -1,64 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "include/batch_headers/data_types.cl"
-#include "include/batch_headers/fetch_data.cl"
-#include "include/sub_group.cl"
-
-__attribute__((reqd_work_group_size(8, 1, 1)))
-KERNEL (fully_connected_gpu_xb_xb_b8_x8)(
-    const __global float* input,
-    __global float* output,
-    const __global float* weight
-#if BIAS_TERM
-    , __global UNIT_TYPE* bias)
-#else
-    )
-#endif
-{
-    const uint global_id = get_global_id(0);
-    const int x = get_global_id(0);
-    const uint batch_id = x % INPUT0_BATCH_NUM;
-
-    uint neuronIdx = (x / INPUT0_BATCH_NUM) * NEURONS_PER_WORK_ITEM;
-
-    const uint sub_group_id = get_local_id(0);
-    const uint batch_num = INPUT0_BATCH_NUM;
-
-    const int out_id = (global_id / batch_num) * NEURONS_PER_WORK_ITEM * batch_num + batch_id;
-
-    const int ofm_offset = (global_id * NEURONS_PER_WORK_ITEM) / batch_num;
-
-    float8 _data0 = 0.f;
-#if NEURONS_PER_WORK_ITEM > 8
-    float8 _data1 = 0.f;
-#endif
-
-    uint weight_offset = sub_group_id + neuronIdx;
-
-    for (uint h = 0; h < INPUT0_ELEMENTS_COUNT; h++)
-    {
-        DOT_PRODUCT_8(_data0, input[h * batch_num + batch_id], weight[weight_offset])
-#if NEURONS_PER_WORK_ITEM > 8
-        DOT_PRODUCT_8(_data1, input[h * batch_num + batch_id], weight[weight_offset + 8])
-#endif
-        weight_offset += FILTER_OFM_NUM;
-    }
-
-#if BIAS_TERM
-    ADD_BIAS_8(_data0, bias[neuronIdx + sub_group_id]);
-#if NEURONS_PER_WORK_ITEM > 8
-    ADD_BIAS_8(_data1, bias[neuronIdx + sub_group_id + 8]);
-#endif
-#endif
-    _data0 = ACTIVATION(_data0, ACTIVATION_PARAMS);
-#if NEURONS_PER_WORK_ITEM > 8
-    _data1 = ACTIVATION(_data1, ACTIVATION_PARAMS);
-#endif
-
-    intel_sub_group_block_write8((__global uint*)output + out_id, as_uint8(_data0));
-#if NEURONS_PER_WORK_ITEM > 8
-    intel_sub_group_block_write8((__global uint*)output + out_id + 8 * batch_num, as_uint8(_data1));
-#endif
-}
--- a/Show More
+++ b/Show More