[GPU] Permute f and y axes (#13561)

* [GPU] Permute f and y axes It is supported cases where y and f (and X if it is not equal 1) axes size divisible by 4,8 or 16. added kernel to switch f and y axes in 4d model blocked and planar formats added test fot tests * Added subgroup read/write to THREE_DIM_TRANSPOSE kernel case. * Better checking of is SIMD size supported. * Added support for long type to subgroup read/write. * Added subgroup read/write support to 2d permute. * Fixed win build issue. * Changed f and y indexes in iteration. * Added vector read/write. * Fixed j_times calculation. * Better naming. * Rollback test logic. * Fixed fusion logic. * Accept only supported blocked layouts and SIMD sizes. --------- Co-authored-by: Mykhailo Hnap <mykhailo.hnap@capgemini.com> Co-authored-by: Wilson Seok <wilson.seok@intel.com>
2023-07-18 21:48:25 +03:00 · 2023-07-18 21:48:25 +03:00 · f0abd468a2
commit f0abd468a2
parent 568d3371d7
10 changed files with 706 additions and 2 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/common.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/common.cl
@ -52,6 +52,8 @@
 #define TYPE_SIZE_int    4
 #define TYPE_SIZE_uint   4
 #define TYPE_SIZE_float  4
+#define TYPE_SIZE_ulong  8
+#define TYPE_SIZE_long   8
 #define TYPE_SIZE(type) CAT(TYPE_SIZE_, type)

 #ifdef cl_intel_required_subgroup_size
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sub_group_block_read.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sub_group_block_read.cl
@ -34,17 +34,20 @@
 #define BLOCK_READ_TYPE_size1 uchar
 #define BLOCK_READ_TYPE_size2 ushort
 #define BLOCK_READ_TYPE_size4 uint
+#define BLOCK_READ_TYPE_size8 ulong
 #define BLOCK_READ_TYPE(type_size) CAT(BLOCK_READ_TYPE_size, type_size)

 #define BLOCK_READ_FUNC_size1       _sub_group_block_read_uc
 #define BLOCK_READ_FUNC_size2       _sub_group_block_read_us
 #define BLOCK_READ_FUNC_size4       _sub_group_block_read
+#define BLOCK_READ_FUNC_size8       _sub_group_block_read_ul
 #define BLOCK_READ_FUNC(type_size)  CAT(BLOCK_READ_FUNC_size, type_size)

 #define BLOCK_READN_FUNC_SIZE_DEF(type_size, vector_size)   MAKE_VECTOR_TYPE(BLOCK_READ_FUNC(type_size), vector_size)
 #define BLOCK_READN_FUNC_size1(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(1, vector_size)
 #define BLOCK_READN_FUNC_size2(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(2, vector_size)
 #define BLOCK_READN_FUNC_size4(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_READN_FUNC_size8(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(8, vector_size)
 #define BLOCK_READN_FUNC(type_size, vector_size)            CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)

 #define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset)                                        \
@ -151,3 +154,15 @@
    DECLARE_BLOCK_READ_EMULATION(1, 8)
    DECLARE_BLOCK_READ_EMULATION(1, 16)
 #endif
+
+#if defined(cl_intel_subgroups_long)
+    #define _sub_group_block_read_ul(ptr)  intel_sub_group_block_read_ul(ptr)
+    #define _sub_group_block_read_ul2(ptr) intel_sub_group_block_read_ul2(ptr)
+    #define _sub_group_block_read_ul4(ptr) intel_sub_group_block_read_ul4(ptr)
+    #define _sub_group_block_read_ul8(ptr) intel_sub_group_block_read_ul8(ptr)
+#elif (__OPENCL_C_VERSION__ >= 200)
+    DECLARE_BLOCK_READ_EMULATION(8, 1)
+    DECLARE_BLOCK_READ_EMULATION(8, 2)
+    DECLARE_BLOCK_READ_EMULATION(8, 4)
+    DECLARE_BLOCK_READ_EMULATION(8, 8)
+#endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sub_group_block_write.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sub_group_block_write.cl
@ -32,17 +32,20 @@
 #define BLOCK_WRITE_TYPE_size1 uchar
 #define BLOCK_WRITE_TYPE_size2 ushort
 #define BLOCK_WRITE_TYPE_size4 uint
+#define BLOCK_WRITE_TYPE_size8 ulong
 #define BLOCK_WRITE_TYPE(type_size) CAT(BLOCK_WRITE_TYPE_size, type_size)

 #define BLOCK_WRITE_FUNC_size1       _sub_group_block_write_uc
 #define BLOCK_WRITE_FUNC_size2       _sub_group_block_write_us
 #define BLOCK_WRITE_FUNC_size4       _sub_group_block_write
+#define BLOCK_WRITE_FUNC_size8       _sub_group_block_write_ul
 #define BLOCK_WRITE_FUNC(type_size)  CAT(BLOCK_WRITE_FUNC_size, type_size)

 #define BLOCK_WRITEN_FUNC_SIZE_DEF(type_size, vector_size)  MAKE_VECTOR_TYPE(BLOCK_WRITE_FUNC(type_size), vector_size)
 #define BLOCK_WRITEN_FUNC_size1(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(1, vector_size)
 #define BLOCK_WRITEN_FUNC_size2(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(2, vector_size)
 #define BLOCK_WRITEN_FUNC_size4(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_WRITEN_FUNC_size8(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(8, vector_size)
 #define BLOCK_WRITEN_FUNC(type_size, vector_size)           CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)

 #define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val)                                  \
@ -133,3 +136,15 @@
    DECLARE_BLOCK_WRITE_EMULATION(1, 8)
    DECLARE_BLOCK_WRITE_EMULATION(1, 16)
 #endif
+
+#if defined(cl_intel_subgroups_long)
+    #define _sub_group_block_write_ul(ptr, v)  intel_sub_group_block_write_ul(ptr, v)
+    #define _sub_group_block_write_ul2(ptr, v) intel_sub_group_block_write_ul2(ptr, v)
+    #define _sub_group_block_write_ul4(ptr, v) intel_sub_group_block_write_ul4(ptr, v)
+    #define _sub_group_block_write_ul8(ptr, v) intel_sub_group_block_write_ul8(ptr, v)
+#elif (__OPENCL_C_VERSION__ >= 200)
+    DECLARE_BLOCK_WRITE_EMULATION(8, 1)
+    DECLARE_BLOCK_WRITE_EMULATION(8, 2)
+    DECLARE_BLOCK_WRITE_EMULATION(8, 4)
+    DECLARE_BLOCK_WRITE_EMULATION(8, 8)
+#endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl
@ -0,0 +1,182 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+
+#define READ_VEC(offset, ptr)          CAT(vload, VEC_SIZE)(offset, ptr)
+#define WRITE_VEC(val, offset, ptr)    CAT(vstore, VEC_SIZE)(val, offset, ptr)
+
+#define IN_VEC_TYPE                     MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
+#define TO_IN_VEC_TYPE(x)               CAT(convert_, IN_VEC_TYPE)(x)
+#define ACC_VEC_TYPE                    MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_SIZE)
+#define TO_ACC_VEC_TYPE(x)              CAT(convert_, ACC_VEC_TYPE)(x)
+#define OUT_VEC_TYPE                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
+#define TO_OUT_VEC_TYPE(x)              CAT(convert_, OUT_VEC_TYPE)(x)
+
+#if defined (PERMUTE_SIMPLE_MEM_COPY)
+KERNEL (permute_f_y_axes)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
+{
+    const int bf = get_global_id(2);
+    const int f_idx = bf % INPUT0_FEATURE_NUM;
+    const int b_idx = bf / INPUT0_FEATURE_NUM;
+    const int x_start = get_global_id(0) * BLOCK_SIZE;
+    const int y_idx = get_global_id(1);
+
+    __attribute__((opencl_unroll_hint(J_TIMES)))
+    for (int j = 0; j < J_TIMES; ++j) {
+        const int x_idx = x_start + j * VEC_SIZE;
+        IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
+#if HAS_FUSED_OPS
+        FUSED_OPS_VEC;
+        OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
+#else
+        OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
+#endif
+        const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f_idx, x_idx);
+        WRITE_VEC(result, 0, &output[output_idx]);
+    }
+}
+
+#elif defined (THREE_DIM_TRANSPOSE)
+
+#ifdef SUB_GROUP_SIZE
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+#endif
+KERNEL (permute_f_y_axes)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
+{
+    __local OUTPUT_TYPE transpose_buf[FEATURE_BLOCK_SIZE][FEATURE_BLOCK_SIZE][TILE_SIZE];
+
+    const int bf = get_global_id(2);
+    const int f_idx = bf % INPUT0_FEATURE_NUM;
+    const int b_idx = bf / INPUT0_FEATURE_NUM;
+
+    int bf_local = get_local_id(2);
+    int y_local = get_local_id(1);
+
+    const int x_begin = get_global_id(0) * TILE_SIZE;
+    const int y_begin = get_global_id(1) * FEATURE_BLOCK_SIZE;
+    const int f_begin = get_local_size(2) * get_group_id(2);
+
+    __attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
+    for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
+        __attribute__((opencl_unroll_hint(TILE_SIZE)))
+        for (int i = 0; i < TILE_SIZE; ++i) {
+            const int x_idx = x_begin + i;
+            const int y_idx = y_begin + j;
+            const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
+            INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
+    #if HAS_FUSED_OPS
+            FUSED_OPS;
+            transpose_buf[bf_local][j][i]  = FUSED_OPS_RESULT;
+    #else
+            transpose_buf[bf_local][j][i] = ACTIVATION(res, ACTIVATION_PARAMS);
+    #endif
+        }
+    }
+
+    __attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
+    for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
+        __attribute__((opencl_unroll_hint(TILE_SIZE)))
+        for (int i = 0; i < TILE_SIZE; ++i) {
+            const int x_idx = x_begin + i;
+            const int f = f_begin + j;
+            const int y_idx = y_begin + bf_local;
+            const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]);
+        }
+    }
+}
+
+#else
+
+#ifdef SUB_GROUP_SIZE
+REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+#endif
+KERNEL (permute_f_y_axes)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+    )
+{
+    __local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
+
+    const int bf = get_global_id(2);
+    const int b_idx = bf / INPUT0_FEATURE_NUM;
+    const int f_idx = bf % INPUT0_FEATURE_NUM;
+    const int bf_local = get_local_id(2);
+    const int x_idx = get_global_id(0);
+    const int y_begin = get_global_id(1) * TILE_SIZE;
+    const int f_begin = get_local_size(2) * get_group_id(2);
+
+#if INPUT0_SIMPLE == 1
+    __attribute__((opencl_unroll_hint(J_TIMES)))
+    for (int j = 0; j < J_TIMES; ++j) {
+        const int j_vec = j * VEC_SIZE;
+        const int y_idx = y_begin + j_vec;
+        IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
+#if HAS_FUSED_OPS
+        FUSED_OPS_VEC;
+        OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
+        for (int k = 0; k < VEC_SIZE; ++k) {
+            transpose_buf[j_vec + k][bf_local] = result[k];
+        }
+#else
+        for (int k = 0; k < VEC_SIZE; ++k) {
+            transpose_buf[j_vec + k][bf_local] = ACTIVATION(res[k], ACTIVATION_PARAMS);
+        }
+#endif
+    }
+
+    __attribute__((opencl_unroll_hint(J_TIMES)))
+    for (int j = 0; j < J_TIMES; ++j) {
+        const int j_vec = j * VEC_SIZE;
+        const int f = f_begin + j_vec;
+        const int y_idx = y_begin + bf_local;
+        const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
+        WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
+    }
+
+
+#else
+    __attribute__((opencl_unroll_hint(TILE_SIZE)))
+    for (int j = 0; j < TILE_SIZE; ++j) {
+        const int y_idx = y_begin + j;
+        const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
+        INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
+#if HAS_FUSED_OPS
+        FUSED_OPS;
+        transpose_buf[bf_local][j]  = FUSED_OPS_RESULT;
+#else
+        transpose_buf[bf_local][j] = ACTIVATION(res, ACTIVATION_PARAMS);
+#endif
+    }
+    __attribute__((opencl_unroll_hint(TILE_SIZE)))
+    for (int j = 0; j < TILE_SIZE; ++j) {
+        const int f = f_begin + j;
+        const int y_idx = y_begin + bf_local;
+        const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
+        DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]);
+    }
+#endif
+}
+
+#endif // #if defined (SIMPLE_MEM_COPY)
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
@ -0,0 +1,246 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "permute_kernel_f_y_axes.h"
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+
+#include "common_tools.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+namespace {
+
+constexpr size_t cSimpleMemCopyOpDivider = 4UL;
+constexpr size_t c3DTransposeBufHeight = 4UL;
+
+size_t GetDivisor(const size_t input_size) {
+    std::vector<size_t> v = {/*32,*/ 16, 8, 4, /*2,*/ 1};
+    auto is_divided = [input_size](size_t i) {
+        return input_size % i == 0;
+    };
+    auto result = std::find_if(begin(v), end(v), is_divided);
+    return *result;
+}
+
+bool IsSimpleMemCopyOperation(const permute_params& params) {
+    return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() == DataLayout::bfyx;
+}
+
+bool Is3DTranspose(const permute_params& params) {
+    return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() != DataLayout::bfyx;
+}
+
+size_t GetFeatureBlockSize(const permute_params& params) {
+    return std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
+}
+
+size_t GetTileHeight(const permute_params& params) {
+    size_t min_divisor{};
+    if (params.inputs[0].X().v == 1) {
+        min_divisor = std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
+    } else {
+        min_divisor = std::min({GetDivisor(params.inputs[0].Feature().v),
+                                GetDivisor(params.inputs[0].Y().v),
+                                GetDivisor(params.inputs[0].X().v)});
+    }
+    if (Is3DTranspose(params)) {
+        return std::min(min_divisor, c3DTransposeBufHeight);
+    }
+    return min_divisor;
+}
+
+size_t GetTileWidth(const permute_params& params) {
+    const Datatype input_type = params.inputs[0].GetDType();
+    const Datatype output_type = params.outputs[0].GetDType();
+
+    size_t min_divisor = GetTileHeight(params);
+    if (IsSimpleMemCopyOperation(params)) {
+        min_divisor = std::min(min_divisor, cSimpleMemCopyOpDivider);
+    }
+
+    // i64 only supports tile size 4
+    if ((input_type == Datatype::INT64) || (output_type == Datatype::INT64)) {
+        min_divisor = min_divisor / 2;
+    }
+    if (input_type == Datatype::F16) {
+        min_divisor = min_divisor * 2;
+    }
+    if (input_type == Datatype::INT8 || input_type == Datatype::UINT8) {
+        min_divisor = min_divisor * 4;
+    }
+
+    if (params.inputs[0].X().v == 1) {
+        return std::min(params.inputs[0].Y().v, min_divisor);
+    }
+    return std::min(params.inputs[0].X().v, min_divisor);
+}
+
+size_t GetTileSize(const permute_params& params) {
+    return std::min(GetTileHeight(params), GetTileWidth(params));
+}
+
+}  // namespace
+
+ParamsKey PermuteKernel_f_y_axes::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableDifferentTypes();
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    return k;
+}
+
+JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& params,
+                                                     const CommonDispatchData& dispatchData) const {
+    auto jit = Parent::GetJitConstants(params, dispatchData);
+
+    if (params.inputs[0].X().v != 1) {
+        if (IsSimpleMemCopyOperation(params)) {
+            jit.AddConstant(MakeJitConstant("PERMUTE_SIMPLE_MEM_COPY", ""));
+        }
+        if (Is3DTranspose(params)) {
+            jit.AddConstant(MakeJitConstant("THREE_DIM_TRANSPOSE", ""));
+        }
+    }
+
+    const size_t tile_width = GetTileWidth(params);
+    const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
+    const size_t tile_size = GetTileSize(params);
+    const size_t j_times = tile_size / vector_size;
+    const size_t feature_block_size = GetFeatureBlockSize(params);
+    jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
+    jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
+    jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
+    jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
+    jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
+
+    const auto layout = params.inputs.front().GetLayout();
+    if (!SimpleLayout(layout)) {
+        const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
+        jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
+    }
+
+    if (!params.fused_ops.empty()) {
+        const std::vector<std::string> original_output_order = {"b_idx", "f_idx", "y_idx", "x_idx"};
+        const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
+        const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
+    }
+    return jit;
+}
+
+static inline std::vector<size_t> GetGWS(const permute_params& params) {
+    const auto& in = params.inputs[0];
+    std::vector<size_t> gws;
+    auto block_size = IsSimpleMemCopyOperation(params) ? GetTileWidth(params) : GetTileSize(params);
+    if (params.inputs[0].X().v == 1) {
+        gws = {in.X().v, in.Y().v / block_size, (in.Batch().v * in.Feature().v)};
+    } else {
+        if (Is3DTranspose(params)) {
+            gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
+        } else {
+            gws = {in.X().v / block_size, in.Y().v, (in.Batch().v * in.Feature().v)};
+        }
+    }
+    return gws;
+}
+
+CommonDispatchData PermuteKernel_f_y_axes::SetDefault(const permute_params& params) const {
+    CommonDispatchData dispatchData;
+    dispatchData.gws = GetGWS(params);
+    if (IsSimpleMemCopyOperation(params)) {
+        auto in_layout = params.inputs[0].GetLayout();
+        auto out_layout = params.outputs[0].GetLayout();
+        const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
+            {Tensor::DataChannelName::X},
+            {Tensor::DataChannelName::Y},
+            {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+        dispatchData.lws =
+            GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
+    } else if (Is3DTranspose(params)) {
+        dispatchData.lws = {1, 1, GetFeatureBlockSize(params)};
+    } else {
+        dispatchData.lws = {1, 1, GetTileSize(params)};
+    }
+    return dispatchData;
+}
+
+bool PermuteKernel_f_y_axes::Validate(const Params& p, const optional_params& o) const {
+    if (!Parent::Validate(p, o)) {
+        return false;
+    }
+
+    const auto is_swapping_f_with_y = [](const std::vector<uint16_t>& order) {
+        // Target transform: Swap feature with y
+        // IE order:    0 2 1 3 => bfyx -> byfx
+        // cldnn order: 0 3 2 1 => bfxy -> byxf
+        if (order.size() != 4) {
+            return false;
+        }
+        if (order[0] != 0 || order[1] != 3 || order[2] != 2 || order[3] != 1) {
+            return false;
+        }
+        return true;
+    };
+
+    const auto& params = dynamic_cast<const permute_params&>(p);
+    const auto& in = params.inputs[0];
+    const auto in_layout = in.GetLayout();
+
+    const auto feature_div = GetDivisor(in.Feature().v);
+    const auto y_div = GetDivisor(in.Y().v);
+    if (feature_div == 1 || y_div == 1) {
+        return false;
+    }
+    if (in.X().v > 1 && GetDivisor(in.X().v) == 1) {
+        return false;
+    }
+    if (!is_swapping_f_with_y(params.order)) {
+        return false;
+    }
+
+    // Accept only supported blocked layouts and SIMD sizes.
+    if (!SimpleLayout(in_layout)) {
+        const auto feature_block_size = GetFeatureBlockSize(params);
+        const auto tile_size = GetTileSize(params);
+        const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
+        if (!(IsSIMDSizeSupported(params.engineInfo, subgroup_size) &&
+              (in_layout == DataLayout::b_fs_yx_fsv32 || in_layout == DataLayout::b_fs_yx_fsv16))) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/,
+                                                           const optional_params& /*options*/) const {
+    return FORCE_PRIORITY_3;
+}
+
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.h
@ -0,0 +1,29 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "permute_kernel_base.h"
+
+namespace kernel_selector {
+
+class PermuteKernel_f_y_axes : public PermuteKernelBase {
+public:
+    using Parent = PermuteKernelBase;
+    using Parent::Parent;
+    PermuteKernel_f_y_axes() : PermuteKernelBase("permute_f_y_axes") {}
+    ~PermuteKernel_f_y_axes() override = default;
+
+    bool Validate(const Params& p, const optional_params& o) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
+    CommonDispatchData SetDefault(const permute_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {FusedOpType::ACTIVATION, FusedOpType::QUANTIZE, FusedOpType::ELTWISE};
+    }
+};
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp
@ -7,6 +7,7 @@
 #include "permute_kernel_tile_8x8_4x4.h"
 #include "permute_kernel_tile_8x8_4x4_fsv.h"
 #include "permute_kernel_bfzyx_to_bfyxz.h"
+#include "permute_kernel_f_y_axes.h"

 namespace kernel_selector {

@ -15,6 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
    Attach<PermuteKernel_tile_8x8_4x4>();
    Attach<PermuteKernel_tile_8x8_4x4_fsv>();
    Attach<PermuteKernel_bfzyx_to_bfyxz>();
+    Attach<PermuteKernel_f_y_axes>();
 }

 KernelsData permute_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
--- a/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/permute_fusion_test.cpp
@ -170,6 +170,9 @@ public:
 // permute_bfzyx_to_bfyxz
 #define CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0 { 1, 8, 8, 2, 2 }, { 1, 8, 2, 8, 2 }, { 0, 1, 3, 4, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx

+// permute_f_y_axes
+#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 2 }, { 1, 2, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+
 class permute_activation_scale_eltwise: public PermuteFusingTest {};
 TEST_P(permute_activation_scale_eltwise, basic) {
    auto p = GetParam();
@ -265,6 +268,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testin

    // Fusing tests for permute_bfzyx_to_bfyxz
    permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 5 },
+
+    // Fusing tests for permute_f_y_axes
+    permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 5 },
 }));

 class permute_quant_u8: public PermuteFusingTest {};
@ -470,6 +476,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t

    // Fusing tests for permute_bfzyx_to_bfyxz
    permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 7 },
+
+    // Fusing tests for permute_f_y_axes
+    permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 7 },
 }));

 /* ------------------------------------------------------------------------------------------------------------ */
--- a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp
@ -24,6 +24,23 @@ using namespace cldnn;
 using namespace ::tests;
 using namespace testing;

+namespace {
+
+// TODO: Move somewhere
+template <class vecElementType>
+std::string vec2str(const std::vector<vecElementType>& vec) {
+    if (!vec.empty()) {
+        std::ostringstream result;
+        result << "(";
+        std::copy(vec.begin(), vec.end() - 1, std::ostream_iterator<vecElementType>(result, "."));
+        result << vec.back() << ")";
+        return result.str();
+    }
+    return "()";
+}
+
+}  // namespace
+
 TEST(permute_gpu_f32, output_ordering_test)
 {
    auto& engine = get_test_engine();
@ -1871,6 +1888,13 @@ public:
    void run_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
                  const std::string & permute_opt = "permute_tile_8x8_4x4_fsv",
                  std::vector<uint16_t> permute_order = {}, bool is_caching_test = false);
+
+    static std::string PrintToStringParamName(const testing::TestParamInfo<TiledPermuteParam>& info) {
+        std::ostringstream result;
+        result << "InputShape=" << vec2str(info.param.sizes) << "_";
+        result << "Format=" << fmt_to_str(info.param.format_fsv);
+        return result.str();
+    }
 };

 template<>
@ -2220,3 +2244,180 @@ TEST_P(permute_tile_fsv_5d, i64_cached) {
    auto p = GetParam();
    run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_tile_8x8_4x4_fsv", {}, true);
 }
+
+class permute_f_y_axes_tile: public TiledPermuteTest {};
+
+INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
+                         permute_f_y_axes_tile,
+                         ::testing::ValuesIn(std::vector<TiledPermuteParam>{
+                             {{1, 4, 8, 1}, format::bfyx},                // permute_f_y_axes
+                             {{1, 64, 32, 1}, format::bfyx},              // permute_f_y_axes
+                             {{1, 32, 256, 512}, format::b_fs_yx_fsv32},  // THREE_DIM_TRANSPOSE
+                             {{1, 32, 256, 512}, format::bfyx},           // PERMUTE_SIMPLE_MEM_COPY
+                             {{1, 256, 256, 1}, format::b_fs_yx_fsv32},   // permute_f_y_axes
+                             {{1, 32, 16, 4}, format::b_fs_yx_fsv16},     // THREE_DIM_TRANSPOSE
+                         }),
+                         TiledPermuteTest::PrintToStringParamName);
+
+TEST_P(permute_f_y_axes_tile, combined) {
+    auto p = GetParam();
+    run_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+    run_test<cldnn::data_types::f16>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+    run_test<cldnn::data_types::u8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+    run_test<cldnn::data_types::i8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+    run_test<cldnn::data_types::i32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+    run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+}
+
+struct TiledPerformancePermuteTest : TiledPermuteTest
+{
+    static double get_exectime(const std::map<cldnn::primitive_id, cldnn::network_output>& outputs,
+                                const std::string& primitive_id)
+    {
+        using namespace std::chrono;
+        std::shared_ptr<event> e = outputs.at(primitive_id).get_event();
+        e->wait(); // should ensure execution completion, if not segfault will occur
+        double avg_time = 0.0;
+        auto intervals = e->get_profiling_info();
+        for (const auto& q : intervals)
+        {
+            if (q.stage != instrumentation::profiling_stage::executing) {
+                continue;
+            }
+            avg_time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
+            break;
+        }
+        return avg_time;
+    }
+
+    static void print_all_perf(std::map<primitive_id, network_output> outputs)
+    {
+        std::cout << "Print last run time" << std::endl;
+        using namespace std::chrono;
+        for( const auto &n : outputs ) {
+            std::shared_ptr<event> e = n.second.get_event();
+            auto intervals = e->get_profiling_info();
+            double time = 0.0;
+            for (const auto& q : intervals)
+            {
+                if (q.stage == instrumentation::profiling_stage::executing) {
+                    continue;
+                }
+                time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
+                break;
+            }
+            std::cout << n.first << ":" << time << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
+    template<data_types Data_Type>
+    void execute_perf_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
+                            const std::string & kernel_name, std::vector<uint16_t> permute_order)
+    {
+        auto& engine = get_test_engine();
+        // convert half_t to FLOAT16
+        using type_ = typename data_type_to_type<Data_Type>::type;
+        using type = typename std::conditional<std::is_same<type_, half_t>::value, FLOAT16, type_>::type;
+
+        std::vector<cldnn::tensor::value_type> internal_sizes(sizes);
+        std::swap(internal_sizes.at(2), internal_sizes.back());
+        cldnn::tensor tensor(internal_sizes);
+
+        cldnn::format format = sizes.size() == 4 ? cldnn::format::bfyx : cldnn::format::bfzyx;
+
+        std::vector<uint16_t> order = {0};
+        if (permute_order.empty()) {
+            for (uint16_t i = 1; i < (sizes.size() - 1); ++i) {
+                order.push_back(i+1);
+            }
+            order.push_back(1);
+        } else {
+            std::swap(order, permute_order);
+        }
+
+        auto input_ref = engine.allocate_memory({Data_Type, format, tensor});
+        set_random_values<type>(input_ref);
+        topology topology_ref = topology(
+            input_layout("input", input_ref->get_layout()),
+            reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
+            permute("output", input_info("reorder"), order)
+        );
+        // run with permute_ref
+        ExecutionConfig config_ref(ov::enable_profiling(true));
+        ov::intel_gpu::ImplementationDesc permute_ref = {format_fsv, "permute_ref"};
+        config_ref.set_property(
+            ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_ref}}));
+        cldnn::network network_ref(engine, topology_ref, config_ref);
+        network_ref.set_input_data("input", input_ref);
+
+        // run with optimized kernel, e.g. permute_tile_8x8_4x4_fsv16
+        auto input_opt = engine.allocate_memory({Data_Type, format, tensor});
+        set_random_values<type>(input_opt);
+        topology topology_opt = topology(
+            input_layout("input", input_opt->get_layout()),
+            reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
+            permute("output", input_info("reorder"), order)
+        );
+        ExecutionConfig config_tile(ov::enable_profiling(true));
+        ov::intel_gpu::ImplementationDesc permute_tile_opt = {format_fsv, kernel_name};
+        config_tile.set_property(
+            ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_tile_opt}}));
+        cldnn::network network_tile(engine, topology_opt, config_tile);
+        network_tile.set_input_data("input", input_opt);
+
+        // first execution of opt
+        std::map<primitive_id, network_output> output_permute_ref;
+        std::map<primitive_id, network_output> output_permute_opt;
+
+        for (int i = 0; i < 10; ++i) {
+            output_permute_ref = network_ref.execute();
+            output_permute_opt = network_tile.execute();
+        }
+
+        auto r = 100;
+        double exectime_ref = 0.f;
+        double exectime_opt = 0.f;
+        for (int i = 0; i < r; ++i) {
+            output_permute_opt = network_tile.execute();
+            auto t_opt = get_exectime(output_permute_opt, "output");
+            exectime_opt += t_opt;
+
+            output_permute_ref = network_ref.execute();
+            auto t_ref = get_exectime(output_permute_ref, "output");
+            exectime_ref += t_ref;
+        }
+        exectime_ref /= r;
+        exectime_opt /= r;
+        std::cout << std::endl;
+        auto output_layout_ref = network_ref.get_program()->get_node("output").get_output_layout();
+        auto output_layout_opt = network_tile.get_program()->get_node("output").get_output_layout();
+        std::string frm_str = cldnn::format(format).to_string();
+        std::string input_type = data_type_traits::name(Data_Type);
+
+        std::cout << "Exectued time " << " " << "permute_ref" << " " << " input(" << tensor.to_string()
+                  << ") output(" <<  output_layout_ref.to_string() << ") "
+                  << frm_str << " " << input_type << " " << exectime_ref << std::endl;
+        std::cout << "Exectued time " << " " << kernel_name << " " << " input(" << tensor.to_string()
+                  << ") output(" <<  output_layout_opt.to_string() << ") "
+                  << frm_str << " " << input_type << " " << exectime_opt << std::endl;
+
+    }
+};
+
+// No need to run performance tests on CI
+TEST_P(TiledPerformancePermuteTest, DISABLED_f32) {
+    auto p = GetParam();
+    execute_perf_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
+}
+
+INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest,
+    ::testing::ValuesIn(std::vector<TiledPermuteParam> {
+        // b_fs_zy_fsv16
+        // normal cases
+        {{1, 512, 16384, 1}, format::bfyx},
+        {{1, 512, 16384, 1}, format::b_fs_yx_fsv16},
+        {{1, 256, 128, 256}, format::bfyx},
+        {{1, 256, 256, 128}, format::b_fs_yx_fsv16},
+    }));
+
--- a/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.h
+++ b/src/plugins/intel_gpu/tests/unit/test_utils/test_utils.h
@ -273,12 +273,15 @@ void set_random_values(cldnn::memory::ptr mem, bool sign = false, unsigned signi
 template<class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
 void set_random_values(cldnn::memory::ptr mem)
 {
+    using T1 = typename std::conditional<std::is_same<int8_t, T>::value, int, T>::type;
+    using T2 = typename std::conditional<std::is_same<uint8_t, T1>::value, unsigned int, T1>::type;
+
    cldnn::mem_lock<T> ptr(mem, get_test_stream());

    std::mt19937 gen;
-    static std::uniform_int_distribution<T> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+    static std::uniform_int_distribution<T2> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
    for (auto it = ptr.begin(); it != ptr.end(); ++it) {
-        *it = uid(gen);
+        *it = static_cast<T>(uid(gen));
    }
 }