[GPU] Align permute axis format with IE (#11379)

2022-04-04 10:28:51 +03:00
parent d879e34363
commit afdaa7cf89
14 changed files with 225 additions and 232 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
@@ -169,31 +169,6 @@ inline cldnn::format DefaultFormatForDims(size_t dimensions) {
    return cldnn::format::bfyx;  // Should not get here
 }

-// This helper function is needed to convert permute order from IE format (bfyx) into cldnn format (bfxy)
-inline std::vector<uint16_t> ConvertPermuteOrder(const std::vector<uint16_t>& ie_order, size_t rank = 0) {
-    std::vector<uint16_t> ie_order_aligned = ie_order;
-    // if order size is less than 4 - fill the rest with just copy
-    rank = std::max(rank, (size_t)4);
-    for (auto o = ie_order_aligned.size(); o < rank; o++)
-        ie_order_aligned.push_back((uint16_t)o);
-
-    std::vector<uint16_t> cldnn_order;
-    // 1. Switch permute order values for spatial dims
-    for (auto const& o : ie_order_aligned) {
-        if (o >= 2)
-            cldnn_order.push_back(1 + ie_order_aligned.size() - o);
-        else
-            cldnn_order.push_back(o);
-    }
-
-    // 2. Swap spatial positions
-    for (int i = 0; i < (cldnn_order.size() - 2) / 2; i++) {
-        std::swap(cldnn_order[2 + i], cldnn_order[1 + cldnn_order.size() - (2 + i)]);
-    }
-
-    return cldnn_order;
-}
-
 inline InferenceEngine::Layout InferenceEngineLayoutFromOVLayout(ov::Layout l) {
    if (l == ov::Layout("C")) return InferenceEngine::Layout::C;
    if (l == ov::Layout("CN")) return InferenceEngine::Layout::CN;
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/permute.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/permute.hpp
@@ -35,7 +35,7 @@ struct permute : public primitive_base<permute> {
            const std::vector<uint16_t>& permute_order = {},
            const primitive_id& ext_prim_id = "",
            const padding& output_padding = padding())
-        : primitive_base(id, {input}, ext_prim_id, output_padding), permute_order(permute_order) {}
+        : primitive_base(id, {input}, ext_prim_id, output_padding), permute_order(permute_order) { }

    /// @brief Array of permuted output order in bfyx format.
    std::vector<uint16_t> permute_order;
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp
@@ -13,7 +13,6 @@
 #include "lstm_inst.h"
 #include "reshape_inst.h"
 #include "resample_inst.h"
-#include "permute_inst.h"
 #include "depth_to_space_inst.h"
 #include "lstm_dynamic_inst.h"
 #include "lstm_dynamic_input_inst.h"
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/permute.cpp
@@ -15,6 +15,33 @@ using namespace cldnn;
 namespace cldnn {
 namespace ocl {

+namespace {
+// This helper function is needed to convert permute order from IE format (bfyx) into cldnn format (bfxy)
+inline std::vector<uint16_t> convert_permute_order(const std::vector<uint16_t>& ie_order, size_t rank = 0) {
+    std::vector<uint16_t> ie_order_aligned = ie_order;
+    // if order size is less than 4 - fill the rest with just copy
+    rank = std::max(rank, (size_t)4);
+    for (auto o = ie_order_aligned.size(); o < rank; o++)
+        ie_order_aligned.push_back((uint16_t)o);
+
+    std::vector<uint16_t> cldnn_order;
+    // 1. Switch permute order values for spatial dims
+    for (auto const& o : ie_order_aligned) {
+        if (o >= 2)
+            cldnn_order.push_back(1 + ie_order_aligned.size() - o);
+        else
+            cldnn_order.push_back(o);
+    }
+
+    // 2. Swap spatial positions
+    for (int i = 0; i < (cldnn_order.size() - 2) / 2; i++) {
+        std::swap(cldnn_order[2 + i], cldnn_order[1 + cldnn_order.size() - (2 + i)]);
+    }
+
+    return cldnn_order;
+}
+}  // namespace
+
 struct permute_impl : typed_primitive_impl_ocl<permute> {
    using parent = typed_primitive_impl_ocl<permute>;
    using parent::parent;
@@ -28,7 +55,8 @@ struct permute_impl : typed_primitive_impl_ocl<permute> {
        auto permute_optional_params =
            get_default_optional_params<kernel_selector::permute_optional_params>(arg.get_program());

-        const auto& permute_order = arg.get_primitive()->permute_order;
+        auto in_rank = arg.get_dependency(0).get_output_layout().get_rank();
+        auto permute_order = convert_permute_order(arg.get_primitive()->permute_order, in_rank);
        permute_params.order = permute_order;
        auto& kernel_selector = kernel_selector::permute_kernel_selector::Instance();
        auto best_kernels = kernel_selector.GetBestKernels(permute_params, permute_optional_params);
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -376,7 +376,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
            auto& permute_order = next.as<permute>().get_primitive()->permute_order;
            if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
                fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
-                && permute_order[1] == 2
+                && permute_order.back() != 1
                && (!next.as<permute>().is_rotating_except_batch())) {
                    return false;
            }
@@ -428,7 +428,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
        auto& permute_order = prev.as<permute>().get_primitive()->permute_order;
        if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
         fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
-         && permute_order[1] == 2
+         && permute_order.back() != 1
         && (!prev.as<permute>().is_rotating_except_batch())) {
            return false;
        }
--- a/src/plugins/intel_gpu/src/graph/permute.cpp
+++ b/src/plugins/intel_gpu/src/graph/permute.cpp
@@ -24,20 +24,26 @@ layout permute_inst::calc_output_layout(permute_node const& node) {
           "Output data type forcing is not supported for permute_node!");
    auto input_layout = node.input().get_output_layout();
    auto permute_order = node.get_primitive()->permute_order;
-    std::vector<tensor::value_type> output_sizes;
+    std::vector<tensor::value_type> output_shape;
+
+    auto input_shape = input_layout.get_dims();

    for (size_t x = 0; x < permute_order.size(); x++) {
-        output_sizes.push_back(input_layout.size.raw[permute_order[x]]);
+        output_shape.push_back(input_shape[permute_order[x]]);
    }

-    auto input_size = tensor(output_sizes);
+    for (size_t i = output_shape.size(); i < 4; i++) {
+        output_shape.push_back(1);
+    }
+
+    auto output_size = tensor(format::get_default_format(input_layout.get_rank()), output_shape);
    auto op = node.get_primitive()->output_padding;

    if (node.has_fused_primitives()) {
        input_layout.data_type = node.get_fused_output_layout().data_type;
    }

-    return layout(input_layout.data_type, input_layout.format, input_size, op);
+    return layout(input_layout.data_type, input_layout.format, output_size, op);
 }

 std::string permute_inst::to_string(permute_node const& node) {
@@ -67,13 +73,6 @@ std::string permute_inst::to_string(permute_node const& node) {
 permute_inst::typed_primitive_inst(network& network, permute_node const& node) : parent(network, node) {
    auto permute_order = argument.permute_order;

-    CLDNN_ERROR_LESS_THAN(node.id(),
-                          "Permute order size",
-                          permute_order.size(),
-                          "minimum order size",
-                          4,
-                          "Permute order size needs to be at least 4.");
-
    auto required_order_values_size = static_cast<uint32_t>(permute_order.size());

    for (decltype(required_order_values_size) i = 0; i < required_order_values_size; i++) {
--- a/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp
@@ -127,7 +127,7 @@ static void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptr<ng
        std::swap(permute_order[1], permute_order[0]);
        auto permutePrim = cldnn::permute(permuteName,
                                          weightsName,
-                                          ConvertPermuteOrder(permute_order, weights_rank),
+                                          permute_order,
                                          op->get_friendly_name());

        p.AddPrimitive(permutePrim);
@@ -191,7 +191,7 @@ static void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_p
        std::swap(permute_order[2], permute_order[1]);
        auto permutePrim = cldnn::permute(permuteName,
                                          weightsName,
-                                          ConvertPermuteOrder(permute_order, weights_rank),
+                                          permute_order,
                                          op->get_friendly_name());

        p.AddPrimitive(permutePrim);
--- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
@@ -87,11 +87,10 @@ static void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::Mat
            for (auto o = transpose_order.size(); o < 4; o++)
                transpose_order.push_back((uint16_t)o);

-            std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
            auto permuteName = op->get_friendly_name() + "/transpose_b";
            auto permutePrim = cldnn::permute(permuteName,
                                              weightsName,
-                                              cldnn_permute_order,
+                                              transpose_order,
                                              op->get_friendly_name());
            p.AddPrimitive(permutePrim);
            p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);
@@ -107,11 +106,10 @@ static void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::Mat
            for (auto o = transpose_order.size(); o < 4; o++)
                transpose_order.push_back((uint16_t)o);

-            std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
            auto permuteName = op->get_friendly_name() + "/transpose_a";
            auto permutePrim = cldnn::permute(permuteName,
                                              inputName,
-                                              cldnn_permute_order,
+                                              transpose_order,
                                              op->get_friendly_name());
            p.AddPrimitive(permutePrim);
            p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);
--- a/src/plugins/intel_gpu/src/plugin/ops/transpose.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/transpose.cpp
@@ -20,13 +20,13 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
    std::string layerName = layer_type_name_ID(op);

-    std::vector<uint16_t> ie_order;
+    std::vector<uint16_t> order;
    if (op->get_input_size() == 2) {
        auto order_constant = std::dynamic_pointer_cast<ngraph::op::Constant>(op->get_input_node_shared_ptr(1));
        if (!order_constant) {
            IE_THROW() << "Unsupported parameter nodes type in " << op->get_friendly_name() << " (" << op->get_type_name() << ")";
        }
-        ie_order = order_constant->cast_vector<uint16_t>();
+        order = order_constant->cast_vector<uint16_t>();
    }

    auto is_convert_color_type = [](const std::shared_ptr<ov::Node> &node) {
@@ -40,7 +40,7 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
    // In case of ConvertColor operation we have NHWC (byxf) input format which should be converted to
    // NCHW (bfyx) by this Permute, so we replace Permute with Reorder (to bfyx) primitve
    auto input = op->input(0).get_source_output().get_node_shared_ptr();
-    if (is_convert_color_type(input) && ie_order == std::vector<uint16_t>{0, 3, 1, 2}) {
+    if (is_convert_color_type(input) && order == std::vector<uint16_t>{0, 3, 1, 2}) {
        auto precision = input->get_element_type();
        p.AddPrimitive(cldnn::reorder(layerName,
                                      inputPrimitives[0],
@@ -54,17 +54,15 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
    }

    int rank = std::max(4, static_cast<int>(op->get_input_shape(0).size()));
-    if (ie_order.empty()) {
+    if (order.empty()) {
        // if order size is less than 4 - fill the rest with just copy
        for (int o = rank - 1; o >= 0; o--)
-            ie_order.push_back((uint16_t)o);
+            order.push_back((uint16_t)o);
    }

-    std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(ie_order, rank);
-
    auto permutePrim = cldnn::permute(layerName,
                                      inputPrimitives[0],
-                                      cldnn_permute_order,
+                                      order,
                                      op->get_friendly_name());

    p.AddPrimitive(permutePrim);
--- a/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp
@@ -90,8 +90,8 @@ TEST_P(permute_eltwise_loop, basic) {
    execute(p);
 }

-#define CASE_LOOP_F32_1 3, { 1, 8, 3, 2 }, { 1, 2, 8, 3 }, { 1, 2, 8, 1 }, { 0, 3, 1, 2 }, data_types::f32, format::bfyx, data_types::f32
-#define CASE_LOOP_F16_0 4, { 1, 12, 4, 2 }, { 1, 2, 12, 4 }, { 1, 2, 12, 1 }, { 0, 3, 1, 2 }, data_types::f16, format::bfyx, data_types::f16
+#define CASE_LOOP_F32_1 3, { 1, 8, 3, 2 }, { 1, 2, 8, 3 }, { 1, 2, 8, 1 }, { 0, 2, 3, 1 }, data_types::f32, format::bfyx, data_types::f32
+#define CASE_LOOP_F16_0 4, { 1, 12, 4, 2 }, { 1, 2, 12, 4 }, { 1, 2, 12, 1 }, { 0, 2, 3, 1 }, data_types::f16, format::bfyx, data_types::f16

 INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_eltwise_loop, ::testing::ValuesIn(std::vector<loop_params>{
    loop_params{ CASE_LOOP_F32_1, 3, 5 },
--- a/src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp
@@ -87,75 +87,75 @@ public:
 /* ------------------------------------------------------------------------------------------------------------ */
 #define CASE_PERMUTE_F32_0 { 1, 16, 2, 2 }, { 1, 16, 2, 2 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
 #define CASE_PERMUTE_F32_1 { 1, 15, 16, 16 }, { 1, 15, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
 #define CASE_PERMUTE_F32_4 { 2, 16, 16, 16 }, { 2, 16, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 3, 1, 2, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 2, 1, 0, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

 #define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 2, 0, 3, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 3, 2, 0, 1 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 3, 0, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx

 #define CASE_PERMUTE_S8_0 { 1, 15, 4, 5 }, { 1, 15, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 2, 0, 1, 3 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 3, 0, 2, 1 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 3, 2, 0, 1 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_PERMUTE_U8_2 { 1, 32, 5, 4 }, { 1, 32, 5, 4 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
-#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx

 // 3d
 #define CASE_PERMUTE_F32_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 2, 3, 4, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 4, 0, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 1, 2, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 2, 3, 4, 0, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx

 #define CASE_PERMUTE_F16_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_PERMUTE_F16_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 4, 0, 3, 2, 1 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 0, 2, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 2, 0, 1, 4, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx

 #define CASE_PERMUTE_S8_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 4, 2, 1, 0, 3 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 2, 4, 3, 0, 1 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_PERMUTE_S8_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 4, 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 0, 2, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 2, 3, 0, 1, 4 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 1, 2, 4 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 2, 3, 4, 0, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx

 // permute_tile_8x8_4x4
-#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
-#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
-#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
+#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx

 // permute_tile_8x8_4x4_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16,  3, 2 }, { 1, 2, 16,  3 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1,  5,  7, 2 }, { 1, 2,  5,  7 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16,  3, 2, 2 }, { 1, 2, 16,  3, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
-#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1,  5,  7, 2, 2 }, { 1, 2,  5,  7, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16,  3, 2 }, { 1, 2, 16,  3 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1,  5,  7, 2 }, { 1, 2,  5,  7 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16,  3, 2, 2 }, { 1, 2, 16,  3, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
+#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1,  5,  7, 2, 2 }, { 1, 2,  5,  7, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16

 class permute_activation_scale_eltwise: public PermuteFusingTest {};
 TEST_P(permute_activation_scale_eltwise, basic) {
@@ -455,49 +455,49 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
 /* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */
 /* ------------------------------------------------------------------------------------------------------------ */

-#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 },   { 0, 3, 2, 1 },    { 0, 3, 2, 1 },    data_types::f32, data_types::f32, format::b_fs_yx_fsv16,  format::bfyx
-#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 },  { 0, 3, 2, 1 },    { 0, 3, 2, 1 },    data_types::f32, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
-#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 },     { 0, 3, 2, 1 },    { 0, 3, 2, 1 },    data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  format::bfyx
-#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 },   { 0, 2, 1, 3 },    { 0, 2, 1, 3 },    data_types::f32, data_types::f32, format::b_fs_yx_fsv16,  format::bfyx
+#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 },  { 0, 2, 1, 3 },    { 0, 2, 1, 3 },    data_types::f32, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
+#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 2, 3, 4, 1 }, { 0, 2, 3, 4, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 },     { 0, 2, 1, 3 },    { 0, 2, 1, 3 },    data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  format::bfyx
+#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
 #define CASE_PERMUTE_REORDER_F16_2 { 1, 5, 1, 2, 14 },  { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx

 // type change
-#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 },    { 0, 3, 2, 1 },    { 0, 3, 2, 1 },    data_types::i8, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
+#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 },    { 0, 2, 1, 3 },    { 0, 2, 1, 3 },    data_types::i8, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
 #define CASE_PERMUTE_REORDER_S8_TO_F32_1 { 1, 2, 15, 4, 5 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 },  { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 },  { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx

 // dim change
-#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 2, 0, 3 }, { 0, 3, 1, 4, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
-#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
-#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 4, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
-#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 4, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
-#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
-#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx
+#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 3, 2, 0 }, { 0, 3, 4, 2, 1 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
+#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 3, 1, 2 }, { 0, 4, 5, 1, 3, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
+#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 2, 1, 3, 4 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 2, 1, 3, 4 }, { 0, 4, 5, 1, 3, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
+#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 5, 1, 4, 3, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 5, 1, 4, 3, 2 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx

 // permute_opt for blocked format
-#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 4, 1 },  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 4, 1, 2, 3 },  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx

 // permute_opt for blocked format => reorder to differnt dim
-#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 },  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 },  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 },  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx

 // permute opt for blocked format => reorder to different dim/type
-#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 },  data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 },  data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 5, 4, 3, 1 },  data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
+#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 },  data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 },  data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 5, 1, 4, 3, 2 },  data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx

 // permute opt for non_blocked format => reorder to differnt dim/type
-#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 },  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 5, 1, 2, 3, 4 }, { 0, 2, 3, 1 },  data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 },  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
-#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 },  data_types::f16, data_types::f32, format::bfyx, format::bfzyx
-#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 5, 3, 1 },  data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
+#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 },  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 2, 3, 4, 5, 1 }, { 0, 3, 1, 2 },  data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 },  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 },  data_types::f16, data_types::f32, format::bfyx, format::bfzyx
+#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 },  data_types::f16, data_types::f32, format::bfyx, format::bfwzyx

 class permute_redundant_reorder : public PermuteReorderFusingTest {};
 TEST_P(permute_redundant_reorder, basic) {
@@ -514,21 +514,21 @@ TEST_P(permute_redundant_reorder, basic) {
 }

 INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_redundant_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 4, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 4, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 4, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 4, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 4, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4 },
@@ -563,17 +563,17 @@ TEST_P(permute_act_reorder, basic) {
 }

 INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 5 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 4, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 4, 5 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 5 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 5 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 5 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 4, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 4, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 4, 5 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 4, 5 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4 },
-    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5 },
+    permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 4, 5 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4 },
    permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5 },
--- a/src/plugins/intel_gpu/tests/test_cases/depth_to_space_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/depth_to_space_gpu_test.cpp
@@ -53,7 +53,7 @@ TEST(depth_to_space_fp16_gpu, d1411_bs2) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+        ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
    }
 }

@@ -95,7 +95,7 @@ TEST(depth_to_space_fp16_gpu, d1421_bs2) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+        ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
    }
 }

@@ -158,7 +158,7 @@ TEST(depth_to_space_fp16_gpu, d1933_bs3) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+        ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
    }
 }

@@ -197,7 +197,7 @@ TEST(depth_to_space_fp32_gpu, d1411_bs2) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], output_ptr[i]);
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
    }
 }

@@ -231,7 +231,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
    auto output = outputs.at("depth_to_space").get_memory();
    cldnn::mem_lock<FLOAT16> output_ptr (output, get_test_stream());

-    std::vector<uint16_t> perm = { 0,4,5,2,1,3 };
+    std::vector<uint16_t> perm = { 0,3,4,1,5,2 };

    topology topology_ref;
    topology_ref.add(input_layout("Input0", input1->get_layout()));
@@ -259,7 +259,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
    cldnn::mem_lock<FLOAT16> output_ptr_ref(output_ref, get_test_stream());

    for (size_t i = 0; i < output->get_layout().count(); ++i) {
-        EXPECT_EQ(output_ptr_ref[i], output_ptr[i]);
+        ASSERT_EQ(output_ptr_ref[i], output_ptr[i]);
    }
 }

@@ -314,7 +314,7 @@ TEST(depth_to_space_fp32_gpu, d1933_bs3) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], output_ptr[i]);
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
    }
 }

@@ -364,7 +364,7 @@ TEST(depth_to_space_fp32_gpu, d1822_bs2_blocks_first) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], output_ptr[i]);
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
    }
 }

@@ -414,6 +414,6 @@ TEST(depth_to_space_fp32_gpu, d1822_bs2_depth_first) {
    };

    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], output_ptr[i]);
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
    }
 }
--- a/src/plugins/intel_gpu/tests/test_cases/permute_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/permute_gpu_test.cpp
@@ -26,37 +26,32 @@ TEST(permute_gpu_f32, output_ordering_test)
 {
    auto& engine = get_test_engine();

-    std::vector<std::vector<int32_t>> input_tensors =
-    {
-        { 10, 5, 15, 2 },{ 2, 4, 6, 8 },{ 2, 2, 3, 2 },{ 9, 8, 7, 4 }
+    std::vector<std::vector<int32_t>> input_tensors = {
+        { 10, 5, 15, 2 },
+        { 2, 4, 6, 8 },
+        { 2, 2, 3, 2 },
+        { 9, 8, 7, 4 }
    };
-    std::vector<std::vector<uint16_t>> permutations =
-    {
+    std::vector<std::vector<uint16_t>> permutations = {
        { 0, 1, 2, 3 }, //do nothing
-    { 0, 1, 3, 2 }, //replace x with y
-    { 1, 0, 3, 2 }, //replace b with f
-    { 0, 2, 3, 1 }  //big permutation
+        { 0, 1, 3, 2 }, //replace x with y
+        { 1, 0, 3, 2 }, //replace b with f
+        { 0, 2, 3, 1 }  //big permutation
    };
    std::vector<format> input_formats = { format::bfyx, format::yxfb };

-    auto get_permutation = [&](const std::vector<int32_t>& inp1, const std::vector<uint16_t>& order)
-    {
+    auto get_permutation = [&](const std::vector<int32_t>& inp1, const std::vector<uint16_t>& order) {
        EXPECT_EQ(inp1.size(), order.size());
        std::vector<int32_t> output;
-        for (auto const& o : order)
-        {
+        for (auto const& o : order) {
            output.push_back(inp1.at(o));
        }
        return output;
    };

-    for (auto const& fr : input_formats)
-    {
-        for (auto const& inp_t : input_tensors)
-        {
-            for (auto const& perm : permutations)
-            {
-
+    for (auto const& fr : input_formats) {
+        for (auto const& inp_t : input_tensors) {
+            for (auto const& perm : permutations) {
                auto input = engine.allocate_memory({ data_types::f32, fr, tensor(inp_t) });
                topology topology(
                    input_layout("input", input->get_layout()),
@@ -68,12 +63,12 @@ TEST(permute_gpu_f32, output_ordering_test)
                auto output = outputs.at("permute");
                auto output_mem = output.get_memory();
                EXPECT_EQ(outputs.size(), size_t(1));
-                auto ref_tensor = get_permutation(inp_t, perm);
-                auto out_tensor = output_mem->get_layout().size;
-                EXPECT_EQ(out_tensor.batch[0], ref_tensor[0]);
-                EXPECT_EQ(out_tensor.feature[0], ref_tensor[1]);
-                EXPECT_EQ(out_tensor.spatial[0], ref_tensor[2]);
-                EXPECT_EQ(out_tensor.spatial[1], ref_tensor[3]);
+                auto ref_tensor = get_permutation(input->get_layout().get_dims(), perm);
+                auto out_tensor = output_mem->get_layout().get_dims();
+                EXPECT_EQ(out_tensor[0], ref_tensor[0]);
+                EXPECT_EQ(out_tensor[1], ref_tensor[1]);
+                EXPECT_EQ(out_tensor[2], ref_tensor[2]);
+                EXPECT_EQ(out_tensor[3], ref_tensor[3]);
            }
        }
    }
@@ -552,9 +547,9 @@ TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
    topology topology_unfused(
        input_layout("input", input->get_layout()),
        reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
-        permute("permute", "reorder1", { 0, 3, 1, 2}),
+        permute("permute", "reorder1", { 0, 2, 3, 1}),
        reorder("reorder2", "permute", format::bfyx, data_types::f32),
-        permute("out", "reorder2", { 0, 2, 3, 1}));
+        permute("out", "reorder2", { 0, 3, 1, 2}));

    cldnn::build_options options_unfused;
    options_unfused.set_option(cldnn::build_option::optimize_data(false));
@@ -567,9 +562,9 @@ TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
    topology topology_fused(
        input_layout("input", input->get_layout()),
        reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
-        permute("permute", "reorder1", { 0, 3, 1, 2}),
+        permute("permute", "reorder1", { 0, 2, 3, 1}),
        reorder("reorder2", "permute", format::bfyx, data_types::f32), // to be fused to previous permute
-        permute("out", "reorder2", { 0, 2, 3, 1})); // return to original value
+        permute("out", "reorder2", { 0, 3, 1, 2})); // return to original value

    cldnn::build_options options_fused;
    options_fused.set_option(cldnn::build_option::optimize_data(true));
@@ -794,7 +789,7 @@ TEST(permute_gpu_f32, 6D_reshape_permute_reshape)
    const int w_reshape = 2;
    const int z_reshape = 2;

-    std::vector<uint16_t> permute_order = { 0, 1, 5, 4, 2, 3 };
+    std::vector<uint16_t> permute_order = { 0, 1, 4, 5, 3, 2 };

    auto input_size = cldnn::tensor(batch(b), feature(f), spatial(x, y));
    auto input_mem = engine.allocate_memory({ data_types::f32, format::bfyx, input_size });
@@ -839,10 +834,10 @@ TEST(permute_gpu_f32, 6D_reshape_permute_reshape)
        EXPECT_EQ(expected_out[i], output_ptr[i]);
    }
 }
-TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)
+TEST(permute_gpu_f32, basic_bfzyx_permute_0_4_1_2_3)
 {
    //  Input               : bfzyx:2x2x2x2x3
-    //  Permute order       : { 0,2,3,4,1 }
+    //  Permute order       : { 0,4,1,2,3 }

    auto& engine = get_test_engine();

@@ -872,7 +867,7 @@ TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 2, 3, 4, 1 }));
+        permute("permute", "input", { 0, 4, 1, 2, 3 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -918,9 +913,9 @@ TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)
 * Test cases for permute_tile_8x8_4x4 kernel
 *
 * This TCs are enabled only when batch axis move to the last.
- * i.e permute order is 0,3,1,2 or 0,4,1,2,3 or 0,5,1,2,3,4
+ * i.e permute order is 0,2,3,1 or 0,4,1,2,3 or 0,5,1,2,3,4
 */
-TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {
+TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_2_3_1) {
    //  Input               : bfyx:2x8x2x8
    //  Permute order       : { 0,3,1,2 }

@@ -932,14 +927,14 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {

    std::vector<float> input_data;
    input_data.reserve(array_size);
-    for (size_t i=0 ; i < array_size; ++i)
+    for (size_t i = 0; i < array_size; ++i)
        input_data.push_back(static_cast<float>(i));

    set_values(input, input_data);

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 3, 1, 2 }));
+        permute("permute", "input", { 0, 2, 3, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -976,7 +971,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {
+TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_2_3_1) {
    //  Input               : bfyx:2x5x2x8
    //  Permute order       : { 0,3,1,2 }

@@ -995,7 +990,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 3, 1, 2 }));
+        permute("permute", "input", { 0, 2, 3, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1026,7 +1021,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {
+TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_2_3_1) {
    //  Input               : bfyx:2x8x2x5
    //  Permute order       : { 0,3,1,2 }

@@ -1051,7 +1046,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 3, 1, 2 }));
+        permute("permute", "input", { 0, 2, 3, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1082,7 +1077,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {
+TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_2_3_1) {
    //  Input               : bfyx:2x5x2x5
    //  Permute order       : { 0,3,1,2 }

@@ -1101,7 +1096,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 3, 1, 2 }));
+        permute("permute", "input", { 0, 2, 3, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1132,7 +1127,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_2_3_4_1) {
    //  Input               : bfzyx:2x8x2x2x8
    //  Permute order       : { 0,4,1,2,3 }

@@ -1151,7 +1146,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 4, 1, 2, 3 }));
+        permute("permute", "input", { 0, 2, 3, 4, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1194,7 +1189,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_2_3_4_1) {
    //  Input               : bfzyx:2x5x2x2x8
    //  Permute order       : { 0,4,1,2,3 }

@@ -1213,7 +1208,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 4, 1, 2, 3 }));
+        permute("permute", "input", { 0, 2, 3, 4, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1248,7 +1243,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_2_3_4_1) {
    //  Input               : bfzyx:2x8x2x2x5
    //  Permute order       : { 0,4,1,2,3 }

@@ -1267,7 +1262,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 4, 1, 2, 3 }));
+        permute("permute", "input", { 0, 2, 3, 4, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1302,7 +1297,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_2_3_4_1) {
    //  Input               : bfzyx:2x5x2x2x5
    //  Permute order       : { 0,4,1,2,3 }

@@ -1321,7 +1316,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 4, 1, 2, 3 }));
+        permute("permute", "input", { 0, 2, 3, 4, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1356,9 +1351,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_2_3_4_5_1) {
    //  Input               : bfwzyx:2x8x2x2x2x8
-    //  Permute order       : { 0,5,1,2,3,4 }
+    //  Permute order       : { 0,2,3,4,5,1 }

    constexpr size_t array_size = 1024;

@@ -1375,7 +1370,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
+        permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1428,9 +1423,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_2_3_4_5_1) {
    //  Input               : bfwzyx:2x5x2x2x2x8
-    //  Permute order       : { 0,5,1,2,3,4 }
+    //  Permute order       : { 0,2,3,4,5,1 }

    constexpr size_t array_size = 640;

@@ -1447,7 +1442,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
+        permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1488,9 +1483,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_2_3_4_5_1) {
    //  Input               : bfwzyx:2x8x2x2x2x5
-    //  Permute order       : { 0,5,1,2,3,4 }
+    //  Permute order       : { 0,2,3,4,5,1 }

    constexpr size_t array_size = 640;

@@ -1507,7 +1502,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
+        permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1548,9 +1543,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {
    }
 }

-TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_5_4_1_2_3) {
+TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_2_3_4_5_1) {
    //  Input               : bfwzyx:2x5x2x2x2x5
-    //  Permute order       : { 0,5,1,2,3,4 }
+    //  Permute order       : { 0,2,3,4,5,1 }

    constexpr size_t array_size = 400;

@@ -1567,7 +1562,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_5_4_1_2_3) {

    topology topology(
        input_layout("input", input->get_layout()),
-        permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
+        permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));

    network network(engine, topology);
    network.set_input_data("input", input);
@@ -1670,12 +1665,13 @@ void TiledPermuteTest::run_test(const std::vector<cldnn::tensor::value_type>& si
    std::swap(internal_sizes.at(2), internal_sizes.back());
    cldnn::tensor tensor(internal_sizes);

-    cldnn::format format = sizes.size() == 4?cldnn::format::bfyx:cldnn::format::bfzyx;
+    cldnn::format format = sizes.size() == 4 ? cldnn::format::bfyx : cldnn::format::bfzyx;

-    std::vector<uint16_t> order{0, static_cast<uint16_t>(sizes.size()-1)};
-    for (uint16_t i = 1; i<(sizes.size()-1); ++i) {
-        order.push_back(i);
+    std::vector<uint16_t> order = {0};
+    for (uint16_t i = 1; i < (sizes.size() - 1); ++i) {
+        order.push_back(i+1);
    }
+    order.push_back(1);

    auto input = engine.allocate_memory({Data_Type, format, tensor});
    set_random_values<type>(input);
--- a/src/plugins/intel_gpu/tests/test_cases/test_device_mem_usage_estimation.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/test_device_mem_usage_estimation.cpp
@@ -21,8 +21,8 @@ TEST(test_device_mem_usage_estimation, basic) {
    topology topology(
        input_layout("input1", input1->get_layout()),
        input_layout("input2", input2->get_layout()),
-        permute("permute1", "input1", { 0, 2, 3, 1 }),
-        permute("permute2", "input2", { 0, 3, 2, 1 }),
+        permute("permute1", "input1", { 0, 3, 1, 2 }),
+        permute("permute2", "input2", { 0, 2, 1, 3 }),
        eltwise("eltw", {"permute1", "permute2"}, eltwise_mode::sum, data_types::f16),
        reorder("output", "eltw", format::bfyx, data_types::f32)
    );