[IE CLDNN] Enabled fusing all types of reorders to permute (#5101)

* [IE CLDNN] Enabled fusing all types of reorders to permute - Fusing reorder to differnt dims (e.g., 4D=>5D, 6D=>4D, etc) - Fusing reorder to permute_opt kernel for blocked formats - Fixed bug for activation_opt kernel for blocked formats * [IE CLDNN] Enabled fusing all types of reorders to permute Refactoring for permute fsv opt kernel * [IE CLDNN] Permute reorder fusing Support reordering to different dims from optimized permute kernel for blocked format
2021-04-21 23:50:53 +09:00 · 2021-04-21 23:50:53 +09:00 · 6c46f26a3c
commit 6c46f26a3c
parent ad6a0e7e6d
8 changed files with 442 additions and 98 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp
@ -27,12 +27,35 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const {
    return k;
 }

+static size_t GetTotalSize(const activation_params& params) {
+    const auto input = params.inputs[0];
+    size_t totalSize = input.LogicalSize();
+    switch (params.inputs[0].GetLayout()) {
+        case DataLayout::b_fs_yx_fsv4:
+            totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 4);
+            break;
+        case DataLayout::b_fs_yx_fsv16:
+        case DataLayout::b_fs_zyx_fsv16:
+            totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 16);
+            break;
+        case DataLayout::b_fs_yx_fsv32:
+        case DataLayout::b_fs_zyx_fsv32:
+        case DataLayout::fs_b_yx_fsv32:
+            totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 32);
+            break;
+        case DataLayout::bs_fs_zyx_bsv16_fsv16:
+        case DataLayout::bs_fs_yx_bsv16_fsv16:
+            totalSize = (totalSize / (input.Feature().v * input.Batch().v)) * Align(input.Feature().v, 16) * Align(input.Batch().v, 16);
+            break;
+        default: break;
+    }
+    return totalSize;
+}
+
 ActivationKernelOpt::Parent::DispatchData ActivationKernelOpt::SetDefault(const activation_params& params) const {
    auto dispatchData = Parent::SetDefault(params);

-    const auto totalSize = params.inputs[0].LogicalSize();
-
-    dispatchData.gws = { totalSize / NUM_COLS_WI, 1, 1 };
+    dispatchData.gws = { GetTotalSize(params) / NUM_COLS_WI, 1, 1 };
    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);

    return dispatchData;
@ -50,17 +73,13 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co

    const activation_params& params = static_cast<const activation_params&>(p);

-    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && params.output.Feature().v % 16 != 0)
-        return false;
-
-    const auto totalSize = params.inputs[0].LogicalSize();
+    const auto totalSize = GetTotalSize(params);
    if ((totalSize % NUM_COLS_WI) != 0 ||
        (params.inputs[0].GetFirstElementOffset() % NUM_COLS_WI) != 0 ||
        (params.output.GetFirstElementOffset() % NUM_COLS_WI) != 0) {
        return false;
    }

-
    if (params.output.GetLayout() != params.inputs[0].GetLayout())
        return false;

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
@ -5,7 +5,6 @@
 #include "permute_kernel_ref.h"
 #include "kernel_selector_utils.h"
 #include <string>
-
 namespace kernel_selector {
 ParamsKey PermuteKernelRef::GetSupportedKey() const {
    ParamsKey k;
@ -42,20 +41,95 @@ CommonDispatchData PermuteKernelRef::SetDefault(const permute_params& params) co
 bool PermuteKernelRef::Validate(const Params& p, const optional_params& o) const {
    if (!Parent::Validate(p, o)) return false;

-    const permute_params& params = static_cast<const permute_params&>(p);
+    return true;
+}

-    // currently reorder fusing is supported only for format change, not the layout change
-    if (DataTensor::ChannelsCount(params.inputs[0].GetLayout())
-        != DataTensor::ChannelsCount(params.output.GetLayout())) {
-        return false;
+static void GetOrderVector(std::string s, std::vector<std::string>* res) {
+    size_t pos_start = 0, pos_end;
+    std::string token;
+    while ((pos_end = s.find(",", pos_start)) != std::string::npos) {
+        token = s.substr(pos_start, pos_end - pos_start);
+        pos_start = pos_end + 1;
+        res->push_back(token);
    }

-    return true;
+    res->push_back(s.substr(pos_start));
+    return;
+}
+
+static std::string GetReorderedOutputOrder(const permute_params& params, const std::vector<std::string>& permute_out_idx,
+                                            const std::pair<size_t, size_t>& dim_change) {
+    std::map<std::string, std::string> size_str_map = {
+        {"b", "INPUT0_BATCH_NUM"},
+        {"f", "INPUT0_FEATURE_NUM"},
+        {"w", "INPUT0_SIZE_W"},
+        {"z", "INPUT0_SIZE_Z"},
+        {"y", "INPUT0_SIZE_Y"},
+        {"x", "INPUT0_SIZE_X"}
+    };
+
+    int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
+
+    std::string reordered_order = permute_out_idx[0] + "," + permute_out_idx[1] + ",";
+    if (dim_diff > 0) {
+        // dim is shrinked
+        std::vector<std::string> merged_indices;
+        if (dim_diff == 2) merged_indices.push_back(permute_out_idx[dim_change.first - 3]);
+        merged_indices.push_back(permute_out_idx[dim_change.first - 2]);
+        merged_indices.push_back(permute_out_idx[dim_change.first - 1]);
+        std::string pitches = "1";
+        for (size_t i = 0 ; i < merged_indices.size(); ++i) {
+            if (i > 0) reordered_order += "+";
+            reordered_order +=  (merged_indices[i] + "*" + pitches);
+            pitches = size_str_map[merged_indices[i]] + "*" + pitches;
+        }
+        for (size_t i = dim_change.first - 1 - merged_indices.size(); i > 1; --i) {
+            reordered_order += ((", " + permute_out_idx[i]));
+        }
+    } else {
+        // dim is expanded
+        if (dim_change.first == 4 && dim_change.second == 5) {
+            reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
+                                 + ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
+                                 + ", " + permute_out_idx[2]);
+        } else if (dim_change.first == 4 && dim_change.second == 6) {
+            reordered_order += (permute_out_idx.back() + "/ (" + std::to_string(params.output.Y().v)
+                                 + " * " + std::to_string(params.output.Z().v) + ")"
+                                 + ", " + permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
+                                 + ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
+                                 + ", " + permute_out_idx[2]);
+        } else if (dim_change.first == 5 && dim_change.second == 6) {
+            reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Z().v)
+                                 + ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Z().v)
+                                 + ", " + permute_out_idx[3]
+                                 + ", " + permute_out_idx[2]);
+        }
+    }
+    return reordered_order;
 }
 JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
    auto jit = Parent::GetJitConstants(params, dispatchData);
    std::vector<std::string> in_idx;
-    std::vector<std::string> out_idx;
+    std::vector<std::string> permute_out_idx;
+
+    std::map<std::string, std::string> size_str_map = {
+        {"b", "INPUT0_BATCH_NUM"},
+        {"f", "INPUT0_FEATURE_NUM"},
+        {"w", "INPUT0_SIZE_W"},
+        {"z", "INPUT0_SIZE_Z"},
+        {"y", "INPUT0_SIZE_Y"},
+        {"x", "INPUT0_SIZE_X"}
+    };
+
+    std::pair<size_t, size_t> dim_change;
+    bool reorder_to_different_dim = false;
+    std::vector<std::string> reordered_out_idx;
+    if (DataTensor::ChannelsCount(params.inputs[0].GetLayout()) != DataTensor::ChannelsCount(params.output.GetLayout())) {
+        // subsequent reorder to differnt dimension is fused
+        dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
+        reorder_to_different_dim = true;
+    }
+
    switch (DataTensor::ChannelsCount(params.inputs[0].GetLayout())) {
        case 6: in_idx = {"b", "f", "x", "y", "z", "w" }; break;
        case 5: in_idx = {"b", "f", "x", "y", "z" }; break;
@ -64,32 +138,46 @@ JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, con

    assert(params.order.size() == in_idx.size());
    for (auto& o : params.order) {
-        out_idx.push_back(in_idx[o]);
+        permute_out_idx.push_back(in_idx[o]);
    }

    std::string input_order = in_idx[0] + "," + in_idx[1];
-    std::string output_order = out_idx[0] + "," + out_idx[1];

    for (size_t i = in_idx.size() - 1; i > 1; i--) {
        input_order += "," + in_idx[i];
-        output_order += "," + out_idx[i];
    }

    jit.AddConstant(MakeJitConstant("IN_IDX", "INPUT0_GET_INDEX(" + input_order + ")"));
-    jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
+
+    if (reorder_to_different_dim) {
+        auto reordered_order = GetReorderedOutputOrder(params, permute_out_idx, dim_change);
+        jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + reordered_order + ")"));
+        GetOrderVector(reordered_order, &reordered_out_idx);
+    } else {
+        std::string output_order = permute_out_idx[0] + "," + permute_out_idx[1];
+        for (size_t i = in_idx.size() - 1; i > 1; i--) {
+           output_order += "," + permute_out_idx[i];
+        }
+        jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
+    }

    if (!params.fused_ops.empty()) {
-        if (out_idx.size() == 4) {
-            std::swap(out_idx[2], out_idx[3]);
-        } else if (out_idx.size() == 5) {
-            std::swap(out_idx[2], out_idx[4]);
-        } else if (out_idx.size() == 6) {
-            std::swap(out_idx[2], out_idx[5]);
-            std::swap(out_idx[3], out_idx[4]);
+        if (permute_out_idx.size() == 4) {
+            std::swap(permute_out_idx[2], permute_out_idx[3]);
+        } else if (permute_out_idx.size() == 5) {
+            std::swap(permute_out_idx[2], permute_out_idx[4]);
+        } else if (permute_out_idx.size() == 6) {
+            std::swap(permute_out_idx[2], permute_out_idx[5]);
+            std::swap(permute_out_idx[3], permute_out_idx[4]);
        }

-        FusedOpsConfiguration conf = {"", out_idx, "input_var", params.inputs[0].GetDType(), 1};
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        if (reorder_to_different_dim) {
+            FusedOpsConfiguration conf = {"", reordered_out_idx, "input_var", params.inputs[0].GetDType(), 1};
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        } else {
+            FusedOpsConfiguration conf = {"", permute_out_idx, "input_var", params.inputs[0].GetDType(), 1};
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        }
    }
    return jit;
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp
@ -70,19 +70,54 @@ static inline std::vector<std::string> GetFusedOpOrderVector(size_t size) {
    return res;
 }

-static inline std::string GetTiledOutputOrder(size_t size) {
+static inline std::string GetTiledOutputOrder(const permute_params& params) {
+    std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
+
    std::string order_str = "";
-    switch (size) {
-        case 4 :
-            order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
-            break;
-        case 5 :
-            order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
-            break;
-        case 6 :
-            order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
-            break;
-        default : throw std::runtime_error("Unsupported combination\n");
+    int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
+
+    if (dim_diff == 0) {
+        switch (dim_change.first) {
+            case 4 :
+                order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
+                break;
+            case 5 :
+                order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
+                break;
+            case 6 :
+                order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
+                break;
+            default : throw std::runtime_error("Unsupported combination\n");
+        }
+    } else if (dim_diff > 0) {
+        // dim is shrinked
+        order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
+        if (dim_change.first == 5 && dim_change.second == 4) {
+            order_str = "b, z, y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f*TILE_SIZE)";
+        } else if (dim_change.first == 6 && dim_change.second == 4) {
+            order_str = "b, w, z * INPUT0_SIZE_Y * INPUT0_SIZE_X +  y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f * TILE_SIZE)";
+        } else if (dim_change.first == 6 && dim_change.second == 5) {
+            order_str = "b, w, z * INPUT0_SIZE_Y + y, x * TILE_SIZE + lh, (f * TILE_SIZE)";
+        }
+    } else {
+        // dim is expanded
+        if (dim_change.first == 4 && dim_change.second == 5) {
+            order_str = ("b, y,  (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
+                                 + ", (x * TILE_SIZE +lh) % " + std::to_string(params.output.Y().v)
+                                 + ", (f * TILE_SIZE)");
+        } else if (dim_change.first == 4 && dim_change.second == 6) {
+            order_str = ("b, y, (x * TILE_SIZE + lh) / (" + std::to_string(params.output.Y().v)
+                                 + " * " + std::to_string(params.output.Z().v) + ")"
+                                 + ", (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
+                                 + ", (x * TILE_SIZE + lh) % " + std::to_string(params.output.Y().v)
+                                 + ", (f * TILE_SIZE)");
+        } else if (dim_change.first == 5 && dim_change.second == 6) {
+            order_str = ("b, z, y /" + std::to_string(params.output.Z().v)
+                                 + ", y % " + std::to_string(params.output.Z().v)
+                                 + ", (x * TILE_SIZE + lh), (f * TILE_SIZE)");
+        } else {
+            throw std::runtime_error("Unsupported combination\n");
+        }
    }
    return order_str;
 }
@ -104,7 +139,6 @@ static inline std::string GetTiledInputOrder(size_t size) {
    return order_str;
 }

-
 JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
    auto jit = Parent::GetJitConstants(params, dispatchData);
    size_t tile_size = GetTileSize(params);
@ -113,7 +147,7 @@ JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& p
    uint64_t total_lws = dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2];
    jit.AddConstant(MakeJitConstant("VEC_WIDTH", vector_width));
    jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(params.inputs[0].GetDims().size())));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params.output.GetDims().size())));
+    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params)));
    jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
    jit.AddConstant(MakeJitConstant("N_VECTORS_IN_TILE", tile_size / vector_width));
    jit.AddConstant(MakeJitConstant("LWS", total_lws));
@ -227,15 +261,6 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params

    const permute_params& params = static_cast<const permute_params&>(p);

-    if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
-        return false;
-    }
-
-    if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
-        // Reorder cannot be fused
-        return false;
-    }
-
    if (!is_rotating_except_batch(params.order)) {
        return false;
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4_fsv.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4_fsv.cpp
@ -40,6 +40,9 @@ ParamsKey PermuteKernel_tile_8x8_4x4_fsv::GetSupportedKey() const {
    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
    k.EnableTensorOffset();
    k.EnableTensorPitches();
    k.EnableBatching();
@ -74,6 +77,47 @@ static inline std::string GetTiledOutputOrder(size_t size) {
    return order_str;
 }

+static inline std::string GetReorderedTiledOutputOrder(const permute_params& params) {
+    std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
+
+    std::string order_str = "";
+    int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
+    if (dim_diff == 0) {
+        switch (params.output.GetDims().size()) {
+           case 4 :
+                order_str = "b, y + lh, x, f";
+                break;
+           case 5 :
+                order_str = "b, z + lh, y, x, f";
+                break;
+           default : throw std::runtime_error("Unsupported combination\n");
+        }
+    } else if (dim_diff > 0) {
+        // dim is shrinked (5 -> 4 only)
+        order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
+    } else {
+        // dim is expanded
+        if (dim_change.first == 4 && dim_change.second == 5) {
+            order_str = ("b, y + lh, x / " + std::to_string(params.output.Y().v)
+                                 + ", x % " + std::to_string(params.output.Y().v)
+                                 + ", f");
+        } else if (dim_change.first == 4 && dim_change.second == 6) {
+            order_str = ("b, y + lh, x / (" + std::to_string(params.output.Y().v)
+                                 + " * " + std::to_string(params.output.Z().v) + ")"
+                                 + ", x / " + std::to_string(params.output.Y().v)
+                                 + ", x % " + std::to_string(params.output.Y().v)
+                                 + ", f");
+        } else if (dim_change.first == 5 && dim_change.second == 6) {
+            order_str = ("b, z + lh, y /" + std::to_string(params.output.Z().v)
+                                 + ", y % " + std::to_string(params.output.Z().v)
+                                 + ", x, f");
+        } else {
+            throw std::runtime_error("Unsupported combination\n");
+        }
+    }
+    return order_str;
+}
+
 static inline std::string GetTiledInputOrder(size_t size) {
    std::string order_str = "";
    switch (size) {
@ -145,12 +189,17 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
    const size_t fsv_alignment = GetFsvAlignment(params);

    jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(input_ndims)));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
    jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment)));
    jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
    jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment));
    jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", tile_size * total_lws));

+    if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
+        jit.AddConstant(MakeJitConstant("REORDERED_OUTPUT_TILED_ORDER", GetReorderedTiledOutputOrder(params)));
+    } else {
+        jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
+    }
+
    // whether F is tile_size-aligned
    if (f % tile_size != 0) {
        jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % tile_size));
@ -172,8 +221,8 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
    }

    if (!params.fused_ops.empty()) {
-        std::vector<std::string> output_order = GetFusedOpOrderVector(output_ndims);
-        FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
+        std::vector<std::string> original_output_order = GetFusedOpOrderVector(input_ndims);
+        FusedOpsConfiguration conf = {"", original_output_order, "input_var", params.inputs[0].GetDType(), 1};
        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
    }
    return jit;
@ -258,11 +307,26 @@ bool PermuteKernel_tile_8x8_4x4_fsv::Validate(const Params& p, const optional_pa
    };

    const permute_params& params = static_cast<const permute_params&>(p);
-
-    if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
-        return false;
+    // blocked format => blocked format is not supported
+    if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
+        if ((params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4) ||
+            (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16) ||
+            (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32)) {
+            if (params.output.GetLayout() != DataLayout::bfyx
+                && params.output.GetLayout() != DataLayout::bfzyx
+                && params.output.GetLayout() != DataLayout::bfwzyx)
+                return false;
+        } else if ((params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16) ||
+                   (params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32)) {
+            if (params.output.GetLayout() != DataLayout::bfyx
+                && params.output.GetLayout() != DataLayout::bfzyx
+                && params.output.GetLayout() != DataLayout::bfwzyx) {
+                return false;
+            }
+        } else {
+            return false;
+        }
    }
-
    if (!is_rotating_except_batch(params.order)) {
        return false;
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_ref.cl
@ -14,9 +14,9 @@ KERNEL (permute_ref)(
 {
    //gws(x, y * z * w, b*f)
    const uint gid_0 = get_global_id(1);
-#if INPUT0_DIMS == 4 && OUTPUT0_DIMS == 4
+#if INPUT0_DIMS == 4
    const uint y = gid_0;
-#elif INPUT0_DIMS == 5 && OUTPUT0_DIMS == 5
+#elif INPUT0_DIMS == 5
    const uint z = gid_0 / INPUT0_SIZE_Y;
    const uint y = gid_0 % INPUT0_SIZE_Y;   
 #else
@ -24,11 +24,11 @@ KERNEL (permute_ref)(
    const uint z = gid_0 / INPUT0_SIZE_Y % INPUT0_SIZE_Z;
    const uint y = gid_0 % INPUT0_SIZE_Y;
 #endif
-    
+
    const uint x = get_global_id(0);
    const uint f = (uint)get_global_id(2) % INPUT0_FEATURE_NUM;
    const uint b = (uint)get_global_id(2) / INPUT0_FEATURE_NUM;
-    
+
    INPUT0_TYPE input_var = input[IN_IDX];

 #if HAS_FUSED_OPS
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_tile_8x8_4x4_fsv.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/permute_tile_8x8_4x4_fsv.cl
@ -18,6 +18,8 @@
 #define VLOAD CAT(vload, TILE_SIZE)
 #define VSTORE CAT(vstore, TILE_SIZE)
 #define AS_INPUTVTYPE CAT(as_, INPUTVTYPE)
+#define AS_OUTPUTVTYPE CAT(as_, OUTPUTVTYPE)
+#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE)

 #define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX))
 #define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX))
@ -51,6 +53,49 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
                    + GET_LOCAL_ID(2);
    const uint local_buf_offset = local_id * TILE_SIZE;

+#ifdef REORDERED_OUTPUT_TILED_ORDER
+    if (F_NO_REMAINDER_CONDITION) {
+        unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
+            // read
+            const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
+            INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
+            // write to ddr
+          #if HAS_FUSED_OPS
+            OUTPUTVTYPE out_data;
+            unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
+                INPUT0_TYPE input_var = read_data[lw];
+                FUSED_OPS;
+                out_data[lw] = FUSED_OPS_RESULT;
+            }
+            const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
+            VSTORE(out_data, 0, output + output_idx);
+          #else
+            const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
+            VSTORE(ACTIVATION(TO_OUTPUTVTYPE(read_data), ACTIVATION_PARAMS), 0, output + output_idx);
+          #endif
+        }
+    }
+#ifdef F_REMAINDER_CONDITION
+    else if (F_REMAINDER_CONDITION) {
+        unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
+            unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
+                // read
+                const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
+                INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
+                // write to ddr
+                const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
+              #if HAS_FUSED_OPS
+                INPUT0_TYPE input_var = read_data[lw];
+                FUSED_OPS;
+                output[output_idx + lw] = FUSED_OPS_RESULT;
+              #else
+                output[output_idx + lw] = TO_OUTPUT_TYPE(read_data[lw]);
+              #endif
+            }
+        }
+    }
+#endif // F_REMAINDER_CONDITION
+#else // !REORDERED_OUTPUT_TILED_ORDER
    if (F_NO_REMAINDER_CONDITION) {
        // read and transpose
        unroll_for (uint lh = 0; lh < TILE_SIZE; ++lh) {
@ -59,16 +104,15 @@ KERNEL (permute_tile_8x8_4x4_fsv)(

            unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
                const uint dst = local_buf_offset + lw;
-#if HAS_FUSED_OPS
+          #if HAS_FUSED_OPS
                INPUT0_TYPE input_var = read_data[lw];
                FUSED_OPS;
                transpose_buf[dst][lh] = FUSED_OPS_RESULT;
-#else
+          #else
                transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
-#endif
+          #endif
            }
        }
-        // write to ddr
 #ifdef YZ_REMAINDER_CONDITION
        if (YZ_REMAINDER_LESS_THAN_TILE_SIZE) {
            // copy one by one when z % TILE_SIZE < TILE_SIZE/2
@ -94,12 +138,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
                VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
            }
        }
-#else
+#else // YZ_REMAINDER_CONDITION
+        // write to ddr
        unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
            const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
            VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
        }
-#endif
+#endif //YZ_REMAINDER_CONDITION
    }
 #ifdef F_REMAINDER_CONDITION
    else if (F_REMAINDER_CONDITION) {
@ -109,13 +154,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
            INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
            unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
                uint dst = local_buf_offset + lw;
-    #if HAS_FUSED_OPS
-                    INPUT0_TYPE input_var = read_data[lw];
-                    FUSED_OPS;
-                    transpose_buf[dst][lh] = FUSED_OPS_RESULT;
-    #else
-                    transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
-    #endif
+        #if HAS_FUSED_OPS
+                INPUT0_TYPE input_var = read_data[lw];
+                FUSED_OPS;
+                transpose_buf[dst][lh] = FUSED_OPS_RESULT;
+        #else
+                transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
+        #endif
            }
        }
        // write to ddr
@ -145,12 +190,30 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
                VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
            }
        }
-#else
+#else //  !YZ_REMAINDER_CONDITION
        unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
            const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
            VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
        }
-#endif
+#endif // YZ_REMAINDER_CONDITION
    }
-#endif
+#endif // F_REMAINDER_CONDITION
+#endif // REORDERED_OUTPUT)TILED_ORDER
 }
+
+#undef unroll_for
+#undef CEIL_DIV(A, B)
+#undef INPUT0_GET_TILED_INDEX(ORDER)
+#undef OUTPUT_GET_TILED_INDEX(ORDER)
+#undef YZ_REMAINDER_LESS_THAN_TILE_SIZE
+#undef YZ_REMAINDER_MORE_THAN_TILE_SIZE
+#undef INPUTVTYPE
+#undef OUTPUTVTYPE
+#undef VLOAD
+#undef VSTORE
+#undef AS_INPUTVTYPE
+#undef AS_OUTPUTVTYPE
+#undef TO_OUTPUTVTYPE
+#undef GET_GLOBAL_ID(IDX)
+#undef GET_LOCAL_ID(IDX)
+#undef GET_LOCAL_SIZE(IDX)
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -236,7 +236,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
         fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
        return true;

-    if (prev.is_type<permute>() && fmt_prev.dimension() == fmt_next.dimension()) {
+    if (prev.is_type<permute>()) {
        return true;
    }
    return false;
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -157,7 +157,6 @@ public:
    void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) {
        auto outputs_ref = not_fused.execute();
        auto outputs_fused = fused.execute();
-
        auto get_reorders_count = [](network& net) -> size_t {
            size_t count = 0;
            for (auto& pi : net.get_primitives_info()) {
@ -187,7 +186,6 @@ public:
            description << "  " << i.original_id << " " << i.kernel_id << std::endl;
        }
        SCOPED_TRACE(description.str());
-
        // Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph
        ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives);
        ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives);
@ -6409,7 +6407,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,

 struct permute_reorder_params {
    tensor in_shape;
-    tensor out_shape;
    std::vector<uint16_t> permute_order1;
    std::vector<uint16_t> permute_order2;
    data_types permute_type;
@ -6420,20 +6417,47 @@ struct permute_reorder_params {
    size_t expected_not_fused_primitives;
 };

-#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2},    {1, 16, 32, 2},    {0, 3, 2, 1},    {0, 3, 2, 1},    data_types::f32, data_types::f32, format::b_fs_yx_fsv16,  format::bfyx
-#define CASE_PERMUTE_REORDER_F32_1 {2, 16, 16, 16},   {2, 16, 16, 16},   {0, 3, 2, 1},    {0, 3, 2, 1},    data_types::f32, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
-#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2},   {0, 3, 2, 1},    {0, 3, 2, 1},    data_types::f32, data_types::f32, format::b_fs_yx_fsv16,  format::bfyx
+#define CASE_PERMUTE_REORDER_F32_1 {2, 7, 9, 27},  {0, 3, 2, 1},    {0, 3, 2, 1},    data_types::f32, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
+#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4},     {0, 2, 1, 3},    {0, 2, 1, 3},    data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  format::bfyx
+#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx

-#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4},     {1, 16, 2, 4},     {0, 2, 1, 3},    {0, 2, 1, 3},    data_types::f16, data_types::f16, format::b_fs_yx_fsv16,  format::bfyx
-#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14},  {1, 5, 1, 2, 14},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+// type change
+#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 15, 4, 5},    {0, 2, 3, 1},    {0, 3, 1, 2},    data_types::i8, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
+#define CASE_PERMUTE_REORDER_S8_TO_F32_1 {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx

-#define CASE_PERMUTE_REORDER_S8_0 {1, 15, 4, 5},    {1, 15, 4, 5},    {0, 2, 3, 1},    {0, 3, 1, 2},    data_types::i8, data_types::f32, format::b_fs_yx_fsv4,   format::bfyx
-#define CASE_PERMUTE_REORDER_S8_1 {1, 2, 15, 4, 5}, {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+// dim change
+#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 {1, 16, 8, 16}, {1, 2, 0, 3}, {0, 3, 1, 4, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
+#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 {1, 16, 8, 16}, {0, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
+#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 {1, 16, 4, 5, 18},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 {1, 16, 8, 4, 16}, {0, 4, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
+#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 4, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx

-#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14},  {1, 5, 1, 2, 14},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 18, 1, 2, 2},  {1, 5, 1, 2, 14},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
-#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7},  {1, 5, 1, 2, 14},  {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
+// permute_opt for blocked format
+#define CASE_PERMUTE_REORDER_TILED_F32_0 {1, 256, 2, 64}, {0, 3, 1, 2}, {0, 2, 3, 1},  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_1 {1, 78, 2, 259}, {0, 3, 1, 2}, {0, 2, 3, 1},  data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F32_2 {1, 48, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 4, 1},  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
+
+// permute_opt for blocked format => reorder to differnt dim
+#define CASE_PERMUTE_REORDER_TILED_F32_3 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1},  data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+
+// permute opt for blocked format => reorder to different dim/type
+#define CASE_PERMUTE_REORDER_TILED_I8_4 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1},  data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_5 {1, 48, 3, 256}, {0, 3, 1, 2}, {0, 2, 4, 3, 1},  data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
+#define CASE_PERMUTE_REORDER_TILED_F16_6 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 5, 4, 3, 1},  data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
+
+// permute opt for non_blocked format => reorder to differnt dim/type
+#define CASE_PERMUTE_REORDER_TILED_F16_7 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1},  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_8 {1, 28, 2, 2, 3, 256}, {0, 5, 1, 2, 3, 4}, {0, 2, 3, 1},  data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_9 {1, 24, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1},  data_types::f16, data_types::f32, format::bfzyx, format::bfyx
+#define CASE_PERMUTE_REORDER_TILED_F16_10 {1, 35, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 3, 1},  data_types::f16, data_types::f32, format::bfyx, format::bfzyx
+#define CASE_PERMUTE_REORDER_TILED_F16_11 {1, 32, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 5, 3, 1},  data_types::f16, data_types::f32, format::bfyx, format::bfwzyx

 class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params> {
 public:
@ -6460,7 +6484,7 @@ TEST_P(permute_redundant_reorder, basic) {
        input_layout("input", get_input_layout(p)),
        permute("permute1", "input",    p.permute_order1),
        reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused
-        permute("permute2", "reorder1", p.permute_order2)                // dummy last op to make reorder n
+        permute("permute2", "reorder1", p.permute_order2)                  // dummy last op to make reorder fused
    );
    tolerance = 1e-5f;
    execute(p);
@ -6474,11 +6498,72 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_redundant_reorder,
                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 4},
                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 4},
                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 4},
-                                permute_reorder_params{CASE_PERMUTE_REORDER_S8_0, 3, 4},
-                                permute_reorder_params{CASE_PERMUTE_REORDER_S8_1, 3, 4},
-                                permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
                                permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
                                permute_reorder_params{CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_I8_4, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 4},
+                                }),);
+
+class permute_act_reorder : public PermuteReorderFusingTest {};
+
+TEST_P(permute_act_reorder, basic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        permute("permute1", "input",    p.permute_order1),
+        activation("activation", "permute1", activation_func::abs),
+        reorder("reorder1", "activation", p.output_format, p.output_type),    // to be fused
+        permute("permute2", "reorder1", p.permute_order2)                // dummy last op to make reorder fused
+    );
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_act_reorder,
+                            ::testing::ValuesIn(std::vector<permute_reorder_params> {
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F32_0, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F32_1, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F32_2, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5},
+                                permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5},
                                }),);

 class NormalizeFusingTest : public ::BaseFusingTest<normalize_test_params> {