[IE CLDNN] Enabled fusing all types of reorders to permute (#5101)

* [IE CLDNN] Enabled fusing all types of reorders to permute
- Fusing reorder to differnt dims (e.g., 4D=>5D, 6D=>4D, etc)
- Fusing reorder to permute_opt kernel for blocked formats
- Fixed bug for activation_opt kernel for blocked formats

* [IE CLDNN] Enabled fusing all types of reorders to permute
Refactoring for permute fsv opt kernel

* [IE CLDNN] Permute reorder fusing
Support reordering to different dims from optimized permute kernel for blocked format
This commit is contained in:
Taylor Yeonbok Lee 2021-04-21 23:50:53 +09:00 committed by GitHub
parent ad6a0e7e6d
commit 6c46f26a3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 442 additions and 98 deletions

View File

@ -27,12 +27,35 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const {
return k;
}
static size_t GetTotalSize(const activation_params& params) {
const auto input = params.inputs[0];
size_t totalSize = input.LogicalSize();
switch (params.inputs[0].GetLayout()) {
case DataLayout::b_fs_yx_fsv4:
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 4);
break;
case DataLayout::b_fs_yx_fsv16:
case DataLayout::b_fs_zyx_fsv16:
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 16);
break;
case DataLayout::b_fs_yx_fsv32:
case DataLayout::b_fs_zyx_fsv32:
case DataLayout::fs_b_yx_fsv32:
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 32);
break;
case DataLayout::bs_fs_zyx_bsv16_fsv16:
case DataLayout::bs_fs_yx_bsv16_fsv16:
totalSize = (totalSize / (input.Feature().v * input.Batch().v)) * Align(input.Feature().v, 16) * Align(input.Batch().v, 16);
break;
default: break;
}
return totalSize;
}
ActivationKernelOpt::Parent::DispatchData ActivationKernelOpt::SetDefault(const activation_params& params) const {
auto dispatchData = Parent::SetDefault(params);
const auto totalSize = params.inputs[0].LogicalSize();
dispatchData.gws = { totalSize / NUM_COLS_WI, 1, 1 };
dispatchData.gws = { GetTotalSize(params) / NUM_COLS_WI, 1, 1 };
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
return dispatchData;
@ -50,17 +73,13 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co
const activation_params& params = static_cast<const activation_params&>(p);
if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && params.output.Feature().v % 16 != 0)
return false;
const auto totalSize = params.inputs[0].LogicalSize();
const auto totalSize = GetTotalSize(params);
if ((totalSize % NUM_COLS_WI) != 0 ||
(params.inputs[0].GetFirstElementOffset() % NUM_COLS_WI) != 0 ||
(params.output.GetFirstElementOffset() % NUM_COLS_WI) != 0) {
return false;
}
if (params.output.GetLayout() != params.inputs[0].GetLayout())
return false;

View File

@ -5,7 +5,6 @@
#include "permute_kernel_ref.h"
#include "kernel_selector_utils.h"
#include <string>
namespace kernel_selector {
ParamsKey PermuteKernelRef::GetSupportedKey() const {
ParamsKey k;
@ -42,20 +41,95 @@ CommonDispatchData PermuteKernelRef::SetDefault(const permute_params& params) co
bool PermuteKernelRef::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) return false;
const permute_params& params = static_cast<const permute_params&>(p);
return true;
}
// currently reorder fusing is supported only for format change, not the layout change
if (DataTensor::ChannelsCount(params.inputs[0].GetLayout())
!= DataTensor::ChannelsCount(params.output.GetLayout())) {
return false;
static void GetOrderVector(std::string s, std::vector<std::string>* res) {
size_t pos_start = 0, pos_end;
std::string token;
while ((pos_end = s.find(",", pos_start)) != std::string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + 1;
res->push_back(token);
}
return true;
res->push_back(s.substr(pos_start));
return;
}
static std::string GetReorderedOutputOrder(const permute_params& params, const std::vector<std::string>& permute_out_idx,
const std::pair<size_t, size_t>& dim_change) {
std::map<std::string, std::string> size_str_map = {
{"b", "INPUT0_BATCH_NUM"},
{"f", "INPUT0_FEATURE_NUM"},
{"w", "INPUT0_SIZE_W"},
{"z", "INPUT0_SIZE_Z"},
{"y", "INPUT0_SIZE_Y"},
{"x", "INPUT0_SIZE_X"}
};
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
std::string reordered_order = permute_out_idx[0] + "," + permute_out_idx[1] + ",";
if (dim_diff > 0) {
// dim is shrinked
std::vector<std::string> merged_indices;
if (dim_diff == 2) merged_indices.push_back(permute_out_idx[dim_change.first - 3]);
merged_indices.push_back(permute_out_idx[dim_change.first - 2]);
merged_indices.push_back(permute_out_idx[dim_change.first - 1]);
std::string pitches = "1";
for (size_t i = 0 ; i < merged_indices.size(); ++i) {
if (i > 0) reordered_order += "+";
reordered_order += (merged_indices[i] + "*" + pitches);
pitches = size_str_map[merged_indices[i]] + "*" + pitches;
}
for (size_t i = dim_change.first - 1 - merged_indices.size(); i > 1; --i) {
reordered_order += ((", " + permute_out_idx[i]));
}
} else {
// dim is expanded
if (dim_change.first == 4 && dim_change.second == 5) {
reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
+ ", " + permute_out_idx[2]);
} else if (dim_change.first == 4 && dim_change.second == 6) {
reordered_order += (permute_out_idx.back() + "/ (" + std::to_string(params.output.Y().v)
+ " * " + std::to_string(params.output.Z().v) + ")"
+ ", " + permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
+ ", " + permute_out_idx[2]);
} else if (dim_change.first == 5 && dim_change.second == 6) {
reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Z().v)
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Z().v)
+ ", " + permute_out_idx[3]
+ ", " + permute_out_idx[2]);
}
}
return reordered_order;
}
JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
std::vector<std::string> in_idx;
std::vector<std::string> out_idx;
std::vector<std::string> permute_out_idx;
std::map<std::string, std::string> size_str_map = {
{"b", "INPUT0_BATCH_NUM"},
{"f", "INPUT0_FEATURE_NUM"},
{"w", "INPUT0_SIZE_W"},
{"z", "INPUT0_SIZE_Z"},
{"y", "INPUT0_SIZE_Y"},
{"x", "INPUT0_SIZE_X"}
};
std::pair<size_t, size_t> dim_change;
bool reorder_to_different_dim = false;
std::vector<std::string> reordered_out_idx;
if (DataTensor::ChannelsCount(params.inputs[0].GetLayout()) != DataTensor::ChannelsCount(params.output.GetLayout())) {
// subsequent reorder to differnt dimension is fused
dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
reorder_to_different_dim = true;
}
switch (DataTensor::ChannelsCount(params.inputs[0].GetLayout())) {
case 6: in_idx = {"b", "f", "x", "y", "z", "w" }; break;
case 5: in_idx = {"b", "f", "x", "y", "z" }; break;
@ -64,32 +138,46 @@ JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, con
assert(params.order.size() == in_idx.size());
for (auto& o : params.order) {
out_idx.push_back(in_idx[o]);
permute_out_idx.push_back(in_idx[o]);
}
std::string input_order = in_idx[0] + "," + in_idx[1];
std::string output_order = out_idx[0] + "," + out_idx[1];
for (size_t i = in_idx.size() - 1; i > 1; i--) {
input_order += "," + in_idx[i];
output_order += "," + out_idx[i];
}
jit.AddConstant(MakeJitConstant("IN_IDX", "INPUT0_GET_INDEX(" + input_order + ")"));
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
if (reorder_to_different_dim) {
auto reordered_order = GetReorderedOutputOrder(params, permute_out_idx, dim_change);
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + reordered_order + ")"));
GetOrderVector(reordered_order, &reordered_out_idx);
} else {
std::string output_order = permute_out_idx[0] + "," + permute_out_idx[1];
for (size_t i = in_idx.size() - 1; i > 1; i--) {
output_order += "," + permute_out_idx[i];
}
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
}
if (!params.fused_ops.empty()) {
if (out_idx.size() == 4) {
std::swap(out_idx[2], out_idx[3]);
} else if (out_idx.size() == 5) {
std::swap(out_idx[2], out_idx[4]);
} else if (out_idx.size() == 6) {
std::swap(out_idx[2], out_idx[5]);
std::swap(out_idx[3], out_idx[4]);
if (permute_out_idx.size() == 4) {
std::swap(permute_out_idx[2], permute_out_idx[3]);
} else if (permute_out_idx.size() == 5) {
std::swap(permute_out_idx[2], permute_out_idx[4]);
} else if (permute_out_idx.size() == 6) {
std::swap(permute_out_idx[2], permute_out_idx[5]);
std::swap(permute_out_idx[3], permute_out_idx[4]);
}
FusedOpsConfiguration conf = {"", out_idx, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
if (reorder_to_different_dim) {
FusedOpsConfiguration conf = {"", reordered_out_idx, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
} else {
FusedOpsConfiguration conf = {"", permute_out_idx, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}
}
return jit;
}

View File

@ -70,19 +70,54 @@ static inline std::vector<std::string> GetFusedOpOrderVector(size_t size) {
return res;
}
static inline std::string GetTiledOutputOrder(size_t size) {
static inline std::string GetTiledOutputOrder(const permute_params& params) {
std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
std::string order_str = "";
switch (size) {
case 4 :
order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
case 5 :
order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
case 6 :
order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
default : throw std::runtime_error("Unsupported combination\n");
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
if (dim_diff == 0) {
switch (dim_change.first) {
case 4 :
order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
case 5 :
order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
case 6 :
order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
break;
default : throw std::runtime_error("Unsupported combination\n");
}
} else if (dim_diff > 0) {
// dim is shrinked
order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
if (dim_change.first == 5 && dim_change.second == 4) {
order_str = "b, z, y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f*TILE_SIZE)";
} else if (dim_change.first == 6 && dim_change.second == 4) {
order_str = "b, w, z * INPUT0_SIZE_Y * INPUT0_SIZE_X + y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f * TILE_SIZE)";
} else if (dim_change.first == 6 && dim_change.second == 5) {
order_str = "b, w, z * INPUT0_SIZE_Y + y, x * TILE_SIZE + lh, (f * TILE_SIZE)";
}
} else {
// dim is expanded
if (dim_change.first == 4 && dim_change.second == 5) {
order_str = ("b, y, (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
+ ", (x * TILE_SIZE +lh) % " + std::to_string(params.output.Y().v)
+ ", (f * TILE_SIZE)");
} else if (dim_change.first == 4 && dim_change.second == 6) {
order_str = ("b, y, (x * TILE_SIZE + lh) / (" + std::to_string(params.output.Y().v)
+ " * " + std::to_string(params.output.Z().v) + ")"
+ ", (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
+ ", (x * TILE_SIZE + lh) % " + std::to_string(params.output.Y().v)
+ ", (f * TILE_SIZE)");
} else if (dim_change.first == 5 && dim_change.second == 6) {
order_str = ("b, z, y /" + std::to_string(params.output.Z().v)
+ ", y % " + std::to_string(params.output.Z().v)
+ ", (x * TILE_SIZE + lh), (f * TILE_SIZE)");
} else {
throw std::runtime_error("Unsupported combination\n");
}
}
return order_str;
}
@ -104,7 +139,6 @@ static inline std::string GetTiledInputOrder(size_t size) {
return order_str;
}
JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
size_t tile_size = GetTileSize(params);
@ -113,7 +147,7 @@ JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& p
uint64_t total_lws = dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2];
jit.AddConstant(MakeJitConstant("VEC_WIDTH", vector_width));
jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(params.inputs[0].GetDims().size())));
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params.output.GetDims().size())));
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params)));
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
jit.AddConstant(MakeJitConstant("N_VECTORS_IN_TILE", tile_size / vector_width));
jit.AddConstant(MakeJitConstant("LWS", total_lws));
@ -227,15 +261,6 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
const permute_params& params = static_cast<const permute_params&>(p);
if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
return false;
}
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
// Reorder cannot be fused
return false;
}
if (!is_rotating_except_batch(params.order)) {
return false;
}

View File

@ -40,6 +40,9 @@ ParamsKey PermuteKernel_tile_8x8_4x4_fsv::GetSupportedKey() const {
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
@ -74,6 +77,47 @@ static inline std::string GetTiledOutputOrder(size_t size) {
return order_str;
}
static inline std::string GetReorderedTiledOutputOrder(const permute_params& params) {
std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
std::string order_str = "";
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
if (dim_diff == 0) {
switch (params.output.GetDims().size()) {
case 4 :
order_str = "b, y + lh, x, f";
break;
case 5 :
order_str = "b, z + lh, y, x, f";
break;
default : throw std::runtime_error("Unsupported combination\n");
}
} else if (dim_diff > 0) {
// dim is shrinked (5 -> 4 only)
order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
} else {
// dim is expanded
if (dim_change.first == 4 && dim_change.second == 5) {
order_str = ("b, y + lh, x / " + std::to_string(params.output.Y().v)
+ ", x % " + std::to_string(params.output.Y().v)
+ ", f");
} else if (dim_change.first == 4 && dim_change.second == 6) {
order_str = ("b, y + lh, x / (" + std::to_string(params.output.Y().v)
+ " * " + std::to_string(params.output.Z().v) + ")"
+ ", x / " + std::to_string(params.output.Y().v)
+ ", x % " + std::to_string(params.output.Y().v)
+ ", f");
} else if (dim_change.first == 5 && dim_change.second == 6) {
order_str = ("b, z + lh, y /" + std::to_string(params.output.Z().v)
+ ", y % " + std::to_string(params.output.Z().v)
+ ", x, f");
} else {
throw std::runtime_error("Unsupported combination\n");
}
}
return order_str;
}
static inline std::string GetTiledInputOrder(size_t size) {
std::string order_str = "";
switch (size) {
@ -145,12 +189,17 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
const size_t fsv_alignment = GetFsvAlignment(params);
jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(input_ndims)));
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment)));
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment));
jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", tile_size * total_lws));
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
jit.AddConstant(MakeJitConstant("REORDERED_OUTPUT_TILED_ORDER", GetReorderedTiledOutputOrder(params)));
} else {
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
}
// whether F is tile_size-aligned
if (f % tile_size != 0) {
jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % tile_size));
@ -172,8 +221,8 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
}
if (!params.fused_ops.empty()) {
std::vector<std::string> output_order = GetFusedOpOrderVector(output_ndims);
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
std::vector<std::string> original_output_order = GetFusedOpOrderVector(input_ndims);
FusedOpsConfiguration conf = {"", original_output_order, "input_var", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}
return jit;
@ -258,11 +307,26 @@ bool PermuteKernel_tile_8x8_4x4_fsv::Validate(const Params& p, const optional_pa
};
const permute_params& params = static_cast<const permute_params&>(p);
if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
return false;
// blocked format => blocked format is not supported
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
if ((params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4) ||
(params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16) ||
(params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32)) {
if (params.output.GetLayout() != DataLayout::bfyx
&& params.output.GetLayout() != DataLayout::bfzyx
&& params.output.GetLayout() != DataLayout::bfwzyx)
return false;
} else if ((params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16) ||
(params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32)) {
if (params.output.GetLayout() != DataLayout::bfyx
&& params.output.GetLayout() != DataLayout::bfzyx
&& params.output.GetLayout() != DataLayout::bfwzyx) {
return false;
}
} else {
return false;
}
}
if (!is_rotating_except_batch(params.order)) {
return false;
}

View File

@ -14,9 +14,9 @@ KERNEL (permute_ref)(
{
//gws(x, y * z * w, b*f)
const uint gid_0 = get_global_id(1);
#if INPUT0_DIMS == 4 && OUTPUT0_DIMS == 4
#if INPUT0_DIMS == 4
const uint y = gid_0;
#elif INPUT0_DIMS == 5 && OUTPUT0_DIMS == 5
#elif INPUT0_DIMS == 5
const uint z = gid_0 / INPUT0_SIZE_Y;
const uint y = gid_0 % INPUT0_SIZE_Y;
#else
@ -24,11 +24,11 @@ KERNEL (permute_ref)(
const uint z = gid_0 / INPUT0_SIZE_Y % INPUT0_SIZE_Z;
const uint y = gid_0 % INPUT0_SIZE_Y;
#endif
const uint x = get_global_id(0);
const uint f = (uint)get_global_id(2) % INPUT0_FEATURE_NUM;
const uint b = (uint)get_global_id(2) / INPUT0_FEATURE_NUM;
INPUT0_TYPE input_var = input[IN_IDX];
#if HAS_FUSED_OPS

View File

@ -18,6 +18,8 @@
#define VLOAD CAT(vload, TILE_SIZE)
#define VSTORE CAT(vstore, TILE_SIZE)
#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE)
#define AS_OUTPUTVTYPE CAT(as_, OUTPUTVTYPE)
#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE)
#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX))
#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX))
@ -51,6 +53,49 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
+ GET_LOCAL_ID(2);
const uint local_buf_offset = local_id * TILE_SIZE;
#ifdef REORDERED_OUTPUT_TILED_ORDER
if (F_NO_REMAINDER_CONDITION) {
unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
// read
const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
// write to ddr
#if HAS_FUSED_OPS
OUTPUTVTYPE out_data;
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
INPUT0_TYPE input_var = read_data[lw];
FUSED_OPS;
out_data[lw] = FUSED_OPS_RESULT;
}
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
VSTORE(out_data, 0, output + output_idx);
#else
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
VSTORE(ACTIVATION(TO_OUTPUTVTYPE(read_data), ACTIVATION_PARAMS), 0, output + output_idx);
#endif
}
}
#ifdef F_REMAINDER_CONDITION
else if (F_REMAINDER_CONDITION) {
unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
// read
const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
// write to ddr
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
#if HAS_FUSED_OPS
INPUT0_TYPE input_var = read_data[lw];
FUSED_OPS;
output[output_idx + lw] = FUSED_OPS_RESULT;
#else
output[output_idx + lw] = TO_OUTPUT_TYPE(read_data[lw]);
#endif
}
}
}
#endif // F_REMAINDER_CONDITION
#else // !REORDERED_OUTPUT_TILED_ORDER
if (F_NO_REMAINDER_CONDITION) {
// read and transpose
unroll_for (uint lh = 0; lh < TILE_SIZE; ++lh) {
@ -59,16 +104,15 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
const uint dst = local_buf_offset + lw;
#if HAS_FUSED_OPS
#if HAS_FUSED_OPS
INPUT0_TYPE input_var = read_data[lw];
FUSED_OPS;
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
#else
#else
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
#endif
#endif
}
}
// write to ddr
#ifdef YZ_REMAINDER_CONDITION
if (YZ_REMAINDER_LESS_THAN_TILE_SIZE) {
// copy one by one when z % TILE_SIZE < TILE_SIZE/2
@ -94,12 +138,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
}
}
#else
#else // YZ_REMAINDER_CONDITION
// write to ddr
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
}
#endif
#endif //YZ_REMAINDER_CONDITION
}
#ifdef F_REMAINDER_CONDITION
else if (F_REMAINDER_CONDITION) {
@ -109,13 +154,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
uint dst = local_buf_offset + lw;
#if HAS_FUSED_OPS
INPUT0_TYPE input_var = read_data[lw];
FUSED_OPS;
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
#else
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
#endif
#if HAS_FUSED_OPS
INPUT0_TYPE input_var = read_data[lw];
FUSED_OPS;
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
#else
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
#endif
}
}
// write to ddr
@ -145,12 +190,30 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
}
}
#else
#else // !YZ_REMAINDER_CONDITION
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
}
#endif
#endif // YZ_REMAINDER_CONDITION
}
#endif
#endif // F_REMAINDER_CONDITION
#endif // REORDERED_OUTPUT)TILED_ORDER
}
#undef unroll_for
#undef CEIL_DIV(A, B)
#undef INPUT0_GET_TILED_INDEX(ORDER)
#undef OUTPUT_GET_TILED_INDEX(ORDER)
#undef YZ_REMAINDER_LESS_THAN_TILE_SIZE
#undef YZ_REMAINDER_MORE_THAN_TILE_SIZE
#undef INPUTVTYPE
#undef OUTPUTVTYPE
#undef VLOAD
#undef VSTORE
#undef AS_INPUTVTYPE
#undef AS_OUTPUTVTYPE
#undef TO_OUTPUTVTYPE
#undef GET_GLOBAL_ID(IDX)
#undef GET_LOCAL_ID(IDX)
#undef GET_LOCAL_SIZE(IDX)

View File

@ -236,7 +236,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
return true;
if (prev.is_type<permute>() && fmt_prev.dimension() == fmt_next.dimension()) {
if (prev.is_type<permute>()) {
return true;
}
return false;

View File

@ -157,7 +157,6 @@ public:
void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) {
auto outputs_ref = not_fused.execute();
auto outputs_fused = fused.execute();
auto get_reorders_count = [](network& net) -> size_t {
size_t count = 0;
for (auto& pi : net.get_primitives_info()) {
@ -187,7 +186,6 @@ public:
description << " " << i.original_id << " " << i.kernel_id << std::endl;
}
SCOPED_TRACE(description.str());
// Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph
ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives);
ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives);
@ -6409,7 +6407,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,
struct permute_reorder_params {
tensor in_shape;
tensor out_shape;
std::vector<uint16_t> permute_order1;
std::vector<uint16_t> permute_order2;
data_types permute_type;
@ -6420,20 +6417,47 @@ struct permute_reorder_params {
size_t expected_not_fused_primitives;
};
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F32_1 {2, 16, 16, 16}, {2, 16, 16, 16}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F32_1 {2, 7, 9, 27}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
// type change
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_S8_TO_F32_1 {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_S8_0 {1, 15, 4, 5}, {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_S8_1 {1, 2, 15, 4, 5}, {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
// dim change
#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 {1, 16, 8, 16}, {1, 2, 0, 3}, {0, 3, 1, 4, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 {1, 16, 8, 16}, {0, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 {1, 16, 4, 5, 18},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 {1, 16, 8, 4, 16}, {0, 4, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 4, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 18, 1, 2, 2}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
// permute_opt for blocked format
#define CASE_PERMUTE_REORDER_TILED_F32_0 {1, 256, 2, 64}, {0, 3, 1, 2}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_1 {1, 78, 2, 259}, {0, 3, 1, 2}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_2 {1, 48, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 4, 1}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
// permute_opt for blocked format => reorder to differnt dim
#define CASE_PERMUTE_REORDER_TILED_F32_3 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
// permute opt for blocked format => reorder to different dim/type
#define CASE_PERMUTE_REORDER_TILED_I8_4 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_5 {1, 48, 3, 256}, {0, 3, 1, 2}, {0, 2, 4, 3, 1}, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_6 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 5, 4, 3, 1}, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
// permute opt for non_blocked format => reorder to differnt dim/type
#define CASE_PERMUTE_REORDER_TILED_F16_7 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_8 {1, 28, 2, 2, 3, 256}, {0, 5, 1, 2, 3, 4}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_9 {1, 24, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_10 {1, 35, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 3, 1}, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_11 {1, 32, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 5, 3, 1}, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params> {
public:
@ -6460,7 +6484,7 @@ TEST_P(permute_redundant_reorder, basic) {
input_layout("input", get_input_layout(p)),
permute("permute1", "input", p.permute_order1),
reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder n
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused
);
tolerance = 1e-5f;
execute(p);
@ -6474,11 +6498,72 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_redundant_reorder,
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3},
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3},
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_I8_4, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 4},
}),);
class permute_act_reorder : public PermuteReorderFusingTest {};
TEST_P(permute_act_reorder, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
permute("permute1", "input", p.permute_order1),
activation("activation", "permute1", activation_func::abs),
reorder("reorder1", "activation", p.output_format, p.output_type), // to be fused
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused
);
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_act_reorder,
::testing::ValuesIn(std::vector<permute_reorder_params> {
permute_reorder_params{CASE_PERMUTE_REORDER_F32_0, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_1, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_F32_2, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5},
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5},
}),);
class NormalizeFusingTest : public ::BaseFusingTest<normalize_test_params> {