[IE CLDNN] Enabled fusing all types of reorders to permute (#5101)
* [IE CLDNN] Enabled fusing all types of reorders to permute - Fusing reorder to differnt dims (e.g., 4D=>5D, 6D=>4D, etc) - Fusing reorder to permute_opt kernel for blocked formats - Fixed bug for activation_opt kernel for blocked formats * [IE CLDNN] Enabled fusing all types of reorders to permute Refactoring for permute fsv opt kernel * [IE CLDNN] Permute reorder fusing Support reordering to different dims from optimized permute kernel for blocked format
This commit is contained in:
parent
ad6a0e7e6d
commit
6c46f26a3c
@ -27,12 +27,35 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const {
|
||||
return k;
|
||||
}
|
||||
|
||||
static size_t GetTotalSize(const activation_params& params) {
|
||||
const auto input = params.inputs[0];
|
||||
size_t totalSize = input.LogicalSize();
|
||||
switch (params.inputs[0].GetLayout()) {
|
||||
case DataLayout::b_fs_yx_fsv4:
|
||||
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 4);
|
||||
break;
|
||||
case DataLayout::b_fs_yx_fsv16:
|
||||
case DataLayout::b_fs_zyx_fsv16:
|
||||
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 16);
|
||||
break;
|
||||
case DataLayout::b_fs_yx_fsv32:
|
||||
case DataLayout::b_fs_zyx_fsv32:
|
||||
case DataLayout::fs_b_yx_fsv32:
|
||||
totalSize = (totalSize / input.Feature().v) * Align(input.Feature().v, 32);
|
||||
break;
|
||||
case DataLayout::bs_fs_zyx_bsv16_fsv16:
|
||||
case DataLayout::bs_fs_yx_bsv16_fsv16:
|
||||
totalSize = (totalSize / (input.Feature().v * input.Batch().v)) * Align(input.Feature().v, 16) * Align(input.Batch().v, 16);
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
ActivationKernelOpt::Parent::DispatchData ActivationKernelOpt::SetDefault(const activation_params& params) const {
|
||||
auto dispatchData = Parent::SetDefault(params);
|
||||
|
||||
const auto totalSize = params.inputs[0].LogicalSize();
|
||||
|
||||
dispatchData.gws = { totalSize / NUM_COLS_WI, 1, 1 };
|
||||
dispatchData.gws = { GetTotalSize(params) / NUM_COLS_WI, 1, 1 };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
|
||||
return dispatchData;
|
||||
@ -50,17 +73,13 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co
|
||||
|
||||
const activation_params& params = static_cast<const activation_params&>(p);
|
||||
|
||||
if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && params.output.Feature().v % 16 != 0)
|
||||
return false;
|
||||
|
||||
const auto totalSize = params.inputs[0].LogicalSize();
|
||||
const auto totalSize = GetTotalSize(params);
|
||||
if ((totalSize % NUM_COLS_WI) != 0 ||
|
||||
(params.inputs[0].GetFirstElementOffset() % NUM_COLS_WI) != 0 ||
|
||||
(params.output.GetFirstElementOffset() % NUM_COLS_WI) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
if (params.output.GetLayout() != params.inputs[0].GetLayout())
|
||||
return false;
|
||||
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include "permute_kernel_ref.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include <string>
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey PermuteKernelRef::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
@ -42,20 +41,95 @@ CommonDispatchData PermuteKernelRef::SetDefault(const permute_params& params) co
|
||||
bool PermuteKernelRef::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!Parent::Validate(p, o)) return false;
|
||||
|
||||
const permute_params& params = static_cast<const permute_params&>(p);
|
||||
return true;
|
||||
}
|
||||
|
||||
// currently reorder fusing is supported only for format change, not the layout change
|
||||
if (DataTensor::ChannelsCount(params.inputs[0].GetLayout())
|
||||
!= DataTensor::ChannelsCount(params.output.GetLayout())) {
|
||||
return false;
|
||||
static void GetOrderVector(std::string s, std::vector<std::string>* res) {
|
||||
size_t pos_start = 0, pos_end;
|
||||
std::string token;
|
||||
while ((pos_end = s.find(",", pos_start)) != std::string::npos) {
|
||||
token = s.substr(pos_start, pos_end - pos_start);
|
||||
pos_start = pos_end + 1;
|
||||
res->push_back(token);
|
||||
}
|
||||
|
||||
return true;
|
||||
res->push_back(s.substr(pos_start));
|
||||
return;
|
||||
}
|
||||
|
||||
static std::string GetReorderedOutputOrder(const permute_params& params, const std::vector<std::string>& permute_out_idx,
|
||||
const std::pair<size_t, size_t>& dim_change) {
|
||||
std::map<std::string, std::string> size_str_map = {
|
||||
{"b", "INPUT0_BATCH_NUM"},
|
||||
{"f", "INPUT0_FEATURE_NUM"},
|
||||
{"w", "INPUT0_SIZE_W"},
|
||||
{"z", "INPUT0_SIZE_Z"},
|
||||
{"y", "INPUT0_SIZE_Y"},
|
||||
{"x", "INPUT0_SIZE_X"}
|
||||
};
|
||||
|
||||
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
|
||||
|
||||
std::string reordered_order = permute_out_idx[0] + "," + permute_out_idx[1] + ",";
|
||||
if (dim_diff > 0) {
|
||||
// dim is shrinked
|
||||
std::vector<std::string> merged_indices;
|
||||
if (dim_diff == 2) merged_indices.push_back(permute_out_idx[dim_change.first - 3]);
|
||||
merged_indices.push_back(permute_out_idx[dim_change.first - 2]);
|
||||
merged_indices.push_back(permute_out_idx[dim_change.first - 1]);
|
||||
std::string pitches = "1";
|
||||
for (size_t i = 0 ; i < merged_indices.size(); ++i) {
|
||||
if (i > 0) reordered_order += "+";
|
||||
reordered_order += (merged_indices[i] + "*" + pitches);
|
||||
pitches = size_str_map[merged_indices[i]] + "*" + pitches;
|
||||
}
|
||||
for (size_t i = dim_change.first - 1 - merged_indices.size(); i > 1; --i) {
|
||||
reordered_order += ((", " + permute_out_idx[i]));
|
||||
}
|
||||
} else {
|
||||
// dim is expanded
|
||||
if (dim_change.first == 4 && dim_change.second == 5) {
|
||||
reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
|
||||
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
|
||||
+ ", " + permute_out_idx[2]);
|
||||
} else if (dim_change.first == 4 && dim_change.second == 6) {
|
||||
reordered_order += (permute_out_idx.back() + "/ (" + std::to_string(params.output.Y().v)
|
||||
+ " * " + std::to_string(params.output.Z().v) + ")"
|
||||
+ ", " + permute_out_idx.back() + "/" + std::to_string(params.output.Y().v)
|
||||
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Y().v)
|
||||
+ ", " + permute_out_idx[2]);
|
||||
} else if (dim_change.first == 5 && dim_change.second == 6) {
|
||||
reordered_order += (permute_out_idx.back() + "/" + std::to_string(params.output.Z().v)
|
||||
+ ", " + permute_out_idx.back() + "%" + std::to_string(params.output.Z().v)
|
||||
+ ", " + permute_out_idx[3]
|
||||
+ ", " + permute_out_idx[2]);
|
||||
}
|
||||
}
|
||||
return reordered_order;
|
||||
}
|
||||
JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
std::vector<std::string> in_idx;
|
||||
std::vector<std::string> out_idx;
|
||||
std::vector<std::string> permute_out_idx;
|
||||
|
||||
std::map<std::string, std::string> size_str_map = {
|
||||
{"b", "INPUT0_BATCH_NUM"},
|
||||
{"f", "INPUT0_FEATURE_NUM"},
|
||||
{"w", "INPUT0_SIZE_W"},
|
||||
{"z", "INPUT0_SIZE_Z"},
|
||||
{"y", "INPUT0_SIZE_Y"},
|
||||
{"x", "INPUT0_SIZE_X"}
|
||||
};
|
||||
|
||||
std::pair<size_t, size_t> dim_change;
|
||||
bool reorder_to_different_dim = false;
|
||||
std::vector<std::string> reordered_out_idx;
|
||||
if (DataTensor::ChannelsCount(params.inputs[0].GetLayout()) != DataTensor::ChannelsCount(params.output.GetLayout())) {
|
||||
// subsequent reorder to differnt dimension is fused
|
||||
dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
|
||||
reorder_to_different_dim = true;
|
||||
}
|
||||
|
||||
switch (DataTensor::ChannelsCount(params.inputs[0].GetLayout())) {
|
||||
case 6: in_idx = {"b", "f", "x", "y", "z", "w" }; break;
|
||||
case 5: in_idx = {"b", "f", "x", "y", "z" }; break;
|
||||
@ -64,32 +138,46 @@ JitConstants PermuteKernelRef::GetJitConstants(const permute_params& params, con
|
||||
|
||||
assert(params.order.size() == in_idx.size());
|
||||
for (auto& o : params.order) {
|
||||
out_idx.push_back(in_idx[o]);
|
||||
permute_out_idx.push_back(in_idx[o]);
|
||||
}
|
||||
|
||||
std::string input_order = in_idx[0] + "," + in_idx[1];
|
||||
std::string output_order = out_idx[0] + "," + out_idx[1];
|
||||
|
||||
for (size_t i = in_idx.size() - 1; i > 1; i--) {
|
||||
input_order += "," + in_idx[i];
|
||||
output_order += "," + out_idx[i];
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_IDX", "INPUT0_GET_INDEX(" + input_order + ")"));
|
||||
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
|
||||
|
||||
if (reorder_to_different_dim) {
|
||||
auto reordered_order = GetReorderedOutputOrder(params, permute_out_idx, dim_change);
|
||||
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + reordered_order + ")"));
|
||||
GetOrderVector(reordered_order, &reordered_out_idx);
|
||||
} else {
|
||||
std::string output_order = permute_out_idx[0] + "," + permute_out_idx[1];
|
||||
for (size_t i = in_idx.size() - 1; i > 1; i--) {
|
||||
output_order += "," + permute_out_idx[i];
|
||||
}
|
||||
jit.AddConstant(MakeJitConstant("OUT_IDX", "OUTPUT_GET_INDEX(" + output_order + ")"));
|
||||
}
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
if (out_idx.size() == 4) {
|
||||
std::swap(out_idx[2], out_idx[3]);
|
||||
} else if (out_idx.size() == 5) {
|
||||
std::swap(out_idx[2], out_idx[4]);
|
||||
} else if (out_idx.size() == 6) {
|
||||
std::swap(out_idx[2], out_idx[5]);
|
||||
std::swap(out_idx[3], out_idx[4]);
|
||||
if (permute_out_idx.size() == 4) {
|
||||
std::swap(permute_out_idx[2], permute_out_idx[3]);
|
||||
} else if (permute_out_idx.size() == 5) {
|
||||
std::swap(permute_out_idx[2], permute_out_idx[4]);
|
||||
} else if (permute_out_idx.size() == 6) {
|
||||
std::swap(permute_out_idx[2], permute_out_idx[5]);
|
||||
std::swap(permute_out_idx[3], permute_out_idx[4]);
|
||||
}
|
||||
|
||||
FusedOpsConfiguration conf = {"", out_idx, "input_var", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||
if (reorder_to_different_dim) {
|
||||
FusedOpsConfiguration conf = {"", reordered_out_idx, "input_var", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||
} else {
|
||||
FusedOpsConfiguration conf = {"", permute_out_idx, "input_var", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||
}
|
||||
}
|
||||
return jit;
|
||||
}
|
||||
|
@ -70,19 +70,54 @@ static inline std::vector<std::string> GetFusedOpOrderVector(size_t size) {
|
||||
return res;
|
||||
}
|
||||
|
||||
static inline std::string GetTiledOutputOrder(size_t size) {
|
||||
static inline std::string GetTiledOutputOrder(const permute_params& params) {
|
||||
std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
|
||||
|
||||
std::string order_str = "";
|
||||
switch (size) {
|
||||
case 4 :
|
||||
order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
case 5 :
|
||||
order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
case 6 :
|
||||
order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
default : throw std::runtime_error("Unsupported combination\n");
|
||||
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
|
||||
|
||||
if (dim_diff == 0) {
|
||||
switch (dim_change.first) {
|
||||
case 4 :
|
||||
order_str = "b, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
case 5 :
|
||||
order_str = "b, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
case 6 :
|
||||
order_str = "b, w, z, y, (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
break;
|
||||
default : throw std::runtime_error("Unsupported combination\n");
|
||||
}
|
||||
} else if (dim_diff > 0) {
|
||||
// dim is shrinked
|
||||
order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
|
||||
if (dim_change.first == 5 && dim_change.second == 4) {
|
||||
order_str = "b, z, y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f*TILE_SIZE)";
|
||||
} else if (dim_change.first == 6 && dim_change.second == 4) {
|
||||
order_str = "b, w, z * INPUT0_SIZE_Y * INPUT0_SIZE_X + y * INPUT0_SIZE_X + (x * TILE_SIZE + lh), (f * TILE_SIZE)";
|
||||
} else if (dim_change.first == 6 && dim_change.second == 5) {
|
||||
order_str = "b, w, z * INPUT0_SIZE_Y + y, x * TILE_SIZE + lh, (f * TILE_SIZE)";
|
||||
}
|
||||
} else {
|
||||
// dim is expanded
|
||||
if (dim_change.first == 4 && dim_change.second == 5) {
|
||||
order_str = ("b, y, (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
|
||||
+ ", (x * TILE_SIZE +lh) % " + std::to_string(params.output.Y().v)
|
||||
+ ", (f * TILE_SIZE)");
|
||||
} else if (dim_change.first == 4 && dim_change.second == 6) {
|
||||
order_str = ("b, y, (x * TILE_SIZE + lh) / (" + std::to_string(params.output.Y().v)
|
||||
+ " * " + std::to_string(params.output.Z().v) + ")"
|
||||
+ ", (x * TILE_SIZE + lh) / " + std::to_string(params.output.Y().v)
|
||||
+ ", (x * TILE_SIZE + lh) % " + std::to_string(params.output.Y().v)
|
||||
+ ", (f * TILE_SIZE)");
|
||||
} else if (dim_change.first == 5 && dim_change.second == 6) {
|
||||
order_str = ("b, z, y /" + std::to_string(params.output.Z().v)
|
||||
+ ", y % " + std::to_string(params.output.Z().v)
|
||||
+ ", (x * TILE_SIZE + lh), (f * TILE_SIZE)");
|
||||
} else {
|
||||
throw std::runtime_error("Unsupported combination\n");
|
||||
}
|
||||
}
|
||||
return order_str;
|
||||
}
|
||||
@ -104,7 +139,6 @@ static inline std::string GetTiledInputOrder(size_t size) {
|
||||
return order_str;
|
||||
}
|
||||
|
||||
|
||||
JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
size_t tile_size = GetTileSize(params);
|
||||
@ -113,7 +147,7 @@ JitConstants PermuteKernel_tile_8x8_4x4::GetJitConstants(const permute_params& p
|
||||
uint64_t total_lws = dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2];
|
||||
jit.AddConstant(MakeJitConstant("VEC_WIDTH", vector_width));
|
||||
jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(params.inputs[0].GetDims().size())));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params.output.GetDims().size())));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(params)));
|
||||
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
|
||||
jit.AddConstant(MakeJitConstant("N_VECTORS_IN_TILE", tile_size / vector_width));
|
||||
jit.AddConstant(MakeJitConstant("LWS", total_lws));
|
||||
@ -227,15 +261,6 @@ bool PermuteKernel_tile_8x8_4x4::Validate(const Params& p, const optional_params
|
||||
|
||||
const permute_params& params = static_cast<const permute_params&>(p);
|
||||
|
||||
if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
|
||||
// Reorder cannot be fused
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!is_rotating_except_batch(params.order)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -40,6 +40,9 @@ ParamsKey PermuteKernel_tile_8x8_4x4_fsv::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
k.EnableOutputLayout(DataLayout::bfwzyx);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
@ -74,6 +77,47 @@ static inline std::string GetTiledOutputOrder(size_t size) {
|
||||
return order_str;
|
||||
}
|
||||
|
||||
static inline std::string GetReorderedTiledOutputOrder(const permute_params& params) {
|
||||
std::pair<size_t, size_t> dim_change = {params.inputs[0].GetDims().size(), params.output.GetDims().size()};
|
||||
|
||||
std::string order_str = "";
|
||||
int32_t dim_diff = static_cast<int32_t>(dim_change.first) - static_cast<int32_t>(dim_change.second);
|
||||
if (dim_diff == 0) {
|
||||
switch (params.output.GetDims().size()) {
|
||||
case 4 :
|
||||
order_str = "b, y + lh, x, f";
|
||||
break;
|
||||
case 5 :
|
||||
order_str = "b, z + lh, y, x, f";
|
||||
break;
|
||||
default : throw std::runtime_error("Unsupported combination\n");
|
||||
}
|
||||
} else if (dim_diff > 0) {
|
||||
// dim is shrinked (5 -> 4 only)
|
||||
order_str = "b, z + lh, y * INPUT0_SIZE_X + x, f";
|
||||
} else {
|
||||
// dim is expanded
|
||||
if (dim_change.first == 4 && dim_change.second == 5) {
|
||||
order_str = ("b, y + lh, x / " + std::to_string(params.output.Y().v)
|
||||
+ ", x % " + std::to_string(params.output.Y().v)
|
||||
+ ", f");
|
||||
} else if (dim_change.first == 4 && dim_change.second == 6) {
|
||||
order_str = ("b, y + lh, x / (" + std::to_string(params.output.Y().v)
|
||||
+ " * " + std::to_string(params.output.Z().v) + ")"
|
||||
+ ", x / " + std::to_string(params.output.Y().v)
|
||||
+ ", x % " + std::to_string(params.output.Y().v)
|
||||
+ ", f");
|
||||
} else if (dim_change.first == 5 && dim_change.second == 6) {
|
||||
order_str = ("b, z + lh, y /" + std::to_string(params.output.Z().v)
|
||||
+ ", y % " + std::to_string(params.output.Z().v)
|
||||
+ ", x, f");
|
||||
} else {
|
||||
throw std::runtime_error("Unsupported combination\n");
|
||||
}
|
||||
}
|
||||
return order_str;
|
||||
}
|
||||
|
||||
static inline std::string GetTiledInputOrder(size_t size) {
|
||||
std::string order_str = "";
|
||||
switch (size) {
|
||||
@ -145,12 +189,17 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
|
||||
const size_t fsv_alignment = GetFsvAlignment(params);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(input_ndims)));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
|
||||
jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment)));
|
||||
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
|
||||
jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment));
|
||||
jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", tile_size * total_lws));
|
||||
|
||||
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
|
||||
jit.AddConstant(MakeJitConstant("REORDERED_OUTPUT_TILED_ORDER", GetReorderedTiledOutputOrder(params)));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
|
||||
}
|
||||
|
||||
// whether F is tile_size-aligned
|
||||
if (f % tile_size != 0) {
|
||||
jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % tile_size));
|
||||
@ -172,8 +221,8 @@ JitConstants PermuteKernel_tile_8x8_4x4_fsv::GetJitConstants(const permute_param
|
||||
}
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
std::vector<std::string> output_order = GetFusedOpOrderVector(output_ndims);
|
||||
FusedOpsConfiguration conf = {"", output_order, "input_var", params.inputs[0].GetDType(), 1};
|
||||
std::vector<std::string> original_output_order = GetFusedOpOrderVector(input_ndims);
|
||||
FusedOpsConfiguration conf = {"", original_output_order, "input_var", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||
}
|
||||
return jit;
|
||||
@ -258,11 +307,26 @@ bool PermuteKernel_tile_8x8_4x4_fsv::Validate(const Params& p, const optional_pa
|
||||
};
|
||||
|
||||
const permute_params& params = static_cast<const permute_params&>(p);
|
||||
|
||||
if (params.inputs[0].GetDims().size() != params.output.GetDims().size()) {
|
||||
return false;
|
||||
// blocked format => blocked format is not supported
|
||||
if (params.inputs[0].GetLayout() != params.output.GetLayout()) {
|
||||
if ((params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4) ||
|
||||
(params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16) ||
|
||||
(params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32)) {
|
||||
if (params.output.GetLayout() != DataLayout::bfyx
|
||||
&& params.output.GetLayout() != DataLayout::bfzyx
|
||||
&& params.output.GetLayout() != DataLayout::bfwzyx)
|
||||
return false;
|
||||
} else if ((params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16) ||
|
||||
(params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32)) {
|
||||
if (params.output.GetLayout() != DataLayout::bfyx
|
||||
&& params.output.GetLayout() != DataLayout::bfzyx
|
||||
&& params.output.GetLayout() != DataLayout::bfwzyx) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_rotating_except_batch(params.order)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -14,9 +14,9 @@ KERNEL (permute_ref)(
|
||||
{
|
||||
//gws(x, y * z * w, b*f)
|
||||
const uint gid_0 = get_global_id(1);
|
||||
#if INPUT0_DIMS == 4 && OUTPUT0_DIMS == 4
|
||||
#if INPUT0_DIMS == 4
|
||||
const uint y = gid_0;
|
||||
#elif INPUT0_DIMS == 5 && OUTPUT0_DIMS == 5
|
||||
#elif INPUT0_DIMS == 5
|
||||
const uint z = gid_0 / INPUT0_SIZE_Y;
|
||||
const uint y = gid_0 % INPUT0_SIZE_Y;
|
||||
#else
|
||||
@ -24,11 +24,11 @@ KERNEL (permute_ref)(
|
||||
const uint z = gid_0 / INPUT0_SIZE_Y % INPUT0_SIZE_Z;
|
||||
const uint y = gid_0 % INPUT0_SIZE_Y;
|
||||
#endif
|
||||
|
||||
|
||||
const uint x = get_global_id(0);
|
||||
const uint f = (uint)get_global_id(2) % INPUT0_FEATURE_NUM;
|
||||
const uint b = (uint)get_global_id(2) / INPUT0_FEATURE_NUM;
|
||||
|
||||
|
||||
INPUT0_TYPE input_var = input[IN_IDX];
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
|
@ -18,6 +18,8 @@
|
||||
#define VLOAD CAT(vload, TILE_SIZE)
|
||||
#define VSTORE CAT(vstore, TILE_SIZE)
|
||||
#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE)
|
||||
#define AS_OUTPUTVTYPE CAT(as_, OUTPUTVTYPE)
|
||||
#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE)
|
||||
|
||||
#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX))
|
||||
#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX))
|
||||
@ -51,6 +53,49 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
|
||||
+ GET_LOCAL_ID(2);
|
||||
const uint local_buf_offset = local_id * TILE_SIZE;
|
||||
|
||||
#ifdef REORDERED_OUTPUT_TILED_ORDER
|
||||
if (F_NO_REMAINDER_CONDITION) {
|
||||
unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
|
||||
// read
|
||||
const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
|
||||
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
|
||||
// write to ddr
|
||||
#if HAS_FUSED_OPS
|
||||
OUTPUTVTYPE out_data;
|
||||
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
|
||||
INPUT0_TYPE input_var = read_data[lw];
|
||||
FUSED_OPS;
|
||||
out_data[lw] = FUSED_OPS_RESULT;
|
||||
}
|
||||
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
|
||||
VSTORE(out_data, 0, output + output_idx);
|
||||
#else
|
||||
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
|
||||
VSTORE(ACTIVATION(TO_OUTPUTVTYPE(read_data), ACTIVATION_PARAMS), 0, output + output_idx);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef F_REMAINDER_CONDITION
|
||||
else if (F_REMAINDER_CONDITION) {
|
||||
unroll_for (uint lh = 0; lh < TILE_SIZE/*8*/; ++lh) {
|
||||
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
|
||||
// read
|
||||
const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER);
|
||||
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
|
||||
// write to ddr
|
||||
const uint output_idx = OUTPUT_GET_TILED_INDEX(REORDERED_OUTPUT_TILED_ORDER);
|
||||
#if HAS_FUSED_OPS
|
||||
INPUT0_TYPE input_var = read_data[lw];
|
||||
FUSED_OPS;
|
||||
output[output_idx + lw] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
output[output_idx + lw] = TO_OUTPUT_TYPE(read_data[lw]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // F_REMAINDER_CONDITION
|
||||
#else // !REORDERED_OUTPUT_TILED_ORDER
|
||||
if (F_NO_REMAINDER_CONDITION) {
|
||||
// read and transpose
|
||||
unroll_for (uint lh = 0; lh < TILE_SIZE; ++lh) {
|
||||
@ -59,16 +104,15 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
|
||||
|
||||
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
|
||||
const uint dst = local_buf_offset + lw;
|
||||
#if HAS_FUSED_OPS
|
||||
#if HAS_FUSED_OPS
|
||||
INPUT0_TYPE input_var = read_data[lw];
|
||||
FUSED_OPS;
|
||||
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
#else
|
||||
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
// write to ddr
|
||||
#ifdef YZ_REMAINDER_CONDITION
|
||||
if (YZ_REMAINDER_LESS_THAN_TILE_SIZE) {
|
||||
// copy one by one when z % TILE_SIZE < TILE_SIZE/2
|
||||
@ -94,12 +138,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
|
||||
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#else // YZ_REMAINDER_CONDITION
|
||||
// write to ddr
|
||||
unroll_for (uint lw = 0; lw < TILE_SIZE; ++lw) {
|
||||
const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
|
||||
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
|
||||
}
|
||||
#endif
|
||||
#endif //YZ_REMAINDER_CONDITION
|
||||
}
|
||||
#ifdef F_REMAINDER_CONDITION
|
||||
else if (F_REMAINDER_CONDITION) {
|
||||
@ -109,13 +154,13 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
|
||||
INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx));
|
||||
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
|
||||
uint dst = local_buf_offset + lw;
|
||||
#if HAS_FUSED_OPS
|
||||
INPUT0_TYPE input_var = read_data[lw];
|
||||
FUSED_OPS;
|
||||
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#if HAS_FUSED_OPS
|
||||
INPUT0_TYPE input_var = read_data[lw];
|
||||
FUSED_OPS;
|
||||
transpose_buf[dst][lh] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
// write to ddr
|
||||
@ -145,12 +190,30 @@ KERNEL (permute_tile_8x8_4x4_fsv)(
|
||||
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#else // !YZ_REMAINDER_CONDITION
|
||||
unroll_for (uint lw = 0; lw < F_REMAINDER_SIZE; ++lw) {
|
||||
const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
|
||||
VSTORE(transpose_buf[local_buf_offset + lw], 0, output + output_idx);
|
||||
}
|
||||
#endif
|
||||
#endif // YZ_REMAINDER_CONDITION
|
||||
}
|
||||
#endif
|
||||
#endif // F_REMAINDER_CONDITION
|
||||
#endif // REORDERED_OUTPUT)TILED_ORDER
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
#undef CEIL_DIV(A, B)
|
||||
#undef INPUT0_GET_TILED_INDEX(ORDER)
|
||||
#undef OUTPUT_GET_TILED_INDEX(ORDER)
|
||||
#undef YZ_REMAINDER_LESS_THAN_TILE_SIZE
|
||||
#undef YZ_REMAINDER_MORE_THAN_TILE_SIZE
|
||||
#undef INPUTVTYPE
|
||||
#undef OUTPUTVTYPE
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#undef AS_INPUTVTYPE
|
||||
#undef AS_OUTPUTVTYPE
|
||||
#undef TO_OUTPUTVTYPE
|
||||
#undef GET_GLOBAL_ID(IDX)
|
||||
#undef GET_LOCAL_ID(IDX)
|
||||
#undef GET_LOCAL_SIZE(IDX)
|
||||
|
@ -236,7 +236,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
if (prev.is_type<permute>() && fmt_prev.dimension() == fmt_next.dimension()) {
|
||||
if (prev.is_type<permute>()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -157,7 +157,6 @@ public:
|
||||
void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) {
|
||||
auto outputs_ref = not_fused.execute();
|
||||
auto outputs_fused = fused.execute();
|
||||
|
||||
auto get_reorders_count = [](network& net) -> size_t {
|
||||
size_t count = 0;
|
||||
for (auto& pi : net.get_primitives_info()) {
|
||||
@ -187,7 +186,6 @@ public:
|
||||
description << " " << i.original_id << " " << i.kernel_id << std::endl;
|
||||
}
|
||||
SCOPED_TRACE(description.str());
|
||||
|
||||
// Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph
|
||||
ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives);
|
||||
ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives);
|
||||
@ -6409,7 +6407,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv,
|
||||
|
||||
struct permute_reorder_params {
|
||||
tensor in_shape;
|
||||
tensor out_shape;
|
||||
std::vector<uint16_t> permute_order1;
|
||||
std::vector<uint16_t> permute_order2;
|
||||
data_types permute_type;
|
||||
@ -6420,20 +6417,47 @@ struct permute_reorder_params {
|
||||
size_t expected_not_fused_primitives;
|
||||
};
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_1 {2, 16, 16, 16}, {2, 16, 16, 16}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F32_0 {1, 16, 32, 2}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_1 {2, 7, 9, 27}, {0, 3, 2, 1}, {0, 3, 2, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F32_2 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F16_0 {1, 16, 2, 4}, {1, 16, 2, 4}, {0, 2, 1, 3}, {0, 2, 1, 3}, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_F16_1 {1, 16, 4, 5, 16}, {1, 16, 4, 5, 16}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F16_2 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
// type change
|
||||
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_S8_TO_F32_1 {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_S8_0 {1, 15, 4, 5}, {1, 15, 4, 5}, {0, 2, 3, 1}, {0, 3, 1, 2}, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_S8_1 {1, 2, 15, 4, 5}, {1, 2, 15, 4, 5}, {0, 2, 4, 1, 3}, {0, 3, 1, 4, 2}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
// dim change
|
||||
#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 {1, 16, 8, 16}, {1, 2, 0, 3}, {0, 3, 1, 4, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 {1, 16, 8, 16}, {0, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
|
||||
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 {1, 16, 4, 5, 18},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 {1, 16, 4, 5, 16},{0, 2, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 {1, 16, 8, 4, 16}, {0, 4, 2, 3, 1}, {0, 3, 5, 4, 1, 2}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
|
||||
#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 {1, 16, 4, 5, 4, 16}, {0, 2, 5, 3, 4, 1}, {0, 3, 4, 1, 2}, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx
|
||||
|
||||
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 {1, 5, 1, 2, 14}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 {1, 18, 1, 2, 2}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 {1, 17, 1, 2, 7}, {1, 5, 1, 2, 14}, {0, 2, 3, 4, 1}, {0, 4, 1, 2, 3}, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
// permute_opt for blocked format
|
||||
#define CASE_PERMUTE_REORDER_TILED_F32_0 {1, 256, 2, 64}, {0, 3, 1, 2}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F32_1 {1, 78, 2, 259}, {0, 3, 1, 2}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F32_2 {1, 48, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 4, 1}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
|
||||
|
||||
// permute_opt for blocked format => reorder to differnt dim
|
||||
#define CASE_PERMUTE_REORDER_TILED_F32_3 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
|
||||
|
||||
// permute opt for blocked format => reorder to different dim/type
|
||||
#define CASE_PERMUTE_REORDER_TILED_I8_4 {1, 45, 1, 3, 259}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_5 {1, 48, 3, 256}, {0, 3, 1, 2}, {0, 2, 4, 3, 1}, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_6 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 5, 4, 3, 1}, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
|
||||
|
||||
// permute opt for non_blocked format => reorder to differnt dim/type
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_7 {1, 48, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_8 {1, 28, 2, 2, 3, 256}, {0, 5, 1, 2, 3, 4}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_9 {1, 24, 2, 3, 256}, {0, 4, 1, 2, 3}, {0, 2, 3, 1}, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_10 {1, 35, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 3, 1}, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
|
||||
#define CASE_PERMUTE_REORDER_TILED_F16_11 {1, 32, 3, 253}, {0, 3, 1, 2}, {0, 2, 4, 5, 3, 1}, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
|
||||
|
||||
class PermuteReorderFusingTest : public ::BaseFusingTest<permute_reorder_params> {
|
||||
public:
|
||||
@ -6460,7 +6484,7 @@ TEST_P(permute_redundant_reorder, basic) {
|
||||
input_layout("input", get_input_layout(p)),
|
||||
permute("permute1", "input", p.permute_order1),
|
||||
reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused
|
||||
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder n
|
||||
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused
|
||||
);
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
@ -6474,11 +6498,72 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_redundant_reorder,
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_I8_4, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 4},
|
||||
}),);
|
||||
|
||||
class permute_act_reorder : public PermuteReorderFusingTest {};
|
||||
|
||||
TEST_P(permute_act_reorder, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
permute("permute1", "input", p.permute_order1),
|
||||
activation("activation", "permute1", activation_func::abs),
|
||||
reorder("reorder1", "activation", p.output_format, p.output_type), // to be fused
|
||||
permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused
|
||||
);
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, permute_act_reorder,
|
||||
::testing::ValuesIn(std::vector<permute_reorder_params> {
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_0, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_1, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F32_2, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_0, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_1, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_F16_2, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_1, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_2, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F32_3, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_5, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_6, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5},
|
||||
permute_reorder_params{CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5},
|
||||
}),);
|
||||
|
||||
class NormalizeFusingTest : public ::BaseFusingTest<normalize_test_params> {
|
||||
|
Loading…
Reference in New Issue
Block a user