[GPU] Align permute axis format with IE (#11379)

This commit is contained in:
Vladimir Paramuzov
2022-04-04 10:28:51 +03:00
committed by GitHub
parent d879e34363
commit afdaa7cf89
14 changed files with 225 additions and 232 deletions

View File

@@ -169,31 +169,6 @@ inline cldnn::format DefaultFormatForDims(size_t dimensions) {
return cldnn::format::bfyx; // Should not get here
}
// This helper function is needed to convert permute order from IE format (bfyx) into cldnn format (bfxy)
inline std::vector<uint16_t> ConvertPermuteOrder(const std::vector<uint16_t>& ie_order, size_t rank = 0) {
std::vector<uint16_t> ie_order_aligned = ie_order;
// if order size is less than 4 - fill the rest with just copy
rank = std::max(rank, (size_t)4);
for (auto o = ie_order_aligned.size(); o < rank; o++)
ie_order_aligned.push_back((uint16_t)o);
std::vector<uint16_t> cldnn_order;
// 1. Switch permute order values for spatial dims
for (auto const& o : ie_order_aligned) {
if (o >= 2)
cldnn_order.push_back(1 + ie_order_aligned.size() - o);
else
cldnn_order.push_back(o);
}
// 2. Swap spatial positions
for (int i = 0; i < (cldnn_order.size() - 2) / 2; i++) {
std::swap(cldnn_order[2 + i], cldnn_order[1 + cldnn_order.size() - (2 + i)]);
}
return cldnn_order;
}
inline InferenceEngine::Layout InferenceEngineLayoutFromOVLayout(ov::Layout l) {
if (l == ov::Layout("C")) return InferenceEngine::Layout::C;
if (l == ov::Layout("CN")) return InferenceEngine::Layout::CN;

View File

@@ -35,7 +35,7 @@ struct permute : public primitive_base<permute> {
const std::vector<uint16_t>& permute_order = {},
const primitive_id& ext_prim_id = "",
const padding& output_padding = padding())
: primitive_base(id, {input}, ext_prim_id, output_padding), permute_order(permute_order) {}
: primitive_base(id, {input}, ext_prim_id, output_padding), permute_order(permute_order) { }
/// @brief Array of permuted output order in bfyx format.
std::vector<uint16_t> permute_order;

View File

@@ -13,7 +13,6 @@
#include "lstm_inst.h"
#include "reshape_inst.h"
#include "resample_inst.h"
#include "permute_inst.h"
#include "depth_to_space_inst.h"
#include "lstm_dynamic_inst.h"
#include "lstm_dynamic_input_inst.h"

View File

@@ -15,6 +15,33 @@ using namespace cldnn;
namespace cldnn {
namespace ocl {
namespace {
// This helper function is needed to convert permute order from IE format (bfyx) into cldnn format (bfxy)
inline std::vector<uint16_t> convert_permute_order(const std::vector<uint16_t>& ie_order, size_t rank = 0) {
std::vector<uint16_t> ie_order_aligned = ie_order;
// if order size is less than 4 - fill the rest with just copy
rank = std::max(rank, (size_t)4);
for (auto o = ie_order_aligned.size(); o < rank; o++)
ie_order_aligned.push_back((uint16_t)o);
std::vector<uint16_t> cldnn_order;
// 1. Switch permute order values for spatial dims
for (auto const& o : ie_order_aligned) {
if (o >= 2)
cldnn_order.push_back(1 + ie_order_aligned.size() - o);
else
cldnn_order.push_back(o);
}
// 2. Swap spatial positions
for (int i = 0; i < (cldnn_order.size() - 2) / 2; i++) {
std::swap(cldnn_order[2 + i], cldnn_order[1 + cldnn_order.size() - (2 + i)]);
}
return cldnn_order;
}
} // namespace
struct permute_impl : typed_primitive_impl_ocl<permute> {
using parent = typed_primitive_impl_ocl<permute>;
using parent::parent;
@@ -28,7 +55,8 @@ struct permute_impl : typed_primitive_impl_ocl<permute> {
auto permute_optional_params =
get_default_optional_params<kernel_selector::permute_optional_params>(arg.get_program());
const auto& permute_order = arg.get_primitive()->permute_order;
auto in_rank = arg.get_dependency(0).get_output_layout().get_rank();
auto permute_order = convert_permute_order(arg.get_primitive()->permute_order, in_rank);
permute_params.order = permute_order;
auto& kernel_selector = kernel_selector::permute_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(permute_params, permute_optional_params);

View File

@@ -376,7 +376,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
auto& permute_order = next.as<permute>().get_primitive()->permute_order;
if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
&& permute_order[1] == 2
&& permute_order.back() != 1
&& (!next.as<permute>().is_rotating_except_batch())) {
return false;
}
@@ -428,7 +428,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
auto& permute_order = prev.as<permute>().get_primitive()->permute_order;
if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 ||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16)
&& permute_order[1] == 2
&& permute_order.back() != 1
&& (!prev.as<permute>().is_rotating_except_batch())) {
return false;
}

View File

@@ -24,20 +24,26 @@ layout permute_inst::calc_output_layout(permute_node const& node) {
"Output data type forcing is not supported for permute_node!");
auto input_layout = node.input().get_output_layout();
auto permute_order = node.get_primitive()->permute_order;
std::vector<tensor::value_type> output_sizes;
std::vector<tensor::value_type> output_shape;
auto input_shape = input_layout.get_dims();
for (size_t x = 0; x < permute_order.size(); x++) {
output_sizes.push_back(input_layout.size.raw[permute_order[x]]);
output_shape.push_back(input_shape[permute_order[x]]);
}
auto input_size = tensor(output_sizes);
for (size_t i = output_shape.size(); i < 4; i++) {
output_shape.push_back(1);
}
auto output_size = tensor(format::get_default_format(input_layout.get_rank()), output_shape);
auto op = node.get_primitive()->output_padding;
if (node.has_fused_primitives()) {
input_layout.data_type = node.get_fused_output_layout().data_type;
}
return layout(input_layout.data_type, input_layout.format, input_size, op);
return layout(input_layout.data_type, input_layout.format, output_size, op);
}
std::string permute_inst::to_string(permute_node const& node) {
@@ -67,13 +73,6 @@ std::string permute_inst::to_string(permute_node const& node) {
permute_inst::typed_primitive_inst(network& network, permute_node const& node) : parent(network, node) {
auto permute_order = argument.permute_order;
CLDNN_ERROR_LESS_THAN(node.id(),
"Permute order size",
permute_order.size(),
"minimum order size",
4,
"Permute order size needs to be at least 4.");
auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
for (decltype(required_order_values_size) i = 0; i < required_order_values_size; i++) {

View File

@@ -127,7 +127,7 @@ static void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptr<ng
std::swap(permute_order[1], permute_order[0]);
auto permutePrim = cldnn::permute(permuteName,
weightsName,
ConvertPermuteOrder(permute_order, weights_rank),
permute_order,
op->get_friendly_name());
p.AddPrimitive(permutePrim);
@@ -191,7 +191,7 @@ static void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_p
std::swap(permute_order[2], permute_order[1]);
auto permutePrim = cldnn::permute(permuteName,
weightsName,
ConvertPermuteOrder(permute_order, weights_rank),
permute_order,
op->get_friendly_name());
p.AddPrimitive(permutePrim);

View File

@@ -87,11 +87,10 @@ static void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::Mat
for (auto o = transpose_order.size(); o < 4; o++)
transpose_order.push_back((uint16_t)o);
std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
auto permuteName = op->get_friendly_name() + "/transpose_b";
auto permutePrim = cldnn::permute(permuteName,
weightsName,
cldnn_permute_order,
transpose_order,
op->get_friendly_name());
p.AddPrimitive(permutePrim);
p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);
@@ -107,11 +106,10 @@ static void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::Mat
for (auto o = transpose_order.size(); o < 4; o++)
transpose_order.push_back((uint16_t)o);
std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
auto permuteName = op->get_friendly_name() + "/transpose_a";
auto permutePrim = cldnn::permute(permuteName,
inputName,
cldnn_permute_order,
transpose_order,
op->get_friendly_name());
p.AddPrimitive(permutePrim);
p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);

View File

@@ -20,13 +20,13 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
auto inputPrimitives = p.GetInputPrimitiveIDs(op);
std::string layerName = layer_type_name_ID(op);
std::vector<uint16_t> ie_order;
std::vector<uint16_t> order;
if (op->get_input_size() == 2) {
auto order_constant = std::dynamic_pointer_cast<ngraph::op::Constant>(op->get_input_node_shared_ptr(1));
if (!order_constant) {
IE_THROW() << "Unsupported parameter nodes type in " << op->get_friendly_name() << " (" << op->get_type_name() << ")";
}
ie_order = order_constant->cast_vector<uint16_t>();
order = order_constant->cast_vector<uint16_t>();
}
auto is_convert_color_type = [](const std::shared_ptr<ov::Node> &node) {
@@ -40,7 +40,7 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
// In case of ConvertColor operation we have NHWC (byxf) input format which should be converted to
// NCHW (bfyx) by this Permute, so we replace Permute with Reorder (to bfyx) primitve
auto input = op->input(0).get_source_output().get_node_shared_ptr();
if (is_convert_color_type(input) && ie_order == std::vector<uint16_t>{0, 3, 1, 2}) {
if (is_convert_color_type(input) && order == std::vector<uint16_t>{0, 3, 1, 2}) {
auto precision = input->get_element_type();
p.AddPrimitive(cldnn::reorder(layerName,
inputPrimitives[0],
@@ -54,17 +54,15 @@ static void CreateTransposeOp(Program& p, const std::shared_ptr<ngraph::op::v1::
}
int rank = std::max(4, static_cast<int>(op->get_input_shape(0).size()));
if (ie_order.empty()) {
if (order.empty()) {
// if order size is less than 4 - fill the rest with just copy
for (int o = rank - 1; o >= 0; o--)
ie_order.push_back((uint16_t)o);
order.push_back((uint16_t)o);
}
std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(ie_order, rank);
auto permutePrim = cldnn::permute(layerName,
inputPrimitives[0],
cldnn_permute_order,
order,
op->get_friendly_name());
p.AddPrimitive(permutePrim);

View File

@@ -90,8 +90,8 @@ TEST_P(permute_eltwise_loop, basic) {
execute(p);
}
#define CASE_LOOP_F32_1 3, { 1, 8, 3, 2 }, { 1, 2, 8, 3 }, { 1, 2, 8, 1 }, { 0, 3, 1, 2 }, data_types::f32, format::bfyx, data_types::f32
#define CASE_LOOP_F16_0 4, { 1, 12, 4, 2 }, { 1, 2, 12, 4 }, { 1, 2, 12, 1 }, { 0, 3, 1, 2 }, data_types::f16, format::bfyx, data_types::f16
#define CASE_LOOP_F32_1 3, { 1, 8, 3, 2 }, { 1, 2, 8, 3 }, { 1, 2, 8, 1 }, { 0, 2, 3, 1 }, data_types::f32, format::bfyx, data_types::f32
#define CASE_LOOP_F16_0 4, { 1, 12, 4, 2 }, { 1, 2, 12, 4 }, { 1, 2, 12, 1 }, { 0, 2, 3, 1 }, data_types::f16, format::bfyx, data_types::f16
INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_eltwise_loop, ::testing::ValuesIn(std::vector<loop_params>{
loop_params{ CASE_LOOP_F32_1, 3, 5 },

View File

@@ -87,75 +87,75 @@ public:
/* ------------------------------------------------------------------------------------------------------------ */
#define CASE_PERMUTE_F32_0 { 1, 16, 2, 2 }, { 1, 16, 2, 2 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_1 { 1, 15, 16, 16 }, { 1, 15, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_4 { 2, 16, 16, 16 }, { 2, 16, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 3, 1, 2, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 2, 1, 0, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 2, 0, 3, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 3, 2, 0, 1 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 3, 0, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_0 { 1, 15, 4, 5 }, { 1, 15, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 2, 0, 1, 3 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 3, 0, 2, 1 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 3, 2, 0, 1 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_2 { 1, 32, 5, 4 }, { 1, 32, 5, 4 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
// 3d
#define CASE_PERMUTE_F32_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 2, 3, 4, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 4, 0, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 1, 2, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 2, 3, 4, 0, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 4, 0, 3, 2, 1 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 0, 2, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 2, 0, 1, 4, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 4, 2, 1, 0, 3 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 2, 4, 3, 0, 1 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 4, 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 0, 2, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 4, 2, 3, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 2, 3, 0, 1, 4 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 1, 2, 4 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 2, 3, 4, 0, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx
// permute_tile_8x8_4x4
#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 2, 3, 4, 5, 1 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx
// permute_tile_8x8_4x4_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16, 3, 2 }, { 1, 2, 16, 3 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1, 5, 7, 2 }, { 1, 2, 5, 7 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16, 3, 2, 2 }, { 1, 2, 16, 3, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1, 5, 7, 2, 2 }, { 1, 2, 5, 7, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16, 3, 2 }, { 1, 2, 16, 3 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1, 5, 7, 2 }, { 1, 2, 5, 7 }, { 0, 2, 3, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16, 3, 2, 2 }, { 1, 2, 16, 3, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1, 5, 7, 2, 2 }, { 1, 2, 5, 7, 2 }, { 0, 2, 3, 4, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16
class permute_activation_scale_eltwise: public PermuteFusingTest {};
TEST_P(permute_activation_scale_eltwise, basic) {
@@ -455,49 +455,49 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
/* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */
/* ------------------------------------------------------------------------------------------------------------ */
#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 }, { 0, 2, 1, 3 }, { 0, 2, 1, 3 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 }, { 0, 2, 1, 3 }, { 0, 2, 1, 3 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 2, 3, 4, 1 }, { 0, 2, 3, 4, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 }, { 0, 2, 1, 3 }, { 0, 2, 1, 3 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F16_2 { 1, 5, 1, 2, 14 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
// type change
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 }, { 0, 2, 1, 3 }, { 0, 2, 1, 3 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx
#define CASE_PERMUTE_REORDER_S8_TO_F32_1 { 1, 2, 15, 4, 5 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 }, { 0, 2, 1, 3, 4 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx
// dim change
#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 2, 0, 3 }, { 0, 3, 1, 4, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 4, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 4, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx
#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 3, 2, 0 }, { 0, 3, 4, 2, 1 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 3, 1, 2 }, { 0, 4, 5, 1, 3, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 2, 1, 3, 4 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 2, 1, 3, 4 }, { 0, 4, 5, 1, 3, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 5, 1, 4, 3, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 5, 1, 4, 3, 2 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx
// permute_opt for blocked format
#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 4, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 4, 1, 2, 3 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx
// permute_opt for blocked format => reorder to differnt dim
#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx
// permute opt for blocked format => reorder to different dim/type
#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 5, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 5, 1, 4, 3, 2 }, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx
// permute opt for non_blocked format => reorder to differnt dim/type
#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 5, 1, 2, 3, 4 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 5, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 2, 3, 4, 5, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx
#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 2, 3, 1 }, { 0, 4, 1, 3, 2 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx
#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 2, 3, 1 }, { 0, 5, 1, 4, 2, 3 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx
class permute_redundant_reorder : public PermuteReorderFusingTest {};
TEST_P(permute_redundant_reorder, basic) {
@@ -514,21 +514,21 @@ TEST_P(permute_redundant_reorder, basic) {
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_redundant_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3 },
permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 4, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3 },
permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4 },
@@ -563,17 +563,17 @@ TEST_P(permute_act_reorder, basic) {
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(std::vector<permute_reorder_params>{
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 4, 5 },
permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4 },
permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5 },

View File

@@ -53,7 +53,7 @@ TEST(depth_to_space_fp16_gpu, d1411_bs2) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
}
}
@@ -95,7 +95,7 @@ TEST(depth_to_space_fp16_gpu, d1421_bs2) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
}
}
@@ -158,7 +158,7 @@ TEST(depth_to_space_fp16_gpu, d1933_bs3) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
ASSERT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
}
}
@@ -197,7 +197,7 @@ TEST(depth_to_space_fp32_gpu, d1411_bs2) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], output_ptr[i]);
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}
@@ -231,7 +231,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
auto output = outputs.at("depth_to_space").get_memory();
cldnn::mem_lock<FLOAT16> output_ptr (output, get_test_stream());
std::vector<uint16_t> perm = { 0,4,5,2,1,3 };
std::vector<uint16_t> perm = { 0,3,4,1,5,2 };
topology topology_ref;
topology_ref.add(input_layout("Input0", input1->get_layout()));
@@ -259,7 +259,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
cldnn::mem_lock<FLOAT16> output_ptr_ref(output_ref, get_test_stream());
for (size_t i = 0; i < output->get_layout().count(); ++i) {
EXPECT_EQ(output_ptr_ref[i], output_ptr[i]);
ASSERT_EQ(output_ptr_ref[i], output_ptr[i]);
}
}
@@ -314,7 +314,7 @@ TEST(depth_to_space_fp32_gpu, d1933_bs3) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], output_ptr[i]);
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}
@@ -364,7 +364,7 @@ TEST(depth_to_space_fp32_gpu, d1822_bs2_blocks_first) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], output_ptr[i]);
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}
@@ -414,6 +414,6 @@ TEST(depth_to_space_fp32_gpu, d1822_bs2_depth_first) {
};
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_EQ(expected_results[i], output_ptr[i]);
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}

View File

@@ -26,37 +26,32 @@ TEST(permute_gpu_f32, output_ordering_test)
{
auto& engine = get_test_engine();
std::vector<std::vector<int32_t>> input_tensors =
{
{ 10, 5, 15, 2 },{ 2, 4, 6, 8 },{ 2, 2, 3, 2 },{ 9, 8, 7, 4 }
std::vector<std::vector<int32_t>> input_tensors = {
{ 10, 5, 15, 2 },
{ 2, 4, 6, 8 },
{ 2, 2, 3, 2 },
{ 9, 8, 7, 4 }
};
std::vector<std::vector<uint16_t>> permutations =
{
std::vector<std::vector<uint16_t>> permutations = {
{ 0, 1, 2, 3 }, //do nothing
{ 0, 1, 3, 2 }, //replace x with y
{ 1, 0, 3, 2 }, //replace b with f
{ 0, 2, 3, 1 } //big permutation
{ 0, 1, 3, 2 }, //replace x with y
{ 1, 0, 3, 2 }, //replace b with f
{ 0, 2, 3, 1 } //big permutation
};
std::vector<format> input_formats = { format::bfyx, format::yxfb };
auto get_permutation = [&](const std::vector<int32_t>& inp1, const std::vector<uint16_t>& order)
{
auto get_permutation = [&](const std::vector<int32_t>& inp1, const std::vector<uint16_t>& order) {
EXPECT_EQ(inp1.size(), order.size());
std::vector<int32_t> output;
for (auto const& o : order)
{
for (auto const& o : order) {
output.push_back(inp1.at(o));
}
return output;
};
for (auto const& fr : input_formats)
{
for (auto const& inp_t : input_tensors)
{
for (auto const& perm : permutations)
{
for (auto const& fr : input_formats) {
for (auto const& inp_t : input_tensors) {
for (auto const& perm : permutations) {
auto input = engine.allocate_memory({ data_types::f32, fr, tensor(inp_t) });
topology topology(
input_layout("input", input->get_layout()),
@@ -68,12 +63,12 @@ TEST(permute_gpu_f32, output_ordering_test)
auto output = outputs.at("permute");
auto output_mem = output.get_memory();
EXPECT_EQ(outputs.size(), size_t(1));
auto ref_tensor = get_permutation(inp_t, perm);
auto out_tensor = output_mem->get_layout().size;
EXPECT_EQ(out_tensor.batch[0], ref_tensor[0]);
EXPECT_EQ(out_tensor.feature[0], ref_tensor[1]);
EXPECT_EQ(out_tensor.spatial[0], ref_tensor[2]);
EXPECT_EQ(out_tensor.spatial[1], ref_tensor[3]);
auto ref_tensor = get_permutation(input->get_layout().get_dims(), perm);
auto out_tensor = output_mem->get_layout().get_dims();
EXPECT_EQ(out_tensor[0], ref_tensor[0]);
EXPECT_EQ(out_tensor[1], ref_tensor[1]);
EXPECT_EQ(out_tensor[2], ref_tensor[2]);
EXPECT_EQ(out_tensor[3], ref_tensor[3]);
}
}
}
@@ -552,9 +547,9 @@ TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
topology topology_unfused(
input_layout("input", input->get_layout()),
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
permute("permute", "reorder1", { 0, 3, 1, 2}),
permute("permute", "reorder1", { 0, 2, 3, 1}),
reorder("reorder2", "permute", format::bfyx, data_types::f32),
permute("out", "reorder2", { 0, 2, 3, 1}));
permute("out", "reorder2", { 0, 3, 1, 2}));
cldnn::build_options options_unfused;
options_unfused.set_option(cldnn::build_option::optimize_data(false));
@@ -567,9 +562,9 @@ TEST(permute_fuse_reorder_gpu_f32, basic_b_fs_yx_fsv4_permute_1_8_16_1)
topology topology_fused(
input_layout("input", input->get_layout()),
reorder("reorder1", "input", format::b_fs_yx_fsv4, data_types::f32),
permute("permute", "reorder1", { 0, 3, 1, 2}),
permute("permute", "reorder1", { 0, 2, 3, 1}),
reorder("reorder2", "permute", format::bfyx, data_types::f32), // to be fused to previous permute
permute("out", "reorder2", { 0, 2, 3, 1})); // return to original value
permute("out", "reorder2", { 0, 3, 1, 2})); // return to original value
cldnn::build_options options_fused;
options_fused.set_option(cldnn::build_option::optimize_data(true));
@@ -794,7 +789,7 @@ TEST(permute_gpu_f32, 6D_reshape_permute_reshape)
const int w_reshape = 2;
const int z_reshape = 2;
std::vector<uint16_t> permute_order = { 0, 1, 5, 4, 2, 3 };
std::vector<uint16_t> permute_order = { 0, 1, 4, 5, 3, 2 };
auto input_size = cldnn::tensor(batch(b), feature(f), spatial(x, y));
auto input_mem = engine.allocate_memory({ data_types::f32, format::bfyx, input_size });
@@ -839,10 +834,10 @@ TEST(permute_gpu_f32, 6D_reshape_permute_reshape)
EXPECT_EQ(expected_out[i], output_ptr[i]);
}
}
TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)
TEST(permute_gpu_f32, basic_bfzyx_permute_0_4_1_2_3)
{
// Input : bfzyx:2x2x2x2x3
// Permute order : { 0,2,3,4,1 }
// Permute order : { 0,4,1,2,3 }
auto& engine = get_test_engine();
@@ -872,7 +867,7 @@ TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 2, 3, 4, 1 }));
permute("permute", "input", { 0, 4, 1, 2, 3 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -918,9 +913,9 @@ TEST(permute_gpu_f32, basic_bfzyx_permute_0_2_3_4_1)
* Test cases for permute_tile_8x8_4x4 kernel
*
* This TCs are enabled only when batch axis move to the last.
* i.e permute order is 0,3,1,2 or 0,4,1,2,3 or 0,5,1,2,3,4
* i.e permute order is 0,2,3,1 or 0,4,1,2,3 or 0,5,1,2,3,4
*/
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_2_3_1) {
// Input : bfyx:2x8x2x8
// Permute order : { 0,3,1,2 }
@@ -932,14 +927,14 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {
std::vector<float> input_data;
input_data.reserve(array_size);
for (size_t i=0 ; i < array_size; ++i)
for (size_t i = 0; i < array_size; ++i)
input_data.push_back(static_cast<float>(i));
set_values(input, input_data);
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 3, 1, 2 }));
permute("permute", "input", { 0, 2, 3, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -976,7 +971,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfyx_0_3_1_2) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_2_3_1) {
// Input : bfyx:2x5x2x8
// Permute order : { 0,3,1,2 }
@@ -995,7 +990,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 3, 1, 2 }));
permute("permute", "input", { 0, 2, 3, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1026,7 +1021,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfyx_0_3_1_2) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_2_3_1) {
// Input : bfyx:2x8x2x5
// Permute order : { 0,3,1,2 }
@@ -1051,7 +1046,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 3, 1, 2 }));
permute("permute", "input", { 0, 2, 3, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1082,7 +1077,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfyx_0_3_1_2) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_2_3_1) {
// Input : bfyx:2x5x2x5
// Permute order : { 0,3,1,2 }
@@ -1101,7 +1096,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 3, 1, 2 }));
permute("permute", "input", { 0, 2, 3, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1132,7 +1127,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfyx_0_3_1_2) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_2_3_4_1) {
// Input : bfzyx:2x8x2x2x8
// Permute order : { 0,4,1,2,3 }
@@ -1151,7 +1146,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 4, 1, 2, 3 }));
permute("permute", "input", { 0, 2, 3, 4, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1194,7 +1189,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfzyx_0_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_2_3_4_1) {
// Input : bfzyx:2x5x2x2x8
// Permute order : { 0,4,1,2,3 }
@@ -1213,7 +1208,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 4, 1, 2, 3 }));
permute("permute", "input", { 0, 2, 3, 4, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1248,7 +1243,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfzyx_0_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_2_3_4_1) {
// Input : bfzyx:2x8x2x2x5
// Permute order : { 0,4,1,2,3 }
@@ -1267,7 +1262,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 4, 1, 2, 3 }));
permute("permute", "input", { 0, 2, 3, 4, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1302,7 +1297,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfzyx_0_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_2_3_4_1) {
// Input : bfzyx:2x5x2x2x5
// Permute order : { 0,4,1,2,3 }
@@ -1321,7 +1316,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 4, 1, 2, 3 }));
permute("permute", "input", { 0, 2, 3, 4, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1356,9 +1351,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfzyx_0_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_2_3_4_5_1) {
// Input : bfwzyx:2x8x2x2x2x8
// Permute order : { 0,5,1,2,3,4 }
// Permute order : { 0,2,3,4,5,1 }
constexpr size_t array_size = 1024;
@@ -1375,7 +1370,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1428,9 +1423,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, normal_bfwzyx_0_5_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_2_3_4_5_1) {
// Input : bfwzyx:2x5x2x2x2x8
// Permute order : { 0,5,1,2,3,4 }
// Permute order : { 0,2,3,4,5,1 }
constexpr size_t array_size = 640;
@@ -1447,7 +1442,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1488,9 +1483,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, f_remainder_bfwzyx_0_5_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_2_3_4_5_1) {
// Input : bfwzyx:2x8x2x2x2x5
// Permute order : { 0,5,1,2,3,4 }
// Permute order : { 0,2,3,4,5,1 }
constexpr size_t array_size = 640;
@@ -1507,7 +1502,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1548,9 +1543,9 @@ TEST(permute_gpu_f32_tile_8x8_4x4, x_remainder_bfwzyx_0_5_4_1_2_3) {
}
}
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_5_4_1_2_3) {
TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_2_3_4_5_1) {
// Input : bfwzyx:2x5x2x2x2x5
// Permute order : { 0,5,1,2,3,4 }
// Permute order : { 0,2,3,4,5,1 }
constexpr size_t array_size = 400;
@@ -1567,7 +1562,7 @@ TEST(permute_gpu_f32_tile_8x8_4x4, xf_remainder_bfwzyx_0_5_4_1_2_3) {
topology topology(
input_layout("input", input->get_layout()),
permute("permute", "input", { 0, 5, 1, 2, 3, 4 }));
permute("permute", "input", { 0, 2, 3, 4, 5, 1 }));
network network(engine, topology);
network.set_input_data("input", input);
@@ -1670,12 +1665,13 @@ void TiledPermuteTest::run_test(const std::vector<cldnn::tensor::value_type>& si
std::swap(internal_sizes.at(2), internal_sizes.back());
cldnn::tensor tensor(internal_sizes);
cldnn::format format = sizes.size() == 4?cldnn::format::bfyx:cldnn::format::bfzyx;
cldnn::format format = sizes.size() == 4 ? cldnn::format::bfyx : cldnn::format::bfzyx;
std::vector<uint16_t> order{0, static_cast<uint16_t>(sizes.size()-1)};
for (uint16_t i = 1; i<(sizes.size()-1); ++i) {
order.push_back(i);
std::vector<uint16_t> order = {0};
for (uint16_t i = 1; i < (sizes.size() - 1); ++i) {
order.push_back(i+1);
}
order.push_back(1);
auto input = engine.allocate_memory({Data_Type, format, tensor});
set_random_values<type>(input);

View File

@@ -21,8 +21,8 @@ TEST(test_device_mem_usage_estimation, basic) {
topology topology(
input_layout("input1", input1->get_layout()),
input_layout("input2", input2->get_layout()),
permute("permute1", "input1", { 0, 2, 3, 1 }),
permute("permute2", "input2", { 0, 3, 2, 1 }),
permute("permute1", "input1", { 0, 3, 1, 2 }),
permute("permute2", "input2", { 0, 2, 1, 3 }),
eltwise("eltw", {"permute1", "permute2"}, eltwise_mode::sum, data_types::f16),
reorder("output", "eltw", format::bfyx, data_types::f32)
);