[GPU] Permute f and y axes (#13561)

* [GPU] Permute f and y axes

It is supported cases where y and f (and X if it is not equal 1) axes size divisible by 4,8 or 16.
added kernel to switch f and y axes in 4d model blocked and planar formats
added test fot tests

* Added subgroup read/write to THREE_DIM_TRANSPOSE kernel case.

* Better checking of is SIMD size supported.

* Added support for long type to subgroup read/write.

* Added subgroup read/write support to 2d permute.

* Fixed win build issue.

* Changed f and y indexes in iteration.

* Added vector read/write.

* Fixed j_times calculation.

* Better naming.

* Rollback test logic.

* Fixed fusion logic.

* Accept only supported blocked layouts and SIMD sizes.

---------

Co-authored-by: Mykhailo Hnap <mykhailo.hnap@capgemini.com>
Co-authored-by: Wilson Seok <wilson.seok@intel.com>
This commit is contained in:
OlehKravchyshyn 2023-07-18 21:48:25 +03:00 committed by GitHub
parent 568d3371d7
commit f0abd468a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 706 additions and 2 deletions

View File

@ -52,6 +52,8 @@
#define TYPE_SIZE_int 4
#define TYPE_SIZE_uint 4
#define TYPE_SIZE_float 4
#define TYPE_SIZE_ulong 8
#define TYPE_SIZE_long 8
#define TYPE_SIZE(type) CAT(TYPE_SIZE_, type)
#ifdef cl_intel_required_subgroup_size

View File

@ -34,17 +34,20 @@
#define BLOCK_READ_TYPE_size1 uchar
#define BLOCK_READ_TYPE_size2 ushort
#define BLOCK_READ_TYPE_size4 uint
#define BLOCK_READ_TYPE_size8 ulong
#define BLOCK_READ_TYPE(type_size) CAT(BLOCK_READ_TYPE_size, type_size)
#define BLOCK_READ_FUNC_size1 _sub_group_block_read_uc
#define BLOCK_READ_FUNC_size2 _sub_group_block_read_us
#define BLOCK_READ_FUNC_size4 _sub_group_block_read
#define BLOCK_READ_FUNC_size8 _sub_group_block_read_ul
#define BLOCK_READ_FUNC(type_size) CAT(BLOCK_READ_FUNC_size, type_size)
#define BLOCK_READN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_READ_FUNC(type_size), vector_size)
#define BLOCK_READN_FUNC_size1(vector_size) BLOCK_READN_FUNC_SIZE_DEF(1, vector_size)
#define BLOCK_READN_FUNC_size2(vector_size) BLOCK_READN_FUNC_SIZE_DEF(2, vector_size)
#define BLOCK_READN_FUNC_size4(vector_size) BLOCK_READN_FUNC_SIZE_DEF(4, vector_size)
#define BLOCK_READN_FUNC_size8(vector_size) BLOCK_READN_FUNC_SIZE_DEF(8, vector_size)
#define BLOCK_READN_FUNC(type_size, vector_size) CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \
@ -151,3 +154,15 @@
DECLARE_BLOCK_READ_EMULATION(1, 8)
DECLARE_BLOCK_READ_EMULATION(1, 16)
#endif
#if defined(cl_intel_subgroups_long)
#define _sub_group_block_read_ul(ptr) intel_sub_group_block_read_ul(ptr)
#define _sub_group_block_read_ul2(ptr) intel_sub_group_block_read_ul2(ptr)
#define _sub_group_block_read_ul4(ptr) intel_sub_group_block_read_ul4(ptr)
#define _sub_group_block_read_ul8(ptr) intel_sub_group_block_read_ul8(ptr)
#elif (__OPENCL_C_VERSION__ >= 200)
DECLARE_BLOCK_READ_EMULATION(8, 1)
DECLARE_BLOCK_READ_EMULATION(8, 2)
DECLARE_BLOCK_READ_EMULATION(8, 4)
DECLARE_BLOCK_READ_EMULATION(8, 8)
#endif

View File

@ -32,17 +32,20 @@
#define BLOCK_WRITE_TYPE_size1 uchar
#define BLOCK_WRITE_TYPE_size2 ushort
#define BLOCK_WRITE_TYPE_size4 uint
#define BLOCK_WRITE_TYPE_size8 ulong
#define BLOCK_WRITE_TYPE(type_size) CAT(BLOCK_WRITE_TYPE_size, type_size)
#define BLOCK_WRITE_FUNC_size1 _sub_group_block_write_uc
#define BLOCK_WRITE_FUNC_size2 _sub_group_block_write_us
#define BLOCK_WRITE_FUNC_size4 _sub_group_block_write
#define BLOCK_WRITE_FUNC_size8 _sub_group_block_write_ul
#define BLOCK_WRITE_FUNC(type_size) CAT(BLOCK_WRITE_FUNC_size, type_size)
#define BLOCK_WRITEN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_WRITE_FUNC(type_size), vector_size)
#define BLOCK_WRITEN_FUNC_size1(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(1, vector_size)
#define BLOCK_WRITEN_FUNC_size2(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(2, vector_size)
#define BLOCK_WRITEN_FUNC_size4(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
#define BLOCK_WRITEN_FUNC_size8(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(8, vector_size)
#define BLOCK_WRITEN_FUNC(type_size, vector_size) CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
#define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \
@ -133,3 +136,15 @@
DECLARE_BLOCK_WRITE_EMULATION(1, 8)
DECLARE_BLOCK_WRITE_EMULATION(1, 16)
#endif
#if defined(cl_intel_subgroups_long)
#define _sub_group_block_write_ul(ptr, v) intel_sub_group_block_write_ul(ptr, v)
#define _sub_group_block_write_ul2(ptr, v) intel_sub_group_block_write_ul2(ptr, v)
#define _sub_group_block_write_ul4(ptr, v) intel_sub_group_block_write_ul4(ptr, v)
#define _sub_group_block_write_ul8(ptr, v) intel_sub_group_block_write_ul8(ptr, v)
#elif (__OPENCL_C_VERSION__ >= 200)
DECLARE_BLOCK_WRITE_EMULATION(8, 1)
DECLARE_BLOCK_WRITE_EMULATION(8, 2)
DECLARE_BLOCK_WRITE_EMULATION(8, 4)
DECLARE_BLOCK_WRITE_EMULATION(8, 8)
#endif

View File

@ -0,0 +1,182 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#define READ_VEC(offset, ptr) CAT(vload, VEC_SIZE)(offset, ptr)
#define WRITE_VEC(val, offset, ptr) CAT(vstore, VEC_SIZE)(val, offset, ptr)
#define IN_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
#define TO_IN_VEC_TYPE(x) CAT(convert_, IN_VEC_TYPE)(x)
#define ACC_VEC_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_SIZE)
#define TO_ACC_VEC_TYPE(x) CAT(convert_, ACC_VEC_TYPE)(x)
#define OUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
#define TO_OUT_VEC_TYPE(x) CAT(convert_, OUT_VEC_TYPE)(x)
#if defined (PERMUTE_SIMPLE_MEM_COPY)
KERNEL (permute_f_y_axes)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const int bf = get_global_id(2);
const int f_idx = bf % INPUT0_FEATURE_NUM;
const int b_idx = bf / INPUT0_FEATURE_NUM;
const int x_start = get_global_id(0) * BLOCK_SIZE;
const int y_idx = get_global_id(1);
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int x_idx = x_start + j * VEC_SIZE;
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
#else
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
#endif
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f_idx, x_idx);
WRITE_VEC(result, 0, &output[output_idx]);
}
}
#elif defined (THREE_DIM_TRANSPOSE)
#ifdef SUB_GROUP_SIZE
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
#endif
KERNEL (permute_f_y_axes)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
__local OUTPUT_TYPE transpose_buf[FEATURE_BLOCK_SIZE][FEATURE_BLOCK_SIZE][TILE_SIZE];
const int bf = get_global_id(2);
const int f_idx = bf % INPUT0_FEATURE_NUM;
const int b_idx = bf / INPUT0_FEATURE_NUM;
int bf_local = get_local_id(2);
int y_local = get_local_id(1);
const int x_begin = get_global_id(0) * TILE_SIZE;
const int y_begin = get_global_id(1) * FEATURE_BLOCK_SIZE;
const int f_begin = get_local_size(2) * get_group_id(2);
__attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int i = 0; i < TILE_SIZE; ++i) {
const int x_idx = x_begin + i;
const int y_idx = y_begin + j;
const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
#if HAS_FUSED_OPS
FUSED_OPS;
transpose_buf[bf_local][j][i] = FUSED_OPS_RESULT;
#else
transpose_buf[bf_local][j][i] = ACTIVATION(res, ACTIVATION_PARAMS);
#endif
}
}
__attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int i = 0; i < TILE_SIZE; ++i) {
const int x_idx = x_begin + i;
const int f = f_begin + j;
const int y_idx = y_begin + bf_local;
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]);
}
}
}
#else
#ifdef SUB_GROUP_SIZE
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
#endif
KERNEL (permute_f_y_axes)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
__local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
const int bf = get_global_id(2);
const int b_idx = bf / INPUT0_FEATURE_NUM;
const int f_idx = bf % INPUT0_FEATURE_NUM;
const int bf_local = get_local_id(2);
const int x_idx = get_global_id(0);
const int y_begin = get_global_id(1) * TILE_SIZE;
const int f_begin = get_local_size(2) * get_group_id(2);
#if INPUT0_SIMPLE == 1
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int j_vec = j * VEC_SIZE;
const int y_idx = y_begin + j_vec;
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
for (int k = 0; k < VEC_SIZE; ++k) {
transpose_buf[j_vec + k][bf_local] = result[k];
}
#else
for (int k = 0; k < VEC_SIZE; ++k) {
transpose_buf[j_vec + k][bf_local] = ACTIVATION(res[k], ACTIVATION_PARAMS);
}
#endif
}
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int j_vec = j * VEC_SIZE;
const int f = f_begin + j_vec;
const int y_idx = y_begin + bf_local;
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
}
#else
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int j = 0; j < TILE_SIZE; ++j) {
const int y_idx = y_begin + j;
const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
#if HAS_FUSED_OPS
FUSED_OPS;
transpose_buf[bf_local][j] = FUSED_OPS_RESULT;
#else
transpose_buf[bf_local][j] = ACTIVATION(res, ACTIVATION_PARAMS);
#endif
}
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int j = 0; j < TILE_SIZE; ++j) {
const int f = f_begin + j;
const int y_idx = y_begin + bf_local;
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]);
}
#endif
}
#endif // #if defined (SIMPLE_MEM_COPY)

View File

@ -0,0 +1,246 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "permute_kernel_f_y_axes.h"
#include <algorithm>
#include <cmath>
#include <string>
#include "common_tools.h"
#include "kernel_selector_utils.h"
namespace kernel_selector {
namespace {
constexpr size_t cSimpleMemCopyOpDivider = 4UL;
constexpr size_t c3DTransposeBufHeight = 4UL;
size_t GetDivisor(const size_t input_size) {
std::vector<size_t> v = {/*32,*/ 16, 8, 4, /*2,*/ 1};
auto is_divided = [input_size](size_t i) {
return input_size % i == 0;
};
auto result = std::find_if(begin(v), end(v), is_divided);
return *result;
}
bool IsSimpleMemCopyOperation(const permute_params& params) {
return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() == DataLayout::bfyx;
}
bool Is3DTranspose(const permute_params& params) {
return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() != DataLayout::bfyx;
}
size_t GetFeatureBlockSize(const permute_params& params) {
return std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
}
size_t GetTileHeight(const permute_params& params) {
size_t min_divisor{};
if (params.inputs[0].X().v == 1) {
min_divisor = std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
} else {
min_divisor = std::min({GetDivisor(params.inputs[0].Feature().v),
GetDivisor(params.inputs[0].Y().v),
GetDivisor(params.inputs[0].X().v)});
}
if (Is3DTranspose(params)) {
return std::min(min_divisor, c3DTransposeBufHeight);
}
return min_divisor;
}
size_t GetTileWidth(const permute_params& params) {
const Datatype input_type = params.inputs[0].GetDType();
const Datatype output_type = params.outputs[0].GetDType();
size_t min_divisor = GetTileHeight(params);
if (IsSimpleMemCopyOperation(params)) {
min_divisor = std::min(min_divisor, cSimpleMemCopyOpDivider);
}
// i64 only supports tile size 4
if ((input_type == Datatype::INT64) || (output_type == Datatype::INT64)) {
min_divisor = min_divisor / 2;
}
if (input_type == Datatype::F16) {
min_divisor = min_divisor * 2;
}
if (input_type == Datatype::INT8 || input_type == Datatype::UINT8) {
min_divisor = min_divisor * 4;
}
if (params.inputs[0].X().v == 1) {
return std::min(params.inputs[0].Y().v, min_divisor);
}
return std::min(params.inputs[0].X().v, min_divisor);
}
size_t GetTileSize(const permute_params& params) {
return std::min(GetTileHeight(params), GetTileWidth(params));
}
} // namespace
ParamsKey PermuteKernel_f_y_axes::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableDifferentTypes();
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& params,
const CommonDispatchData& dispatchData) const {
auto jit = Parent::GetJitConstants(params, dispatchData);
if (params.inputs[0].X().v != 1) {
if (IsSimpleMemCopyOperation(params)) {
jit.AddConstant(MakeJitConstant("PERMUTE_SIMPLE_MEM_COPY", ""));
}
if (Is3DTranspose(params)) {
jit.AddConstant(MakeJitConstant("THREE_DIM_TRANSPOSE", ""));
}
}
const size_t tile_width = GetTileWidth(params);
const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
const size_t tile_size = GetTileSize(params);
const size_t j_times = tile_size / vector_size;
const size_t feature_block_size = GetFeatureBlockSize(params);
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
const auto layout = params.inputs.front().GetLayout();
if (!SimpleLayout(layout)) {
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
}
if (!params.fused_ops.empty()) {
const std::vector<std::string> original_output_order = {"b_idx", "f_idx", "y_idx", "x_idx"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
}
return jit;
}
static inline std::vector<size_t> GetGWS(const permute_params& params) {
const auto& in = params.inputs[0];
std::vector<size_t> gws;
auto block_size = IsSimpleMemCopyOperation(params) ? GetTileWidth(params) : GetTileSize(params);
if (params.inputs[0].X().v == 1) {
gws = {in.X().v, in.Y().v / block_size, (in.Batch().v * in.Feature().v)};
} else {
if (Is3DTranspose(params)) {
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
} else {
gws = {in.X().v / block_size, in.Y().v, (in.Batch().v * in.Feature().v)};
}
}
return gws;
}
CommonDispatchData PermuteKernel_f_y_axes::SetDefault(const permute_params& params) const {
CommonDispatchData dispatchData;
dispatchData.gws = GetGWS(params);
if (IsSimpleMemCopyOperation(params)) {
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
dispatchData.lws =
GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
} else if (Is3DTranspose(params)) {
dispatchData.lws = {1, 1, GetFeatureBlockSize(params)};
} else {
dispatchData.lws = {1, 1, GetTileSize(params)};
}
return dispatchData;
}
bool PermuteKernel_f_y_axes::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) {
return false;
}
const auto is_swapping_f_with_y = [](const std::vector<uint16_t>& order) {
// Target transform: Swap feature with y
// IE order: 0 2 1 3 => bfyx -> byfx
// cldnn order: 0 3 2 1 => bfxy -> byxf
if (order.size() != 4) {
return false;
}
if (order[0] != 0 || order[1] != 3 || order[2] != 2 || order[3] != 1) {
return false;
}
return true;
};
const auto& params = dynamic_cast<const permute_params&>(p);
const auto& in = params.inputs[0];
const auto in_layout = in.GetLayout();
const auto feature_div = GetDivisor(in.Feature().v);
const auto y_div = GetDivisor(in.Y().v);
if (feature_div == 1 || y_div == 1) {
return false;
}
if (in.X().v > 1 && GetDivisor(in.X().v) == 1) {
return false;
}
if (!is_swapping_f_with_y(params.order)) {
return false;
}
// Accept only supported blocked layouts and SIMD sizes.
if (!SimpleLayout(in_layout)) {
const auto feature_block_size = GetFeatureBlockSize(params);
const auto tile_size = GetTileSize(params);
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
if (!(IsSIMDSizeSupported(params.engineInfo, subgroup_size) &&
(in_layout == DataLayout::b_fs_yx_fsv32 || in_layout == DataLayout::b_fs_yx_fsv16))) {
return false;
}
}
return true;
}
KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/,
const optional_params& /*options*/) const {
return FORCE_PRIORITY_3;
}
} // namespace kernel_selector

View File

@ -0,0 +1,29 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "permute_kernel_base.h"
namespace kernel_selector {
class PermuteKernel_f_y_axes : public PermuteKernelBase {
public:
using Parent = PermuteKernelBase;
using Parent::Parent;
PermuteKernel_f_y_axes() : PermuteKernelBase("permute_f_y_axes") {}
~PermuteKernel_f_y_axes() override = default;
bool Validate(const Params& p, const optional_params& o) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
CommonDispatchData SetDefault(const permute_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {FusedOpType::ACTIVATION, FusedOpType::QUANTIZE, FusedOpType::ELTWISE};
}
};
} // namespace kernel_selector

View File

@ -7,6 +7,7 @@
#include "permute_kernel_tile_8x8_4x4.h"
#include "permute_kernel_tile_8x8_4x4_fsv.h"
#include "permute_kernel_bfzyx_to_bfyxz.h"
#include "permute_kernel_f_y_axes.h"
namespace kernel_selector {
@ -15,6 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
Attach<PermuteKernel_tile_8x8_4x4>();
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
Attach<PermuteKernel_bfzyx_to_bfyxz>();
Attach<PermuteKernel_f_y_axes>();
}
KernelsData permute_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {

View File

@ -170,6 +170,9 @@ public:
// permute_bfzyx_to_bfyxz
#define CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0 { 1, 8, 8, 2, 2 }, { 1, 8, 2, 8, 2 }, { 0, 1, 3, 4, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
// permute_f_y_axes
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 2 }, { 1, 2, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
class permute_activation_scale_eltwise: public PermuteFusingTest {};
TEST_P(permute_activation_scale_eltwise, basic) {
auto p = GetParam();
@ -265,6 +268,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testin
// Fusing tests for permute_bfzyx_to_bfyxz
permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 5 },
// Fusing tests for permute_f_y_axes
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 5 },
}));
class permute_quant_u8: public PermuteFusingTest {};
@ -470,6 +476,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
// Fusing tests for permute_bfzyx_to_bfyxz
permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 7 },
// Fusing tests for permute_f_y_axes
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 7 },
}));
/* ------------------------------------------------------------------------------------------------------------ */

View File

@ -24,6 +24,23 @@ using namespace cldnn;
using namespace ::tests;
using namespace testing;
namespace {
// TODO: Move somewhere
template <class vecElementType>
std::string vec2str(const std::vector<vecElementType>& vec) {
if (!vec.empty()) {
std::ostringstream result;
result << "(";
std::copy(vec.begin(), vec.end() - 1, std::ostream_iterator<vecElementType>(result, "."));
result << vec.back() << ")";
return result.str();
}
return "()";
}
} // namespace
TEST(permute_gpu_f32, output_ordering_test)
{
auto& engine = get_test_engine();
@ -1871,6 +1888,13 @@ public:
void run_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
const std::string & permute_opt = "permute_tile_8x8_4x4_fsv",
std::vector<uint16_t> permute_order = {}, bool is_caching_test = false);
static std::string PrintToStringParamName(const testing::TestParamInfo<TiledPermuteParam>& info) {
std::ostringstream result;
result << "InputShape=" << vec2str(info.param.sizes) << "_";
result << "Format=" << fmt_to_str(info.param.format_fsv);
return result.str();
}
};
template<>
@ -2220,3 +2244,180 @@ TEST_P(permute_tile_fsv_5d, i64_cached) {
auto p = GetParam();
run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_tile_8x8_4x4_fsv", {}, true);
}
class permute_f_y_axes_tile: public TiledPermuteTest {};
INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
permute_f_y_axes_tile,
::testing::ValuesIn(std::vector<TiledPermuteParam>{
{{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes
{{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes
{{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE
{{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY
{{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes
{{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE
}),
TiledPermuteTest::PrintToStringParamName);
TEST_P(permute_f_y_axes_tile, combined) {
auto p = GetParam();
run_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
run_test<cldnn::data_types::f16>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
run_test<cldnn::data_types::u8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
run_test<cldnn::data_types::i8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
run_test<cldnn::data_types::i32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
}
struct TiledPerformancePermuteTest : TiledPermuteTest
{
static double get_exectime(const std::map<cldnn::primitive_id, cldnn::network_output>& outputs,
const std::string& primitive_id)
{
using namespace std::chrono;
std::shared_ptr<event> e = outputs.at(primitive_id).get_event();
e->wait(); // should ensure execution completion, if not segfault will occur
double avg_time = 0.0;
auto intervals = e->get_profiling_info();
for (const auto& q : intervals)
{
if (q.stage != instrumentation::profiling_stage::executing) {
continue;
}
avg_time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
break;
}
return avg_time;
}
static void print_all_perf(std::map<primitive_id, network_output> outputs)
{
std::cout << "Print last run time" << std::endl;
using namespace std::chrono;
for( const auto &n : outputs ) {
std::shared_ptr<event> e = n.second.get_event();
auto intervals = e->get_profiling_info();
double time = 0.0;
for (const auto& q : intervals)
{
if (q.stage == instrumentation::profiling_stage::executing) {
continue;
}
time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
break;
}
std::cout << n.first << ":" << time << std::endl;
}
std::cout << std::endl;
}
template<data_types Data_Type>
void execute_perf_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
const std::string & kernel_name, std::vector<uint16_t> permute_order)
{
auto& engine = get_test_engine();
// convert half_t to FLOAT16
using type_ = typename data_type_to_type<Data_Type>::type;
using type = typename std::conditional<std::is_same<type_, half_t>::value, FLOAT16, type_>::type;
std::vector<cldnn::tensor::value_type> internal_sizes(sizes);
std::swap(internal_sizes.at(2), internal_sizes.back());
cldnn::tensor tensor(internal_sizes);
cldnn::format format = sizes.size() == 4 ? cldnn::format::bfyx : cldnn::format::bfzyx;
std::vector<uint16_t> order = {0};
if (permute_order.empty()) {
for (uint16_t i = 1; i < (sizes.size() - 1); ++i) {
order.push_back(i+1);
}
order.push_back(1);
} else {
std::swap(order, permute_order);
}
auto input_ref = engine.allocate_memory({Data_Type, format, tensor});
set_random_values<type>(input_ref);
topology topology_ref = topology(
input_layout("input", input_ref->get_layout()),
reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
permute("output", input_info("reorder"), order)
);
// run with permute_ref
ExecutionConfig config_ref(ov::enable_profiling(true));
ov::intel_gpu::ImplementationDesc permute_ref = {format_fsv, "permute_ref"};
config_ref.set_property(
ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_ref}}));
cldnn::network network_ref(engine, topology_ref, config_ref);
network_ref.set_input_data("input", input_ref);
// run with optimized kernel, e.g. permute_tile_8x8_4x4_fsv16
auto input_opt = engine.allocate_memory({Data_Type, format, tensor});
set_random_values<type>(input_opt);
topology topology_opt = topology(
input_layout("input", input_opt->get_layout()),
reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
permute("output", input_info("reorder"), order)
);
ExecutionConfig config_tile(ov::enable_profiling(true));
ov::intel_gpu::ImplementationDesc permute_tile_opt = {format_fsv, kernel_name};
config_tile.set_property(
ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_tile_opt}}));
cldnn::network network_tile(engine, topology_opt, config_tile);
network_tile.set_input_data("input", input_opt);
// first execution of opt
std::map<primitive_id, network_output> output_permute_ref;
std::map<primitive_id, network_output> output_permute_opt;
for (int i = 0; i < 10; ++i) {
output_permute_ref = network_ref.execute();
output_permute_opt = network_tile.execute();
}
auto r = 100;
double exectime_ref = 0.f;
double exectime_opt = 0.f;
for (int i = 0; i < r; ++i) {
output_permute_opt = network_tile.execute();
auto t_opt = get_exectime(output_permute_opt, "output");
exectime_opt += t_opt;
output_permute_ref = network_ref.execute();
auto t_ref = get_exectime(output_permute_ref, "output");
exectime_ref += t_ref;
}
exectime_ref /= r;
exectime_opt /= r;
std::cout << std::endl;
auto output_layout_ref = network_ref.get_program()->get_node("output").get_output_layout();
auto output_layout_opt = network_tile.get_program()->get_node("output").get_output_layout();
std::string frm_str = cldnn::format(format).to_string();
std::string input_type = data_type_traits::name(Data_Type);
std::cout << "Exectued time " << " " << "permute_ref" << " " << " input(" << tensor.to_string()
<< ") output(" << output_layout_ref.to_string() << ") "
<< frm_str << " " << input_type << " " << exectime_ref << std::endl;
std::cout << "Exectued time " << " " << kernel_name << " " << " input(" << tensor.to_string()
<< ") output(" << output_layout_opt.to_string() << ") "
<< frm_str << " " << input_type << " " << exectime_opt << std::endl;
}
};
// No need to run performance tests on CI
TEST_P(TiledPerformancePermuteTest, DISABLED_f32) {
auto p = GetParam();
execute_perf_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
}
INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest,
::testing::ValuesIn(std::vector<TiledPermuteParam> {
// b_fs_zy_fsv16
// normal cases
{{1, 512, 16384, 1}, format::bfyx},
{{1, 512, 16384, 1}, format::b_fs_yx_fsv16},
{{1, 256, 128, 256}, format::bfyx},
{{1, 256, 256, 128}, format::b_fs_yx_fsv16},
}));

View File

@ -273,12 +273,15 @@ void set_random_values(cldnn::memory::ptr mem, bool sign = false, unsigned signi
template<class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
void set_random_values(cldnn::memory::ptr mem)
{
using T1 = typename std::conditional<std::is_same<int8_t, T>::value, int, T>::type;
using T2 = typename std::conditional<std::is_same<uint8_t, T1>::value, unsigned int, T1>::type;
cldnn::mem_lock<T> ptr(mem, get_test_stream());
std::mt19937 gen;
static std::uniform_int_distribution<T> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
static std::uniform_int_distribution<T2> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
for (auto it = ptr.begin(); it != ptr.end(); ++it) {
*it = uid(gen);
*it = static_cast<T>(uid(gen));
}
}