[GPU] Permute f and y axes (#13561)
* [GPU] Permute f and y axes It is supported cases where y and f (and X if it is not equal 1) axes size divisible by 4,8 or 16. added kernel to switch f and y axes in 4d model blocked and planar formats added test fot tests * Added subgroup read/write to THREE_DIM_TRANSPOSE kernel case. * Better checking of is SIMD size supported. * Added support for long type to subgroup read/write. * Added subgroup read/write support to 2d permute. * Fixed win build issue. * Changed f and y indexes in iteration. * Added vector read/write. * Fixed j_times calculation. * Better naming. * Rollback test logic. * Fixed fusion logic. * Accept only supported blocked layouts and SIMD sizes. --------- Co-authored-by: Mykhailo Hnap <mykhailo.hnap@capgemini.com> Co-authored-by: Wilson Seok <wilson.seok@intel.com>
This commit is contained in:
parent
568d3371d7
commit
f0abd468a2
@ -52,6 +52,8 @@
|
||||
#define TYPE_SIZE_int 4
|
||||
#define TYPE_SIZE_uint 4
|
||||
#define TYPE_SIZE_float 4
|
||||
#define TYPE_SIZE_ulong 8
|
||||
#define TYPE_SIZE_long 8
|
||||
#define TYPE_SIZE(type) CAT(TYPE_SIZE_, type)
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
|
@ -34,17 +34,20 @@
|
||||
#define BLOCK_READ_TYPE_size1 uchar
|
||||
#define BLOCK_READ_TYPE_size2 ushort
|
||||
#define BLOCK_READ_TYPE_size4 uint
|
||||
#define BLOCK_READ_TYPE_size8 ulong
|
||||
#define BLOCK_READ_TYPE(type_size) CAT(BLOCK_READ_TYPE_size, type_size)
|
||||
|
||||
#define BLOCK_READ_FUNC_size1 _sub_group_block_read_uc
|
||||
#define BLOCK_READ_FUNC_size2 _sub_group_block_read_us
|
||||
#define BLOCK_READ_FUNC_size4 _sub_group_block_read
|
||||
#define BLOCK_READ_FUNC_size8 _sub_group_block_read_ul
|
||||
#define BLOCK_READ_FUNC(type_size) CAT(BLOCK_READ_FUNC_size, type_size)
|
||||
|
||||
#define BLOCK_READN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_READ_FUNC(type_size), vector_size)
|
||||
#define BLOCK_READN_FUNC_size1(vector_size) BLOCK_READN_FUNC_SIZE_DEF(1, vector_size)
|
||||
#define BLOCK_READN_FUNC_size2(vector_size) BLOCK_READN_FUNC_SIZE_DEF(2, vector_size)
|
||||
#define BLOCK_READN_FUNC_size4(vector_size) BLOCK_READN_FUNC_SIZE_DEF(4, vector_size)
|
||||
#define BLOCK_READN_FUNC_size8(vector_size) BLOCK_READN_FUNC_SIZE_DEF(8, vector_size)
|
||||
#define BLOCK_READN_FUNC(type_size, vector_size) CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
|
||||
|
||||
#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \
|
||||
@ -151,3 +154,15 @@
|
||||
DECLARE_BLOCK_READ_EMULATION(1, 8)
|
||||
DECLARE_BLOCK_READ_EMULATION(1, 16)
|
||||
#endif
|
||||
|
||||
#if defined(cl_intel_subgroups_long)
|
||||
#define _sub_group_block_read_ul(ptr) intel_sub_group_block_read_ul(ptr)
|
||||
#define _sub_group_block_read_ul2(ptr) intel_sub_group_block_read_ul2(ptr)
|
||||
#define _sub_group_block_read_ul4(ptr) intel_sub_group_block_read_ul4(ptr)
|
||||
#define _sub_group_block_read_ul8(ptr) intel_sub_group_block_read_ul8(ptr)
|
||||
#elif (__OPENCL_C_VERSION__ >= 200)
|
||||
DECLARE_BLOCK_READ_EMULATION(8, 1)
|
||||
DECLARE_BLOCK_READ_EMULATION(8, 2)
|
||||
DECLARE_BLOCK_READ_EMULATION(8, 4)
|
||||
DECLARE_BLOCK_READ_EMULATION(8, 8)
|
||||
#endif
|
||||
|
@ -32,17 +32,20 @@
|
||||
#define BLOCK_WRITE_TYPE_size1 uchar
|
||||
#define BLOCK_WRITE_TYPE_size2 ushort
|
||||
#define BLOCK_WRITE_TYPE_size4 uint
|
||||
#define BLOCK_WRITE_TYPE_size8 ulong
|
||||
#define BLOCK_WRITE_TYPE(type_size) CAT(BLOCK_WRITE_TYPE_size, type_size)
|
||||
|
||||
#define BLOCK_WRITE_FUNC_size1 _sub_group_block_write_uc
|
||||
#define BLOCK_WRITE_FUNC_size2 _sub_group_block_write_us
|
||||
#define BLOCK_WRITE_FUNC_size4 _sub_group_block_write
|
||||
#define BLOCK_WRITE_FUNC_size8 _sub_group_block_write_ul
|
||||
#define BLOCK_WRITE_FUNC(type_size) CAT(BLOCK_WRITE_FUNC_size, type_size)
|
||||
|
||||
#define BLOCK_WRITEN_FUNC_SIZE_DEF(type_size, vector_size) MAKE_VECTOR_TYPE(BLOCK_WRITE_FUNC(type_size), vector_size)
|
||||
#define BLOCK_WRITEN_FUNC_size1(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(1, vector_size)
|
||||
#define BLOCK_WRITEN_FUNC_size2(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(2, vector_size)
|
||||
#define BLOCK_WRITEN_FUNC_size4(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
|
||||
#define BLOCK_WRITEN_FUNC_size8(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(8, vector_size)
|
||||
#define BLOCK_WRITEN_FUNC(type_size, vector_size) CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
|
||||
|
||||
#define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \
|
||||
@ -133,3 +136,15 @@
|
||||
DECLARE_BLOCK_WRITE_EMULATION(1, 8)
|
||||
DECLARE_BLOCK_WRITE_EMULATION(1, 16)
|
||||
#endif
|
||||
|
||||
#if defined(cl_intel_subgroups_long)
|
||||
#define _sub_group_block_write_ul(ptr, v) intel_sub_group_block_write_ul(ptr, v)
|
||||
#define _sub_group_block_write_ul2(ptr, v) intel_sub_group_block_write_ul2(ptr, v)
|
||||
#define _sub_group_block_write_ul4(ptr, v) intel_sub_group_block_write_ul4(ptr, v)
|
||||
#define _sub_group_block_write_ul8(ptr, v) intel_sub_group_block_write_ul8(ptr, v)
|
||||
#elif (__OPENCL_C_VERSION__ >= 200)
|
||||
DECLARE_BLOCK_WRITE_EMULATION(8, 1)
|
||||
DECLARE_BLOCK_WRITE_EMULATION(8, 2)
|
||||
DECLARE_BLOCK_WRITE_EMULATION(8, 4)
|
||||
DECLARE_BLOCK_WRITE_EMULATION(8, 8)
|
||||
#endif
|
||||
|
@ -0,0 +1,182 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
|
||||
#define READ_VEC(offset, ptr) CAT(vload, VEC_SIZE)(offset, ptr)
|
||||
#define WRITE_VEC(val, offset, ptr) CAT(vstore, VEC_SIZE)(val, offset, ptr)
|
||||
|
||||
#define IN_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
|
||||
#define TO_IN_VEC_TYPE(x) CAT(convert_, IN_VEC_TYPE)(x)
|
||||
#define ACC_VEC_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_SIZE)
|
||||
#define TO_ACC_VEC_TYPE(x) CAT(convert_, ACC_VEC_TYPE)(x)
|
||||
#define OUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
|
||||
#define TO_OUT_VEC_TYPE(x) CAT(convert_, OUT_VEC_TYPE)(x)
|
||||
|
||||
#if defined (PERMUTE_SIMPLE_MEM_COPY)
|
||||
KERNEL (permute_f_y_axes)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const int bf = get_global_id(2);
|
||||
const int f_idx = bf % INPUT0_FEATURE_NUM;
|
||||
const int b_idx = bf / INPUT0_FEATURE_NUM;
|
||||
const int x_start = get_global_id(0) * BLOCK_SIZE;
|
||||
const int y_idx = get_global_id(1);
|
||||
|
||||
__attribute__((opencl_unroll_hint(J_TIMES)))
|
||||
for (int j = 0; j < J_TIMES; ++j) {
|
||||
const int x_idx = x_start + j * VEC_SIZE;
|
||||
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_VEC;
|
||||
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
|
||||
#else
|
||||
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f_idx, x_idx);
|
||||
WRITE_VEC(result, 0, &output[output_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined (THREE_DIM_TRANSPOSE)
|
||||
|
||||
#ifdef SUB_GROUP_SIZE
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
#endif
|
||||
KERNEL (permute_f_y_axes)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
__local OUTPUT_TYPE transpose_buf[FEATURE_BLOCK_SIZE][FEATURE_BLOCK_SIZE][TILE_SIZE];
|
||||
|
||||
const int bf = get_global_id(2);
|
||||
const int f_idx = bf % INPUT0_FEATURE_NUM;
|
||||
const int b_idx = bf / INPUT0_FEATURE_NUM;
|
||||
|
||||
int bf_local = get_local_id(2);
|
||||
int y_local = get_local_id(1);
|
||||
|
||||
const int x_begin = get_global_id(0) * TILE_SIZE;
|
||||
const int y_begin = get_global_id(1) * FEATURE_BLOCK_SIZE;
|
||||
const int f_begin = get_local_size(2) * get_group_id(2);
|
||||
|
||||
__attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
|
||||
for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
|
||||
__attribute__((opencl_unroll_hint(TILE_SIZE)))
|
||||
for (int i = 0; i < TILE_SIZE; ++i) {
|
||||
const int x_idx = x_begin + i;
|
||||
const int y_idx = y_begin + j;
|
||||
const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
|
||||
INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
transpose_buf[bf_local][j][i] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
transpose_buf[bf_local][j][i] = ACTIVATION(res, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(FEATURE_BLOCK_SIZE)))
|
||||
for (int j = 0; j < FEATURE_BLOCK_SIZE; ++j) {
|
||||
__attribute__((opencl_unroll_hint(TILE_SIZE)))
|
||||
for (int i = 0; i < TILE_SIZE; ++i) {
|
||||
const int x_idx = x_begin + i;
|
||||
const int f = f_begin + j;
|
||||
const int y_idx = y_begin + bf_local;
|
||||
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SUB_GROUP_SIZE
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
#endif
|
||||
KERNEL (permute_f_y_axes)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
__local OUTPUT_TYPE transpose_buf[TILE_SIZE][TILE_SIZE+1];
|
||||
|
||||
const int bf = get_global_id(2);
|
||||
const int b_idx = bf / INPUT0_FEATURE_NUM;
|
||||
const int f_idx = bf % INPUT0_FEATURE_NUM;
|
||||
const int bf_local = get_local_id(2);
|
||||
const int x_idx = get_global_id(0);
|
||||
const int y_begin = get_global_id(1) * TILE_SIZE;
|
||||
const int f_begin = get_local_size(2) * get_group_id(2);
|
||||
|
||||
#if INPUT0_SIMPLE == 1
|
||||
__attribute__((opencl_unroll_hint(J_TIMES)))
|
||||
for (int j = 0; j < J_TIMES; ++j) {
|
||||
const int j_vec = j * VEC_SIZE;
|
||||
const int y_idx = y_begin + j_vec;
|
||||
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_VEC;
|
||||
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
|
||||
for (int k = 0; k < VEC_SIZE; ++k) {
|
||||
transpose_buf[j_vec + k][bf_local] = result[k];
|
||||
}
|
||||
#else
|
||||
for (int k = 0; k < VEC_SIZE; ++k) {
|
||||
transpose_buf[j_vec + k][bf_local] = ACTIVATION(res[k], ACTIVATION_PARAMS);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(J_TIMES)))
|
||||
for (int j = 0; j < J_TIMES; ++j) {
|
||||
const int j_vec = j * VEC_SIZE;
|
||||
const int f = f_begin + j_vec;
|
||||
const int y_idx = y_begin + bf_local;
|
||||
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
|
||||
WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint(TILE_SIZE)))
|
||||
for (int j = 0; j < TILE_SIZE; ++j) {
|
||||
const int y_idx = y_begin + j;
|
||||
const uint input_offset = INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx) - get_sub_group_local_id();
|
||||
INPUT0_TYPE res = DT_INPUT_BLOCK_READ(input, input_offset);
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
transpose_buf[bf_local][j] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
transpose_buf[bf_local][j] = ACTIVATION(res, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
}
|
||||
__attribute__((opencl_unroll_hint(TILE_SIZE)))
|
||||
for (int j = 0; j < TILE_SIZE; ++j) {
|
||||
const int f = f_begin + j;
|
||||
const int y_idx = y_begin + bf_local;
|
||||
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // #if defined (SIMPLE_MEM_COPY)
|
@ -0,0 +1,246 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "permute_kernel_f_y_axes.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
|
||||
#include "common_tools.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr size_t cSimpleMemCopyOpDivider = 4UL;
|
||||
constexpr size_t c3DTransposeBufHeight = 4UL;
|
||||
|
||||
size_t GetDivisor(const size_t input_size) {
|
||||
std::vector<size_t> v = {/*32,*/ 16, 8, 4, /*2,*/ 1};
|
||||
auto is_divided = [input_size](size_t i) {
|
||||
return input_size % i == 0;
|
||||
};
|
||||
auto result = std::find_if(begin(v), end(v), is_divided);
|
||||
return *result;
|
||||
}
|
||||
|
||||
bool IsSimpleMemCopyOperation(const permute_params& params) {
|
||||
return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() == DataLayout::bfyx;
|
||||
}
|
||||
|
||||
bool Is3DTranspose(const permute_params& params) {
|
||||
return params.inputs[0].X().v > 1 && params.inputs[0].GetLayout() != DataLayout::bfyx;
|
||||
}
|
||||
|
||||
size_t GetFeatureBlockSize(const permute_params& params) {
|
||||
return std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
|
||||
}
|
||||
|
||||
size_t GetTileHeight(const permute_params& params) {
|
||||
size_t min_divisor{};
|
||||
if (params.inputs[0].X().v == 1) {
|
||||
min_divisor = std::min(GetDivisor(params.inputs[0].Feature().v), GetDivisor(params.inputs[0].Y().v));
|
||||
} else {
|
||||
min_divisor = std::min({GetDivisor(params.inputs[0].Feature().v),
|
||||
GetDivisor(params.inputs[0].Y().v),
|
||||
GetDivisor(params.inputs[0].X().v)});
|
||||
}
|
||||
if (Is3DTranspose(params)) {
|
||||
return std::min(min_divisor, c3DTransposeBufHeight);
|
||||
}
|
||||
return min_divisor;
|
||||
}
|
||||
|
||||
size_t GetTileWidth(const permute_params& params) {
|
||||
const Datatype input_type = params.inputs[0].GetDType();
|
||||
const Datatype output_type = params.outputs[0].GetDType();
|
||||
|
||||
size_t min_divisor = GetTileHeight(params);
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
min_divisor = std::min(min_divisor, cSimpleMemCopyOpDivider);
|
||||
}
|
||||
|
||||
// i64 only supports tile size 4
|
||||
if ((input_type == Datatype::INT64) || (output_type == Datatype::INT64)) {
|
||||
min_divisor = min_divisor / 2;
|
||||
}
|
||||
if (input_type == Datatype::F16) {
|
||||
min_divisor = min_divisor * 2;
|
||||
}
|
||||
if (input_type == Datatype::INT8 || input_type == Datatype::UINT8) {
|
||||
min_divisor = min_divisor * 4;
|
||||
}
|
||||
|
||||
if (params.inputs[0].X().v == 1) {
|
||||
return std::min(params.inputs[0].Y().v, min_divisor);
|
||||
}
|
||||
return std::min(params.inputs[0].X().v, min_divisor);
|
||||
}
|
||||
|
||||
size_t GetTileSize(const permute_params& params) {
|
||||
return std::min(GetTileHeight(params), GetTileWidth(params));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ParamsKey PermuteKernel_f_y_axes::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
k.EnableInputDataType(Datatype::F32);
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableInputDataType(Datatype::INT32);
|
||||
k.EnableInputDataType(Datatype::INT64);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT32);
|
||||
k.EnableOutputDataType(Datatype::INT64);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
return k;
|
||||
}
|
||||
|
||||
JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& params,
|
||||
const CommonDispatchData& dispatchData) const {
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
|
||||
if (params.inputs[0].X().v != 1) {
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
jit.AddConstant(MakeJitConstant("PERMUTE_SIMPLE_MEM_COPY", ""));
|
||||
}
|
||||
if (Is3DTranspose(params)) {
|
||||
jit.AddConstant(MakeJitConstant("THREE_DIM_TRANSPOSE", ""));
|
||||
}
|
||||
}
|
||||
|
||||
const size_t tile_width = GetTileWidth(params);
|
||||
const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
|
||||
const size_t tile_size = GetTileSize(params);
|
||||
const size_t j_times = tile_size / vector_size;
|
||||
const size_t feature_block_size = GetFeatureBlockSize(params);
|
||||
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
|
||||
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
|
||||
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
|
||||
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
|
||||
|
||||
const auto layout = params.inputs.front().GetLayout();
|
||||
if (!SimpleLayout(layout)) {
|
||||
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
|
||||
}
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
const std::vector<std::string> original_output_order = {"b_idx", "f_idx", "y_idx", "x_idx"};
|
||||
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
|
||||
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
|
||||
}
|
||||
return jit;
|
||||
}
|
||||
|
||||
static inline std::vector<size_t> GetGWS(const permute_params& params) {
|
||||
const auto& in = params.inputs[0];
|
||||
std::vector<size_t> gws;
|
||||
auto block_size = IsSimpleMemCopyOperation(params) ? GetTileWidth(params) : GetTileSize(params);
|
||||
if (params.inputs[0].X().v == 1) {
|
||||
gws = {in.X().v, in.Y().v / block_size, (in.Batch().v * in.Feature().v)};
|
||||
} else {
|
||||
if (Is3DTranspose(params)) {
|
||||
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
|
||||
} else {
|
||||
gws = {in.X().v / block_size, in.Y().v, (in.Batch().v * in.Feature().v)};
|
||||
}
|
||||
}
|
||||
return gws;
|
||||
}
|
||||
|
||||
CommonDispatchData PermuteKernel_f_y_axes::SetDefault(const permute_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
dispatchData.gws = GetGWS(params);
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.outputs[0].GetLayout();
|
||||
const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
dispatchData.lws =
|
||||
GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
} else if (Is3DTranspose(params)) {
|
||||
dispatchData.lws = {1, 1, GetFeatureBlockSize(params)};
|
||||
} else {
|
||||
dispatchData.lws = {1, 1, GetTileSize(params)};
|
||||
}
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
bool PermuteKernel_f_y_axes::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!Parent::Validate(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto is_swapping_f_with_y = [](const std::vector<uint16_t>& order) {
|
||||
// Target transform: Swap feature with y
|
||||
// IE order: 0 2 1 3 => bfyx -> byfx
|
||||
// cldnn order: 0 3 2 1 => bfxy -> byxf
|
||||
if (order.size() != 4) {
|
||||
return false;
|
||||
}
|
||||
if (order[0] != 0 || order[1] != 3 || order[2] != 2 || order[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
const auto& params = dynamic_cast<const permute_params&>(p);
|
||||
const auto& in = params.inputs[0];
|
||||
const auto in_layout = in.GetLayout();
|
||||
|
||||
const auto feature_div = GetDivisor(in.Feature().v);
|
||||
const auto y_div = GetDivisor(in.Y().v);
|
||||
if (feature_div == 1 || y_div == 1) {
|
||||
return false;
|
||||
}
|
||||
if (in.X().v > 1 && GetDivisor(in.X().v) == 1) {
|
||||
return false;
|
||||
}
|
||||
if (!is_swapping_f_with_y(params.order)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Accept only supported blocked layouts and SIMD sizes.
|
||||
if (!SimpleLayout(in_layout)) {
|
||||
const auto feature_block_size = GetFeatureBlockSize(params);
|
||||
const auto tile_size = GetTileSize(params);
|
||||
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
|
||||
if (!(IsSIMDSizeSupported(params.engineInfo, subgroup_size) &&
|
||||
(in_layout == DataLayout::b_fs_yx_fsv32 || in_layout == DataLayout::b_fs_yx_fsv16))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/,
|
||||
const optional_params& /*options*/) const {
|
||||
return FORCE_PRIORITY_3;
|
||||
}
|
||||
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,29 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "permute_kernel_base.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class PermuteKernel_f_y_axes : public PermuteKernelBase {
|
||||
public:
|
||||
using Parent = PermuteKernelBase;
|
||||
using Parent::Parent;
|
||||
PermuteKernel_f_y_axes() : PermuteKernelBase("permute_f_y_axes") {}
|
||||
~PermuteKernel_f_y_axes() override = default;
|
||||
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
|
||||
CommonDispatchData SetDefault(const permute_params& params) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return {FusedOpType::ACTIVATION, FusedOpType::QUANTIZE, FusedOpType::ELTWISE};
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -7,6 +7,7 @@
|
||||
#include "permute_kernel_tile_8x8_4x4.h"
|
||||
#include "permute_kernel_tile_8x8_4x4_fsv.h"
|
||||
#include "permute_kernel_bfzyx_to_bfyxz.h"
|
||||
#include "permute_kernel_f_y_axes.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
@ -15,6 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
|
||||
Attach<PermuteKernel_tile_8x8_4x4>();
|
||||
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
|
||||
Attach<PermuteKernel_bfzyx_to_bfyxz>();
|
||||
Attach<PermuteKernel_f_y_axes>();
|
||||
}
|
||||
|
||||
KernelsData permute_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
|
@ -170,6 +170,9 @@ public:
|
||||
// permute_bfzyx_to_bfyxz
|
||||
#define CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0 { 1, 8, 8, 2, 2 }, { 1, 8, 2, 8, 2 }, { 0, 1, 3, 4, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
|
||||
|
||||
// permute_f_y_axes
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 2 }, { 1, 2, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
|
||||
|
||||
class permute_activation_scale_eltwise: public PermuteFusingTest {};
|
||||
TEST_P(permute_activation_scale_eltwise, basic) {
|
||||
auto p = GetParam();
|
||||
@ -265,6 +268,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testin
|
||||
|
||||
// Fusing tests for permute_bfzyx_to_bfyxz
|
||||
permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 5 },
|
||||
|
||||
// Fusing tests for permute_f_y_axes
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 5 },
|
||||
}));
|
||||
|
||||
class permute_quant_u8: public PermuteFusingTest {};
|
||||
@ -470,6 +476,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::t
|
||||
|
||||
// Fusing tests for permute_bfzyx_to_bfyxz
|
||||
permute_params{ CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0, 2, 7 },
|
||||
|
||||
// Fusing tests for permute_f_y_axes
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 7 },
|
||||
}));
|
||||
|
||||
/* ------------------------------------------------------------------------------------------------------------ */
|
||||
|
@ -24,6 +24,23 @@ using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
using namespace testing;
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: Move somewhere
|
||||
template <class vecElementType>
|
||||
std::string vec2str(const std::vector<vecElementType>& vec) {
|
||||
if (!vec.empty()) {
|
||||
std::ostringstream result;
|
||||
result << "(";
|
||||
std::copy(vec.begin(), vec.end() - 1, std::ostream_iterator<vecElementType>(result, "."));
|
||||
result << vec.back() << ")";
|
||||
return result.str();
|
||||
}
|
||||
return "()";
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(permute_gpu_f32, output_ordering_test)
|
||||
{
|
||||
auto& engine = get_test_engine();
|
||||
@ -1871,6 +1888,13 @@ public:
|
||||
void run_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
|
||||
const std::string & permute_opt = "permute_tile_8x8_4x4_fsv",
|
||||
std::vector<uint16_t> permute_order = {}, bool is_caching_test = false);
|
||||
|
||||
static std::string PrintToStringParamName(const testing::TestParamInfo<TiledPermuteParam>& info) {
|
||||
std::ostringstream result;
|
||||
result << "InputShape=" << vec2str(info.param.sizes) << "_";
|
||||
result << "Format=" << fmt_to_str(info.param.format_fsv);
|
||||
return result.str();
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
@ -2220,3 +2244,180 @@ TEST_P(permute_tile_fsv_5d, i64_cached) {
|
||||
auto p = GetParam();
|
||||
run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_tile_8x8_4x4_fsv", {}, true);
|
||||
}
|
||||
|
||||
class permute_f_y_axes_tile: public TiledPermuteTest {};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
|
||||
permute_f_y_axes_tile,
|
||||
::testing::ValuesIn(std::vector<TiledPermuteParam>{
|
||||
{{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes
|
||||
{{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes
|
||||
{{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE
|
||||
{{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY
|
||||
{{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes
|
||||
{{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE
|
||||
}),
|
||||
TiledPermuteTest::PrintToStringParamName);
|
||||
|
||||
TEST_P(permute_f_y_axes_tile, combined) {
|
||||
auto p = GetParam();
|
||||
run_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
run_test<cldnn::data_types::f16>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
run_test<cldnn::data_types::u8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
run_test<cldnn::data_types::i8>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
run_test<cldnn::data_types::i32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
run_test<cldnn::data_types::i64>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
}
|
||||
|
||||
struct TiledPerformancePermuteTest : TiledPermuteTest
|
||||
{
|
||||
static double get_exectime(const std::map<cldnn::primitive_id, cldnn::network_output>& outputs,
|
||||
const std::string& primitive_id)
|
||||
{
|
||||
using namespace std::chrono;
|
||||
std::shared_ptr<event> e = outputs.at(primitive_id).get_event();
|
||||
e->wait(); // should ensure execution completion, if not segfault will occur
|
||||
double avg_time = 0.0;
|
||||
auto intervals = e->get_profiling_info();
|
||||
for (const auto& q : intervals)
|
||||
{
|
||||
if (q.stage != instrumentation::profiling_stage::executing) {
|
||||
continue;
|
||||
}
|
||||
avg_time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
|
||||
break;
|
||||
}
|
||||
return avg_time;
|
||||
}
|
||||
|
||||
static void print_all_perf(std::map<primitive_id, network_output> outputs)
|
||||
{
|
||||
std::cout << "Print last run time" << std::endl;
|
||||
using namespace std::chrono;
|
||||
for( const auto &n : outputs ) {
|
||||
std::shared_ptr<event> e = n.second.get_event();
|
||||
auto intervals = e->get_profiling_info();
|
||||
double time = 0.0;
|
||||
for (const auto& q : intervals)
|
||||
{
|
||||
if (q.stage == instrumentation::profiling_stage::executing) {
|
||||
continue;
|
||||
}
|
||||
time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
|
||||
break;
|
||||
}
|
||||
std::cout << n.first << ":" << time << std::endl;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
template<data_types Data_Type>
|
||||
void execute_perf_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
|
||||
const std::string & kernel_name, std::vector<uint16_t> permute_order)
|
||||
{
|
||||
auto& engine = get_test_engine();
|
||||
// convert half_t to FLOAT16
|
||||
using type_ = typename data_type_to_type<Data_Type>::type;
|
||||
using type = typename std::conditional<std::is_same<type_, half_t>::value, FLOAT16, type_>::type;
|
||||
|
||||
std::vector<cldnn::tensor::value_type> internal_sizes(sizes);
|
||||
std::swap(internal_sizes.at(2), internal_sizes.back());
|
||||
cldnn::tensor tensor(internal_sizes);
|
||||
|
||||
cldnn::format format = sizes.size() == 4 ? cldnn::format::bfyx : cldnn::format::bfzyx;
|
||||
|
||||
std::vector<uint16_t> order = {0};
|
||||
if (permute_order.empty()) {
|
||||
for (uint16_t i = 1; i < (sizes.size() - 1); ++i) {
|
||||
order.push_back(i+1);
|
||||
}
|
||||
order.push_back(1);
|
||||
} else {
|
||||
std::swap(order, permute_order);
|
||||
}
|
||||
|
||||
auto input_ref = engine.allocate_memory({Data_Type, format, tensor});
|
||||
set_random_values<type>(input_ref);
|
||||
topology topology_ref = topology(
|
||||
input_layout("input", input_ref->get_layout()),
|
||||
reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
|
||||
permute("output", input_info("reorder"), order)
|
||||
);
|
||||
// run with permute_ref
|
||||
ExecutionConfig config_ref(ov::enable_profiling(true));
|
||||
ov::intel_gpu::ImplementationDesc permute_ref = {format_fsv, "permute_ref"};
|
||||
config_ref.set_property(
|
||||
ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_ref}}));
|
||||
cldnn::network network_ref(engine, topology_ref, config_ref);
|
||||
network_ref.set_input_data("input", input_ref);
|
||||
|
||||
// run with optimized kernel, e.g. permute_tile_8x8_4x4_fsv16
|
||||
auto input_opt = engine.allocate_memory({Data_Type, format, tensor});
|
||||
set_random_values<type>(input_opt);
|
||||
topology topology_opt = topology(
|
||||
input_layout("input", input_opt->get_layout()),
|
||||
reorder("reorder", input_info("input"), {Data_Type, format_fsv, tensor}),
|
||||
permute("output", input_info("reorder"), order)
|
||||
);
|
||||
ExecutionConfig config_tile(ov::enable_profiling(true));
|
||||
ov::intel_gpu::ImplementationDesc permute_tile_opt = {format_fsv, kernel_name};
|
||||
config_tile.set_property(
|
||||
ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"output", permute_tile_opt}}));
|
||||
cldnn::network network_tile(engine, topology_opt, config_tile);
|
||||
network_tile.set_input_data("input", input_opt);
|
||||
|
||||
// first execution of opt
|
||||
std::map<primitive_id, network_output> output_permute_ref;
|
||||
std::map<primitive_id, network_output> output_permute_opt;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
output_permute_ref = network_ref.execute();
|
||||
output_permute_opt = network_tile.execute();
|
||||
}
|
||||
|
||||
auto r = 100;
|
||||
double exectime_ref = 0.f;
|
||||
double exectime_opt = 0.f;
|
||||
for (int i = 0; i < r; ++i) {
|
||||
output_permute_opt = network_tile.execute();
|
||||
auto t_opt = get_exectime(output_permute_opt, "output");
|
||||
exectime_opt += t_opt;
|
||||
|
||||
output_permute_ref = network_ref.execute();
|
||||
auto t_ref = get_exectime(output_permute_ref, "output");
|
||||
exectime_ref += t_ref;
|
||||
}
|
||||
exectime_ref /= r;
|
||||
exectime_opt /= r;
|
||||
std::cout << std::endl;
|
||||
auto output_layout_ref = network_ref.get_program()->get_node("output").get_output_layout();
|
||||
auto output_layout_opt = network_tile.get_program()->get_node("output").get_output_layout();
|
||||
std::string frm_str = cldnn::format(format).to_string();
|
||||
std::string input_type = data_type_traits::name(Data_Type);
|
||||
|
||||
std::cout << "Exectued time " << " " << "permute_ref" << " " << " input(" << tensor.to_string()
|
||||
<< ") output(" << output_layout_ref.to_string() << ") "
|
||||
<< frm_str << " " << input_type << " " << exectime_ref << std::endl;
|
||||
std::cout << "Exectued time " << " " << kernel_name << " " << " input(" << tensor.to_string()
|
||||
<< ") output(" << output_layout_opt.to_string() << ") "
|
||||
<< frm_str << " " << input_type << " " << exectime_opt << std::endl;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
// No need to run performance tests on CI
|
||||
TEST_P(TiledPerformancePermuteTest, DISABLED_f32) {
|
||||
auto p = GetParam();
|
||||
execute_perf_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest,
|
||||
::testing::ValuesIn(std::vector<TiledPermuteParam> {
|
||||
// b_fs_zy_fsv16
|
||||
// normal cases
|
||||
{{1, 512, 16384, 1}, format::bfyx},
|
||||
{{1, 512, 16384, 1}, format::b_fs_yx_fsv16},
|
||||
{{1, 256, 128, 256}, format::bfyx},
|
||||
{{1, 256, 256, 128}, format::b_fs_yx_fsv16},
|
||||
}));
|
||||
|
||||
|
@ -273,12 +273,15 @@ void set_random_values(cldnn::memory::ptr mem, bool sign = false, unsigned signi
|
||||
template<class T, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
|
||||
void set_random_values(cldnn::memory::ptr mem)
|
||||
{
|
||||
using T1 = typename std::conditional<std::is_same<int8_t, T>::value, int, T>::type;
|
||||
using T2 = typename std::conditional<std::is_same<uint8_t, T1>::value, unsigned int, T1>::type;
|
||||
|
||||
cldnn::mem_lock<T> ptr(mem, get_test_stream());
|
||||
|
||||
std::mt19937 gen;
|
||||
static std::uniform_int_distribution<T> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
|
||||
static std::uniform_int_distribution<T2> uid(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
|
||||
for (auto it = ptr.begin(); it != ptr.end(); ++it) {
|
||||
*it = uid(gen);
|
||||
*it = static_cast<T>(uid(gen));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user