[GPU] Fix accuracy problems brought by permute_f_y kernel (#20241)

* Add test cases from Levit model

* Enable PermuteKernel_f_y_axes kernel

* Fix permute of y and b for planar layout where x is not 1

* Enable fusing in permute_kernel_f_y_axes

* Add tests for fusing in permute_kernel_f_y_axes

---------

Co-authored-by: Wilson Seok <wilson.seok@intel.com>
This commit is contained in:
Yaroslav Torziuk
2023-12-19 13:50:32 +01:00
committed by GitHub
parent 6246fb0c23
commit 6719133af2
6 changed files with 91 additions and 51 deletions

View File

@@ -25,21 +25,22 @@ KERNEL (permute_f_y_axes)(
#endif
)
{
const int bf = get_global_id(2);
const int f_idx = bf % INPUT0_FEATURE_NUM;
const int b_idx = bf / INPUT0_FEATURE_NUM;
const int x_start = get_global_id(0) * BLOCK_SIZE;
const int y_idx = get_global_id(1);
const int f_idx = get_global_id(2);
const int b_idx = get_global_id(1);;
const int y_idx = get_global_id(0);
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int x_idx = x_start + j * VEC_SIZE;
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
for (int x_idx = 0; x_idx < J_TIMES * VEC_SIZE; x_idx+=VEC_SIZE) {
IN_VEC_TYPE resv = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
OUT_VEC_TYPE result;
for (int i = 0; i < VEC_SIZE; i++)
{
INPUT0_TYPE res = resv[i];
FUSED_OPS;
result[i] = FUSED_OPS_RESULT;
}
#else
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
OUT_VEC_TYPE result = ACTIVATION(resv, ACTIVATION_PARAMS);
#endif
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f_idx, x_idx);
WRITE_VEC(result, 0, &output[output_idx]);
@@ -132,18 +133,16 @@ KERNEL (permute_f_y_axes)(
for (int j = 0; j < J_TIMES; ++j) {
const int j_vec = j * VEC_SIZE;
const int y_idx = y_begin + j_vec;
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
IN_VEC_TYPE resv = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
for (int k = 0; k < VEC_SIZE; ++k) {
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
for (int k = 0; k < VEC_SIZE; ++k) {
transpose_buf[j_vec + k][bf_local] = result[k];
}
INPUT0_TYPE res = resv[k];
FUSED_OPS
transpose_buf[j_vec + k][bf_local] = FUSED_OPS_RESULT;
#else
for (int k = 0; k < VEC_SIZE; ++k) {
transpose_buf[j_vec + k][bf_local] = ACTIVATION(res[k], ACTIVATION_PARAMS);
}
transpose_buf[j_vec + k][bf_local] = ACTIVATION(resv[k], ACTIVATION_PARAMS);
#endif
}
}
__attribute__((opencl_unroll_hint(J_TIMES)))

View File

@@ -128,29 +128,46 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
}
}
const size_t tile_width = GetTileWidth(params);
const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
const size_t tile_size = GetTileSize(params);
const size_t j_times = tile_size / vector_size;
const size_t feature_block_size = GetFeatureBlockSize(params);
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
const auto layout = params.inputs.front().GetLayout();
if (!SimpleLayout(layout)) {
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
std::size_t vector_size;
if (IsSimpleMemCopyOperation(params)) {
vector_size = GetDivisor(params.inputs[0].X().v);
auto j_times = params.inputs[0].X().v / vector_size;
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
} else {
const size_t tile_width = GetTileWidth(params);
vector_size = std::min(tile_width, static_cast<size_t>(4));
const size_t tile_size = GetTileSize(params);
const size_t j_times = tile_size / vector_size;
const size_t feature_block_size = GetFeatureBlockSize(params);
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
if (!SimpleLayout(layout)) {
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
}
}
if (!params.fused_ops.empty()) {
const std::vector<std::string> original_output_order = {"b_idx", "f_idx", "y_idx", "x_idx"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
if (IsSimpleMemCopyOperation(params)) {
const std::vector<std::string> original_output_order = {"b_idx", "y_idx", "f_idx", "(x_idx+i)"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
} else if (SimpleLayout(layout)) {
const std::vector<std::string> original_output_order = {"b_idx", "(y_idx+k)", "f_idx", "x_idx"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
} else {
const std::vector<std::string> original_output_order = {"b_idx", "y_idx", "f_idx", "x_idx"};
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
}
}
return jit;
}
@@ -158,14 +175,12 @@ static inline std::vector<size_t> GetGWS(const permute_params& params) {
const auto& in = params.inputs[0];
std::vector<size_t> gws;
auto block_size = IsSimpleMemCopyOperation(params) ? GetTileWidth(params) : GetTileSize(params);
if (params.inputs[0].X().v == 1) {
if (IsSimpleMemCopyOperation(params)) {
return {in.Y().v, in.Batch().v, in.Feature().v};
} else if (Is3DTranspose(params)) {
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
} else { // the case where x is 1
gws = {in.X().v, in.Y().v / block_size, (in.Batch().v * in.Feature().v)};
} else {
if (Is3DTranspose(params)) {
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
} else {
gws = {in.X().v / block_size, in.Y().v, (in.Batch().v * in.Feature().v)};
}
}
return gws;
}
@@ -177,9 +192,9 @@ CommonDispatchData PermuteKernel_f_y_axes::SetDefault(const permute_params& para
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
{Tensor::DataChannelName::X},
{Tensor::DataChannelName::Y},
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
{Tensor::DataChannelName::BATCH},
{Tensor::DataChannelName::FEATURE}};
dispatchData.lws =
GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
} else if (Is3DTranspose(params)) {

View File

@@ -23,7 +23,11 @@ protected:
JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
CommonDispatchData SetDefault(const permute_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {FusedOpType::ACTIVATION, FusedOpType::QUANTIZE, FusedOpType::ELTWISE};
return {
FusedOpType::ACTIVATION,
FusedOpType::QUANTIZE,
FusedOpType::ELTWISE
};
}
};
} // namespace kernel_selector

View File

@@ -16,7 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
Attach<PermuteKernel_tile_8x8_4x4>();
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
Attach<PermuteKernel_bfzyx_to_bfyxz>();
// Attach<PermuteKernel_f_y_axes>();
Attach<PermuteKernel_f_y_axes>();
}
KernelsData permute_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {

View File

@@ -40,6 +40,22 @@ INSTANTIATE_TEST_SUITE_P(smoke_Transpose,
testing::Values(ov::test::utils::DEVICE_GPU)),
TransposeLayerTest::getTestCaseName);
const std::vector<std::vector<ov::Shape>> levitShapes = {
{{1, 4, 196, 32}},
{{1, 196, 4, 32}},
{{1, 196, 4, 16}},
{{1, 196, 8, 64}},
{{1, 24, 196, 1}},
};
INSTANTIATE_TEST_SUITE_P(smoke_Transpose_Levit,
TransposeLayerTest,
testing::Combine(testing::Values(std::vector<size_t>{0, 2, 1, 3}),
testing::Values(ov::element::f16),
testing::ValuesIn(ov::test::static_shapes_to_test_representation(levitShapes)),
testing::Values(ov::test::utils::DEVICE_GPU)),
TransposeLayerTest::getTestCaseName);
/**
* 5D permute tests
*/

View File

@@ -171,7 +171,10 @@ public:
#define CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0 { 1, 8, 8, 2, 2 }, { 1, 8, 2, 8, 2 }, { 0, 1, 3, 4, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
// permute_f_y_axes
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 2 }, { 1, 2, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 16 }, { 1, 16, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_1 { 1, 8, 1, 16 }, { 1, 16, 1, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_2 { 1, 8, 4, 16 }, { 1, 16, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_3 { 1, 8, 1, 16 }, { 1, 16, 1, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
class permute_activation_scale_eltwise: public PermuteFusingTest {};
TEST_P(permute_activation_scale_eltwise, basic) {
@@ -271,6 +274,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testin
// Fusing tests for permute_f_y_axes
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 5 },
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_1, 2, 5 },
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_2, 2, 5 },
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_3, 2, 5 },
}));
class permute_quant_u8: public PermuteFusingTest {};