[GPU] Fix accuracy problems brought by permute_f_y kernel (#20241)
* Add test cases from Levit model * Enable PermuteKernel_f_y_axes kernel * Fix permute of y and b for planar layout where x is not 1 * Enable fusing in permute_kernel_f_y_axes * Add tests for fusing in permute_kernel_f_y_axes --------- Co-authored-by: Wilson Seok <wilson.seok@intel.com>
This commit is contained in:
@@ -25,21 +25,22 @@ KERNEL (permute_f_y_axes)(
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const int bf = get_global_id(2);
|
||||
const int f_idx = bf % INPUT0_FEATURE_NUM;
|
||||
const int b_idx = bf / INPUT0_FEATURE_NUM;
|
||||
const int x_start = get_global_id(0) * BLOCK_SIZE;
|
||||
const int y_idx = get_global_id(1);
|
||||
const int f_idx = get_global_id(2);
|
||||
const int b_idx = get_global_id(1);;
|
||||
const int y_idx = get_global_id(0);
|
||||
|
||||
__attribute__((opencl_unroll_hint(J_TIMES)))
|
||||
for (int j = 0; j < J_TIMES; ++j) {
|
||||
const int x_idx = x_start + j * VEC_SIZE;
|
||||
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
for (int x_idx = 0; x_idx < J_TIMES * VEC_SIZE; x_idx+=VEC_SIZE) {
|
||||
IN_VEC_TYPE resv = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_VEC;
|
||||
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
|
||||
OUT_VEC_TYPE result;
|
||||
for (int i = 0; i < VEC_SIZE; i++)
|
||||
{
|
||||
INPUT0_TYPE res = resv[i];
|
||||
FUSED_OPS;
|
||||
result[i] = FUSED_OPS_RESULT;
|
||||
}
|
||||
#else
|
||||
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
|
||||
OUT_VEC_TYPE result = ACTIVATION(resv, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f_idx, x_idx);
|
||||
WRITE_VEC(result, 0, &output[output_idx]);
|
||||
@@ -132,18 +133,16 @@ KERNEL (permute_f_y_axes)(
|
||||
for (int j = 0; j < J_TIMES; ++j) {
|
||||
const int j_vec = j * VEC_SIZE;
|
||||
const int y_idx = y_begin + j_vec;
|
||||
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
IN_VEC_TYPE resv = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
|
||||
for (int k = 0; k < VEC_SIZE; ++k) {
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS_VEC;
|
||||
OUT_VEC_TYPE result = FUSED_OPS_RESULT_VEC;
|
||||
for (int k = 0; k < VEC_SIZE; ++k) {
|
||||
transpose_buf[j_vec + k][bf_local] = result[k];
|
||||
}
|
||||
INPUT0_TYPE res = resv[k];
|
||||
FUSED_OPS
|
||||
transpose_buf[j_vec + k][bf_local] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
for (int k = 0; k < VEC_SIZE; ++k) {
|
||||
transpose_buf[j_vec + k][bf_local] = ACTIVATION(res[k], ACTIVATION_PARAMS);
|
||||
}
|
||||
transpose_buf[j_vec + k][bf_local] = ACTIVATION(resv[k], ACTIVATION_PARAMS);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(J_TIMES)))
|
||||
|
||||
@@ -128,29 +128,46 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
|
||||
}
|
||||
}
|
||||
|
||||
const size_t tile_width = GetTileWidth(params);
|
||||
const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
|
||||
const size_t tile_size = GetTileSize(params);
|
||||
const size_t j_times = tile_size / vector_size;
|
||||
const size_t feature_block_size = GetFeatureBlockSize(params);
|
||||
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
|
||||
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
|
||||
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
|
||||
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
|
||||
|
||||
const auto layout = params.inputs.front().GetLayout();
|
||||
if (!SimpleLayout(layout)) {
|
||||
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
|
||||
std::size_t vector_size;
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
vector_size = GetDivisor(params.inputs[0].X().v);
|
||||
auto j_times = params.inputs[0].X().v / vector_size;
|
||||
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
|
||||
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
|
||||
} else {
|
||||
const size_t tile_width = GetTileWidth(params);
|
||||
vector_size = std::min(tile_width, static_cast<size_t>(4));
|
||||
const size_t tile_size = GetTileSize(params);
|
||||
const size_t j_times = tile_size / vector_size;
|
||||
const size_t feature_block_size = GetFeatureBlockSize(params);
|
||||
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
|
||||
jit.AddConstant(MakeJitConstant("J_TIMES", j_times));
|
||||
jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCK_SIZE", feature_block_size));
|
||||
if (!SimpleLayout(layout)) {
|
||||
const auto subgroup_size = Is3DTranspose(params) ? feature_block_size : tile_size;
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subgroup_size));
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
const std::vector<std::string> original_output_order = {"b_idx", "f_idx", "y_idx", "x_idx"};
|
||||
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
|
||||
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
const std::vector<std::string> original_output_order = {"b_idx", "y_idx", "f_idx", "(x_idx+i)"};
|
||||
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
} else if (SimpleLayout(layout)) {
|
||||
const std::vector<std::string> original_output_order = {"b_idx", "(y_idx+k)", "f_idx", "x_idx"};
|
||||
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
} else {
|
||||
const std::vector<std::string> original_output_order = {"b_idx", "y_idx", "f_idx", "x_idx"};
|
||||
const FusedOpsConfiguration conf_scalar = {"", original_output_order, "res", params.inputs[0].GetDType(), 1};
|
||||
const FusedOpsConfiguration conf_vec = {"_VEC", original_output_order, "res", params.inputs[0].GetDType(), vector_size};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar, conf_vec}));
|
||||
}
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
@@ -158,14 +175,12 @@ static inline std::vector<size_t> GetGWS(const permute_params& params) {
|
||||
const auto& in = params.inputs[0];
|
||||
std::vector<size_t> gws;
|
||||
auto block_size = IsSimpleMemCopyOperation(params) ? GetTileWidth(params) : GetTileSize(params);
|
||||
if (params.inputs[0].X().v == 1) {
|
||||
if (IsSimpleMemCopyOperation(params)) {
|
||||
return {in.Y().v, in.Batch().v, in.Feature().v};
|
||||
} else if (Is3DTranspose(params)) {
|
||||
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
|
||||
} else { // the case where x is 1
|
||||
gws = {in.X().v, in.Y().v / block_size, (in.Batch().v * in.Feature().v)};
|
||||
} else {
|
||||
if (Is3DTranspose(params)) {
|
||||
gws = {in.X().v / block_size, in.Y().v / GetFeatureBlockSize(params), (in.Batch().v * in.Feature().v)};
|
||||
} else {
|
||||
gws = {in.X().v / block_size, in.Y().v, (in.Batch().v * in.Feature().v)};
|
||||
}
|
||||
}
|
||||
return gws;
|
||||
}
|
||||
@@ -177,9 +192,9 @@ CommonDispatchData PermuteKernel_f_y_axes::SetDefault(const permute_params& para
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.outputs[0].GetLayout();
|
||||
const std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
{Tensor::DataChannelName::BATCH},
|
||||
{Tensor::DataChannelName::FEATURE}};
|
||||
dispatchData.lws =
|
||||
GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
} else if (Is3DTranspose(params)) {
|
||||
|
||||
@@ -23,7 +23,11 @@ protected:
|
||||
JitConstants GetJitConstants(const permute_params& params, const CommonDispatchData& dispatchData) const override;
|
||||
CommonDispatchData SetDefault(const permute_params& params) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return {FusedOpType::ACTIVATION, FusedOpType::QUANTIZE, FusedOpType::ELTWISE};
|
||||
return {
|
||||
FusedOpType::ACTIVATION,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::ELTWISE
|
||||
};
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
||||
@@ -16,7 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
|
||||
Attach<PermuteKernel_tile_8x8_4x4>();
|
||||
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
|
||||
Attach<PermuteKernel_bfzyx_to_bfyxz>();
|
||||
// Attach<PermuteKernel_f_y_axes>();
|
||||
Attach<PermuteKernel_f_y_axes>();
|
||||
}
|
||||
|
||||
KernelsData permute_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
|
||||
@@ -40,6 +40,22 @@ INSTANTIATE_TEST_SUITE_P(smoke_Transpose,
|
||||
testing::Values(ov::test::utils::DEVICE_GPU)),
|
||||
TransposeLayerTest::getTestCaseName);
|
||||
|
||||
const std::vector<std::vector<ov::Shape>> levitShapes = {
|
||||
{{1, 4, 196, 32}},
|
||||
{{1, 196, 4, 32}},
|
||||
{{1, 196, 4, 16}},
|
||||
{{1, 196, 8, 64}},
|
||||
{{1, 24, 196, 1}},
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_Transpose_Levit,
|
||||
TransposeLayerTest,
|
||||
testing::Combine(testing::Values(std::vector<size_t>{0, 2, 1, 3}),
|
||||
testing::Values(ov::element::f16),
|
||||
testing::ValuesIn(ov::test::static_shapes_to_test_representation(levitShapes)),
|
||||
testing::Values(ov::test::utils::DEVICE_GPU)),
|
||||
TransposeLayerTest::getTestCaseName);
|
||||
|
||||
/**
|
||||
* 5D permute tests
|
||||
*/
|
||||
|
||||
@@ -171,7 +171,10 @@ public:
|
||||
#define CASE_PERMUTE_TILE_BFZYX_TO_BFYXZ_0 { 1, 8, 8, 2, 2 }, { 1, 8, 2, 8, 2 }, { 0, 1, 3, 4, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
|
||||
|
||||
// permute_f_y_axes
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 2 }, { 1, 2, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_0 { 1, 8, 4, 16 }, { 1, 16, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_1 { 1, 8, 1, 16 }, { 1, 16, 1, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_2 { 1, 8, 4, 16 }, { 1, 16, 4, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
|
||||
#define CASE_PERMUTE_TILE_BFYX_TO_BYFX_3 { 1, 8, 1, 16 }, { 1, 16, 1, 8 }, { 0, 2, 1, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16
|
||||
|
||||
class permute_activation_scale_eltwise: public PermuteFusingTest {};
|
||||
TEST_P(permute_activation_scale_eltwise, basic) {
|
||||
@@ -271,6 +274,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testin
|
||||
|
||||
// Fusing tests for permute_f_y_axes
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_0, 2, 5 },
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_1, 2, 5 },
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_2, 2, 5 },
|
||||
permute_params{ CASE_PERMUTE_TILE_BFYX_TO_BYFX_3, 2, 5 },
|
||||
}));
|
||||
|
||||
class permute_quant_u8: public PermuteFusingTest {};
|
||||
|
||||
Reference in New Issue
Block a user