[GPU] Fix some bugs of cldnn eltwise kernel at bs_fs_yx_bsv32_fsv16 format. (#9198)

* Fix some bugs of cldnn eltwise kernel at bs_fs_yx_bsv32_fsv16 format.

+ Add a condition so that the eltwise_simple_vload8 kernel is not selected when the tensor is not aligned as bsv32_fsv16 or bsv32_fsv32.
+ Optimize gws/lws of eltwise_ref kernel for bsv32_fsv16 format.

* Check feature align of b_fs_yx_fsv32 for eltwise vload8 kernel

+ Minor fix for OV_GPU_Help option
This commit is contained in:
Jade Cho 2021-12-22 11:43:04 +09:00 committed by GitHub
parent 63121e28ca
commit 7bcca1b82d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 18 deletions

View File

@ -624,11 +624,11 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para
}
} else if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
(params.output.Feature().v % 16 != 0 || dispatchData.gws[1] % 16 != 0)) {
auto bs_fsv16_local = GetLimitedOptimalLocalWorkGroupSizes({dispatchData.gws[1], dispatchData.gws[2], dispatchData.gws[0]},
params.engineInfo, {16, 32, params.engineInfo.maxWorkGroupSize});
dispatchData.lws[0] = bs_fsv16_local[2];
dispatchData.lws[1] = bs_fsv16_local[0];
dispatchData.lws[2] = bs_fsv16_local[1];
auto bs_fsv16_local = GetLimitedOptimalLocalWorkGroupSizes({dispatchData.gws[2], dispatchData.gws[0], dispatchData.gws[1]},
params.engineInfo, {32 * 16, 1024, 1024});
dispatchData.lws[0] = bs_fsv16_local[1];
dispatchData.lws[1] = bs_fsv16_local[2];
dispatchData.lws[2] = bs_fsv16_local[0];
} else {
dispatchData.lws[0] = local[0];
dispatchData.lws[1] = local[1];

View File

@ -32,18 +32,29 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params&
const auto& ewParams = static_cast<const eltwise_params&>(params);
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv4 && ewParams.inputs[i].Feature().v % 8 != 0) ||
ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
const auto input_layout = ewParams.inputs[i].GetLayout();
const auto batch_size = ewParams.inputs[i].Batch().v;
const auto feature_size = ewParams.inputs[i].Feature().v;
if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) ||
(input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
input_layout == DataLayout::fs_b_yx_fsv32 ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0)))
return false;
}
if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv32 && ewParams.output.Feature().v % 32 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv4 && ewParams.output.Feature().v % 8 != 0) ||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32 ||
(ewParams.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
(ewParams.output.Feature().v % 16 != 0 || ewParams.output.Batch().v % 32 != 0)) ||
(ewParams.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 &&
(ewParams.output.Feature().v % 32 != 0 || ewParams.output.Batch().v % 32 != 0)))
return false;
}
if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv4 && ewParams.output.Feature().v % 8 != 0) ||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;
const auto& output = ewParams.output;
const auto count = output.PhysicalSize();

View File

@ -425,7 +425,9 @@ bool CheckInputsOutputNoPitchSameDims(const base_params& params) {
{DataLayout::bs_f_bsv16__af8, {16, 8}},
{DataLayout::b_fs_yx_fsv4, {1, 4}},
{DataLayout::fs_b_yx_fsv32, {1, 32}},
{DataLayout::b_fs_yx_32fp, {1, 32}}
{DataLayout::b_fs_yx_32fp, {1, 32}},
{DataLayout::bs_fs_yx_bsv32_fsv16, {32, 16}},
{DataLayout::bs_fs_yx_bsv32_fsv32, {32, 32}}
};
if (params.inputs.size()) {

View File

@ -123,7 +123,7 @@ static void print_help_messages() {
GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
for (auto& p : message_list) {
GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + " " << p.second << std::endl;
}
}