From 30adf04d31f5a701c41c5a9d19a3b89b765671c3 Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Fri, 3 Sep 2021 02:02:16 +0900 Subject: [PATCH] [GPU] Fuse reorder to convolution (#6396) --- .../convolution_kernel_b_fs_yx_fsv16.cpp | 35 ++- .../convolution_kernel_b_fs_yx_fsv16.h | 7 +- .../convolution/convolution_kernel_ref.h | 7 +- .../reorder/reorder_kernel_base.h | 11 + .../cl_kernels/convolution_gpu_bfyx_f16.cl | 136 ++++++++--- .../kernel_selector/core/common/jitter.cpp | 4 +- .../kernel_selector/core/kernel_base.cpp | 7 + .../core/kernel_selector_params.h | 9 +- .../remove_redundant_reorders.cpp | 97 +++++++- .../clDNN/src/include/program_node.h | 1 + .../clDNN/src/include/reorder_inst.h | 10 + .../tests/test_cases/convolution_gpu_test.cpp | 216 ++++++++++++++++++ .../tests/test_cases/fusings_gpu_test.cpp | 70 +++++- 13 files changed, 552 insertions(+), 58 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp index 14fb4322d2b..b89a45bd527 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp @@ -4,6 +4,7 @@ #include "convolution_kernel_b_fs_yx_fsv16.h" #include "kernel_selector_utils.h" +#include "reorder/reorder_kernel_base.h" #include #include @@ -95,6 +96,8 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const { k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableDilation(); @@ -176,12 +179,28 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0) return false; + // Not supporting batch padding for different format (reorder-fused case) + if (input.GetLayout() == DataLayout::b_fs_yx_fsv16 && output.GetLayout() == DataLayout::bfyx) { + if (output.Batch().pad.before != 0 || output.Batch().pad.after != 0) + return false; + } + if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType()) return false; return true; } +bool post_reorder_fused(const convolution_params& params) { + if (!params.fused_ops.empty()) { + if (params.fused_ops.back().GetType() == KernelType::REORDER) { + return true; + } + } + + return false; +} + JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const { auto input = params.inputs[0]; @@ -190,8 +209,18 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_ ConvolutionTuningData tuning_data = GetTuningParams(params); + if (post_reorder_fused(params) && + input.GetLayout() == DataLayout::b_fs_yx_fsv16 && + output.GetLayout() == DataLayout::bfyx) { + jit.AddConstant(MakeJitConstant("OUTPUT_FORMAT_BFYX", 1)); + } + auto blockWidth = dispatchData.cldnnStyle.blockWidth; if (!params.fused_ops.empty()) { + DataLayout orig_output_layout = output.GetLayout(); + if (post_reorder_fused(params)) { + orig_output_layout = params.fused_ops.back().GetOpParams()->input_layout; + } auto input_dt = GetActivationType(params); FusedOpsConfiguration conf_vec = { "_VEC", {"b", "(feature_block * 16)", "y", "x"}, @@ -201,7 +230,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_ LoadType::LT_ALIGNED_READ, BoundaryCheck::ENABLED, IndexType::TENSOR_COORD, - Tensor::DataChannelName::X }; + Tensor::DataChannelName::X, + {}, false, "", orig_output_layout }; FusedOpsConfiguration conf_scalar = { "_SCALAR", {"b", "(feature_block * 16)", "y", "(x + i)"}, "dst[i]", @@ -210,7 +240,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_ LoadType::LT_ALIGNED_READ, BoundaryCheck::ENABLED, IndexType::TENSOR_COORD, - Tensor::DataChannelName::X }; + Tensor::DataChannelName::X, + {}, false, "", orig_output_layout }; jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar})); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h index e22aca5dc44..6c9ac8b146a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h @@ -30,10 +30,15 @@ protected: return (p.groups > 1) ? WeightsLayout::g_os_is_yx_isv16_osv16 : WeightsLayout::os_is_yx_isv16_osv16; } std::vector GetSupportedFusedOps() const override { + // FusedOpType::REORDER should be registered explicitly here + // only when fused_primitive_desc for reorder is added by optimization passes (e.g., remove_redundant_reorder) for corresponding primitive. + // The typical usage for fused_primitive_desc for convolution is to get original output layout from jitter, + // so that it can decide whether to fuse eltwise along with reorder. return { FusedOpType::ELTWISE, FusedOpType::QUANTIZE, FusedOpType::SCALE, - FusedOpType::ACTIVATION }; + FusedOpType::ACTIVATION, + FusedOpType::REORDER }; } bool NeedPaddedInput() const override { return false; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.h index a07032c6e10..d17e48eb41c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.h @@ -27,10 +27,15 @@ protected: return (params.groups > 1) ? WeightsLayout::goizyx : WeightsLayout::oizyx; } std::vector GetSupportedFusedOps() const override { + // FusedOpType::REORDER should be registered explicitly here + // only when fused_primitive_desc for reorder is added by optimization passes (e.g., remove_redundant_reorder) for corresponding primitive. + // The typical usage for fused_primitive_desc for convolution is to get original output layout from jitter, + // so that it can decide whether to fuse eltwise along with reorder. return { FusedOpType::ELTWISE, FusedOpType::QUANTIZE, FusedOpType::SCALE, - FusedOpType::ACTIVATION }; + FusedOpType::ACTIVATION, + FusedOpType::REORDER }; } JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h index e75e85c3880..eca73fec479 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h @@ -43,6 +43,17 @@ struct reorder_optional_params : optional_params { reorder_optional_params() : optional_params(KernelType::REORDER) {} }; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// reorder_fuse_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct reorder_fuse_params : fuse_params { + DataLayout input_layout; + DataLayout output_layout; + + reorder_fuse_params(DataLayout input_layout, DataLayout output_layout) : + fuse_params(KernelType::REORDER), input_layout(input_layout), output_layout(output_layout) {} +}; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // reorder_weights_params //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl index 149c00edacb..212f4bc04dc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl @@ -41,24 +41,30 @@ # error convolution_gpu_bfyx_f16.cl: unsupported filter type #endif -#if OUTPUT_TYPE_SIZE == 1 -# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val)) -# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val)) -# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val)) -# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val)) -#elif OUTPUT_TYPE_SIZE == 2 -# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val)) -# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val)) -# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val)) -# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val)) -#elif OUTPUT_TYPE_SIZE == 4 -# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val)) -# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val)) -# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val)) -# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val)) +#if OUTPUT_FORMAT_BFYX +# define OUTPUTVTYPE(n) CAT(OUTPUT_TYPE, n) +# define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE(OUTPUT_X_BLOCK_SIZE)) +# define VSTORE CAT(vstore, OUTPUT_X_BLOCK_SIZE) #else -# error convolution_gpu_bfyx_f16.cl: unsupported output type -#endif +# if OUTPUT_TYPE_SIZE == 1 +# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val)) +# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val)) +# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val)) +# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val)) +# elif OUTPUT_TYPE_SIZE == 2 +# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val)) +# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val)) +# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val)) +# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val)) +# elif OUTPUT_TYPE_SIZE == 4 +# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val)) +# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val)) +# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val)) +# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val)) +# else +# error convolution_gpu_bfyx_f16.cl: unsupported output type +# endif +#endif // OUTPUT_FORMAT_BFYX #if INPUT0_TYPE_SIZE == 2 # define AS_INPUT_SRC CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE)) @@ -129,18 +135,30 @@ KERNEL(convolution_bfyx_f16)( (INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch; // Output offset calculations: - const uint output_x_pitch = FEATURE_SLICE_SIZE; - const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X); - const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM; - const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y); - const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE); +#if OUTPUT_FORMAT_BFYX + const uint output_y_pitch = (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X); + const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y); + const uint output_b_pitch = output_fs_pitch * (OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM); + + const uint output_offset = b * output_b_pitch + + feature_block * (output_fs_pitch * FEATURE_SLICE_SIZE) + + (sglid + OUTPUT_PAD_BEFORE_FEATURE_NUM) * output_fs_pitch + + (y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch + + (x + OUTPUT_PAD_BEFORE_SIZE_X); +#else + const uint output_x_pitch = FEATURE_SLICE_SIZE; + const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X); + const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM; + const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y); + const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE); const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE; const uint output_offset = b * output_b_pitch + (feature_block + output_fs_pad_before) * output_fs_pitch + (y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch + (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch; +#endif // Filter offset calculations: const uint filter_isv_pitch = FEATURE_SLICE_SIZE; @@ -383,15 +401,27 @@ KERNEL(convolution_bfyx_f16)( #if OUTPUT_LEFTOVERS if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) { for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { + #if HAS_FUSED_OPS FUSED_OPS_SCALAR; +# if OUTPUT_FORMAT_BFYX + res[i] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT_SCALAR); +# else res[i] = FUSED_OPS_RESULT_SCALAR; +# endif #else res[i] = TO_OUTPUT_TYPE(dst[i]); #endif + +#if OUTPUT_FORMAT_BFYX + if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) { + output[output_offset + i] = res[i]; + } +#else if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) { output[output_offset + i * output_x_pitch + sglid] = res[i]; } +#endif } } else @@ -400,35 +430,61 @@ KERNEL(convolution_bfyx_f16)( if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) { #if HAS_FUSED_OPS FUSED_OPS_VEC; +# if OUTPUT_FORMAT_BFYX + res = TO_OUTPUTVTYPE(FUSED_OPS_RESULT_VEC); +# else res = FUSED_OPS_RESULT_VEC; +# endif #else +# if OUTPUT_FORMAT_BFYX + res = TO_OUTPUTVTYPE(dst); +# else res = dst; +# endif #endif // TODO Generalize for other block sizes -#if OUTPUT_X_BLOCK_SIZE == 8 - OUTPUT_BLOCK_WRITE8(output, output_offset, res); -#elif OUTPUT_X_BLOCK_SIZE == 4 - OUTPUT_BLOCK_WRITE4(output, output_offset, res); -#elif OUTPUT_X_BLOCK_SIZE == 2 - OUTPUT_BLOCK_WRITE2(output, output_offset, res); -#elif OUTPUT_X_BLOCK_SIZE == 1 - OUTPUT_BLOCK_WRITE(output, output_offset, res); +#if OUTPUT_FORMAT_BFYX + #if OUTPUT_X_BLOCK_SIZE == 2 || OUTPUT_X_BLOCK_SIZE == 4 || OUTPUT_X_BLOCK_SIZE == 8 + VSTORE(res, 0, output + output_offset); + #elif OUTPUT_X_BLOCK_SIZE == 1 + output[output_offset] = res[0]; + #else + # error convolution_gpu_bfyx_f16.cl: unsupported output x block size + #endif #else -# error convolution_gpu_bfyx_f16.cl: unsupported output x block size -#endif + #if OUTPUT_X_BLOCK_SIZE == 8 + OUTPUT_BLOCK_WRITE8(output, output_offset, res); + #elif OUTPUT_X_BLOCK_SIZE == 4 + OUTPUT_BLOCK_WRITE4(output, output_offset, res); + #elif OUTPUT_X_BLOCK_SIZE == 2 + OUTPUT_BLOCK_WRITE2(output, output_offset, res); + #elif OUTPUT_X_BLOCK_SIZE == 1 + OUTPUT_BLOCK_WRITE(output, output_offset, res); + #else + # error convolution_gpu_bfyx_f16.cl: unsupported output x block size + #endif +#endif // OUTPUT_FORMAT_BFYX } else { for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) { #if HAS_FUSED_OPS FUSED_OPS_SCALAR; +# if OUTPUT_FORMAT_BFYX + res[i] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT_SCALAR); +# else res[i] = FUSED_OPS_RESULT_SCALAR; +# endif #else res[i] = TO_OUTPUT_TYPE(dst[i]); #endif + +#if OUTPUT_FORMAT_BFYX + output[output_offset + i] = res[i]; +#else OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]); +#endif } } } - #if SLM_DIV_FACTOR > 1 } #endif @@ -462,7 +518,13 @@ KERNEL(convolution_bfyx_f16)( #undef FILTER_BLOCK_READ8 -#undef OUTPUT_BLOCK_WRITE -#undef OUTPUT_BLOCK_WRITE2 -#undef OUTPUT_BLOCK_WRITE4 -#undef OUTPUT_BLOCK_WRITE8 +#if OUTPUT_FORMAT_BFYX +# undef OUTPUTVTYPE +# undef TO_OUTPUTVTYPE +# undef VSTORE +#else +# undef OUTPUT_BLOCK_WRITE +# undef OUTPUT_BLOCK_WRITE2 +# undef OUTPUT_BLOCK_WRITE4 +# undef OUTPUT_BLOCK_WRITE8 +#endif // OUTPUT_FORMAT_BFYX diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index e4fb9d8055b..813c375bf2a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -1741,8 +1741,10 @@ std::string FusedOpsCodeGenerator::GetJitLoad(const FusedOpsConfiguration& conf, // Eltwise fused op can't have full tensor argument when requested vec_size > 1, since it might require // splitting load into several parts and some kind of index recalculation which is not supported + DataLayout orig_output_layout = conf.IsPostReorderFused() ? conf.orig_output_layout : prim_output.GetLayout(); + if (desc.GetType() == KernelType::ELTWISE && !valid_broadcast_case && - input_tensor.GetLayout() != prim_output.GetLayout() && conf.vec_size > 1) { + input_tensor.GetLayout() != orig_output_layout && conf.vec_size > 1) { throw std::runtime_error("[clDNN] Mixed layouts of input tensors are not supported in fused eltwise:" "\nfused_input: " + toString_v2(input_tensor) + "\noutput: " + toString_v2(prim_output)); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp index 41d067d1f62..7b977147465 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp @@ -108,6 +108,9 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa if (conf.empty()) return jit; + if (params.fused_ops.size() == 1 && params.fused_ops[0].GetType() == KernelType::REORDER) + return jit; + try { for (auto& c : conf) { std::string fused_ops; @@ -119,6 +122,10 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa bool can_all_use_preload = true; for (size_t i = 0; i < params.fused_ops.size(); i++) { + // Reorder is not processed by jitter + if (params.fused_ops[i].GetType() == FusedOpType::REORDER) + continue; + auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]); jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output)); jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name)); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h index 4638de3fbe9..b68054dad7c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h @@ -469,6 +469,8 @@ struct FusedOpsConfiguration { bool allow_for_partial_preload; // Load index for shuffle fused op std::string shuffle_var_name; + // Record original output layout before reorder is fused + DataLayout orig_output_layout; FusedOpsConfiguration(std::string suffix, std::vector bfzyx_idx_order, @@ -481,7 +483,8 @@ struct FusedOpsConfiguration { Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT, std::vector loop_axes = {}, bool allow_for_partial_preload = false, - std::string shuffle_var_name = "") + std::string shuffle_var_name = "", + DataLayout orig_output_layout = DataLayout::DataLayoutCount) : suffix(suffix) , bfzyx_idx_order(bfzyx_idx_order) , input_var_name(input_var_name) @@ -493,7 +496,8 @@ struct FusedOpsConfiguration { , index_type(index_type) , loop_axes(loop_axes) , allow_for_partial_preload(allow_for_partial_preload) - , shuffle_var_name(shuffle_var_name) { } + , shuffle_var_name(shuffle_var_name) + , orig_output_layout(orig_output_layout) { } FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; } FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; } @@ -505,6 +509,7 @@ struct FusedOpsConfiguration { allow_for_partial_preload = partial_preload; return *this; } FusedOpsConfiguration& SetShuffleVarName(std::string val) { shuffle_var_name = val; return *this; } + bool IsPostReorderFused(void) const { return orig_output_layout != DataLayout::DataLayoutCount; } }; // Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program::fuse_nodes diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp index fc7ce14f260..e40be287243 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -334,26 +334,24 @@ void remove_redundant_reorders::run(program& p) { p.remove_if_dangling(node); } - // This pass removes reorder for Convolution BFYX -> FS_B_YX_FSV32 - itr = p.get_processing_order().begin(); - while (itr != p.get_processing_order().end()) { - auto& node = *itr++; - if (!node->is_type() || !node->is_in_data_flow() || node->get_users().size() != 1 || node->get_dependencies().size() != 1) - continue; + // Remove reorder for Convolution bfyx -> fs_b_yx_fsv32 + auto try_fuse_reorder_bfyx_to_fsv32 = [&](reorder_node* node) { + if (node->get_users().size() != 1) + return; auto& usr = node->get_users().front(); auto& dep = node->get_dependency(0); if (!(usr->is_type()) || (usr->get_output_layout().data_type != dep.get_output_layout().data_type) || - (usr->get_output_layout().format != format::fs_b_yx_fsv32) || - (dep.get_output_layout().format != format::bfyx)) - continue; + (dep.get_output_layout().format != format::bfyx) || + (usr->get_output_layout().format != format::fs_b_yx_fsv32)) + return; if (dep.is_type()) - continue; + return; if (usr->as().get_primitive()->groups != 1) - continue; + return; dep.merge_output_padding(node->get_output_layout().data_padding); p.replace_all_usages(*node, dep); @@ -361,6 +359,83 @@ void remove_redundant_reorders::run(program& p) { p.add_optimized_primitive_info(node->id()); p.remove_all_connections(*node); p.remove_if_dangling(*node); + }; + + // Remove reorder for Convolution b_fs_yx_fsv16 -> bfyx + auto try_fuse_reorder_fsv16_to_bfyx = [&](reorder_node* node) { + if (!node->get_fused_activations_funcs().empty() || + !node->get_fused_primitives().empty()) + return; + + auto& input = node->input(); + + if (!(input.is_type()) || + !(input.get_output_layout().format == format::b_fs_yx_fsv16) || + !(node->get_output_layout().format == format::bfyx)) + return; + + if (input.as().get_primitive()->groups != 1) + return; + + if (input.get_users().size() != 1) + return; + + auto& input_dep = input.get_dependency(0); + if (input_dep.get_output_layout().format != format::b_fs_yx_fsv16 || + input_dep.get_output_layout().data_type == data_types::u8 || + input_dep.get_output_layout().data_type == data_types::i8) + return; + + for (auto& user : node->get_users()) { + // if concat is reorder's user and concat's axis is 0(Batch) or 1(Feature), conv's output would have padding. + // This padding might lead not to select the optimized conv kernel("convolution_gpu_bfyx_f16") + if (user->is_type()) { + auto& concat_node = user->as(); + auto concat_axis = concat_node.get_primitive()->axis; + if (concat_axis == 0 || concat_axis == 1) + return; + } + } + + auto output_layout = node->get_output_layout(); + input.set_output_layout(output_layout, false); + if (input.type()->does_possible_implementation_exist(input)) { + input.set_output_padding(node->get_output_layout().data_padding); + + // Add fused_primitive_desc of reorder to convolution which propagate original output layout to jitter + fused_primitive_desc local_desc; + local_desc.node = p.get_node_ptr(node->id()); + local_desc.dep_start_idx = input.get_fused_primitives().size(); + local_desc.output_layout = output_layout; + local_desc.input_layout = input.get_dependency(0).get_output_layout(); // original convolution's output layout + local_desc.activation = activation_func::none; + input.add_fused_primitive(local_desc); + node->set_input_layout(local_desc.input_layout); + + // remove reorder node + node->can_be_optimized(true); + p.add_optimized_primitive_info(node->id()); + p.extract_and_remove(*node); + } + }; + + if (enable_reorder_fusing) { + itr = p.get_processing_order().begin(); + while (itr != p.get_processing_order().end()) { + auto& node = *itr++; + if (!node->is_type()) + continue; + + if (!node->is_in_data_flow() || node->get_dependencies().size() != 1) + continue; + + auto& r_node = node->as(); + + // Remove reorder for Convolution bfyx -> fs_b_yx_fsv32 + try_fuse_reorder_bfyx_to_fsv32(&r_node); + // Remove reorder for Convolution b_fs_yx_fsv16 -> bfyx + try_fuse_reorder_fsv16_to_bfyx(&r_node); + } } // Additional reshape chains shrink. diff --git a/inference-engine/thirdparty/clDNN/src/include/program_node.h b/inference-engine/thirdparty/clDNN/src/include/program_node.h index 2b38d85d966..7974e90aeb5 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_node.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_node.h @@ -41,6 +41,7 @@ struct fused_primitive_desc { std::vector fused_deps; activation_func activation; activation_additional_params activation_params; + layout input_layout = layout(data_types::f32, format::bfyx, tensor()); layout output_layout = layout(data_types::f32, format::bfyx, tensor()); }; diff --git a/inference-engine/thirdparty/clDNN/src/include/reorder_inst.h b/inference-engine/thirdparty/clDNN/src/include/reorder_inst.h index 85d301e5b13..3589b9453f6 100644 --- a/inference-engine/thirdparty/clDNN/src/include/reorder_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/reorder_inst.h @@ -7,6 +7,8 @@ #include "cldnn/primitives/reorder.hpp" #include "primitive_inst.h" +#include "kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h" +#include "kernel_selector/common/tensor_type.h" #include #include @@ -33,11 +35,19 @@ public: void requires_reinterpret(bool val) { req_reinterpr = (optimized && val); } void set_input_offset(tensor const& io) { input_offset = io; } + void set_input_layout(layout const& lo) { input_layout = lo; } tensor get_input_offset() const { return input_offset; } + std::shared_ptr get_fuse_params() const override { + kernel_selector::DataLayout ks_input_layout = convert_data_tensor(input_layout).GetLayout(); + kernel_selector::DataLayout ks_output_layout = convert_data_tensor(get_output_layout()).GetLayout(); + return std::make_shared(ks_input_layout, ks_output_layout); + } + private: bool req_reinterpr = false; tensor input_offset = tensor{0}; // used by reorder to winograd domain + layout input_layout = layout(data_types::f32, format::bfyx, { 0, 0, 0, 0 }); }; using reorder_node = typed_program_node; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index 11e79771a94..24cff72146b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -7609,6 +7609,222 @@ TEST_P(convolution_general_gpu, conv_fp16_cases) { } } +struct convolution_gpu_fsv16_to_bfyx : public convolution_general_gpu {}; + +INSTANTIATE_TEST_SUITE_P(conv_b_fs_yx_fsv16_to_bfyx, + convolution_gpu_fsv16_to_bfyx, + ::testing::Values( + // Input X size, Input Y size, Input Z size, Input features, Output features, + // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch, + // Input data format, Implementation name, WithBias + TestParamType_general_convolution_gpu(6, 6, 0, 16, 16, 3, 3, 0, 1, 1, 4, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false), + TestParamType_general_convolution_gpu(6, 6, 0, 32, 32, 3, 3, 0, 1, 1, 1, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false), + TestParamType_general_convolution_gpu(6, 6, 0, 16, 16, 3, 3, 0, 1, 1, 16, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false), + TestParamType_general_convolution_gpu(16, 6, 0, 20, 16, 3, 3, 0, 1, 1, 20, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false) + ), + convolution_gpu_fsv16_to_bfyx::PrintToStringParamName); + +TEST_P(convolution_gpu_fsv16_to_bfyx, conv_b_fs_yx_fsv16_to_bfyx_padding) +{ + auto& engine = get_test_engine(); + + if (!engine.get_device_info().supports_fp16) + { + std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl; + EXPECT_EQ(1, 1); + return; + } + + const int input_b = testing::get<10>(GetParam()); + const int input_f = testing::get<3>(GetParam()); + const int input_y = testing::get<1>(GetParam()); + const int input_x = testing::get<0>(GetParam()); + + const int filter_x = testing::get<5>(GetParam()); + const int filter_y = testing::get<6>(GetParam()); + const int stride = testing::get<9>(GetParam()); + + const int input_offset_y = (filter_y - 1) / 2; + const int input_offset_x = (filter_x - 1) / 2; + + auto input_size = tensor(input_b, input_f, input_x, input_y); + auto input_data = generate_random_4d(input_b, input_f, input_y, input_x, -1, 1); + auto input_data_bfyx = flatten_4d(format::bfyx, input_data); + auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size }); + set_values(input_mem, input_data_bfyx); + + auto weights_size = tensor(input_b, input_f, filter_x, filter_y, 1); + auto weights_data = generate_random_4d(input_b, input_f, filter_x, filter_y, -1, 1); + auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data); + auto weights_mem = engine.allocate_memory({ data_types::f16, format::goiyx, weights_size }); + set_values(weights_mem, weights_data_bfyx); + + // Set topology + topology topology( + input_layout("input_origin", input_mem->get_layout()), + data("weights_fsv", weights_mem), + reorder("input_fsv16", "input_origin", { data_types::f16, format::b_fs_yx_fsv16, input_size })); // format 3 to 8 + + // Add convolution + auto input_stride = tensor(1, 1, stride, stride); + auto input_offset = tensor(0, 0, input_offset_x, input_offset_y); + auto input_dilation = tensor(1, 1, 1, 1); + auto input_padding_before = tensor(0, 0, input_offset_x, input_offset_y); + auto input_padding_after = tensor(0, 0, input_offset_x, input_offset_y); + + auto conv_fsv = convolution("conv_fsv", "input_fsv16", { "weights_fsv" }, input_stride, input_offset, input_dilation, input_padding_before, input_padding_after); + conv_fsv.output_padding = padding({ 0, 32, 2, 2 }, 0.f); + topology.add(conv_fsv); // format 8 to 8 -> after fusing, format 8 to 3 + + // Add reorder to bfyx + auto reorder_bfyx = reorder("reorder_bfyx", "conv_fsv", { data_types::f16, format::bfyx, input_size }); + reorder_bfyx.output_padding = padding({ 0, 16, 1, 1 }, 0.f); + topology.add(reorder_bfyx); // format 8 to 3 -> after fusing, removed + + // Exec ref network (non-fusing) + build_options options_ref; + options_ref.set_option(build_option::optimize_data(false)); + options_ref.set_option(build_option::allow_static_input_reorder(true)); + + network network_ref(engine, topology, options_ref); + network_ref.set_input_data("input_origin", input_mem); + auto ref_out = network_ref.execute(); + + auto ref_out_mem = ref_out.begin()->second.get_memory(); + cldnn::mem_lock ref_out_ptr(ref_out_mem, get_test_stream()); + + // Exec target network (fusing: conv+reorder) + build_options options_target; + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" }; + options_target.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} })); + options_target.set_option(build_option::optimize_data(true)); + + network network_target(engine, topology, options_target); + network_target.set_input_data("input_origin", input_mem); + auto target_out = network_target.execute(); + + auto target_out_mem = target_out.begin()->second.get_memory(); + cldnn::mem_lock target_out_ptr(target_out_mem, get_test_stream()); + + // Compare ref and target result + for (size_t i = 0; i < ref_out_ptr.size(); i++) { + auto ref_val = static_cast(ref_out_ptr[i]); + auto target_val = static_cast(target_out_ptr[i]); + auto diff = std::fabs(ref_val - target_val); + auto equal = (diff > 1e-5f) ? false : true; + + EXPECT_TRUE(equal); + if (!equal) + { + std::cout << "i:" << i \ + << "\t ref_out = " << ref_val \ + << "\t target_out = " << target_val \ + << std::endl; + + break; + } + } +} + +TEST_P(convolution_gpu_fsv16_to_bfyx, conv_b_fs_yx_fsv16_to_bfyx_different_type) +{ + auto& engine = get_test_engine(); + + if (!engine.get_device_info().supports_fp16) + { + std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl; + EXPECT_EQ(1, 1); + return; + } + + const int input_b = testing::get<10>(GetParam()); + const int input_f = testing::get<3>(GetParam()); + const int input_y = testing::get<1>(GetParam()); + const int input_x = testing::get<0>(GetParam()); + + const int filter_x = testing::get<5>(GetParam()); + const int filter_y = testing::get<6>(GetParam()); + const int stride = testing::get<9>(GetParam()); + + const int input_offset_y = (filter_y - 1) / 2; + const int input_offset_x = (filter_x - 1) / 2; + + auto input_size = tensor(input_b, input_f, input_x, input_y); + auto input_data = generate_random_4d(input_b, input_f, input_y, input_x, -1, 1); + auto input_data_bfyx = flatten_4d(format::bfyx, input_data); + auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size }); + set_values(input_mem, input_data_bfyx); + + auto weights_size = tensor(input_b, input_f, filter_x, filter_y, 1); + auto weights_data = generate_random_4d(input_b, input_f, filter_x, filter_y, -1, 1); + auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data); + auto weights_mem = engine.allocate_memory({ data_types::f16, format::goiyx, weights_size }); + set_values(weights_mem, weights_data_bfyx); + + // Set topology + topology topology( + input_layout("input_origin", input_mem->get_layout()), + data("weights_fsv", weights_mem), + reorder("input_fsv16", "input_origin", { data_types::f16, format::b_fs_yx_fsv16, input_size })); // format 3 to 8 + + // Add convolution + auto input_stride = tensor(1, 1, stride, stride); + auto input_offset = tensor(0, 0, input_offset_x, input_offset_y); + auto input_dilation = tensor(1, 1, 1, 1); + auto no_padding = tensor(0, 0, input_offset_x, input_offset_y); + + auto conv_fsv = convolution("conv_fsv", "input_fsv16", { "weights_fsv" }, input_stride, input_offset, input_dilation, no_padding, no_padding); + topology.add(conv_fsv); // format 8 to 8 -> after fusing, format 8 to 3 + + // Add reorder to bfyx + auto reorder_bfyx = reorder("reorder_bfyx", "conv_fsv", { data_types::f32, format::bfyx, input_size }); + topology.add(reorder_bfyx); // format 8 to 3 -> after fusing, removed + + // Exec ref network (non-fusing) + build_options options_ref; + options_ref.set_option(build_option::optimize_data(false)); + options_ref.set_option(build_option::allow_static_input_reorder(true)); + + network network_ref(engine, topology, options_ref); + network_ref.set_input_data("input_origin", input_mem); + auto ref_out = network_ref.execute(); + + auto ref_out_mem = ref_out.begin()->second.get_memory(); + cldnn::mem_lock ref_out_ptr(ref_out_mem, get_test_stream()); + + // Exec target network (fusing: conv+reorder) + build_options options_target; + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" }; + options_target.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} })); + options_target.set_option(build_option::optimize_data(true)); + + network network_target(engine, topology, options_target); + network_target.set_input_data("input_origin", input_mem); + auto target_out = network_target.execute(); + + auto target_out_mem = target_out.begin()->second.get_memory(); + cldnn::mem_lock target_out_ptr(target_out_mem, get_test_stream()); + + // Compare ref and target result + for (size_t i = 0; i < ref_out_ptr.size(); i++) { + auto ref_val = static_cast(ref_out_ptr[i]); + auto target_val = static_cast(target_out_ptr[i]); + auto diff = std::abs(ref_val - target_val); + auto equal = (diff > 1e-5f) ? false : true; + + EXPECT_TRUE(equal); + if (!equal) + { + std::cout << "i:" << i \ + << "\t ref_out = " << ref_val \ + << "\t target_out = " << target_val \ + << std::endl; + + break; + } + } +} + template class convolution_test_base { public: diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index d683d117286..e2996495262 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -616,6 +616,73 @@ public: } }; +class conv_fp32_reorder_fsv16_to_bfyx : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_fsv16_to_bfyx, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), + convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx, ::testing::ValuesIn(std::vector{ + bc_test_params{ CASE_CONV_FP32_1, 2, 2}, + bc_test_params{ CASE_CONV_FP32_2, 2, 2}, + bc_test_params{ CASE_CONV_FP32_3, 2, 2}, + bc_test_params{ CASE_CONV_FP32_4, 2, 2 }, + bc_test_params{ CASE_CONV_FP32_5, 2, 2 }, + bc_test_params{ CASE_CONV_FP32_14, 2, 2 }, + + bc_test_params{ CASE_CONV_FP16_1, 2, 2}, + bc_test_params{ CASE_CONV_FP16_2, 2, 2}, + bc_test_params{ CASE_CONV_FP16_3, 2, 2}, + bc_test_params{ CASE_CONV_FP16_4, 2, 2 }, + bc_test_params{ CASE_CONV_FP16_5, 2, 2 }, + bc_test_params{ CASE_CONV_FP16_13, 2, 2} +})); + +class conv_fp32_reorder_fsv16_to_bfyx_conv : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) { + auto p = GetParam(); + + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + auto dw_stride = tensor{ 0, 0, 1, 1 }; + + create_topologies(input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), + convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32), + convolution("conv_output", "reorder_bfyx", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs), + reorder("reorder_output", "activation", p.default_format, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx_conv, ::testing::ValuesIn(std::vector{ + bc_test_params{ CASE_CONV_FP32_1, 3, 4 }, + bc_test_params{ CASE_CONV_FP32_2, 3, 4 }, + bc_test_params{ CASE_CONV_FP32_3, 3, 4 }, + bc_test_params{ CASE_CONV_FP32_4, 3, 4 }, + bc_test_params{ CASE_CONV_FP32_5, 3, 4 }, + bc_test_params{ CASE_CONV_FP32_14, 3, 4 }, + + bc_test_params{ CASE_CONV_FP16_1, 3, 4 }, + bc_test_params{ CASE_CONV_FP16_2, 3, 4 }, + bc_test_params{ CASE_CONV_FP16_3, 3, 4 }, + bc_test_params{ CASE_CONV_FP16_4, 3, 4 }, + bc_test_params{ CASE_CONV_FP16_5, 3, 4 }, + bc_test_params{ CASE_CONV_FP16_13, 3, 4 }, +})); + class conv_fp32_activation : public ConvFusingTest {}; TEST_P(conv_fp32_activation, basic) { auto p = GetParam(); @@ -8279,9 +8346,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 5 }, scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 }, scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_7, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_9, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_8, 2, 5 }, scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 }, scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 },