[GPU] Fuse reorder to convolution (#6396)
This commit is contained in:
parent
2cf7065f6e
commit
30adf04d31
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include "convolution_kernel_b_fs_yx_fsv16.h"
|
#include "convolution_kernel_b_fs_yx_fsv16.h"
|
||||||
#include "kernel_selector_utils.h"
|
#include "kernel_selector_utils.h"
|
||||||
|
#include "reorder/reorder_kernel_base.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -95,6 +96,8 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
|||||||
|
|
||||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||||
|
k.EnableOutputLayout(DataLayout::bfyx);
|
||||||
|
|
||||||
k.EnableTensorOffset();
|
k.EnableTensorOffset();
|
||||||
k.EnableTensorPitches();
|
k.EnableTensorPitches();
|
||||||
k.EnableDilation();
|
k.EnableDilation();
|
||||||
@ -176,12 +179,28 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p
|
|||||||
if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
|
if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
// Not supporting batch padding for different format (reorder-fused case)
|
||||||
|
if (input.GetLayout() == DataLayout::b_fs_yx_fsv16 && output.GetLayout() == DataLayout::bfyx) {
|
||||||
|
if (output.Batch().pad.before != 0 || output.Batch().pad.after != 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
|
if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool post_reorder_fused(const convolution_params& params) {
|
||||||
|
if (!params.fused_ops.empty()) {
|
||||||
|
if (params.fused_ops.back().GetType() == KernelType::REORDER) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_params& params,
|
JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_params& params,
|
||||||
const DispatchData& dispatchData) const {
|
const DispatchData& dispatchData) const {
|
||||||
auto input = params.inputs[0];
|
auto input = params.inputs[0];
|
||||||
@ -190,8 +209,18 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
|
|
||||||
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
ConvolutionTuningData tuning_data = GetTuningParams(params);
|
||||||
|
|
||||||
|
if (post_reorder_fused(params) &&
|
||||||
|
input.GetLayout() == DataLayout::b_fs_yx_fsv16 &&
|
||||||
|
output.GetLayout() == DataLayout::bfyx) {
|
||||||
|
jit.AddConstant(MakeJitConstant("OUTPUT_FORMAT_BFYX", 1));
|
||||||
|
}
|
||||||
|
|
||||||
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
auto blockWidth = dispatchData.cldnnStyle.blockWidth;
|
||||||
if (!params.fused_ops.empty()) {
|
if (!params.fused_ops.empty()) {
|
||||||
|
DataLayout orig_output_layout = output.GetLayout();
|
||||||
|
if (post_reorder_fused(params)) {
|
||||||
|
orig_output_layout = params.fused_ops.back().GetOpParams<reorder_fuse_params>()->input_layout;
|
||||||
|
}
|
||||||
auto input_dt = GetActivationType(params);
|
auto input_dt = GetActivationType(params);
|
||||||
FusedOpsConfiguration conf_vec = { "_VEC",
|
FusedOpsConfiguration conf_vec = { "_VEC",
|
||||||
{"b", "(feature_block * 16)", "y", "x"},
|
{"b", "(feature_block * 16)", "y", "x"},
|
||||||
@ -201,7 +230,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
LoadType::LT_ALIGNED_READ,
|
LoadType::LT_ALIGNED_READ,
|
||||||
BoundaryCheck::ENABLED,
|
BoundaryCheck::ENABLED,
|
||||||
IndexType::TENSOR_COORD,
|
IndexType::TENSOR_COORD,
|
||||||
Tensor::DataChannelName::X };
|
Tensor::DataChannelName::X,
|
||||||
|
{}, false, "", orig_output_layout };
|
||||||
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
FusedOpsConfiguration conf_scalar = { "_SCALAR",
|
||||||
{"b", "(feature_block * 16)", "y", "(x + i)"},
|
{"b", "(feature_block * 16)", "y", "(x + i)"},
|
||||||
"dst[i]",
|
"dst[i]",
|
||||||
@ -210,7 +240,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
|
|||||||
LoadType::LT_ALIGNED_READ,
|
LoadType::LT_ALIGNED_READ,
|
||||||
BoundaryCheck::ENABLED,
|
BoundaryCheck::ENABLED,
|
||||||
IndexType::TENSOR_COORD,
|
IndexType::TENSOR_COORD,
|
||||||
Tensor::DataChannelName::X };
|
Tensor::DataChannelName::X,
|
||||||
|
{}, false, "", orig_output_layout };
|
||||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
|
jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,10 +30,15 @@ protected:
|
|||||||
return (p.groups > 1) ? WeightsLayout::g_os_is_yx_isv16_osv16 : WeightsLayout::os_is_yx_isv16_osv16;
|
return (p.groups > 1) ? WeightsLayout::g_os_is_yx_isv16_osv16 : WeightsLayout::os_is_yx_isv16_osv16;
|
||||||
}
|
}
|
||||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||||
|
// FusedOpType::REORDER should be registered explicitly here
|
||||||
|
// only when fused_primitive_desc for reorder is added by optimization passes (e.g., remove_redundant_reorder) for corresponding primitive.
|
||||||
|
// The typical usage for fused_primitive_desc for convolution is to get original output layout from jitter,
|
||||||
|
// so that it can decide whether to fuse eltwise along with reorder.
|
||||||
return { FusedOpType::ELTWISE,
|
return { FusedOpType::ELTWISE,
|
||||||
FusedOpType::QUANTIZE,
|
FusedOpType::QUANTIZE,
|
||||||
FusedOpType::SCALE,
|
FusedOpType::SCALE,
|
||||||
FusedOpType::ACTIVATION };
|
FusedOpType::ACTIVATION,
|
||||||
|
FusedOpType::REORDER };
|
||||||
}
|
}
|
||||||
|
|
||||||
bool NeedPaddedInput() const override { return false; }
|
bool NeedPaddedInput() const override { return false; }
|
||||||
|
@ -27,10 +27,15 @@ protected:
|
|||||||
return (params.groups > 1) ? WeightsLayout::goizyx : WeightsLayout::oizyx;
|
return (params.groups > 1) ? WeightsLayout::goizyx : WeightsLayout::oizyx;
|
||||||
}
|
}
|
||||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||||
|
// FusedOpType::REORDER should be registered explicitly here
|
||||||
|
// only when fused_primitive_desc for reorder is added by optimization passes (e.g., remove_redundant_reorder) for corresponding primitive.
|
||||||
|
// The typical usage for fused_primitive_desc for convolution is to get original output layout from jitter,
|
||||||
|
// so that it can decide whether to fuse eltwise along with reorder.
|
||||||
return { FusedOpType::ELTWISE,
|
return { FusedOpType::ELTWISE,
|
||||||
FusedOpType::QUANTIZE,
|
FusedOpType::QUANTIZE,
|
||||||
FusedOpType::SCALE,
|
FusedOpType::SCALE,
|
||||||
FusedOpType::ACTIVATION };
|
FusedOpType::ACTIVATION,
|
||||||
|
FusedOpType::REORDER };
|
||||||
}
|
}
|
||||||
|
|
||||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
|
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
|
||||||
|
@ -43,6 +43,17 @@ struct reorder_optional_params : optional_params {
|
|||||||
reorder_optional_params() : optional_params(KernelType::REORDER) {}
|
reorder_optional_params() : optional_params(KernelType::REORDER) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// reorder_fuse_params
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
struct reorder_fuse_params : fuse_params {
|
||||||
|
DataLayout input_layout;
|
||||||
|
DataLayout output_layout;
|
||||||
|
|
||||||
|
reorder_fuse_params(DataLayout input_layout, DataLayout output_layout) :
|
||||||
|
fuse_params(KernelType::REORDER), input_layout(input_layout), output_layout(output_layout) {}
|
||||||
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// reorder_weights_params
|
// reorder_weights_params
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -41,24 +41,30 @@
|
|||||||
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
|
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if OUTPUT_TYPE_SIZE == 1
|
#if OUTPUT_FORMAT_BFYX
|
||||||
|
# define OUTPUTVTYPE(n) CAT(OUTPUT_TYPE, n)
|
||||||
|
# define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE(OUTPUT_X_BLOCK_SIZE))
|
||||||
|
# define VSTORE CAT(vstore, OUTPUT_X_BLOCK_SIZE)
|
||||||
|
#else
|
||||||
|
# if OUTPUT_TYPE_SIZE == 1
|
||||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
|
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
|
||||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val))
|
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val))
|
||||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val))
|
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val))
|
||||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
|
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
|
||||||
#elif OUTPUT_TYPE_SIZE == 2
|
# elif OUTPUT_TYPE_SIZE == 2
|
||||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
|
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
|
||||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val))
|
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val))
|
||||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val))
|
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val))
|
||||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
|
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
|
||||||
#elif OUTPUT_TYPE_SIZE == 4
|
# elif OUTPUT_TYPE_SIZE == 4
|
||||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
|
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
|
||||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val))
|
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val))
|
||||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
||||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
||||||
#else
|
# else
|
||||||
# error convolution_gpu_bfyx_f16.cl: unsupported output type
|
# error convolution_gpu_bfyx_f16.cl: unsupported output type
|
||||||
#endif
|
# endif
|
||||||
|
#endif // OUTPUT_FORMAT_BFYX
|
||||||
|
|
||||||
#if INPUT0_TYPE_SIZE == 2
|
#if INPUT0_TYPE_SIZE == 2
|
||||||
# define AS_INPUT_SRC CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE))
|
# define AS_INPUT_SRC CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE))
|
||||||
@ -129,18 +135,30 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
(INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch;
|
(INPUT0_PAD_BEFORE_SIZE_X + input_x) * input_x_pitch;
|
||||||
|
|
||||||
// Output offset calculations:
|
// Output offset calculations:
|
||||||
|
|
||||||
|
#if OUTPUT_FORMAT_BFYX
|
||||||
|
const uint output_y_pitch = (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
|
||||||
|
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
||||||
|
const uint output_b_pitch = output_fs_pitch * (OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM);
|
||||||
|
|
||||||
|
const uint output_offset = b * output_b_pitch +
|
||||||
|
feature_block * (output_fs_pitch * FEATURE_SLICE_SIZE) +
|
||||||
|
(sglid + OUTPUT_PAD_BEFORE_FEATURE_NUM) * output_fs_pitch +
|
||||||
|
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||||
|
(x + OUTPUT_PAD_BEFORE_SIZE_X);
|
||||||
|
#else
|
||||||
const uint output_x_pitch = FEATURE_SLICE_SIZE;
|
const uint output_x_pitch = FEATURE_SLICE_SIZE;
|
||||||
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
|
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
|
||||||
const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
|
const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
|
||||||
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
||||||
const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||||
|
|
||||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||||
|
|
||||||
const uint output_offset = b * output_b_pitch +
|
const uint output_offset = b * output_b_pitch +
|
||||||
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
(feature_block + output_fs_pad_before) * output_fs_pitch +
|
||||||
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||||
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Filter offset calculations:
|
// Filter offset calculations:
|
||||||
const uint filter_isv_pitch = FEATURE_SLICE_SIZE;
|
const uint filter_isv_pitch = FEATURE_SLICE_SIZE;
|
||||||
@ -383,15 +401,27 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
#if OUTPUT_LEFTOVERS
|
#if OUTPUT_LEFTOVERS
|
||||||
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
|
||||||
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
|
||||||
|
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_SCALAR;
|
FUSED_OPS_SCALAR;
|
||||||
|
# if OUTPUT_FORMAT_BFYX
|
||||||
|
res[i] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT_SCALAR);
|
||||||
|
# else
|
||||||
res[i] = FUSED_OPS_RESULT_SCALAR;
|
res[i] = FUSED_OPS_RESULT_SCALAR;
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if OUTPUT_FORMAT_BFYX
|
||||||
|
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
||||||
|
output[output_offset + i] = res[i];
|
||||||
|
}
|
||||||
|
#else
|
||||||
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
|
||||||
output[output_offset + i * output_x_pitch + sglid] = res[i];
|
output[output_offset + i * output_x_pitch + sglid] = res[i];
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -400,35 +430,61 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
|
if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_VEC;
|
FUSED_OPS_VEC;
|
||||||
|
# if OUTPUT_FORMAT_BFYX
|
||||||
|
res = TO_OUTPUTVTYPE(FUSED_OPS_RESULT_VEC);
|
||||||
|
# else
|
||||||
res = FUSED_OPS_RESULT_VEC;
|
res = FUSED_OPS_RESULT_VEC;
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
|
# if OUTPUT_FORMAT_BFYX
|
||||||
|
res = TO_OUTPUTVTYPE(dst);
|
||||||
|
# else
|
||||||
res = dst;
|
res = dst;
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
// TODO Generalize for other block sizes
|
// TODO Generalize for other block sizes
|
||||||
#if OUTPUT_X_BLOCK_SIZE == 8
|
#if OUTPUT_FORMAT_BFYX
|
||||||
OUTPUT_BLOCK_WRITE8(output, output_offset, res);
|
#if OUTPUT_X_BLOCK_SIZE == 2 || OUTPUT_X_BLOCK_SIZE == 4 || OUTPUT_X_BLOCK_SIZE == 8
|
||||||
#elif OUTPUT_X_BLOCK_SIZE == 4
|
VSTORE(res, 0, output + output_offset);
|
||||||
OUTPUT_BLOCK_WRITE4(output, output_offset, res);
|
#elif OUTPUT_X_BLOCK_SIZE == 1
|
||||||
#elif OUTPUT_X_BLOCK_SIZE == 2
|
output[output_offset] = res[0];
|
||||||
OUTPUT_BLOCK_WRITE2(output, output_offset, res);
|
#else
|
||||||
#elif OUTPUT_X_BLOCK_SIZE == 1
|
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
||||||
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
#endif
|
||||||
#else
|
#else
|
||||||
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
#if OUTPUT_X_BLOCK_SIZE == 8
|
||||||
#endif
|
OUTPUT_BLOCK_WRITE8(output, output_offset, res);
|
||||||
|
#elif OUTPUT_X_BLOCK_SIZE == 4
|
||||||
|
OUTPUT_BLOCK_WRITE4(output, output_offset, res);
|
||||||
|
#elif OUTPUT_X_BLOCK_SIZE == 2
|
||||||
|
OUTPUT_BLOCK_WRITE2(output, output_offset, res);
|
||||||
|
#elif OUTPUT_X_BLOCK_SIZE == 1
|
||||||
|
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
||||||
|
#else
|
||||||
|
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
||||||
|
#endif
|
||||||
|
#endif // OUTPUT_FORMAT_BFYX
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
|
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS_SCALAR;
|
FUSED_OPS_SCALAR;
|
||||||
|
# if OUTPUT_FORMAT_BFYX
|
||||||
|
res[i] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT_SCALAR);
|
||||||
|
# else
|
||||||
res[i] = FUSED_OPS_RESULT_SCALAR;
|
res[i] = FUSED_OPS_RESULT_SCALAR;
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
||||||
#endif
|
#endif
|
||||||
OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
#if OUTPUT_FORMAT_BFYX
|
||||||
|
output[output_offset + i] = res[i];
|
||||||
|
#else
|
||||||
|
OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#if SLM_DIV_FACTOR > 1
|
#if SLM_DIV_FACTOR > 1
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -462,7 +518,13 @@ KERNEL(convolution_bfyx_f16)(
|
|||||||
|
|
||||||
#undef FILTER_BLOCK_READ8
|
#undef FILTER_BLOCK_READ8
|
||||||
|
|
||||||
#undef OUTPUT_BLOCK_WRITE
|
#if OUTPUT_FORMAT_BFYX
|
||||||
#undef OUTPUT_BLOCK_WRITE2
|
# undef OUTPUTVTYPE
|
||||||
#undef OUTPUT_BLOCK_WRITE4
|
# undef TO_OUTPUTVTYPE
|
||||||
#undef OUTPUT_BLOCK_WRITE8
|
# undef VSTORE
|
||||||
|
#else
|
||||||
|
# undef OUTPUT_BLOCK_WRITE
|
||||||
|
# undef OUTPUT_BLOCK_WRITE2
|
||||||
|
# undef OUTPUT_BLOCK_WRITE4
|
||||||
|
# undef OUTPUT_BLOCK_WRITE8
|
||||||
|
#endif // OUTPUT_FORMAT_BFYX
|
||||||
|
@ -1741,8 +1741,10 @@ std::string FusedOpsCodeGenerator::GetJitLoad(const FusedOpsConfiguration& conf,
|
|||||||
|
|
||||||
// Eltwise fused op can't have full tensor argument when requested vec_size > 1, since it might require
|
// Eltwise fused op can't have full tensor argument when requested vec_size > 1, since it might require
|
||||||
// splitting load into several parts and some kind of index recalculation which is not supported
|
// splitting load into several parts and some kind of index recalculation which is not supported
|
||||||
|
DataLayout orig_output_layout = conf.IsPostReorderFused() ? conf.orig_output_layout : prim_output.GetLayout();
|
||||||
|
|
||||||
if (desc.GetType() == KernelType::ELTWISE && !valid_broadcast_case &&
|
if (desc.GetType() == KernelType::ELTWISE && !valid_broadcast_case &&
|
||||||
input_tensor.GetLayout() != prim_output.GetLayout() && conf.vec_size > 1) {
|
input_tensor.GetLayout() != orig_output_layout && conf.vec_size > 1) {
|
||||||
throw std::runtime_error("[clDNN] Mixed layouts of input tensors are not supported in fused eltwise:"
|
throw std::runtime_error("[clDNN] Mixed layouts of input tensors are not supported in fused eltwise:"
|
||||||
"\nfused_input: " + toString_v2(input_tensor) +
|
"\nfused_input: " + toString_v2(input_tensor) +
|
||||||
"\noutput: " + toString_v2(prim_output));
|
"\noutput: " + toString_v2(prim_output));
|
||||||
|
@ -108,6 +108,9 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
|
|||||||
if (conf.empty())
|
if (conf.empty())
|
||||||
return jit;
|
return jit;
|
||||||
|
|
||||||
|
if (params.fused_ops.size() == 1 && params.fused_ops[0].GetType() == KernelType::REORDER)
|
||||||
|
return jit;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (auto& c : conf) {
|
for (auto& c : conf) {
|
||||||
std::string fused_ops;
|
std::string fused_ops;
|
||||||
@ -119,6 +122,10 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
|
|||||||
bool can_all_use_preload = true;
|
bool can_all_use_preload = true;
|
||||||
|
|
||||||
for (size_t i = 0; i < params.fused_ops.size(); i++) {
|
for (size_t i = 0; i < params.fused_ops.size(); i++) {
|
||||||
|
// Reorder is not processed by jitter
|
||||||
|
if (params.fused_ops[i].GetType() == FusedOpType::REORDER)
|
||||||
|
continue;
|
||||||
|
|
||||||
auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
|
auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
|
||||||
jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output));
|
jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output));
|
||||||
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name));
|
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name));
|
||||||
|
@ -469,6 +469,8 @@ struct FusedOpsConfiguration {
|
|||||||
bool allow_for_partial_preload;
|
bool allow_for_partial_preload;
|
||||||
// Load index for shuffle fused op
|
// Load index for shuffle fused op
|
||||||
std::string shuffle_var_name;
|
std::string shuffle_var_name;
|
||||||
|
// Record original output layout before reorder is fused
|
||||||
|
DataLayout orig_output_layout;
|
||||||
|
|
||||||
FusedOpsConfiguration(std::string suffix,
|
FusedOpsConfiguration(std::string suffix,
|
||||||
std::vector<std::string> bfzyx_idx_order,
|
std::vector<std::string> bfzyx_idx_order,
|
||||||
@ -481,7 +483,8 @@ struct FusedOpsConfiguration {
|
|||||||
Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
|
Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
|
||||||
std::vector<Tensor::DataChannelName> loop_axes = {},
|
std::vector<Tensor::DataChannelName> loop_axes = {},
|
||||||
bool allow_for_partial_preload = false,
|
bool allow_for_partial_preload = false,
|
||||||
std::string shuffle_var_name = "")
|
std::string shuffle_var_name = "",
|
||||||
|
DataLayout orig_output_layout = DataLayout::DataLayoutCount)
|
||||||
: suffix(suffix)
|
: suffix(suffix)
|
||||||
, bfzyx_idx_order(bfzyx_idx_order)
|
, bfzyx_idx_order(bfzyx_idx_order)
|
||||||
, input_var_name(input_var_name)
|
, input_var_name(input_var_name)
|
||||||
@ -493,7 +496,8 @@ struct FusedOpsConfiguration {
|
|||||||
, index_type(index_type)
|
, index_type(index_type)
|
||||||
, loop_axes(loop_axes)
|
, loop_axes(loop_axes)
|
||||||
, allow_for_partial_preload(allow_for_partial_preload)
|
, allow_for_partial_preload(allow_for_partial_preload)
|
||||||
, shuffle_var_name(shuffle_var_name) { }
|
, shuffle_var_name(shuffle_var_name)
|
||||||
|
, orig_output_layout(orig_output_layout) { }
|
||||||
|
|
||||||
FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
|
FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
|
||||||
FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
|
FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
|
||||||
@ -505,6 +509,7 @@ struct FusedOpsConfiguration {
|
|||||||
allow_for_partial_preload = partial_preload;
|
allow_for_partial_preload = partial_preload;
|
||||||
return *this; }
|
return *this; }
|
||||||
FusedOpsConfiguration& SetShuffleVarName(std::string val) { shuffle_var_name = val; return *this; }
|
FusedOpsConfiguration& SetShuffleVarName(std::string val) { shuffle_var_name = val; return *this; }
|
||||||
|
bool IsPostReorderFused(void) const { return orig_output_layout != DataLayout::DataLayoutCount; }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program::fuse_nodes
|
// Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program::fuse_nodes
|
||||||
|
@ -334,26 +334,24 @@ void remove_redundant_reorders::run(program& p) {
|
|||||||
p.remove_if_dangling(node);
|
p.remove_if_dangling(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This pass removes reorder for Convolution BFYX -> FS_B_YX_FSV32
|
// Remove reorder for Convolution bfyx -> fs_b_yx_fsv32
|
||||||
itr = p.get_processing_order().begin();
|
auto try_fuse_reorder_bfyx_to_fsv32 = [&](reorder_node* node) {
|
||||||
while (itr != p.get_processing_order().end()) {
|
if (node->get_users().size() != 1)
|
||||||
auto& node = *itr++;
|
return;
|
||||||
if (!node->is_type<reorder>() || !node->is_in_data_flow() || node->get_users().size() != 1 || node->get_dependencies().size() != 1)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
auto& usr = node->get_users().front();
|
auto& usr = node->get_users().front();
|
||||||
auto& dep = node->get_dependency(0);
|
auto& dep = node->get_dependency(0);
|
||||||
if (!(usr->is_type<convolution>()) ||
|
if (!(usr->is_type<convolution>()) ||
|
||||||
(usr->get_output_layout().data_type != dep.get_output_layout().data_type) ||
|
(usr->get_output_layout().data_type != dep.get_output_layout().data_type) ||
|
||||||
(usr->get_output_layout().format != format::fs_b_yx_fsv32) ||
|
(dep.get_output_layout().format != format::bfyx) ||
|
||||||
(dep.get_output_layout().format != format::bfyx))
|
(usr->get_output_layout().format != format::fs_b_yx_fsv32))
|
||||||
continue;
|
return;
|
||||||
|
|
||||||
if (dep.is_type<input_layout>())
|
if (dep.is_type<input_layout>())
|
||||||
continue;
|
return;
|
||||||
|
|
||||||
if (usr->as<convolution>().get_primitive()->groups != 1)
|
if (usr->as<convolution>().get_primitive()->groups != 1)
|
||||||
continue;
|
return;
|
||||||
|
|
||||||
dep.merge_output_padding(node->get_output_layout().data_padding);
|
dep.merge_output_padding(node->get_output_layout().data_padding);
|
||||||
p.replace_all_usages(*node, dep);
|
p.replace_all_usages(*node, dep);
|
||||||
@ -361,6 +359,83 @@ void remove_redundant_reorders::run(program& p) {
|
|||||||
p.add_optimized_primitive_info(node->id());
|
p.add_optimized_primitive_info(node->id());
|
||||||
p.remove_all_connections(*node);
|
p.remove_all_connections(*node);
|
||||||
p.remove_if_dangling(*node);
|
p.remove_if_dangling(*node);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Remove reorder for Convolution b_fs_yx_fsv16 -> bfyx
|
||||||
|
auto try_fuse_reorder_fsv16_to_bfyx = [&](reorder_node* node) {
|
||||||
|
if (!node->get_fused_activations_funcs().empty() ||
|
||||||
|
!node->get_fused_primitives().empty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto& input = node->input();
|
||||||
|
|
||||||
|
if (!(input.is_type<convolution>()) ||
|
||||||
|
!(input.get_output_layout().format == format::b_fs_yx_fsv16) ||
|
||||||
|
!(node->get_output_layout().format == format::bfyx))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (input.as<convolution>().get_primitive()->groups != 1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (input.get_users().size() != 1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto& input_dep = input.get_dependency(0);
|
||||||
|
if (input_dep.get_output_layout().format != format::b_fs_yx_fsv16 ||
|
||||||
|
input_dep.get_output_layout().data_type == data_types::u8 ||
|
||||||
|
input_dep.get_output_layout().data_type == data_types::i8)
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (auto& user : node->get_users()) {
|
||||||
|
// if concat is reorder's user and concat's axis is 0(Batch) or 1(Feature), conv's output would have padding.
|
||||||
|
// This padding might lead not to select the optimized conv kernel("convolution_gpu_bfyx_f16")
|
||||||
|
if (user->is_type<concatenation>()) {
|
||||||
|
auto& concat_node = user->as<concatenation>();
|
||||||
|
auto concat_axis = concat_node.get_primitive()->axis;
|
||||||
|
if (concat_axis == 0 || concat_axis == 1)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto output_layout = node->get_output_layout();
|
||||||
|
input.set_output_layout(output_layout, false);
|
||||||
|
if (input.type()->does_possible_implementation_exist(input)) {
|
||||||
|
input.set_output_padding(node->get_output_layout().data_padding);
|
||||||
|
|
||||||
|
// Add fused_primitive_desc of reorder to convolution which propagate original output layout to jitter
|
||||||
|
fused_primitive_desc local_desc;
|
||||||
|
local_desc.node = p.get_node_ptr(node->id());
|
||||||
|
local_desc.dep_start_idx = input.get_fused_primitives().size();
|
||||||
|
local_desc.output_layout = output_layout;
|
||||||
|
local_desc.input_layout = input.get_dependency(0).get_output_layout(); // original convolution's output layout
|
||||||
|
local_desc.activation = activation_func::none;
|
||||||
|
input.add_fused_primitive(local_desc);
|
||||||
|
node->set_input_layout(local_desc.input_layout);
|
||||||
|
|
||||||
|
// remove reorder node
|
||||||
|
node->can_be_optimized(true);
|
||||||
|
p.add_optimized_primitive_info(node->id());
|
||||||
|
p.extract_and_remove(*node);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (enable_reorder_fusing) {
|
||||||
|
itr = p.get_processing_order().begin();
|
||||||
|
while (itr != p.get_processing_order().end()) {
|
||||||
|
auto& node = *itr++;
|
||||||
|
if (!node->is_type<reorder>())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!node->is_in_data_flow() || node->get_dependencies().size() != 1)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto& r_node = node->as<reorder>();
|
||||||
|
|
||||||
|
// Remove reorder for Convolution bfyx -> fs_b_yx_fsv32
|
||||||
|
try_fuse_reorder_bfyx_to_fsv32(&r_node);
|
||||||
|
// Remove reorder for Convolution b_fs_yx_fsv16 -> bfyx
|
||||||
|
try_fuse_reorder_fsv16_to_bfyx(&r_node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Additional reshape chains shrink.
|
// Additional reshape chains shrink.
|
||||||
|
@ -41,6 +41,7 @@ struct fused_primitive_desc {
|
|||||||
std::vector<primitive_id> fused_deps;
|
std::vector<primitive_id> fused_deps;
|
||||||
activation_func activation;
|
activation_func activation;
|
||||||
activation_additional_params activation_params;
|
activation_additional_params activation_params;
|
||||||
|
layout input_layout = layout(data_types::f32, format::bfyx, tensor());
|
||||||
layout output_layout = layout(data_types::f32, format::bfyx, tensor());
|
layout output_layout = layout(data_types::f32, format::bfyx, tensor());
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
|
|
||||||
#include "cldnn/primitives/reorder.hpp"
|
#include "cldnn/primitives/reorder.hpp"
|
||||||
#include "primitive_inst.h"
|
#include "primitive_inst.h"
|
||||||
|
#include "kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.h"
|
||||||
|
#include "kernel_selector/common/tensor_type.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -33,11 +35,19 @@ public:
|
|||||||
void requires_reinterpret(bool val) { req_reinterpr = (optimized && val); }
|
void requires_reinterpret(bool val) { req_reinterpr = (optimized && val); }
|
||||||
|
|
||||||
void set_input_offset(tensor const& io) { input_offset = io; }
|
void set_input_offset(tensor const& io) { input_offset = io; }
|
||||||
|
void set_input_layout(layout const& lo) { input_layout = lo; }
|
||||||
tensor get_input_offset() const { return input_offset; }
|
tensor get_input_offset() const { return input_offset; }
|
||||||
|
|
||||||
|
std::shared_ptr<kernel_selector::fuse_params> get_fuse_params() const override {
|
||||||
|
kernel_selector::DataLayout ks_input_layout = convert_data_tensor(input_layout).GetLayout();
|
||||||
|
kernel_selector::DataLayout ks_output_layout = convert_data_tensor(get_output_layout()).GetLayout();
|
||||||
|
return std::make_shared<kernel_selector::reorder_fuse_params>(ks_input_layout, ks_output_layout);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool req_reinterpr = false;
|
bool req_reinterpr = false;
|
||||||
tensor input_offset = tensor{0}; // used by reorder to winograd domain
|
tensor input_offset = tensor{0}; // used by reorder to winograd domain
|
||||||
|
layout input_layout = layout(data_types::f32, format::bfyx, { 0, 0, 0, 0 });
|
||||||
};
|
};
|
||||||
|
|
||||||
using reorder_node = typed_program_node<reorder>;
|
using reorder_node = typed_program_node<reorder>;
|
||||||
|
@ -7609,6 +7609,222 @@ TEST_P(convolution_general_gpu, conv_fp16_cases) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct convolution_gpu_fsv16_to_bfyx : public convolution_general_gpu {};
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(conv_b_fs_yx_fsv16_to_bfyx,
|
||||||
|
convolution_gpu_fsv16_to_bfyx,
|
||||||
|
::testing::Values(
|
||||||
|
// Input X size, Input Y size, Input Z size, Input features, Output features,
|
||||||
|
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
|
||||||
|
// Input data format, Implementation name, WithBias
|
||||||
|
TestParamType_general_convolution_gpu(6, 6, 0, 16, 16, 3, 3, 0, 1, 1, 4, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false),
|
||||||
|
TestParamType_general_convolution_gpu(6, 6, 0, 32, 32, 3, 3, 0, 1, 1, 1, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false),
|
||||||
|
TestParamType_general_convolution_gpu(6, 6, 0, 16, 16, 3, 3, 0, 1, 1, 16, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false),
|
||||||
|
TestParamType_general_convolution_gpu(16, 6, 0, 20, 16, 3, 3, 0, 1, 1, 20, format::b_fs_yx_fsv16, "convolution_gpu_fsv16_to_bfyx", false)
|
||||||
|
),
|
||||||
|
convolution_gpu_fsv16_to_bfyx::PrintToStringParamName);
|
||||||
|
|
||||||
|
TEST_P(convolution_gpu_fsv16_to_bfyx, conv_b_fs_yx_fsv16_to_bfyx_padding)
|
||||||
|
{
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
|
||||||
|
if (!engine.get_device_info().supports_fp16)
|
||||||
|
{
|
||||||
|
std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
|
||||||
|
EXPECT_EQ(1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int input_b = testing::get<10>(GetParam());
|
||||||
|
const int input_f = testing::get<3>(GetParam());
|
||||||
|
const int input_y = testing::get<1>(GetParam());
|
||||||
|
const int input_x = testing::get<0>(GetParam());
|
||||||
|
|
||||||
|
const int filter_x = testing::get<5>(GetParam());
|
||||||
|
const int filter_y = testing::get<6>(GetParam());
|
||||||
|
const int stride = testing::get<9>(GetParam());
|
||||||
|
|
||||||
|
const int input_offset_y = (filter_y - 1) / 2;
|
||||||
|
const int input_offset_x = (filter_x - 1) / 2;
|
||||||
|
|
||||||
|
auto input_size = tensor(input_b, input_f, input_x, input_y);
|
||||||
|
auto input_data = generate_random_4d<FLOAT16>(input_b, input_f, input_y, input_x, -1, 1);
|
||||||
|
auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
|
||||||
|
auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
|
||||||
|
set_values(input_mem, input_data_bfyx);
|
||||||
|
|
||||||
|
auto weights_size = tensor(input_b, input_f, filter_x, filter_y, 1);
|
||||||
|
auto weights_data = generate_random_4d<FLOAT16>(input_b, input_f, filter_x, filter_y, -1, 1);
|
||||||
|
auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
|
||||||
|
auto weights_mem = engine.allocate_memory({ data_types::f16, format::goiyx, weights_size });
|
||||||
|
set_values(weights_mem, weights_data_bfyx);
|
||||||
|
|
||||||
|
// Set topology
|
||||||
|
topology topology(
|
||||||
|
input_layout("input_origin", input_mem->get_layout()),
|
||||||
|
data("weights_fsv", weights_mem),
|
||||||
|
reorder("input_fsv16", "input_origin", { data_types::f16, format::b_fs_yx_fsv16, input_size })); // format 3 to 8
|
||||||
|
|
||||||
|
// Add convolution
|
||||||
|
auto input_stride = tensor(1, 1, stride, stride);
|
||||||
|
auto input_offset = tensor(0, 0, input_offset_x, input_offset_y);
|
||||||
|
auto input_dilation = tensor(1, 1, 1, 1);
|
||||||
|
auto input_padding_before = tensor(0, 0, input_offset_x, input_offset_y);
|
||||||
|
auto input_padding_after = tensor(0, 0, input_offset_x, input_offset_y);
|
||||||
|
|
||||||
|
auto conv_fsv = convolution("conv_fsv", "input_fsv16", { "weights_fsv" }, input_stride, input_offset, input_dilation, input_padding_before, input_padding_after);
|
||||||
|
conv_fsv.output_padding = padding({ 0, 32, 2, 2 }, 0.f);
|
||||||
|
topology.add(conv_fsv); // format 8 to 8 -> after fusing, format 8 to 3
|
||||||
|
|
||||||
|
// Add reorder to bfyx
|
||||||
|
auto reorder_bfyx = reorder("reorder_bfyx", "conv_fsv", { data_types::f16, format::bfyx, input_size });
|
||||||
|
reorder_bfyx.output_padding = padding({ 0, 16, 1, 1 }, 0.f);
|
||||||
|
topology.add(reorder_bfyx); // format 8 to 3 -> after fusing, removed
|
||||||
|
|
||||||
|
// Exec ref network (non-fusing)
|
||||||
|
build_options options_ref;
|
||||||
|
options_ref.set_option(build_option::optimize_data(false));
|
||||||
|
options_ref.set_option(build_option::allow_static_input_reorder(true));
|
||||||
|
|
||||||
|
network network_ref(engine, topology, options_ref);
|
||||||
|
network_ref.set_input_data("input_origin", input_mem);
|
||||||
|
auto ref_out = network_ref.execute();
|
||||||
|
|
||||||
|
auto ref_out_mem = ref_out.begin()->second.get_memory();
|
||||||
|
cldnn::mem_lock<FLOAT16> ref_out_ptr(ref_out_mem, get_test_stream());
|
||||||
|
|
||||||
|
// Exec target network (fusing: conv+reorder)
|
||||||
|
build_options options_target;
|
||||||
|
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" };
|
||||||
|
options_target.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
|
||||||
|
options_target.set_option(build_option::optimize_data(true));
|
||||||
|
|
||||||
|
network network_target(engine, topology, options_target);
|
||||||
|
network_target.set_input_data("input_origin", input_mem);
|
||||||
|
auto target_out = network_target.execute();
|
||||||
|
|
||||||
|
auto target_out_mem = target_out.begin()->second.get_memory();
|
||||||
|
cldnn::mem_lock<FLOAT16> target_out_ptr(target_out_mem, get_test_stream());
|
||||||
|
|
||||||
|
// Compare ref and target result
|
||||||
|
for (size_t i = 0; i < ref_out_ptr.size(); i++) {
|
||||||
|
auto ref_val = static_cast<float>(ref_out_ptr[i]);
|
||||||
|
auto target_val = static_cast<float>(target_out_ptr[i]);
|
||||||
|
auto diff = std::fabs(ref_val - target_val);
|
||||||
|
auto equal = (diff > 1e-5f) ? false : true;
|
||||||
|
|
||||||
|
EXPECT_TRUE(equal);
|
||||||
|
if (!equal)
|
||||||
|
{
|
||||||
|
std::cout << "i:" << i \
|
||||||
|
<< "\t ref_out = " << ref_val \
|
||||||
|
<< "\t target_out = " << target_val \
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(convolution_gpu_fsv16_to_bfyx, conv_b_fs_yx_fsv16_to_bfyx_different_type)
|
||||||
|
{
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
|
||||||
|
if (!engine.get_device_info().supports_fp16)
|
||||||
|
{
|
||||||
|
std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
|
||||||
|
EXPECT_EQ(1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int input_b = testing::get<10>(GetParam());
|
||||||
|
const int input_f = testing::get<3>(GetParam());
|
||||||
|
const int input_y = testing::get<1>(GetParam());
|
||||||
|
const int input_x = testing::get<0>(GetParam());
|
||||||
|
|
||||||
|
const int filter_x = testing::get<5>(GetParam());
|
||||||
|
const int filter_y = testing::get<6>(GetParam());
|
||||||
|
const int stride = testing::get<9>(GetParam());
|
||||||
|
|
||||||
|
const int input_offset_y = (filter_y - 1) / 2;
|
||||||
|
const int input_offset_x = (filter_x - 1) / 2;
|
||||||
|
|
||||||
|
auto input_size = tensor(input_b, input_f, input_x, input_y);
|
||||||
|
auto input_data = generate_random_4d<FLOAT16>(input_b, input_f, input_y, input_x, -1, 1);
|
||||||
|
auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
|
||||||
|
auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
|
||||||
|
set_values(input_mem, input_data_bfyx);
|
||||||
|
|
||||||
|
auto weights_size = tensor(input_b, input_f, filter_x, filter_y, 1);
|
||||||
|
auto weights_data = generate_random_4d<FLOAT16>(input_b, input_f, filter_x, filter_y, -1, 1);
|
||||||
|
auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
|
||||||
|
auto weights_mem = engine.allocate_memory({ data_types::f16, format::goiyx, weights_size });
|
||||||
|
set_values(weights_mem, weights_data_bfyx);
|
||||||
|
|
||||||
|
// Set topology
|
||||||
|
topology topology(
|
||||||
|
input_layout("input_origin", input_mem->get_layout()),
|
||||||
|
data("weights_fsv", weights_mem),
|
||||||
|
reorder("input_fsv16", "input_origin", { data_types::f16, format::b_fs_yx_fsv16, input_size })); // format 3 to 8
|
||||||
|
|
||||||
|
// Add convolution
|
||||||
|
auto input_stride = tensor(1, 1, stride, stride);
|
||||||
|
auto input_offset = tensor(0, 0, input_offset_x, input_offset_y);
|
||||||
|
auto input_dilation = tensor(1, 1, 1, 1);
|
||||||
|
auto no_padding = tensor(0, 0, input_offset_x, input_offset_y);
|
||||||
|
|
||||||
|
auto conv_fsv = convolution("conv_fsv", "input_fsv16", { "weights_fsv" }, input_stride, input_offset, input_dilation, no_padding, no_padding);
|
||||||
|
topology.add(conv_fsv); // format 8 to 8 -> after fusing, format 8 to 3
|
||||||
|
|
||||||
|
// Add reorder to bfyx
|
||||||
|
auto reorder_bfyx = reorder("reorder_bfyx", "conv_fsv", { data_types::f32, format::bfyx, input_size });
|
||||||
|
topology.add(reorder_bfyx); // format 8 to 3 -> after fusing, removed
|
||||||
|
|
||||||
|
// Exec ref network (non-fusing)
|
||||||
|
build_options options_ref;
|
||||||
|
options_ref.set_option(build_option::optimize_data(false));
|
||||||
|
options_ref.set_option(build_option::allow_static_input_reorder(true));
|
||||||
|
|
||||||
|
network network_ref(engine, topology, options_ref);
|
||||||
|
network_ref.set_input_data("input_origin", input_mem);
|
||||||
|
auto ref_out = network_ref.execute();
|
||||||
|
|
||||||
|
auto ref_out_mem = ref_out.begin()->second.get_memory();
|
||||||
|
cldnn::mem_lock<float> ref_out_ptr(ref_out_mem, get_test_stream());
|
||||||
|
|
||||||
|
// Exec target network (fusing: conv+reorder)
|
||||||
|
build_options options_target;
|
||||||
|
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" };
|
||||||
|
options_target.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
|
||||||
|
options_target.set_option(build_option::optimize_data(true));
|
||||||
|
|
||||||
|
network network_target(engine, topology, options_target);
|
||||||
|
network_target.set_input_data("input_origin", input_mem);
|
||||||
|
auto target_out = network_target.execute();
|
||||||
|
|
||||||
|
auto target_out_mem = target_out.begin()->second.get_memory();
|
||||||
|
cldnn::mem_lock<float> target_out_ptr(target_out_mem, get_test_stream());
|
||||||
|
|
||||||
|
// Compare ref and target result
|
||||||
|
for (size_t i = 0; i < ref_out_ptr.size(); i++) {
|
||||||
|
auto ref_val = static_cast<float>(ref_out_ptr[i]);
|
||||||
|
auto target_val = static_cast<float>(target_out_ptr[i]);
|
||||||
|
auto diff = std::abs(ref_val - target_val);
|
||||||
|
auto equal = (diff > 1e-5f) ? false : true;
|
||||||
|
|
||||||
|
EXPECT_TRUE(equal);
|
||||||
|
if (!equal)
|
||||||
|
{
|
||||||
|
std::cout << "i:" << i \
|
||||||
|
<< "\t ref_out = " << ref_val \
|
||||||
|
<< "\t target_out = " << target_val \
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename InputT, typename WeightsT, typename OutputT>
|
template <typename InputT, typename WeightsT, typename OutputT>
|
||||||
class convolution_test_base {
|
class convolution_test_base {
|
||||||
public:
|
public:
|
||||||
|
@ -616,6 +616,73 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class conv_fp32_reorder_fsv16_to_bfyx : public ConvFusingTest {};
|
||||||
|
TEST_P(conv_fp32_reorder_fsv16_to_bfyx, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p))),
|
||||||
|
reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32),
|
||||||
|
convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx, ::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
bc_test_params{ CASE_CONV_FP32_1, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP32_2, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP32_3, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP32_4, 2, 2 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_5, 2, 2 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_14, 2, 2 },
|
||||||
|
|
||||||
|
bc_test_params{ CASE_CONV_FP16_1, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP16_2, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP16_3, 2, 2},
|
||||||
|
bc_test_params{ CASE_CONV_FP16_4, 2, 2 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_5, 2, 2 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_13, 2, 2}
|
||||||
|
}));
|
||||||
|
|
||||||
|
class conv_fp32_reorder_fsv16_to_bfyx_conv : public ConvFusingTest {};
|
||||||
|
TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) {
|
||||||
|
auto p = GetParam();
|
||||||
|
|
||||||
|
auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3));
|
||||||
|
auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor };
|
||||||
|
auto dw_stride = tensor{ 0, 0, 1, 1 };
|
||||||
|
|
||||||
|
create_topologies(input_layout("input", get_input_layout(p)),
|
||||||
|
data("weights", get_mem(get_weights_layout(p), -127, 127)),
|
||||||
|
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
|
||||||
|
reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32),
|
||||||
|
convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||||
|
reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32),
|
||||||
|
convolution("conv_output", "reorder_bfyx", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
|
||||||
|
activation("activation", "conv_output", activation_func::abs),
|
||||||
|
reorder("reorder_output", "activation", p.default_format, data_types::f32)
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx_conv, ::testing::ValuesIn(std::vector<bc_test_params>{
|
||||||
|
bc_test_params{ CASE_CONV_FP32_1, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_2, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_3, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_4, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_5, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP32_14, 3, 4 },
|
||||||
|
|
||||||
|
bc_test_params{ CASE_CONV_FP16_1, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_2, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_3, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_4, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_5, 3, 4 },
|
||||||
|
bc_test_params{ CASE_CONV_FP16_13, 3, 4 },
|
||||||
|
}));
|
||||||
|
|
||||||
class conv_fp32_activation : public ConvFusingTest {};
|
class conv_fp32_activation : public ConvFusingTest {};
|
||||||
TEST_P(conv_fp32_activation, basic) {
|
TEST_P(conv_fp32_activation, basic) {
|
||||||
auto p = GetParam();
|
auto p = GetParam();
|
||||||
@ -8279,9 +8346,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise
|
|||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 5 },
|
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 5 },
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 },
|
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 },
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 },
|
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 },
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_7, 2, 5 },
|
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_9, 2, 5 },
|
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_8, 2, 5 },
|
|
||||||
|
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 },
|
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 },
|
||||||
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 },
|
scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 },
|
||||||
|
Loading…
Reference in New Issue
Block a user