[GPU] Added shape agnostic ref kernel for Reduce and Quantize (#15183)
* [GPU] Added shape agnostic ref kernel for quantize * [GPU] Added shape agnostic ref kernel for Reduce
This commit is contained in:
parent
3bbaa16d90
commit
8e073819c3
@ -43,6 +43,8 @@ protected:
|
||||
}
|
||||
}
|
||||
args.outputs = { instance.output_memory_ptr() };
|
||||
args.shape_info = instance.shape_info_memory_ptr();
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
@ -81,122 +83,74 @@ public:
|
||||
for (size_t i = 1; i < arg.inputs_count(); i++) {
|
||||
quantize_params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[i]));
|
||||
}
|
||||
const auto& output_layout = impl_param.get_output_layout();
|
||||
quantize_params.outputs = { convert_data_tensor(output_layout) };
|
||||
|
||||
auto& kernel_selector = kernel_selector::quantize_kernel_selector::Instance();
|
||||
auto best_kernel = kernel_selector.get_best_kernel(quantize_params, quantize_optional_params);
|
||||
|
||||
return make_unique<quantize_impl>(best_kernel);
|
||||
}
|
||||
|
||||
void update_dispatch_data(const kernel_impl_params& impl_param) override {
|
||||
auto quantize_params = get_default_params<kernel_selector::quantize_params>(impl_param);
|
||||
const auto& output_layout = impl_param.get_output_layout();
|
||||
quantize_params.packed_binary_output = output_layout.data_type == data_types::bin;
|
||||
(_kernel_data.update_dispatch_data_func)(quantize_params, _kernel_data);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
attach_quantize_impl::attach_quantize_impl() {
|
||||
implementation_map<quantize>::add(impl_types::ocl, quantize_impl::create, {
|
||||
std::make_tuple(data_types::f16, format::fs_b_yx_fsv32),
|
||||
std::make_tuple(data_types::f32, format::fs_b_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::fs_b_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::fs_b_yx_fsv32),
|
||||
std::set<implementation_map<quantize>::key_type> keys;
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
auto types = {
|
||||
data_types::f16,
|
||||
data_types::f32,
|
||||
data_types::i8,
|
||||
data_types::u8
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::u8, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::byxf),
|
||||
auto formats = {
|
||||
format::bfyx,
|
||||
format::byxf,
|
||||
format::b_fs_yx_fsv4,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_yx_fsv32,
|
||||
format::fs_b_yx_fsv32,
|
||||
format::bs_fs_yx_bsv16_fsv16,
|
||||
format::bs_fs_yx_bsv16_fsv32,
|
||||
format::bs_fs_yx_bsv32_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv32,
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv4),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv4),
|
||||
format::bfzyx,
|
||||
format::b_fs_zyx_fsv16,
|
||||
format::b_fs_zyx_fsv32,
|
||||
format::bs_fs_zyx_bsv16_fsv16,
|
||||
format::bs_fs_zyx_bsv16_fsv32,
|
||||
format::bs_fs_zyx_bsv32_fsv16,
|
||||
format::bs_fs_zyx_bsv32_fsv32,
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
format::bfwzyx
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv32),
|
||||
auto dyn_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
|
||||
for (const auto type : types) {
|
||||
for (const auto format : formats) {
|
||||
keys.emplace(type, format);
|
||||
}
|
||||
}
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv32),
|
||||
keys.emplace(data_types::f16, format::yxfb);
|
||||
keys.emplace(data_types::f32, format::yxfb);
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
|
||||
implementation_map<quantize>::add(impl_types::ocl, shape_types::static_shape, quantize_impl::create, keys);
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::i32, format::byxf),
|
||||
std::make_tuple(data_types::u8, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::byxf),
|
||||
|
||||
std::make_tuple(data_types::f32, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i32, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i32, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv32_fsv32),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv32_fsv32),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv32_fsv16),
|
||||
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv32_fsv16),
|
||||
});
|
||||
implementation_map<quantize>::add(impl_types::ocl, shape_types::dynamic_shape, quantize_impl::create, types, dyn_formats);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -89,48 +89,50 @@ struct reduce_impl : typed_primitive_impl_ocl<reduce> {
|
||||
|
||||
return {params, optional_params};
|
||||
}
|
||||
|
||||
void update_dispatch_data(const kernel_impl_params& impl_param) override {
|
||||
auto kernel_params = get_kernel_params(impl_param);
|
||||
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
attach_reduce_impl::attach_reduce_impl() {
|
||||
implementation_map<reduce>::add(impl_types::ocl, typed_primitive_impl_ocl<reduce>::create<reduce_impl>, {
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
auto types = {
|
||||
data_types::f32,
|
||||
data_types::f16,
|
||||
data_types::i32,
|
||||
data_types::i8,
|
||||
data_types::u8
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i32, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
auto static_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_yx_fsv32,
|
||||
format::b_fs_zyx_fsv16
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i32, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
implementation_map<reduce>::add(impl_types::ocl,
|
||||
shape_types::static_shape,
|
||||
typed_primitive_impl_ocl<reduce>::create<reduce_impl>,
|
||||
types,
|
||||
static_formats);
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
auto dyn_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx
|
||||
};
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i32, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
|
||||
});
|
||||
implementation_map<reduce>::add(impl_types::ocl,
|
||||
shape_types::dynamic_shape,
|
||||
typed_primitive_impl_ocl<reduce>::create<reduce_impl>,
|
||||
types,
|
||||
dyn_formats);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -8,12 +8,14 @@
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
#endif
|
||||
__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2)))
|
||||
KERNEL(quantize_ref)(const __global INPUT0_TYPE* input,
|
||||
const __global INPUT1_TYPE* input_low,
|
||||
const __global INPUT2_TYPE* input_high,
|
||||
const __global INPUT3_TYPE* output_low,
|
||||
const __global INPUT4_TYPE* output_high,
|
||||
__global OUTPUT_TYPE* output)
|
||||
KERNEL(quantize_ref)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* input,
|
||||
const __global INPUT1_TYPE* input_low,
|
||||
const __global INPUT2_TYPE* input_high,
|
||||
const __global INPUT3_TYPE* output_low,
|
||||
const __global INPUT4_TYPE* output_high,
|
||||
__global OUTPUT_TYPE* output)
|
||||
{
|
||||
const int b = get_global_id(0);
|
||||
const int of = get_global_id(1);
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
inline uint FUNC(calc_linear_offset)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
inline uint FUNC(calc_linear_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
uint index = b * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W * OUTPUT_FEATURE_NUM +
|
||||
f * OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z * OUTPUT_SIZE_W +
|
||||
@ -17,6 +17,7 @@ inline uint FUNC(calc_linear_offset)(uint b, uint f, uint w, uint z, uint y, uin
|
||||
}
|
||||
|
||||
KERNEL(reduce_ref)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* data,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
@ -51,7 +52,7 @@ KERNEL(reduce_ref)(
|
||||
const uint out_idx = OUTPUT_GET_INDEX(b, f, w, z, y, x);
|
||||
#endif
|
||||
|
||||
const uint linear_idx = FUNC_CALL(calc_linear_offset)(b, f, w, z, y, x);
|
||||
const uint linear_idx = FUNC_CALL(calc_linear_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
|
||||
if (linear_idx >= COMPUTATIONAL_OPERATIONS_NUMBER)
|
||||
return;
|
||||
|
||||
|
@ -61,17 +61,25 @@ KernelsData QuantizeKernelBase::GetKernelsData(const Params& params, const optio
|
||||
return {};
|
||||
}
|
||||
|
||||
auto dispatchData = SetDefault(newParams, options);
|
||||
auto dispatchData = SetDefault(newParams);
|
||||
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
|
||||
auto cldnn_jit = GetJitConstants(newParams, dispatchData);
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
|
||||
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
|
||||
const auto& prim_params = static_cast<const quantize_params&>(params);
|
||||
auto dispatchData = SetDefault(prim_params);
|
||||
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
|
||||
kd.kernels[0].params.workGroups.global = dispatchData.gws;
|
||||
kd.kernels[0].params.workGroups.local = dispatchData.lws;
|
||||
};
|
||||
|
||||
auto& kernel = kd.kernels[0];
|
||||
|
||||
kernel.params.workGroups.global = dispatchData.gws;
|
||||
kernel.params.workGroups.local = dispatchData.lws;
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT);
|
||||
kernel.params.arguments = GetArgsDesc(static_cast<int>(newParams.inputs.size()), false, false);
|
||||
kernel.params.arguments = GetArgsDesc(static_cast<int>(newParams.inputs.size()), false, false, 0, 1, newParams.outputs[0].is_dynamic());
|
||||
|
||||
return {kd};
|
||||
}
|
||||
|
@ -19,6 +19,6 @@ public:
|
||||
|
||||
protected:
|
||||
virtual JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const;
|
||||
virtual CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const = 0;
|
||||
virtual CommonDispatchData SetDefault(const quantize_params& params) const = 0;
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
@ -25,10 +25,11 @@ ParamsKey QuantizeKernelRef::GetSupportedKey() const {
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableQuantizePackedBinaryOutput();
|
||||
k.EnableDynamicShapesSupport();
|
||||
return k;
|
||||
}
|
||||
|
||||
CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params, const optional_params&) const {
|
||||
CommonDispatchData QuantizeKernelRef::SetDefault(const quantize_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
|
||||
auto output = params.outputs[0];
|
||||
|
@ -16,7 +16,7 @@ public:
|
||||
virtual ~QuantizeKernelRef() {}
|
||||
|
||||
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
|
||||
CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const override;
|
||||
CommonDispatchData SetDefault(const quantize_params& params) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
@ -31,7 +31,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
|
||||
return k;
|
||||
}
|
||||
|
||||
CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& params, const optional_params&) const {
|
||||
CommonDispatchData QuantizeKernelScaleShift::SetDefault(const quantize_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
|
||||
auto output = params.outputs[0];
|
||||
|
@ -16,7 +16,7 @@ public:
|
||||
virtual ~QuantizeKernelScaleShift() {}
|
||||
|
||||
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
|
||||
CommonDispatchData SetDefault(const quantize_params& params, const optional_params&) const override;
|
||||
CommonDispatchData SetDefault(const quantize_params& params) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
@ -108,7 +108,7 @@ DeviceFeaturesKey ReduceKernel_b_fs_yx_fsv16::get_required_device_features_key(c
|
||||
return k;
|
||||
}
|
||||
|
||||
CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& params, const optional_params&) const {
|
||||
CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
|
||||
auto in_dims = calc_in_dims(params);
|
||||
|
@ -12,7 +12,7 @@ class ReduceKernel_b_fs_yx_fsv16 : public ReduceKernelBase {
|
||||
public:
|
||||
ReduceKernel_b_fs_yx_fsv16() : ReduceKernelBase("reduce_gpu_b_fs_yx_fsv16") {}
|
||||
virtual ~ReduceKernel_b_fs_yx_fsv16() {}
|
||||
CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const override;
|
||||
CommonDispatchData SetDefault(const reduce_params& params) const override;
|
||||
JitConstants GetJitConstants(const reduce_params& params) const override;
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
|
@ -28,7 +28,29 @@ bool ReduceKernelBase::Validate(const Params& p, const optional_params&) const {
|
||||
JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) const {
|
||||
JitConstants jit = MakeBaseParamsJitConstants(params);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.outputs[0].LogicalSize()));
|
||||
const auto& output = params.outputs[0];
|
||||
if (output.is_dynamic()) {
|
||||
size_t output_offset = (1 + GetFusedPrimitiveInputsCount(params)) * 6;
|
||||
auto x = toCodeString(output.X(), output_offset + 5);
|
||||
auto y = toCodeString(output.Y(), output_offset + 4);
|
||||
auto z = toCodeString(output.Z(), output_offset + 3);
|
||||
auto w = toCodeString(output.W(), output_offset + 2);
|
||||
auto f = toCodeString(output.Feature(), output_offset + 1);
|
||||
auto b = toCodeString(output.Batch(), output_offset);
|
||||
|
||||
auto multiply = [](std::vector<std::string> dims) -> std::string {
|
||||
std::string res = "(";
|
||||
for (size_t i = 0; i < dims.size() - 1; ++i) {
|
||||
res += dims[i] + "*";
|
||||
}
|
||||
res += dims.back() + ")";
|
||||
return res;
|
||||
};
|
||||
jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", multiply({x, y, z, w, f, b})));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", params.outputs[0].LogicalSize()));
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("REDUCE_" + toString(params.reduceMode) + "_MODE", 1));
|
||||
jit.AddConstant(MakeJitConstant("KEEP_DIMS", params.keepDims));
|
||||
|
||||
@ -226,7 +248,7 @@ KernelsData ReduceKernelBase::GetCommonKernelsData(const Params& p,
|
||||
}
|
||||
|
||||
const reduce_params& params = static_cast<const reduce_params&>(p);
|
||||
DispatchData dispatchData = SetDefault(params, options);
|
||||
DispatchData dispatchData = SetDefault(params);
|
||||
|
||||
KernelData kd = KernelData::Default<reduce_params>(params);
|
||||
|
||||
@ -234,6 +256,14 @@ KernelsData ReduceKernelBase::GetCommonKernelsData(const Params& p,
|
||||
auto entry_point = GetEntryPoint(kernelName, params.layerID, params, options);
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
|
||||
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
|
||||
const auto& prim_params = static_cast<const reduce_params&>(params);
|
||||
auto dispatchData = SetDefault(prim_params);
|
||||
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
|
||||
kd.kernels[0].params.workGroups.global = dispatchData.gws;
|
||||
kd.kernels[0].params.workGroups.local = dispatchData.lws;
|
||||
};
|
||||
|
||||
auto& kernel = kd.kernels[0];
|
||||
FillCLKernelData(kernel,
|
||||
dispatchData,
|
||||
@ -245,7 +275,9 @@ KernelsData ReduceKernelBase::GetCommonKernelsData(const Params& p,
|
||||
false,
|
||||
false,
|
||||
1,
|
||||
GetFusedPrimitiveInputsCount(params));
|
||||
GetFusedPrimitiveInputsCount(params),
|
||||
1,
|
||||
params.inputs[0].is_dynamic());
|
||||
|
||||
return {kd};
|
||||
}
|
||||
|
@ -39,7 +39,7 @@ public:
|
||||
protected:
|
||||
bool Validate(const Params&, const optional_params&) const override;
|
||||
virtual JitConstants GetJitConstants(const reduce_params& params) const;
|
||||
virtual CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const = 0;
|
||||
virtual CommonDispatchData SetDefault(const reduce_params& params) const = 0;
|
||||
Datatype GetAccumulatorType(const reduce_params& p) const;
|
||||
Datatype GetFinalAccumulatorType(const reduce_params& p) const;
|
||||
Datatype GetActivationType(const reduce_params& params) const;
|
||||
|
@ -27,10 +27,11 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const {
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDynamicShapesSupport();
|
||||
return k;
|
||||
}
|
||||
|
||||
CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const {
|
||||
CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.outputs[0].GetLayout();
|
||||
|
@ -12,7 +12,7 @@ class ReduceKernelRef : public ReduceKernelBase {
|
||||
public:
|
||||
ReduceKernelRef() : ReduceKernelBase("reduce_ref") {}
|
||||
virtual ~ReduceKernelRef() {}
|
||||
CommonDispatchData SetDefault(const reduce_params& params, const optional_params&) const override;
|
||||
CommonDispatchData SetDefault(const reduce_params& params) const override;
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
@ -91,9 +91,7 @@ static void CreateReduceOp(Program& p, const std::shared_ptr<ngraph::Node>& op,
|
||||
auto reorder_prim = cldnn::reorder(reorderLayerName,
|
||||
cldnn::input_info(resultLayerName),
|
||||
out_format,
|
||||
out_dt,
|
||||
std::vector<float>(),
|
||||
cldnn::reorder_mean_mode::subtract);
|
||||
out_dt);
|
||||
p.add_primitive(*op, reorder_prim);
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/quantize.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
#include "quantize_inst.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
@ -582,6 +583,107 @@ TEST(quantize_gpu, quantize_levels_256_3d_unsigned) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(quantize_gpu, dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::bfyx });
|
||||
auto input_low = engine.allocate_memory({ { 1, 16, 1, 1 }, data_types::f32, format::bfyx });
|
||||
auto input_high = engine.allocate_memory({ { 1, 16, 1, 1 }, data_types::f32, format::bfyx });
|
||||
auto output_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
|
||||
auto output_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
|
||||
|
||||
layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
|
||||
|
||||
set_values(input, { -1.0f, 2.0f, 3.0f, 4.0f,
|
||||
5.0f, 2.0f, 2.0f, 3.0f,
|
||||
4.0f, 6.0f, 3.0f, 3.0f,
|
||||
3.0f, 5.0f, 1.0f, 1.0f,
|
||||
|
||||
1.0f, 1.0f, 1.0f, 1.0f,
|
||||
4.0f, 6.0f, 3.0f, 3.0f,
|
||||
3.0f, 5.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f, 1.0f,
|
||||
|
||||
-1.0f, 2.0f, 3.0f, 4.0f,
|
||||
5.0f, 2.0f, 2.0f, 3.0f,
|
||||
4.0f, 6.0f, 3.0f, 3.0f,
|
||||
3.0f, 5.0f, 1.0f, 1.0f,
|
||||
|
||||
1.0f, 1.0f, 1.0f, 1.0f,
|
||||
4.0f, 6.0f, 3.0f, 3.0f,
|
||||
3.0f, 5.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f, 1.0f });
|
||||
|
||||
set_values(input_low, { 0.0f, 1.0f, 2.0f, 3.0f,
|
||||
4.0f, 5.0f, 6.0f, 7.0f,
|
||||
7.0f, 6.0f, 5.0f, 4.0f,
|
||||
3.0f, 2.0f, 1.0f, 0.0f });
|
||||
|
||||
set_values(input_high, { 0.0f, 1.0f, 2.0f, 3.0f,
|
||||
4.0f, 5.0f, 6.0f, 7.0f,
|
||||
7.0f, 6.0f, 5.0f, 4.0f,
|
||||
3.0f, 2.0f, 1.0f, 0.0f });
|
||||
|
||||
set_values(output_low, { -1.0f });
|
||||
set_values(output_high, { 1.0f });
|
||||
|
||||
// 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1
|
||||
// 1 1 1 1 0 1 0 0 0 0 1 1 0 1 1 1
|
||||
// 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1
|
||||
// 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1
|
||||
std::vector<float> ref_data = { -1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
-1, 1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, 1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
-1, 1, -1, -1,
|
||||
-1, 1, -1, -1,
|
||||
-1, -1, -1, -1,
|
||||
1, 1, 1, 1,
|
||||
1, 1, -1, -1,
|
||||
1, 1, 1, 1 };
|
||||
|
||||
topology topology;
|
||||
topology.add(
|
||||
input_layout("input", in_dyn_layout),
|
||||
data("input_low", input_low),
|
||||
data("input_high", input_high),
|
||||
data("output_low", output_low),
|
||||
data("output_high", output_high),
|
||||
quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 2, data_types::f32)
|
||||
);
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input);
|
||||
|
||||
auto inst = network.get_primitive("quantize");
|
||||
auto impl = inst->get_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("quantize").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
// Check that layout and memory contains logical size of tensor
|
||||
ASSERT_EQ(output->count(), (size_t)64);
|
||||
ASSERT_EQ(output->get_layout().count(), (size_t)64);
|
||||
|
||||
ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint32_t));
|
||||
|
||||
for (size_t i = 0; i < ref_data.size(); ++i) {
|
||||
ASSERT_EQ(output_ptr[i], ref_data[i]) << " index = " << i;
|
||||
}
|
||||
}
|
||||
|
||||
struct quantize_random_test_params {
|
||||
data_types input_type;
|
||||
data_types output_type;
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/reduce.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
#include "reduce_inst.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
@ -1641,6 +1642,43 @@ TEST(reduce_gpu, common_bfwzyx_log_sum_exp_keepdims) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(reduce_gpu, dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input = engine.allocate_memory({data_types::f32, format::bfwzyx, {2, 3, 1, 1, 1, 1}});
|
||||
|
||||
layout in_dyn_layout { ov::PartialShape::dynamic(6), data_types::f32, format::bfwzyx };
|
||||
|
||||
set_values(input, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", in_dyn_layout));
|
||||
topology.add(reduce("reduce", input_info("input"), reduce_mode::prod, {1, 2}, 1));
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input);
|
||||
|
||||
auto inst = network.get_primitive("reduce");
|
||||
auto impl = inst->get_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "reduce");
|
||||
|
||||
auto output = outputs.at("reduce").get_memory();
|
||||
|
||||
std::vector<float> ref_data = {0.0f, 60.0f};
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < ref_data.size(); ++i) {
|
||||
ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <data_types InputT, data_types OutputT>
|
||||
class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
|
||||
protected:
|
||||
|
Loading…
Reference in New Issue
Block a user