[GPU] Enable crop for shape agnostic kernel (#15866)
* Enable crop shape agnostic kernel * Added unit test * Added new scalar argument for crop (eltwise) for being used as runtime input offset in shape agnostic kernel * Fix eltwise to have runtime offset only for crop * Fix unittest error * Applied review comment
This commit is contained in:
committed by
GitHub
parent
15990afea2
commit
fabf67ee5e
@@ -26,19 +26,60 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {
|
||||
}
|
||||
|
||||
public:
|
||||
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
|
||||
auto params = get_default_params<kernel_selector::eltwise_params>(impl_param);
|
||||
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
|
||||
const auto& primitive = impl_param.typed_desc<crop>();
|
||||
auto params = get_default_params<kernel_selector::eltwise_params>(impl_param, is_shape_agnostic);
|
||||
auto optional_params = get_default_optional_params<kernel_selector::eltwise_optional_params>(impl_param.get_program());
|
||||
|
||||
params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0)}, kernel_selector::eltwise_mode::ASSIGN});
|
||||
params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]);
|
||||
if (impl_param.get_program().get_node(primitive->id).is_dynamic()) {
|
||||
// WA to always match compiled dynamic kernel with dispatch data
|
||||
// W/O enforcing this option we may generate kernel for "broadcast" scneario due to umatched tensor dimensions
|
||||
// but in runtime dispatch data will be generated for non-broadcast case as shapes are actually same.
|
||||
params.broadcast = true;
|
||||
} else {
|
||||
params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]);
|
||||
}
|
||||
return {params, optional_params};
|
||||
}
|
||||
void update_dispatch_data(const kernel_impl_params& impl_param) override {
|
||||
auto kernel_params = get_kernel_params(impl_param, true);
|
||||
auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();
|
||||
kernel_selector::ScalarDescriptor s;
|
||||
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
|
||||
s.v.u32 = runtime_offset;
|
||||
OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1,
|
||||
"[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl");
|
||||
_kernel_data.kernels[0].params.scalars[0] = s;
|
||||
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
|
||||
update_kernels_list_to_skip();
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
attach_crop_impl::attach_crop_impl() {
|
||||
auto dyn_types = {
|
||||
data_types::f32,
|
||||
data_types::f16,
|
||||
data_types::i8,
|
||||
data_types::u8,
|
||||
data_types::i32,
|
||||
data_types::i64
|
||||
};
|
||||
|
||||
auto dyn_formats = {
|
||||
format::bfyx,
|
||||
format::bfzyx,
|
||||
format::bfwzyx
|
||||
};
|
||||
|
||||
implementation_map<crop>::add(impl_types::ocl,
|
||||
shape_types::dynamic_shape,
|
||||
typed_primitive_impl_ocl<crop>::create<crop_impl>,
|
||||
dyn_types,
|
||||
dyn_formats);
|
||||
|
||||
implementation_map<crop>::add(impl_types::ocl, typed_primitive_impl_ocl<crop>::create<crop_impl>, {
|
||||
std::make_tuple(data_types::f32, format::yxfb),
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
|
||||
@@ -19,6 +19,9 @@ KERNEL(eltwise)(
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
#if IS_DYNAMIC_CROP
|
||||
, int runtime_offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
|
||||
|
||||
@@ -217,7 +217,12 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel,
|
||||
kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
|
||||
kernel.params.workGroups.global = dispatchData.gws;
|
||||
kernel.params.workGroups.local = dispatchData.lws;
|
||||
kernel.params.arguments = GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims, number_of_outputs, is_dynamic);
|
||||
kernel.params.arguments = GetArgsDesc(number_of_inputs,
|
||||
weights,
|
||||
bias,
|
||||
number_of_inputs_for_fused_prims,
|
||||
number_of_outputs,
|
||||
is_dynamic);
|
||||
}
|
||||
|
||||
bool KernelBaseOpenCL::layout_is_one_of(const MultiDataTensor& tensors, const std::vector<DataLayout>& allowed_layouts) const {
|
||||
|
||||
@@ -311,9 +311,13 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param
|
||||
bool useVload8) const {
|
||||
JitConstants jit = {};
|
||||
std::string vload_decls;
|
||||
|
||||
for (size_t op_num = 0; op_num < params.operations.size(); op_num++) {
|
||||
const std::string op_num_str = toCodeString(op_num);
|
||||
const auto &ew = params.operations[op_num];
|
||||
bool is_dynamic_crop_kernel = params.is_shape_agnostic && params.operations[op_num].mode == EltwiseMode::ASSIGN;
|
||||
if (is_dynamic_crop_kernel)
|
||||
jit.AddConstant(MakeJitConstant("IS_DYNAMIC_CROP", 1));
|
||||
for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
|
||||
const auto &input = ew.inputs[input_idx];
|
||||
const std::string name = "INPUT_" + op_num_str + "_" + toCodeString(input_idx);
|
||||
@@ -330,7 +334,7 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param
|
||||
jit.AddConstant(MakeJitConstant(name,
|
||||
"input" + toCodeString(input.index) +
|
||||
"[GET_INDEX(INPUT, " + toCodeString(input.index) +
|
||||
"," + idx_order + ")]"));
|
||||
"," + idx_order + ") " + (is_dynamic_crop_kernel ? "+ runtime_offset]" : "]")));
|
||||
break;
|
||||
case EltwiseInputMode::OUTPUT_BUFFER:
|
||||
jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT,,OUTPUT_IDX_ORDER)]"));
|
||||
@@ -711,7 +715,13 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const
|
||||
GetFusedPrimitiveInputsCount(params),
|
||||
1,
|
||||
is_dynamic);
|
||||
|
||||
if (params.is_shape_agnostic && newParams.operations[0].mode == EltwiseMode::ASSIGN) {
|
||||
kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
|
||||
kernel_selector::ScalarDescriptor s;
|
||||
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
|
||||
s.v.u32 = 0;
|
||||
kernel.params.scalars.push_back(s);
|
||||
}
|
||||
return {kd};
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#include <intel_gpu/primitives/eltwise.hpp>
|
||||
#include <intel_gpu/primitives/reorder.hpp>
|
||||
|
||||
#include "crop_inst.h"
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
@@ -1342,9 +1344,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) {
|
||||
topology.add(crop("crop1", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0, num_splits));
|
||||
topology.add(crop("crop2", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1, num_splits));
|
||||
|
||||
std::vector<int32_t> input_vec = { -1, 2, -3, 4 };
|
||||
std::vector<int32_t> out1 = { -1, 2 };
|
||||
std::vector<int32_t> out2 = { -3, 4 };
|
||||
std::vector<float> input_vec = { -1.0f, 2.0f, -3.0f, 4.0f };
|
||||
std::vector<float> out1 = { -1.0f, 2.0f };
|
||||
std::vector<float> out2 = { -3.0f, 4.0f };
|
||||
set_values(input_mem, input_vec);
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
@@ -1355,14 +1357,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) {
|
||||
network.set_input_data("input", input_mem);
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("crop1").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
auto impl1 = network.get_primitive("crop1")->get_impl();
|
||||
ASSERT_TRUE(impl1 != nullptr);
|
||||
ASSERT_TRUE(impl1->is_dynamic());
|
||||
auto impl2 = network.get_primitive("crop2")->get_impl();
|
||||
ASSERT_TRUE(impl2 != nullptr);
|
||||
ASSERT_TRUE(impl2->is_dynamic());
|
||||
|
||||
auto output1 = outputs.at("crop1").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr_1(output1, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < out1.size(); i++)
|
||||
ASSERT_EQ(output_ptr[i], out1[i]);
|
||||
ASSERT_EQ(output_ptr_1[i], out1[i]);
|
||||
|
||||
auto output_2 = outputs.at("crop2").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr_2(output_2, get_test_stream());
|
||||
cldnn::mem_lock<float> output_ptr_2(output_2, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < out2.size(); i++)
|
||||
ASSERT_EQ(output_ptr_2[i], out2[i]);
|
||||
@@ -1399,9 +1408,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) {
|
||||
topology.add(crop("crop1", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0));
|
||||
topology.add(crop("crop2", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1));
|
||||
|
||||
std::vector<int32_t> input_vec = { -1, 2, -3, 4 };
|
||||
std::vector<int32_t> out1 = { -1, 2, -3 };
|
||||
std::vector<int32_t> out2 = { 4 };
|
||||
std::vector<float> input_vec = { -1.0f, 2.0f, -3.0f, 4.0f };
|
||||
std::vector<float> out1 = { -1.0f, 2.0f, -3.0f };
|
||||
std::vector<float> out2 = { 4.0f };
|
||||
std::vector<int64_t> splits_vec = {3, 1};
|
||||
|
||||
set_values(input_mem, input_vec);
|
||||
@@ -1417,14 +1426,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) {
|
||||
network.set_input_data("input", input_mem);
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto impl1 = network.get_primitive("crop1")->get_impl();
|
||||
ASSERT_TRUE(impl1 != nullptr);
|
||||
ASSERT_TRUE(impl1->is_dynamic());
|
||||
auto impl2 = network.get_primitive("crop2")->get_impl();
|
||||
ASSERT_TRUE(impl2 != nullptr);
|
||||
ASSERT_TRUE(impl2->is_dynamic());
|
||||
|
||||
auto output = outputs.at("crop1").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < out1.size(); i++)
|
||||
ASSERT_EQ(output_ptr[i], out1[i]);
|
||||
|
||||
auto output_2 = outputs.at("crop2").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr_2(output_2, get_test_stream());
|
||||
cldnn::mem_lock<float> output_ptr_2(output_2, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < out2.size(); i++)
|
||||
ASSERT_EQ(output_ptr_2[i], out2[i]);
|
||||
|
||||
Reference in New Issue
Block a user