[GPU] Enable crop for shape agnostic kernel (#15866)

* Enable crop shape agnostic kernel

* Added unit test

* Added new scalar argument for crop (eltwise) for being used as runtime input offset in shape agnostic kernel

* Fix eltwise to have runtime offset only for crop

* Fix unittest error

* Applied review comment
This commit is contained in:
Taylor Yeonbok Lee
2023-02-25 15:49:46 -08:00
committed by GitHub
parent 15990afea2
commit fabf67ee5e
5 changed files with 93 additions and 18 deletions

View File

@@ -26,19 +26,60 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {
}
public:
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
auto params = get_default_params<kernel_selector::eltwise_params>(impl_param);
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<crop>();
auto params = get_default_params<kernel_selector::eltwise_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::eltwise_optional_params>(impl_param.get_program());
params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0)}, kernel_selector::eltwise_mode::ASSIGN});
params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]);
if (impl_param.get_program().get_node(primitive->id).is_dynamic()) {
// WA to always match compiled dynamic kernel with dispatch data
// W/O enforcing this option we may generate kernel for "broadcast" scneario due to umatched tensor dimensions
// but in runtime dispatch data will be generated for non-broadcast case as shapes are actually same.
params.broadcast = true;
} else {
params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]);
}
return {params, optional_params};
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();
kernel_selector::ScalarDescriptor s;
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
s.v.u32 = runtime_offset;
OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1,
"[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl");
_kernel_data.kernels[0].params.scalars[0] = s;
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};
namespace detail {
attach_crop_impl::attach_crop_impl() {
auto dyn_types = {
data_types::f32,
data_types::f16,
data_types::i8,
data_types::u8,
data_types::i32,
data_types::i64
};
auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
};
implementation_map<crop>::add(impl_types::ocl,
shape_types::dynamic_shape,
typed_primitive_impl_ocl<crop>::create<crop_impl>,
dyn_types,
dyn_formats);
implementation_map<crop>::add(impl_types::ocl, typed_primitive_impl_ocl<crop>::create<crop_impl>, {
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),

View File

@@ -19,6 +19,9 @@ KERNEL(eltwise)(
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
#if IS_DYNAMIC_CROP
, int runtime_offset
#endif
)
{

View File

@@ -217,7 +217,12 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel,
kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.params.arguments = GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims, number_of_outputs, is_dynamic);
kernel.params.arguments = GetArgsDesc(number_of_inputs,
weights,
bias,
number_of_inputs_for_fused_prims,
number_of_outputs,
is_dynamic);
}
bool KernelBaseOpenCL::layout_is_one_of(const MultiDataTensor& tensors, const std::vector<DataLayout>& allowed_layouts) const {

View File

@@ -311,9 +311,13 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param
bool useVload8) const {
JitConstants jit = {};
std::string vload_decls;
for (size_t op_num = 0; op_num < params.operations.size(); op_num++) {
const std::string op_num_str = toCodeString(op_num);
const auto &ew = params.operations[op_num];
bool is_dynamic_crop_kernel = params.is_shape_agnostic && params.operations[op_num].mode == EltwiseMode::ASSIGN;
if (is_dynamic_crop_kernel)
jit.AddConstant(MakeJitConstant("IS_DYNAMIC_CROP", 1));
for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
const auto &input = ew.inputs[input_idx];
const std::string name = "INPUT_" + op_num_str + "_" + toCodeString(input_idx);
@@ -330,7 +334,7 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param
jit.AddConstant(MakeJitConstant(name,
"input" + toCodeString(input.index) +
"[GET_INDEX(INPUT, " + toCodeString(input.index) +
"," + idx_order + ")]"));
"," + idx_order + ") " + (is_dynamic_crop_kernel ? "+ runtime_offset]" : "]")));
break;
case EltwiseInputMode::OUTPUT_BUFFER:
jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT,,OUTPUT_IDX_ORDER)]"));
@@ -711,7 +715,13 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const
GetFusedPrimitiveInputsCount(params),
1,
is_dynamic);
if (params.is_shape_agnostic && newParams.operations[0].mode == EltwiseMode::ASSIGN) {
kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
kernel_selector::ScalarDescriptor s;
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
s.v.u32 = 0;
kernel.params.scalars.push_back(s);
}
return {kd};
}
} // namespace kernel_selector

View File

@@ -8,6 +8,8 @@
#include <intel_gpu/primitives/eltwise.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include "crop_inst.h"
using namespace cldnn;
using namespace ::tests;
@@ -1342,9 +1344,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) {
topology.add(crop("crop1", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0, num_splits));
topology.add(crop("crop2", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1, num_splits));
std::vector<int32_t> input_vec = { -1, 2, -3, 4 };
std::vector<int32_t> out1 = { -1, 2 };
std::vector<int32_t> out2 = { -3, 4 };
std::vector<float> input_vec = { -1.0f, 2.0f, -3.0f, 4.0f };
std::vector<float> out1 = { -1.0f, 2.0f };
std::vector<float> out2 = { -3.0f, 4.0f };
set_values(input_mem, input_vec);
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
@@ -1355,14 +1357,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) {
network.set_input_data("input", input_mem);
auto outputs = network.execute();
auto output = outputs.at("crop1").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
auto impl1 = network.get_primitive("crop1")->get_impl();
ASSERT_TRUE(impl1 != nullptr);
ASSERT_TRUE(impl1->is_dynamic());
auto impl2 = network.get_primitive("crop2")->get_impl();
ASSERT_TRUE(impl2 != nullptr);
ASSERT_TRUE(impl2->is_dynamic());
auto output1 = outputs.at("crop1").get_memory();
cldnn::mem_lock<float> output_ptr_1(output1, get_test_stream());
for (size_t i = 0; i < out1.size(); i++)
ASSERT_EQ(output_ptr[i], out1[i]);
ASSERT_EQ(output_ptr_1[i], out1[i]);
auto output_2 = outputs.at("crop2").get_memory();
cldnn::mem_lock<int32_t> output_ptr_2(output_2, get_test_stream());
cldnn::mem_lock<float> output_ptr_2(output_2, get_test_stream());
for (size_t i = 0; i < out2.size(); i++)
ASSERT_EQ(output_ptr_2[i], out2[i]);
@@ -1399,9 +1408,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) {
topology.add(crop("crop1", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0));
topology.add(crop("crop2", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1));
std::vector<int32_t> input_vec = { -1, 2, -3, 4 };
std::vector<int32_t> out1 = { -1, 2, -3 };
std::vector<int32_t> out2 = { 4 };
std::vector<float> input_vec = { -1.0f, 2.0f, -3.0f, 4.0f };
std::vector<float> out1 = { -1.0f, 2.0f, -3.0f };
std::vector<float> out2 = { 4.0f };
std::vector<int64_t> splits_vec = {3, 1};
set_values(input_mem, input_vec);
@@ -1417,14 +1426,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) {
network.set_input_data("input", input_mem);
auto outputs = network.execute();
auto impl1 = network.get_primitive("crop1")->get_impl();
ASSERT_TRUE(impl1 != nullptr);
ASSERT_TRUE(impl1->is_dynamic());
auto impl2 = network.get_primitive("crop2")->get_impl();
ASSERT_TRUE(impl2 != nullptr);
ASSERT_TRUE(impl2->is_dynamic());
auto output = outputs.at("crop1").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
for (size_t i = 0; i < out1.size(); i++)
ASSERT_EQ(output_ptr[i], out1[i]);
auto output_2 = outputs.at("crop2").get_memory();
cldnn::mem_lock<int32_t> output_ptr_2(output_2, get_test_stream());
cldnn::mem_lock<float> output_ptr_2(output_2, get_test_stream());
for (size_t i = 0; i < out2.size(); i++)
ASSERT_EQ(output_ptr_2[i], out2[i]);