From 5ba60f845e56805632c8174bdfa82021582b4b22 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 14 Sep 2023 09:59:53 +0400 Subject: [PATCH] [GPU] Added zero input support for Pad (#19720) --- .../intel_gpu/src/graph/impls/ocl/border.cpp | 25 +++++ .../intel_gpu/src/graph/primitive_inst.cpp | 4 +- .../intel_gpu/src/graph/strided_slice.cpp | 4 +- .../kernels/border/border_kernel_base.cpp | 2 +- .../tests/unit/test_cases/border_gpu_test.cpp | 99 ++++++++++++++++++- 5 files changed, 126 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp index cfda9a4dd97..c3d84f6269f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/border.cpp @@ -109,6 +109,31 @@ struct border_impl : typed_primitive_impl_ocl { auto kernel_params = get_kernel_params(impl_param, true); (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); } + +protected: + kernel_arguments_data get_arguments(const typed_primitive_inst& instance) const override { + kernel_arguments_data args = parent::get_arguments(instance); + + // In case of zero input shape and non-zero output (kernel execution is not skipped), we need to add fake input buffer + // So as not to get an error during the argument setting stage + if (instance.get_input_layout().count() == 0) { + args.inputs[0] = instance.get_intermediates_memories().front(); + } + + return args; + } + + std::vector get_internal_buffer_layouts_impl() const override { + const auto& prim_params = static_cast(*_kernel_data.params); + std::vector layouts; + + if (prim_params.inputs[0].LogicalSize() == 0) { + layout any_layout = {data_types::u8, format::bfyx, {1, 1, 1, 1}}; + layouts.push_back(any_layout); + } + + return layouts; + } }; namespace detail { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 271bf2d3389..5819d1c22e3 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -773,11 +773,11 @@ event::ptr primitive_inst::execute(const std::vector& events) { update_shape(); // Check successor reorder if layouts are same - // Need to set can_be_optimized for user reorder at predesescor because + // Need to set can_be_optimized for user reorder at predecessor because // if the user is can_be_optimized and output node then current nodes' output should be allocated to host. do_runtime_skip_reorder(); if (_impl_params->output_layouts[0].count() == 0) { - GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl; + GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping because output data is empty " << std::endl; auto ev = get_network().get_stream().create_user_event(true); update_shape_done_by_other = false; // reset return ev; diff --git a/src/plugins/intel_gpu/src/graph/strided_slice.cpp b/src/plugins/intel_gpu/src/graph/strided_slice.cpp index 6d20706aa3b..9dd37819170 100644 --- a/src/plugins/intel_gpu/src/graph/strided_slice.cpp +++ b/src/plugins/intel_gpu/src/graph/strided_slice.cpp @@ -128,8 +128,8 @@ std::string strided_slice_inst::to_string(strided_slice_node const& node) { json_composite strided_slice_info; strided_slice_info.add("input id", input.id()); std::vector dependencies_info = {"begin_param id", "end_param id", "stride_param id"}; - for (size_t i = 0; i < node.get_dependencies().size(); ++i) { - strided_slice_info.add(dependencies_info[i], node.get_dependency(i).id()); + for (size_t i = 1; i < node.get_dependencies().size(); ++i) { + strided_slice_info.add(dependencies_info[i - 1], node.get_dependency(i).id()); } strided_slice_info.add("begin", node.get_primitive()->begin); strided_slice_info.add("end", node.get_primitive()->end); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/border/border_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/border/border_kernel_base.cpp index 83996379e78..4f51946e6c0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/border/border_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/border/border_kernel_base.cpp @@ -74,7 +74,7 @@ KernelsData BorderKernelBase::GetCommonKernelsData(const Params& params, auto dispatchData = SetDefault(prim_params); KernelData k_data = KernelData::Default(params); k_data.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { - const auto& prim_params = static_cast(params); + const auto& prim_params = static_cast(params); auto dispatchData = SetDefault(prim_params); OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); kd.kernels[0].params.workGroups.global = dispatchData.gws; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/border_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/border_gpu_test.cpp index a8f30a0da42..aa43f2da007 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/border_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/border_gpu_test.cpp @@ -1684,12 +1684,12 @@ public: blt_size_f = p.lt[1]; blt_size_y = p.lt[2]; blt_size_x = p.lt[3]; - + brb_size_b = p.rb[0]; brb_size_f = p.rb[1]; brb_size_y = p.rb[2]; brb_size_x = p.rb[3]; - + out_size_b = in_size_b + blt_size_b + brb_size_b; out_size_f = in_size_f + blt_size_f + brb_size_f; out_size_y = in_size_y + blt_size_y + brb_size_y; @@ -1825,5 +1825,98 @@ TEST_P(border_dynamic_test, border_dynamic_test) {} INSTANTIATE_TEST_SUITE_P(border_dynamic_test, border_dynamic_test, ::testing::ValuesIn(dynamic_params)); -}; // namespace +TEST(border_gpu, basic_zero_input_dynamic) { + auto& engine = get_test_engine(); + + // WA to avoid crash due to attempt to allocate 0 bytes for USM memory + layout fake_input_layout = {{1}, data_types::bin, format::bfyx}; + auto input = engine.allocate_memory(fake_input_layout); + + layout zero_input_layout = {{0, 1}, data_types::f32, format::bfyx}; + input = engine.reinterpret_buffer(*input, zero_input_layout); + + layout input_layout_dynamic = {ov::PartialShape::dynamic(2), data_types::f32, format::bfyx}; + + ov::CoordinateDiff pads_begin = {4, 0}; + ov::CoordinateDiff pads_end = {0, 0}; + + topology topology; + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(border("border", {input_info("input")}, 0, pads_begin, pads_end, ov::op::PadMode::CONSTANT, 1.0f)); + + std::vector ref_output = { + 1, 1, 1, 1 + }; + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + cldnn::network network(engine, topology, config); + network.set_input_data("input", input); + + auto inst = network.get_primitive("border"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "border"); + + auto output = outputs.at("border").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + ASSERT_EQ(ref_output.size(), output_ptr.size()); + + for (size_t i = 0; i < output_ptr.size(); ++i) { + ASSERT_EQ(ref_output[i], output_ptr[i]); + } +} + +TEST(border_gpu, basic_zero_input) { + auto& engine = get_test_engine(); + + // WA to avoid crash due to attempt to allocate 0 bytes for USM memory + layout fake_input_layout = {{1}, data_types::bin, format::bfyx}; + auto input = engine.allocate_memory(fake_input_layout); + + layout zero_input_layout = {{0, 1}, data_types::f32, format::bfyx}; + input = engine.reinterpret_buffer(*input, zero_input_layout); + + ov::CoordinateDiff pads_begin = {4, 0}; + ov::PartialShape pads_begin_shape = { ov::Dimension(pads_begin.size()) }; + auto pads_begin_input = engine.allocate_memory({pads_begin_shape, data_types::i32, format::bfyx}); + set_values(pads_begin_input, pads_begin); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(input_layout("pads_begin", pads_begin_input->get_layout())); + topology.add(border("border", {input_info("input"), input_info("pads_begin")}, + border::PAD_NON_CONST_INPUT::BEGIN, + /*pads_begin*/{}, /*pads_end*/{0, 0}, + ov::op::PadMode::CONSTANT, + 2.0f)); + + std::vector ref_output = { + 2, 2, 2, 2 + }; + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + cldnn::network network(engine, topology, config); + network.set_input_data("input", input); + network.set_input_data("pads_begin", pads_begin_input); + auto outputs = network.execute(); + + auto output = outputs.at("border").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + ASSERT_EQ(ref_output.size(), output_ptr.size()); + + for (size_t i = 0; i < output_ptr.size(); ++i) { + ASSERT_EQ(ref_output[i], output_ptr[i]); + } +} +}; // namespace