From 44cfbea9abb9d022b7be6ba2689510a87139d9b8 Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Wed, 5 Apr 2023 21:29:47 +0900 Subject: [PATCH] [GPU] Fix synchronization issue from wrong stream in multi-stream mode on dGPU (#16671) Signed-off-by: Andrew Park --- .../intel_gpu/graph/kernel_impl_params.hpp | 8 +++++- .../include/intel_gpu/graph/program.hpp | 1 + .../intel_gpu/src/graph/arg_max_min.cpp | 2 +- src/plugins/intel_gpu/src/graph/border.cpp | 8 +++--- src/plugins/intel_gpu/src/graph/broadcast.cpp | 2 +- src/plugins/intel_gpu/src/graph/crop.cpp | 6 ++-- .../intel_gpu/src/graph/deconvolution.cpp | 4 +-- .../graph/graph_optimizer/reorder_inputs.cpp | 3 ++ .../src/graph/impls/ocl/resample.cpp | 2 +- .../src/graph/include/primitive_inst.h | 1 + .../src/graph/include/program_node.h | 4 +-- .../src/graph/non_max_suppression.cpp | 2 +- src/plugins/intel_gpu/src/graph/non_zero.cpp | 4 +-- .../intel_gpu/src/graph/primitive_inst.cpp | 2 +- src/plugins/intel_gpu/src/graph/range.cpp | 6 ++-- src/plugins/intel_gpu/src/graph/resample.cpp | 4 +-- src/plugins/intel_gpu/src/graph/reshape.cpp | 2 +- .../intel_gpu/src/graph/strided_slice.cpp | 6 ++-- src/plugins/intel_gpu/src/graph/tile.cpp | 2 +- .../fusions/fully_connected_fusion_test.cpp | 28 ++++++++++++++++++- .../test_cases/multiple_streams_gpu_test.cpp | 14 +++++++++- 21 files changed, 80 insertions(+), 31 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index a962eb402c0..17f30a3eecc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -7,6 +7,7 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/memory.hpp" +#include "intel_gpu/runtime/stream.hpp" #include "intel_gpu/runtime/utils.hpp" #include "intel_gpu/runtime/tensor.hpp" #include "intel_gpu/primitives/primitive.hpp" @@ -32,6 +33,7 @@ struct kernel_impl_params { bool has_runtime_layouts = false; const program *prog; + stream::ptr strm; std::shared_ptr desc; size_t unique_id; std::vector input_layouts; @@ -52,9 +54,10 @@ struct kernel_impl_params { std::map memory_deps = {}; size_t primary_input_idx = 0; - kernel_impl_params() : prog(nullptr), desc(nullptr), unique_id(0) {} + kernel_impl_params() : prog(nullptr), strm(nullptr), desc(nullptr), unique_id(0) {} kernel_impl_params(program& _prog, + stream::ptr _strm, std::shared_ptr _desc, size_t _uid, const std::vector& _in_layouts, @@ -62,6 +65,7 @@ struct kernel_impl_params { const std::vector& _fused_descs) : has_runtime_layouts(true) , prog(&_prog) + , strm(_strm) , desc(_desc) , unique_id(_uid) , input_layouts(_in_layouts) @@ -119,6 +123,8 @@ struct kernel_impl_params { OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params in not initialized"); return *prog; } + stream& get_stream() const { return *strm; } + stream::ptr get_stream_ptr() const { return strm; } size_t hash() const; bool operator==(const kernel_impl_params& rhs) const; diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 195469d4f43..c537b1335a7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -151,6 +151,7 @@ public: nodes_ordering& get_processing_order(); uint32_t get_prog_id() { return prog_id; } stream& get_stream() { return *_stream; } + stream::ptr get_stream_ptr() const { return _stream; } const stream& get_stream() const { return *_stream; } const std::list& get_optimized_out() const { return optimized_out; } const std::list& get_optimized() const { return optimized; } diff --git a/src/plugins/intel_gpu/src/graph/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/arg_max_min.cpp index 21665eceafc..e83cba5b6c5 100644 --- a/src/plugins/intel_gpu/src/graph/arg_max_min.cpp +++ b/src/plugins/intel_gpu/src/graph/arg_max_min.cpp @@ -96,7 +96,7 @@ std::vector arg_max_min_inst::calc_output_layouts(arg_max_min_node const } else if (constant_mem.count(1)) { std::map const_data; auto target_shape_mem = constant_mem.at(1); - cldnn::mem_lock target_shape_lock(target_shape_mem, impl_param.prog->get_stream()); + cldnn::mem_lock target_shape_lock(target_shape_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(target_shape_mem->get_layout(), target_shape_lock.data())); ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); diff --git a/src/plugins/intel_gpu/src/graph/border.cpp b/src/plugins/intel_gpu/src/graph/border.cpp index bd80f5736fb..ce6d5edd4f1 100644 --- a/src/plugins/intel_gpu/src/graph/border.cpp +++ b/src/plugins/intel_gpu/src/graph/border.cpp @@ -59,18 +59,18 @@ std::vector border_inst::calc_output_layouts(border_node const& /*node*/ if ((is_begin_mem && memory_deps.count(1)) && (is_end_mem && memory_deps.count(2))) { auto pads_begin_mem = memory_deps.at(1); - cldnn::mem_lock pads_begin_lock(pads_begin_mem, impl_param.prog->get_stream()); + cldnn::mem_lock pads_begin_lock(pads_begin_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(pads_begin_mem->get_layout(), pads_begin_lock.data())); auto pads_end_mem = memory_deps.at(2); - cldnn::mem_lock pads_end_lock(pads_end_mem, impl_param.prog->get_stream()); + cldnn::mem_lock pads_end_lock(pads_end_mem, impl_param.get_stream()); const_data.emplace(2, make_host_tensor(pads_end_mem->get_layout(), pads_end_lock.data())); ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); } else if ((is_begin_mem || is_end_mem) && memory_deps.count(1)) { if (is_begin_mem) { auto pads_begin_mem = memory_deps.at(1); - cldnn::mem_lock pads_begin_lock(pads_begin_mem, impl_param.prog->get_stream()); + cldnn::mem_lock pads_begin_lock(pads_begin_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(pads_begin_mem->get_layout(), pads_begin_lock.data())); auto pads_end_data = desc->pads_end; @@ -84,7 +84,7 @@ std::vector border_inst::calc_output_layouts(border_node const& /*node*/ const_data.emplace(1, pads_begin_tensor); auto pads_end_mem = memory_deps.at(1); - cldnn::mem_lock pads_end_lock(pads_end_mem, impl_param.prog->get_stream()); + cldnn::mem_lock pads_end_lock(pads_end_mem, impl_param.get_stream()); const_data.emplace(2, make_host_tensor(pads_end_mem->get_layout(), pads_end_lock.data())); ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); diff --git a/src/plugins/intel_gpu/src/graph/broadcast.cpp b/src/plugins/intel_gpu/src/graph/broadcast.cpp index bdb2bb331ce..4cae5acdb3f 100644 --- a/src/plugins/intel_gpu/src/graph/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/broadcast.cpp @@ -73,7 +73,7 @@ std::vector broadcast_inst::calc_output_layouts(broadcast_node const& /* auto& constant_mem = impl_param.memory_deps; if (constant_mem.count(1)) { auto target_shape_mem = constant_mem.at(1); - cldnn::mem_lock target_shape_lock(target_shape_mem, impl_param.prog->get_stream()); + cldnn::mem_lock target_shape_lock(target_shape_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(target_shape_mem->get_layout(), target_shape_lock.data())); ov::op::v3::shape_infer(&op, input_shapes, output_shapes, const_data); } else if (impl_param.input_layouts.size() == 1) { diff --git a/src/plugins/intel_gpu/src/graph/crop.cpp b/src/plugins/intel_gpu/src/graph/crop.cpp index 01b14c3337b..f55dadf335d 100644 --- a/src/plugins/intel_gpu/src/graph/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/crop.cpp @@ -60,12 +60,12 @@ std::vector crop_inst::calc_output_layouts(const crop_node& /*node*/, co OPENVINO_ASSERT(impl_param.memory_deps.count(1) > 0, "[GPU] Can't find Crop(ngraph VariadicSplit op mode) axis values memory dependency"); auto axis_values_mem = impl_param.memory_deps.at(1); - cldnn::mem_lock axis_values_mem_lock(axis_values_mem, impl_param.prog->get_stream()); + cldnn::mem_lock axis_values_mem_lock(axis_values_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(axis_values_mem->get_layout(), axis_values_mem_lock.data())); OPENVINO_ASSERT(impl_param.memory_deps.count(2) > 0, "[GPU] Can't find Crop(ngraph VariadicSplit op mode) split length values memory dependency"); auto split_length_mem = impl_param.memory_deps.at(2); - cldnn::mem_lock split_length_mem_lock(split_length_mem, impl_param.prog->get_stream()); + cldnn::mem_lock split_length_mem_lock(split_length_mem, impl_param.get_stream()); const_data.emplace(2, make_host_tensor(split_length_mem->get_layout(), split_length_mem_lock.data())); ov::op::v1::VariadicSplit op; @@ -75,7 +75,7 @@ std::vector crop_inst::calc_output_layouts(const crop_node& /*node*/, co OPENVINO_ASSERT(impl_param.memory_deps.count(1) > 0, "[GPU] Can't find Crop(ngraph Split op mode) axis values memory dependency"); auto axis_values_mem = impl_param.memory_deps.at(1); - cldnn::mem_lock axis_values_mem_lock(axis_values_mem, impl_param.prog->get_stream()); + cldnn::mem_lock axis_values_mem_lock(axis_values_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(axis_values_mem->get_layout(), axis_values_mem_lock.data())); ov::op::v1::Split op; diff --git a/src/plugins/intel_gpu/src/graph/deconvolution.cpp b/src/plugins/intel_gpu/src/graph/deconvolution.cpp index 3dc12a9fbd2..bc6805f0f4a 100644 --- a/src/plugins/intel_gpu/src/graph/deconvolution.cpp +++ b/src/plugins/intel_gpu/src/graph/deconvolution.cpp @@ -185,7 +185,7 @@ std::vector deconvolution_inst::calc_output_layouts(deconvolution_node c output_shapes = ov::op::v1::shape_infer(&op, input_shapes, pads_begin, pads_end); } else if (memory_deps.count(2)) { auto mem = memory_deps.at(2); - auto dims = read_vector(mem, impl_param.prog->get_stream()); + auto dims = read_vector(mem, impl_param.get_stream()); auto dims_shape = ov::Shape{dims.size()}; input_shapes.push_back(dims_shape); output_shapes = ov::op::v1::shape_infer( @@ -211,7 +211,7 @@ std::vector deconvolution_inst::calc_output_layouts(deconvolution_node c output_shapes = ov::op::v1::shape_infer(&op, input_shapes, pads_begin, pads_end); } else if (memory_deps.count(2)) { auto mem = memory_deps.at(2); - auto dims = read_vector(mem, impl_param.prog->get_stream()); + auto dims = read_vector(mem, impl_param.get_stream()); auto dims_shape = ov::Shape{dims.size()}; input_shapes.push_back(dims_shape); output_shapes = ov::op::v1::shape_infer( diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 66c87f99816..4ddd96d6a4c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -1021,6 +1021,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) auto& data = node->get_dependency(fused_prim.dep_start_idx); auto data_layout = data.get_output_layout(); + if (fc_layout.is_dynamic() || data_layout.is_dynamic()) + continue; + if ((fc_layout.batch() == 1 || fc_layout.feature() == 1) || (data_layout.batch() == 1 && data_layout.feature() == 1) || (fc_layout.count() == data_layout.count())) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp index bd481afb847..4098ea46066 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/resample.cpp @@ -156,7 +156,7 @@ struct resample_impl : typed_primitive_impl_ocl { bool scales_calc_mod = primitive->shape_calc_mode == resample::InterpolateOp::ShapeCalcMode::SCALES; if (scales_calc_mod && impl_param.input_layouts.size() > 1 && scales.empty()) { auto mem = impl_param.memory_deps.at(2); - scales = read_vector(mem, impl_param.prog->get_stream()); + scales = read_vector(mem, impl_param.get_stream()); } for (size_t i = 0; i < scales.size(); ++i) { diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index ba347ae069d..481c86fc448 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -144,6 +144,7 @@ public: void check_memory_to_set(const memory& mem, const layout& layout) const; const std::list& get_users() const { return _node->get_users(); } + const kernel_impl_params* get_impl_params() const { return _impl_params.get(); } // return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead const primitive_impl* get_impl() const { return _impl.get(); } diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index 4d353c27070..87730de4eba 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -111,8 +111,8 @@ public: } virtual std::unique_ptr get_kernel_impl_params(const std::vector& in_layouts, const std::vector& out_layouts) const { - auto params = std::unique_ptr(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layouts, - get_fused_primitives())); + auto params = std::unique_ptr(new kernel_impl_params(get_program(), get_program().get_stream_ptr(), get_primitive(), + get_unique_id(), in_layouts, out_layouts, get_fused_primitives())); params->memory_deps = get_const_memory_deps(); auto deps = get_dependencies(); diff --git a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp index 029fddf3fcc..a250a200de6 100644 --- a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp @@ -42,7 +42,7 @@ std::vector non_max_suppression_inst::calc_output_layouts(non_max_suppre if (memory_deps.count(2)) { auto max_output_boxes_per_class_mem = memory_deps.at(2); cldnn::mem_lock max_output_boxes_per_class_lock(max_output_boxes_per_class_mem, - impl_param.prog->get_stream()); + impl_param.get_stream()); auto max_output_boxes_per_class_tensor = make_host_tensor(max_output_boxes_per_class_mem->get_layout(), max_output_boxes_per_class_lock.data()); const_data.emplace(2, max_output_boxes_per_class_tensor); diff --git a/src/plugins/intel_gpu/src/graph/non_zero.cpp b/src/plugins/intel_gpu/src/graph/non_zero.cpp index db6489fc3cd..c181835ab9e 100644 --- a/src/plugins/intel_gpu/src/graph/non_zero.cpp +++ b/src/plugins/intel_gpu/src/graph/non_zero.cpp @@ -57,7 +57,7 @@ layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node, assert(static_cast(node.get_primitive()->output_data_types[0]) == false && "Output data type forcing is not supported for gather_nonzero_node!"); if (impl_param.memory_deps.count(1)) { - auto out_size = read_vector(impl_param.memory_deps.at(1), impl_param.prog->get_stream()); + auto out_size = read_vector(impl_param.memory_deps.at(1), impl_param.get_stream()); ov::Shape output_shape(out_size.begin(), out_size.end()); ov::PartialShape output_pshape(output_shape); return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx}; @@ -72,7 +72,7 @@ std::vector gather_nonzero_inst::calc_output_layouts(gather_nonzero_node assert(static_cast(desc->output_data_types[0]) == false && "Output data type forcing is not supported for gather_nonzero_node!"); if (impl_param.memory_deps.count(1)) { - auto out_size = read_vector(impl_param.memory_deps.at(1), impl_param.prog->get_stream()); + auto out_size = read_vector(impl_param.memory_deps.at(1), impl_param.get_stream()); // output shape of nonzero is [input_rank, count_non_zero] auto rank = static_cast(impl_param.get_input_layout(0).get().rank().get_length()); auto count = static_cast(out_size[0]); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 85e82982725..ff77999718e 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -633,7 +633,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}); } } - + _impl_params->strm = _network.get_stream_ptr(); if (_outputs[0]) max_output_layout_size = _outputs[0]->get_layout().get_tensor().count(); } diff --git a/src/plugins/intel_gpu/src/graph/range.cpp b/src/plugins/intel_gpu/src/graph/range.cpp index 62db0604ee8..4b67a9ff962 100644 --- a/src/plugins/intel_gpu/src/graph/range.cpp +++ b/src/plugins/intel_gpu/src/graph/range.cpp @@ -39,15 +39,15 @@ std::vector range_inst::calc_output_layouts(range_node const& /*node*/, if (memory_deps.count(0) > 0 && memory_deps.count(1) > 0 && memory_deps.count(2) > 0) { auto start_mem = memory_deps.at(0); - cldnn::mem_lock start_mem_lock(start_mem, impl_param.prog->get_stream()); + cldnn::mem_lock start_mem_lock(start_mem, impl_param.get_stream()); const_data.emplace(0, make_host_tensor(start_mem->get_layout(), start_mem_lock.data())); auto stop_mem = memory_deps.at(1); - cldnn::mem_lock stop_mem_lock(stop_mem, impl_param.prog->get_stream()); + cldnn::mem_lock stop_mem_lock(stop_mem, impl_param.get_stream()); const_data.emplace(1, make_host_tensor(stop_mem->get_layout(), stop_mem_lock.data())); auto step_mem = memory_deps.at(2); - cldnn::mem_lock step_mem_lock(step_mem, impl_param.prog->get_stream()); + cldnn::mem_lock step_mem_lock(step_mem, impl_param.get_stream()); const_data.emplace(2, make_host_tensor(step_mem->get_layout(), step_mem_lock.data())); shape_infer(&op, input_shapes, output_shapes, const_data); diff --git a/src/plugins/intel_gpu/src/graph/resample.cpp b/src/plugins/intel_gpu/src/graph/resample.cpp index f3b02e438c9..2a95c856d9c 100644 --- a/src/plugins/intel_gpu/src/graph/resample.cpp +++ b/src/plugins/intel_gpu/src/graph/resample.cpp @@ -84,7 +84,7 @@ std::vector resample_inst::calc_output_layouts(resample_node const& /*no ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data}); } else { auto sizes_mem = memory_deps.at(1); - cldnn::mem_lock lock(sizes_mem, impl_param.prog->get_stream()); + cldnn::mem_lock lock(sizes_mem, impl_param.get_stream()); auto sizes_tensor = make_host_tensor(sizes_mem->get_layout(), lock.data()); const_data.emplace(1, sizes_tensor); ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data}); @@ -96,7 +96,7 @@ std::vector resample_inst::calc_output_layouts(resample_node const& /*no ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data}); } else { auto scales_mem = memory_deps.at(2); - cldnn::mem_lock lock(scales_mem, impl_param.prog->get_stream()); + cldnn::mem_lock lock(scales_mem, impl_param.get_stream()); auto scales_tensor = make_host_tensor(scales_mem->get_layout(), lock.data()); const_data.emplace(2, scales_tensor); ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data}); diff --git a/src/plugins/intel_gpu/src/graph/reshape.cpp b/src/plugins/intel_gpu/src/graph/reshape.cpp index 7cabb6ab2d9..b32ec9e0683 100644 --- a/src/plugins/intel_gpu/src/graph/reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/reshape.cpp @@ -111,7 +111,7 @@ std::vector reshape_inst::calc_output_layouts(reshape_node const& /*node if (memory_deps.count(1) > 0) { auto pattern_mem = memory_deps.at(1); - cldnn::mem_lock pattern_lock(pattern_mem, impl_param.prog->get_stream()); + cldnn::mem_lock pattern_lock(pattern_mem, impl_param.get_stream()); auto pattern_ptr = pattern_lock.data(); auto pattern_tensor = make_host_tensor(pattern_mem->get_layout(), pattern_ptr); diff --git a/src/plugins/intel_gpu/src/graph/strided_slice.cpp b/src/plugins/intel_gpu/src/graph/strided_slice.cpp index 2942e9624de..5c9a0f50252 100644 --- a/src/plugins/intel_gpu/src/graph/strided_slice.cpp +++ b/src/plugins/intel_gpu/src/graph/strided_slice.cpp @@ -83,9 +83,9 @@ std::vector strided_slice_inst::calc_output_layouts(strided_slice_node c auto end_mem = constant_mem.at(2); auto strides_mem = constant_mem.at(3); - cldnn::mem_lock lock1(begin_mem, impl_param.prog->get_stream()); - cldnn::mem_lock lock2(end_mem, impl_param.prog->get_stream()); - cldnn::mem_lock lock3(strides_mem, impl_param.prog->get_stream()); + cldnn::mem_lock lock1(begin_mem, impl_param.get_stream()); + cldnn::mem_lock lock2(end_mem, impl_param.get_stream()); + cldnn::mem_lock lock3(strides_mem, impl_param.get_stream()); auto begin_tensor = make_host_tensor(begin_mem->get_layout(), lock1.data()); auto end_tensor = make_host_tensor(end_mem->get_layout(), lock2.data()); diff --git a/src/plugins/intel_gpu/src/graph/tile.cpp b/src/plugins/intel_gpu/src/graph/tile.cpp index 85aa8685a49..584ca0d197a 100644 --- a/src/plugins/intel_gpu/src/graph/tile.cpp +++ b/src/plugins/intel_gpu/src/graph/tile.cpp @@ -54,7 +54,7 @@ std::vector tile_inst::calc_output_layouts(tile_node const& /*node*/, co if (desc->input_size() == 2) { if (constant_mem.count(1)) { auto repeats_mem = constant_mem.at(1); - cldnn::mem_lock repeats_lock(repeats_mem, impl_param.prog->get_stream()); + cldnn::mem_lock repeats_lock(repeats_mem, impl_param.get_stream()); const auto& layout = repeats_mem->get_layout(); const auto repeats_tensor = ov::Tensor(data_type_to_element_type(layout.data_type), layout.get_shape(), repeats_lock.data()); diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp index 3abdc8fadd1..f29813b56e3 100644 --- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -78,7 +78,7 @@ public: #ifdef ENABLE_ONEDNN_FOR_GPU class FullyConnectedFusingTestOneDNN : public BaseFusingTest { public: - void execute(fully_connected_test_params& p, bool is_caching_test = false) { + void execute(fully_connected_test_params& p, bool is_caching_test = false, bool is_dynamic = false) { // Onednn post operation has issue in a machine that does not support imad. if (!engine.get_device_info().supports_immad) return; @@ -103,6 +103,8 @@ public: ov::intel_gpu::ImplementationDesc fc_ocl_impl = { ocl_forcing_format, p.ocl_kernel_name /*fully_connected_gpu_bfyx_ref*/}; cfg_not_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_ocl_impl } })); } + cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test); network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test); network_fused->set_input_data("input", input_prim); @@ -498,6 +500,30 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add, ::testing::ValuesIn(s fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3, "fully_connected_gpu_bfyx_ref" }, })); +class fc_fp16_eltwise_add_dynamic : public FullyConnectedFusingTestOneDNN {}; +TEST_P(fc_fp16_eltwise_add_dynamic, basic) { + auto p = GetParam(); + auto test_input_layout = get_input_layout(p); + auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().size()), test_input_layout.data_type, test_input_layout.format}; + create_topologies( + input_layout("input", dynamic_input_layout), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)), + fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)), + eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum), + reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p, false, true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add_dynamic, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_FP16_3, 2, 3, "fully_connected_gpu_bfyx_ref" }, + fully_connected_test_params{ CASE_FC_FP16_4, 2, 3, "fully_connected_gpu_bfyx_ref" }, +})); + class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {}; TEST_P(fc_fp16_eltwise_sub, basic) { auto p = GetParam(); diff --git a/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp index 06b0eb08119..b6374f38540 100644 --- a/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp @@ -13,6 +13,8 @@ #include #include +#include "primitive_inst.h" + using namespace cldnn; using namespace ::tests; @@ -40,14 +42,20 @@ TEST(multistream_gpu, basic) { topology.add(shape_of("shape_of", input_info("fc"), 3, data_types::i32)); auto prog_ptr = program::build_program(engine, topology, config); + auto &node = prog_ptr->get_node("shape_of"); + auto strm = node.get_kernel_impl_params()->get_stream_ptr(); + ASSERT_EQ(prog_ptr->get_stream_ptr(), strm); + std::vector networks; + std::vector streams; for (size_t i = 0; i < num_streams; i++) { networks.push_back(network::allocate_network(engine, prog_ptr)); + streams.push_back(networks[i]->get_stream_ptr()); } std::vector tasks; for (size_t i = 0; i < num_streams; i++) { - tasks.push_back([&networks, i, &engine] { + tasks.push_back([&networks, &streams, i, &engine] { auto cfg = get_test_default_config(engine); auto stream = engine.create_stream(cfg); auto net = networks[i]; @@ -61,6 +69,10 @@ TEST(multistream_gpu, basic) { auto outputs = net->execute(); + auto inst = net->get_primitive("shape_of"); + auto strm = inst->get_impl_params()->get_stream_ptr(); + ASSERT_EQ(streams[i], strm); + auto output = outputs.at("shape_of").get_memory(); cldnn::mem_lock output_ptr(output, *stream);