[GPU] Fix in-order queue synchronization issue related to OCL/OneDNN impls interaction with CPU impls (#17976)

This commit is contained in:
Sergey Shlyapnikov 2023-06-14 05:15:04 +04:00 committed by GitHub
parent b023119b9a
commit e631f65a9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 320 additions and 37 deletions

View File

@ -110,6 +110,7 @@ struct loop_impl : typed_primitive_impl<loop> {
}
}
std::vector<event::ptr> all_events;
std::vector<event::ptr> loop_carried_dep(events.begin(), events.end());
int64_t current_iteration_idx = 0;
while (current_iteration_idx < trip_count && execution_condition) {
@ -145,6 +146,15 @@ struct loop_impl : typed_primitive_impl<loop> {
loop_carried_dep.emplace_back(body_event);
}
// Collect output events for waiting for all iterations finishing
for (auto& out : body_network->get_outputs()) {
auto output_id = out->id();
if (body_network->has_event(output_id)) {
auto output_event = body_network->get_primitive_event(output_id);
all_events.push_back(output_event);
}
}
//TODO: execution_condition is prepared as they are presented in the
// ngraph opset document for loop operation.
// However they are not being used yet and only TensorIterator which
@ -157,7 +167,9 @@ struct loop_impl : typed_primitive_impl<loop> {
++current_iteration_idx;
}
body_network->reset_execution();
// Reset network and wait for all collected events
body_network->reset_execution(false);
stream.wait_for_events(all_events);
// Concatenate sliced output to the outer network
for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) {

View File

@ -40,7 +40,9 @@ public:
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
auto& stream = instance.get_network().get_stream();
return stream.enqueue_marker(events);
return events.empty() ? stream.create_user_event(true)
: stream.enqueue_marker(events);
}
static std::unique_ptr<primitive_impl> create_data(const data_node& data, const kernel_impl_params&) {

View File

@ -132,7 +132,8 @@ protected:
if (group && !is_output)
return stream.group_events(events);
return stream.enqueue_marker(events, is_output);
return events.empty() ? stream.create_user_event(true)
: stream.enqueue_marker(events, is_output);
}
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
@ -259,14 +260,9 @@ protected:
if (_kernel_data.kernels[kd_idx].skip_execution)
continue;
std::vector<event::ptr> new_events;
// is any user of the prim's users is an detecion output, set prim as a output event (event won't be nullptr)
bool is_output_event;
if (instance.node != nullptr) {
auto users = instance.node->get_users();
is_output_event = is_any_user_cpu(users) || instance.node->is_output();
} else {
is_output_event = instance.is_output_event();
}
// If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
bool needs_completion_event = instance.needs_completion_event();
auto& params = _kernel_data.kernels[kd_idx].params;
auto args = get_arguments(instance);
@ -280,9 +276,10 @@ protected:
const auto& lws = params.workGroups.local;
GPU_DEBUG_TRACE_DETAIL << "Enqueue kernel " << kd_idx << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] "
<< "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" << std::endl;
<< "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]"
<< (needs_completion_event ? " has_completion_event=true" : "") << std::endl;
auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, is_output_event);
auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, needs_completion_event);
new_events.push_back(ev);
all_events.push_back(ev);

View File

@ -460,16 +460,6 @@ protected:
void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }
event::ptr aggregate_events(const std::vector<event::ptr>& events, stream& stream, bool group = false, bool is_output = false) const {
if (events.size() == 1 && !is_output)
return events[0];
if (group && !is_output)
return stream.group_events(events);
return stream.enqueue_marker(events, is_output);
}
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
if (instance.can_be_optimized())
return;
@ -499,6 +489,12 @@ protected:
}
throw; // rethrowing dnnl::error if not out_of_memory
}
// If oneDNN primitive is the output primitive or it's user is CPU implementation, then enqueue marker
// with empty events wait list (which will trigger wait for all previously enqueued tasks) and
// return it as oneDNN primitive's event as it is a single option for proper synchronization
if (instance.needs_completion_event())
event = stream.enqueue_marker({});
}
if (_enable_profiling) {

View File

@ -230,7 +230,7 @@ public:
bool is_dynamic() const { return _is_dynamic; }
bool can_share_buffer() const { return _can_share_buffer; }
bool is_constant() const { return _is_constant; }
bool is_output_event() const { return _is_output_event; }
bool needs_completion_event() const { return _needs_completion_event; }
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
void allocate_internal_buffers();
@ -330,7 +330,7 @@ protected:
bool _can_be_optimized = false;
bool _can_share_buffer = true;
bool _is_constant = false;
bool _is_output_event = false;
bool _needs_completion_event = false;
size_t max_output_layout_size = 0;
std::vector<size_t> max_intermediates_memory_sizes;

View File

@ -1396,8 +1396,11 @@ void network::execute_primitive(const std::shared_ptr<primitive_inst>& primitive
const std::vector<event::ptr>& events) {
event::ptr ev = primitive->execute(events);
// Collect events only for OOO queue and Profiling mode
if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling) {
// Collect events under any of the following conditions:
// 1) OOO queue execution
// 2) Profiling mode is enabled
// 3) Primitive has CPU user or primitive is output
if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling || primitive->needs_completion_event()) {
auto id = primitive->id();
_events.insert({id, ev});
}

View File

@ -684,7 +684,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
dependencies = events;
} else {
auto queue_type = get_network().get_stream().get_queue_type();
if (queue_type == QueueTypes::out_of_order) {
// Prepare dependencies events in case of OOO queue, CPU implementation,
// or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
dependencies.reserve(dependencies.size() + _exec_deps.size());
for (auto& input : _exec_deps) {
auto id = input->id();
@ -803,7 +805,8 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
, _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
, _can_be_optimized(node.can_be_optimized())
, _can_share_buffer(node.can_share_buffer())
, _is_constant(node.is_constant()) {
, _is_constant(node.is_constant())
, _needs_completion_event(is_any_user_cpu(node.get_users()) || node.is_output()) {
if (allocate_memory) {
// In case when output is mutable_data primitive, and other users dependencies are only used for
// suychronization, The output memory of such primitive will be fused with mutable_data
@ -1392,9 +1395,7 @@ void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
ob << can_be_optimized();
ob << can_share_buffer();
ob << is_constant();
auto users = get_node().get_users();
bool is_output_event = is_any_user_cpu(users) || get_node().is_output();
ob << is_output_event;
ob << needs_completion_event();
if (type() == cldnn::data::type_id()) {
return;
@ -1485,7 +1486,7 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
ib >> _can_be_optimized;
ib >> _can_share_buffer;
ib >> _is_constant;
ib >> _is_output_event;
ib >> _needs_completion_event;
if (type() == cldnn::data::type_id()) {
return;

View File

@ -322,8 +322,13 @@ void ocl_stream::enqueue_barrier() {
}
event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool is_output) {
if (deps.empty())
return std::make_shared<ocl_user_event>(_engine.get_cl_context(), true);
// Wait for all previously enqueued tasks if deps list is empty
if (deps.empty()) {
cl::Event ret_ev;
_command_queue.enqueueMarkerWithWaitList(nullptr, &ret_ev);
return std::make_shared<ocl_event>(ret_ev);
}
if (sync_method == sync_methods::events) {
cl::Event ret_ev;

View File

@ -7,8 +7,13 @@
#include "intel_gpu/graph/network.hpp"
#include "intel_gpu/primitives/input_layout.hpp"
#include "intel_gpu/primitives/data.hpp"
#include "intel_gpu/primitives/activation.hpp"
#include "intel_gpu/primitives/broadcast.hpp"
#include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/reshape.hpp"
#include "runtime/ocl/ocl_event.hpp"
#include <memory>
@ -60,3 +65,147 @@ TEST(network_test, model_with_dynamic_input_is_dynamic) {
ASSERT_TRUE(net.is_dynamic());
}
TEST(network_test, has_proper_event_for_in_order_queue) {
auto& engine = get_test_engine();
layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
topology topology;
topology.add(input_layout("input1", in_layout));
topology.add(data("input2", const_mem));
topology.add(activation("activation1", input_info("input1"), activation_func::clamp, {-10.f, 10.f}));
topology.add(concatenation("concat", { input_info("activation1"), input_info("input2") }, 1));
topology.add(reorder("reorder", input_info("concat"), in_layout));
topology.add(activation("activation2", input_info("concat"), activation_func::relu));
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation2", impl_desc}};
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net(engine, topology, config);
net.set_input_data("input1", input_mem);
net.execute();
ASSERT_FALSE(net.has_event("activation1"));
ASSERT_TRUE(net.has_event("concat"));
ASSERT_TRUE(net.has_event("reorder"));
ASSERT_TRUE(net.has_event("activation2"));
auto concat_ev = net.get_primitive_event("concat");
auto reorder_ev = net.get_primitive_event("reorder");
auto activation_ev = net.get_primitive_event("activation2");
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
// Check if we have real underlying OpenCL events
ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
}
TEST(network_test, has_proper_event_for_in_order_queue_optimized_out) {
auto& engine = get_test_engine();
layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
topology topology;
topology.add(input_layout("input1", in_layout));
topology.add(data("input2", const_mem));
topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
topology.add(reorder("reorder", input_info("reshape"), in_layout));
topology.add(activation("activation", input_info("reshape"), activation_func::relu));
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net(engine, topology, config);
net.set_input_data("input1", input_mem);
net.execute();
ASSERT_TRUE(net.has_event("concat"));
ASSERT_TRUE(net.has_event("reshape"));
ASSERT_TRUE(net.has_event("reorder"));
ASSERT_TRUE(net.has_event("activation"));
auto concat_ev = net.get_primitive_event("concat");
auto reshape_ev = net.get_primitive_event("reshape");
auto reorder_ev = net.get_primitive_event("reorder");
auto activation_ev = net.get_primitive_event("activation");
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reshape_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
// Check if we have real underlying OpenCL events
ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reshape_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
}
#ifdef ENABLE_ONEDNN_FOR_GPU
TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
auto& engine = get_test_engine();
if (!engine.get_device_info().supports_immad)
return;
layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto weights = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("weights", weights));
topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
topology.add(activation("activation", input_info("conv"), activation_func::relu));
topology.add(reorder("reorder", input_info("conv"), in_layout));
auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net(engine, topology, config);
net.set_input_data("input", input_mem);
net.execute();
ASSERT_TRUE(net.has_event("conv"));
ASSERT_TRUE(net.has_event("reorder"));
ASSERT_TRUE(net.has_event("activation"));
auto conv_ev = net.get_primitive_event("conv");
auto reorder_ev = net.get_primitive_event("reorder");
auto activation_ev = net.get_primitive_event("activation");
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(conv_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
// Check if we have real underlying OpenCL events
ASSERT_TRUE(downcast<ocl::ocl_base_event>(conv_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
}
#endif

View File

@ -8,6 +8,8 @@
#include <intel_gpu/primitives/activation.hpp>
#include <intel_gpu/primitives/data.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include <intel_gpu/primitives/reshape.hpp>
#include <intel_gpu/primitives/concatenation.hpp>
#include "activation_inst.h"
#include <cmath>
@ -1999,3 +2001,59 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(padding{}),
::testing::Values(impl_types::cpu),
::testing::Values(true)));
TEST(activation_gpu, has_proper_synchronization) {
auto& engine = get_test_engine();
auto in_layout = layout({1, 2, 2, 4}, data_types::f32, format::bfyx);
auto input_mem = engine.allocate_memory(in_layout);
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
auto in_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
auto const_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
set_values(input_mem, flatten_4d(format::bfyx, in_data));
set_values(const_mem, flatten_4d(format::bfyx, const_data));
auto create_topology =[&]() {
topology topology;
topology.add(input_layout("input1", in_layout));
topology.add(data("input2", const_mem));
topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
topology.add(reorder("reorder", input_info("reshape"), in_layout));
topology.add(activation("activation", input_info("reshape"), activation_func::relu));
return topology;
};
auto topology_ref = create_topology();
auto topology_test = create_topology();
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
auto config_ref = get_test_default_config(engine);
config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto config_test = config_ref;
config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net_test(engine, topology_test, config_test);
net_test.set_input_data("input1", input_mem);
auto outputs_test = net_test.execute();
auto res_test = outputs_test.at("activation").get_memory();
network net_ref(engine, topology_ref, config_ref);
net_ref.set_input_data("input1", input_mem);
auto outputs_ref = net_ref.execute();
auto res_ref = outputs_ref.at("activation").get_memory();
ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
ASSERT_EQ(test_mem[i], ref_mem[i]);
}
}

View File

@ -9293,12 +9293,13 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
network.set_input_data("input", input_mem);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.size(), size_t(2));
ASSERT_TRUE(outputs.find("conv_fsv") != outputs.end());
for (auto& p : network.get_primitives_info())
std::cerr << p.original_id << " " << p.kernel_id << std::endl;
auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.begin()->second);
auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.find("conv_fsv")->second);
auto out_lay = network.get_node_output_layout("conv_fsv");
ASSERT_EQ(out_lay.batch(), expected_result.size());
ASSERT_EQ(out_lay.feature(), expected_result[0].size());
@ -9657,6 +9658,65 @@ TEST(convolution_gpu_onednn, quantized_onednn_convolution_u8s8f32_asymmetric_act
}
}
TEST(convolution_gpu_onednn, has_proper_synchronization) {
auto& engine = get_test_engine();
if (!engine.get_device_info().supports_immad)
return;
layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
auto input_mem = engine.allocate_memory(in_layout);
auto weights_mem = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
auto in_data = generate_random_4d<float>(1, 16, 2, 4, -1, 1);
auto weights_data = generate_random_4d<float>(16, 16, 1, 1, -1, 1);
set_values(input_mem, flatten_4d(format::bfyx, in_data));
set_values(weights_mem, flatten_4d(format::bfyx, weights_data));
auto create_topology =[&]() {
topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("weights", weights_mem));
topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
topology.add(activation("activation", input_info("conv"), activation_func::relu));
topology.add(reorder("reorder", input_info("conv"), in_layout));
return topology;
};
auto topology_ref = create_topology();
auto topology_test = create_topology();
auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
auto config_ref = get_test_default_config(engine);
config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto config_test = config_ref;
config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
network net_test(engine, topology_test, config_test);
net_test.set_input_data("input", input_mem);
auto outputs_test = net_test.execute();
auto res_test = outputs_test.at("activation").get_memory();
network net_ref(engine, topology_ref, config_ref);
net_ref.set_input_data("input", input_mem);
auto outputs_ref = net_ref.execute();
auto res_ref = outputs_ref.at("activation").get_memory();
ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
ASSERT_EQ(test_mem[i], ref_mem[i]);
}
}
#endif // ENABLE_ONEDNN_FOR_GPU
template <typename T>