[GPU] Fix in-order queue synchronization issue related to OCL/OneDNN impls interaction with CPU impls (#17976)

2023-06-14 05:15:04 +04:00 · 2023-06-14 05:15:04 +04:00 · e631f65a9b
commit e631f65a9b
parent b023119b9a
11 changed files with 320 additions and 37 deletions
--- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
@ -110,6 +110,7 @@ struct loop_impl : typed_primitive_impl<loop> {
            }
        }

+        std::vector<event::ptr> all_events;
        std::vector<event::ptr> loop_carried_dep(events.begin(), events.end());
        int64_t current_iteration_idx = 0;
        while (current_iteration_idx < trip_count && execution_condition) {
@ -145,6 +146,15 @@ struct loop_impl : typed_primitive_impl<loop> {
                loop_carried_dep.emplace_back(body_event);
            }

+            // Collect output events for waiting for all iterations finishing
+            for (auto& out : body_network->get_outputs()) {
+                auto output_id = out->id();
+                if (body_network->has_event(output_id)) {
+                    auto output_event = body_network->get_primitive_event(output_id);
+                    all_events.push_back(output_event);
+                }
+            }
+
            //TODO: execution_condition is prepared as they are presented in the
            //      ngraph opset document for loop operation.
            // However they are not being used yet and only TensorIterator which
@ -157,7 +167,9 @@ struct loop_impl : typed_primitive_impl<loop> {
            ++current_iteration_idx;
        }

-        body_network->reset_execution();
+        // Reset network and wait for all collected events
+        body_network->reset_execution(false);
+        stream.wait_for_events(all_events);

        // Concatenate sliced output to the outer network
        for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) {
--- a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp
@ -40,7 +40,9 @@ public:

    event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
        auto& stream = instance.get_network().get_stream();
-        return stream.enqueue_marker(events);
+
+        return events.empty() ? stream.create_user_event(true)
+                              : stream.enqueue_marker(events);
    }

    static std::unique_ptr<primitive_impl> create_data(const data_node& data, const kernel_impl_params&) {
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
@ -132,7 +132,8 @@ protected:
        if (group && !is_output)
            return stream.group_events(events);

-        return stream.enqueue_marker(events, is_output);
+        return events.empty() ? stream.create_user_event(true)
+                              : stream.enqueue_marker(events, is_output);
    }

    void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
@ -259,14 +260,9 @@ protected:
            if (_kernel_data.kernels[kd_idx].skip_execution)
                continue;
            std::vector<event::ptr> new_events;
-            // is any user of the prim's users is an detecion output, set prim as a output event (event won't be nullptr)
-            bool is_output_event;
-            if (instance.node != nullptr) {
-                auto users = instance.node->get_users();
-                is_output_event = is_any_user_cpu(users) || instance.node->is_output();
-            } else {
-                is_output_event = instance.is_output_event();
-            }
+
+            // If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
+            bool needs_completion_event = instance.needs_completion_event();

            auto& params = _kernel_data.kernels[kd_idx].params;
            auto args = get_arguments(instance);
@ -280,9 +276,10 @@ protected:
            const auto& lws = params.workGroups.local;

            GPU_DEBUG_TRACE_DETAIL << "Enqueue kernel " << kd_idx << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] "
-                                   << "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" << std::endl;
+                                   << "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]"
+                                   << (needs_completion_event ? " has_completion_event=true" : "") << std::endl;

-            auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, is_output_event);
+            auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, needs_completion_event);
            new_events.push_back(ev);
            all_events.push_back(ev);

--- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@ -460,16 +460,6 @@ protected:

    void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }

-    event::ptr aggregate_events(const std::vector<event::ptr>& events, stream& stream, bool group = false, bool is_output = false) const {
-        if (events.size() == 1 && !is_output)
-            return events[0];
-
-        if (group && !is_output)
-            return stream.group_events(events);
-
-        return stream.enqueue_marker(events, is_output);
-    }
-
    void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
        if (instance.can_be_optimized())
            return;
@ -499,6 +489,12 @@ protected:
                }
                throw;    // rethrowing dnnl::error if not out_of_memory
            }
+
+            // If oneDNN primitive is the output primitive or it's user is CPU implementation, then enqueue marker
+            // with empty events wait list (which will trigger wait for all previously enqueued tasks) and
+            // return it as oneDNN primitive's event as it is a single option for proper synchronization
+            if (instance.needs_completion_event())
+                event = stream.enqueue_marker({});
        }

        if (_enable_profiling) {
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -230,7 +230,7 @@ public:
    bool is_dynamic() const { return _is_dynamic; }
    bool can_share_buffer() const { return _can_share_buffer; }
    bool is_constant() const { return _is_constant; }
-    bool is_output_event() const { return _is_output_event; }
+    bool needs_completion_event() const { return _needs_completion_event; }
    bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }

    void allocate_internal_buffers();
@ -330,7 +330,7 @@ protected:
    bool _can_be_optimized = false;
    bool _can_share_buffer = true;
    bool _is_constant = false;
-    bool _is_output_event = false;
+    bool _needs_completion_event = false;

    size_t max_output_layout_size = 0;
    std::vector<size_t> max_intermediates_memory_sizes;
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -1396,8 +1396,11 @@ void network::execute_primitive(const std::shared_ptr<primitive_inst>& primitive
                                const std::vector<event::ptr>& events) {
    event::ptr ev = primitive->execute(events);

-    // Collect events only for OOO queue and Profiling mode
-    if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling) {
+    // Collect events under any of the following conditions:
+    // 1) OOO queue execution
+    // 2) Profiling mode is enabled
+    // 3) Primitive has CPU user or primitive is output
+    if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling || primitive->needs_completion_event()) {
        auto id = primitive->id();
        _events.insert({id, ev});
    }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -684,7 +684,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
        dependencies = events;
    } else {
        auto queue_type = get_network().get_stream().get_queue_type();
-        if (queue_type == QueueTypes::out_of_order) {
+        // Prepare dependencies events in case of OOO queue, CPU implementation,
+        // or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
+        if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
            dependencies.reserve(dependencies.size() + _exec_deps.size());
            for (auto& input : _exec_deps) {
                auto id = input->id();
@ -803,7 +805,8 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
    , _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
    , _can_be_optimized(node.can_be_optimized())
    , _can_share_buffer(node.can_share_buffer())
-    , _is_constant(node.is_constant()) {
+    , _is_constant(node.is_constant())
+    , _needs_completion_event(is_any_user_cpu(node.get_users()) || node.is_output()) {
    if (allocate_memory) {
        // In case when output is mutable_data primitive, and other users dependencies are only used for
        // suychronization, The output memory of such primitive will be fused with mutable_data
@ -1392,9 +1395,7 @@ void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
    ob << can_be_optimized();
    ob << can_share_buffer();
    ob << is_constant();
-    auto users = get_node().get_users();
-    bool is_output_event = is_any_user_cpu(users) || get_node().is_output();
-    ob << is_output_event;
+    ob << needs_completion_event();

    if (type() == cldnn::data::type_id()) {
        return;
@ -1485,7 +1486,7 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
    ib >> _can_be_optimized;
    ib >> _can_share_buffer;
    ib >> _is_constant;
-    ib >> _is_output_event;
+    ib >> _needs_completion_event;

    if (type() == cldnn::data::type_id()) {
        return;
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@ -322,8 +322,13 @@ void ocl_stream::enqueue_barrier() {
 }

 event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool is_output) {
-    if (deps.empty())
-        return std::make_shared<ocl_user_event>(_engine.get_cl_context(), true);
+    // Wait for all previously enqueued tasks if deps list is empty
+    if (deps.empty()) {
+        cl::Event ret_ev;
+        _command_queue.enqueueMarkerWithWaitList(nullptr, &ret_ev);
+
+        return std::make_shared<ocl_event>(ret_ev);
+    }

    if (sync_method == sync_methods::events) {
        cl::Event ret_ev;
--- a/src/plugins/intel_gpu/tests/unit/module_tests/network_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/network_test.cpp
@ -7,8 +7,13 @@
 #include "intel_gpu/graph/network.hpp"
 #include "intel_gpu/primitives/input_layout.hpp"
 #include "intel_gpu/primitives/data.hpp"
+#include "intel_gpu/primitives/activation.hpp"
 #include "intel_gpu/primitives/broadcast.hpp"
 #include "intel_gpu/primitives/concatenation.hpp"
+#include "intel_gpu/primitives/reorder.hpp"
+#include "intel_gpu/primitives/reshape.hpp"
+
+#include "runtime/ocl/ocl_event.hpp"

 #include <memory>

@ -60,3 +65,147 @@ TEST(network_test, model_with_dynamic_input_is_dynamic) {

    ASSERT_TRUE(net.is_dynamic());
 }
+
+TEST(network_test, has_proper_event_for_in_order_queue) {
+    auto& engine = get_test_engine();
+    layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
+
+    topology topology;
+    topology.add(input_layout("input1", in_layout));
+    topology.add(data("input2", const_mem));
+    topology.add(activation("activation1", input_info("input1"), activation_func::clamp, {-10.f, 10.f}));
+    topology.add(concatenation("concat", { input_info("activation1"), input_info("input2") }, 1));
+    topology.add(reorder("reorder", input_info("concat"), in_layout));
+    topology.add(activation("activation2", input_info("concat"), activation_func::relu));
+
+    auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation2", impl_desc}};
+
+    auto config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net(engine, topology, config);
+
+    net.set_input_data("input1", input_mem);
+    net.execute();
+
+    ASSERT_FALSE(net.has_event("activation1"));
+    ASSERT_TRUE(net.has_event("concat"));
+    ASSERT_TRUE(net.has_event("reorder"));
+    ASSERT_TRUE(net.has_event("activation2"));
+
+    auto concat_ev = net.get_primitive_event("concat");
+    auto reorder_ev = net.get_primitive_event("reorder");
+    auto activation_ev = net.get_primitive_event("activation2");
+
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
+
+    // Check if we have real underlying OpenCL events
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
+}
+
+TEST(network_test, has_proper_event_for_in_order_queue_optimized_out) {
+    auto& engine = get_test_engine();
+    layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
+
+    topology topology;
+    topology.add(input_layout("input1", in_layout));
+    topology.add(data("input2", const_mem));
+    topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
+    topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
+    topology.add(reorder("reorder", input_info("reshape"), in_layout));
+    topology.add(activation("activation", input_info("reshape"), activation_func::relu));
+
+    auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
+
+    auto config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net(engine, topology, config);
+
+    net.set_input_data("input1", input_mem);
+    net.execute();
+
+    ASSERT_TRUE(net.has_event("concat"));
+    ASSERT_TRUE(net.has_event("reshape"));
+    ASSERT_TRUE(net.has_event("reorder"));
+    ASSERT_TRUE(net.has_event("activation"));
+
+    auto concat_ev = net.get_primitive_event("concat");
+    auto reshape_ev = net.get_primitive_event("reshape");
+    auto reorder_ev = net.get_primitive_event("reorder");
+    auto activation_ev = net.get_primitive_event("activation");
+
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reshape_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
+
+    // Check if we have real underlying OpenCL events
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(reshape_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
+}
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto weights = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
+
+    topology topology;
+    topology.add(input_layout("input", in_layout));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
+    topology.add(activation("activation", input_info("conv"), activation_func::relu));
+    topology.add(reorder("reorder", input_info("conv"), in_layout));
+
+    auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
+    auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
+
+    auto config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net(engine, topology, config);
+    net.set_input_data("input", input_mem);
+    net.execute();
+
+    ASSERT_TRUE(net.has_event("conv"));
+    ASSERT_TRUE(net.has_event("reorder"));
+    ASSERT_TRUE(net.has_event("activation"));
+
+    auto conv_ev = net.get_primitive_event("conv");
+    auto reorder_ev = net.get_primitive_event("reorder");
+    auto activation_ev = net.get_primitive_event("activation");
+
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(conv_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
+    ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
+
+    // Check if we have real underlying OpenCL events
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(conv_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
+    ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
+}
+#endif
--- a/src/plugins/intel_gpu/tests/unit/test_cases/activation_simple_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/activation_simple_gpu_test.cpp
@ -8,6 +8,8 @@
 #include <intel_gpu/primitives/activation.hpp>
 #include <intel_gpu/primitives/data.hpp>
 #include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/reshape.hpp>
+#include <intel_gpu/primitives/concatenation.hpp>
 #include "activation_inst.h"

 #include <cmath>
@ -1999,3 +2001,59 @@ INSTANTIATE_TEST_SUITE_P(
                       ::testing::Values(padding{}),
                       ::testing::Values(impl_types::cpu),
                       ::testing::Values(true)));
+
+TEST(activation_gpu, has_proper_synchronization) {
+    auto& engine = get_test_engine();
+    auto in_layout = layout({1, 2, 2, 4}, data_types::f32, format::bfyx);
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
+
+    auto in_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
+    auto const_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
+
+    set_values(input_mem, flatten_4d(format::bfyx, in_data));
+    set_values(const_mem, flatten_4d(format::bfyx, const_data));
+
+    auto create_topology =[&]() {
+        topology topology;
+        topology.add(input_layout("input1", in_layout));
+        topology.add(data("input2", const_mem));
+        topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
+        topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
+        topology.add(reorder("reorder", input_info("reshape"), in_layout));
+        topology.add(activation("activation", input_info("reshape"), activation_func::relu));
+        return topology;
+    };
+
+    auto topology_ref = create_topology();
+    auto topology_test = create_topology();
+
+    auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
+
+    auto config_ref = get_test_default_config(engine);
+    config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    auto config_test = config_ref;
+    config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net_test(engine, topology_test, config_test);
+    net_test.set_input_data("input1", input_mem);
+    auto outputs_test = net_test.execute();
+    auto res_test = outputs_test.at("activation").get_memory();
+
+    network net_ref(engine, topology_ref, config_ref);
+    net_ref.set_input_data("input1", input_mem);
+    auto outputs_ref = net_ref.execute();
+    auto res_ref = outputs_ref.at("activation").get_memory();
+
+    ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
+
+    cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
+    cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
+
+    for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
+        ASSERT_EQ(test_mem[i], ref_mem[i]);
+    }
+}
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@ -9293,12 +9293,13 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {

    network.set_input_data("input", input_mem);
    auto outputs = network.execute();
-    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.size(), size_t(2));
+    ASSERT_TRUE(outputs.find("conv_fsv") != outputs.end());

    for (auto& p : network.get_primitives_info())
        std::cerr << p.original_id << " " << p.kernel_id << std::endl;

-    auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.begin()->second);
+    auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.find("conv_fsv")->second);
    auto out_lay = network.get_node_output_layout("conv_fsv");
    ASSERT_EQ(out_lay.batch(), expected_result.size());
    ASSERT_EQ(out_lay.feature(), expected_result[0].size());
@ -9657,6 +9658,65 @@ TEST(convolution_gpu_onednn, quantized_onednn_convolution_u8s8f32_asymmetric_act
        }
 }

+TEST(convolution_gpu_onednn, has_proper_synchronization) {
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
+    auto input_mem = engine.allocate_memory(in_layout);
+    auto weights_mem = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
+
+    auto in_data = generate_random_4d<float>(1, 16, 2, 4, -1, 1);
+    auto weights_data = generate_random_4d<float>(16, 16, 1, 1, -1, 1);
+
+    set_values(input_mem, flatten_4d(format::bfyx, in_data));
+    set_values(weights_mem, flatten_4d(format::bfyx, weights_data));
+
+    auto create_topology =[&]() {
+        topology topology;
+        topology.add(input_layout("input", in_layout));
+        topology.add(data("weights", weights_mem));
+        topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
+        topology.add(activation("activation", input_info("conv"), activation_func::relu));
+        topology.add(reorder("reorder", input_info("conv"), in_layout));
+        return topology;
+    };
+
+    auto topology_ref = create_topology();
+    auto topology_test = create_topology();
+
+    auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
+    auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
+
+    auto config_ref = get_test_default_config(engine);
+    config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+    config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    auto config_test = config_ref;
+    config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network net_test(engine, topology_test, config_test);
+    net_test.set_input_data("input", input_mem);
+    auto outputs_test = net_test.execute();
+    auto res_test = outputs_test.at("activation").get_memory();
+
+    network net_ref(engine, topology_ref, config_ref);
+    net_ref.set_input_data("input", input_mem);
+    auto outputs_ref = net_ref.execute();
+    auto res_ref = outputs_ref.at("activation").get_memory();
+
+    ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
+
+    cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
+    cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
+
+    for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
+        ASSERT_EQ(test_mem[i], ref_mem[i]);
+    }
+}
+
 #endif   // ENABLE_ONEDNN_FOR_GPU

 template <typename T>