[GPU] Fix in-order queue synchronization issue related to OCL/OneDNN impls interaction with CPU impls (#17976)
This commit is contained in:
parent
b023119b9a
commit
e631f65a9b
@ -110,6 +110,7 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<event::ptr> all_events;
|
||||
std::vector<event::ptr> loop_carried_dep(events.begin(), events.end());
|
||||
int64_t current_iteration_idx = 0;
|
||||
while (current_iteration_idx < trip_count && execution_condition) {
|
||||
@ -145,6 +146,15 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
loop_carried_dep.emplace_back(body_event);
|
||||
}
|
||||
|
||||
// Collect output events for waiting for all iterations finishing
|
||||
for (auto& out : body_network->get_outputs()) {
|
||||
auto output_id = out->id();
|
||||
if (body_network->has_event(output_id)) {
|
||||
auto output_event = body_network->get_primitive_event(output_id);
|
||||
all_events.push_back(output_event);
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: execution_condition is prepared as they are presented in the
|
||||
// ngraph opset document for loop operation.
|
||||
// However they are not being used yet and only TensorIterator which
|
||||
@ -157,7 +167,9 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
++current_iteration_idx;
|
||||
}
|
||||
|
||||
body_network->reset_execution();
|
||||
// Reset network and wait for all collected events
|
||||
body_network->reset_execution(false);
|
||||
stream.wait_for_events(all_events);
|
||||
|
||||
// Concatenate sliced output to the outer network
|
||||
for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) {
|
||||
|
@ -40,7 +40,9 @@ public:
|
||||
|
||||
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
|
||||
auto& stream = instance.get_network().get_stream();
|
||||
return stream.enqueue_marker(events);
|
||||
|
||||
return events.empty() ? stream.create_user_event(true)
|
||||
: stream.enqueue_marker(events);
|
||||
}
|
||||
|
||||
static std::unique_ptr<primitive_impl> create_data(const data_node& data, const kernel_impl_params&) {
|
||||
|
@ -132,7 +132,8 @@ protected:
|
||||
if (group && !is_output)
|
||||
return stream.group_events(events);
|
||||
|
||||
return stream.enqueue_marker(events, is_output);
|
||||
return events.empty() ? stream.create_user_event(true)
|
||||
: stream.enqueue_marker(events, is_output);
|
||||
}
|
||||
|
||||
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
|
||||
@ -259,14 +260,9 @@ protected:
|
||||
if (_kernel_data.kernels[kd_idx].skip_execution)
|
||||
continue;
|
||||
std::vector<event::ptr> new_events;
|
||||
// is any user of the prim's users is an detecion output, set prim as a output event (event won't be nullptr)
|
||||
bool is_output_event;
|
||||
if (instance.node != nullptr) {
|
||||
auto users = instance.node->get_users();
|
||||
is_output_event = is_any_user_cpu(users) || instance.node->is_output();
|
||||
} else {
|
||||
is_output_event = instance.is_output_event();
|
||||
}
|
||||
|
||||
// If any user of the prim's users is CPU implementation or network's output, set prim as a output event (event won't be nullptr)
|
||||
bool needs_completion_event = instance.needs_completion_event();
|
||||
|
||||
auto& params = _kernel_data.kernels[kd_idx].params;
|
||||
auto args = get_arguments(instance);
|
||||
@ -280,9 +276,10 @@ protected:
|
||||
const auto& lws = params.workGroups.local;
|
||||
|
||||
GPU_DEBUG_TRACE_DETAIL << "Enqueue kernel " << kd_idx << ": gws=[" << gws[0] << ", " << gws[1] << ", " << gws[2] << "] "
|
||||
<< "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]" << std::endl;
|
||||
<< "lws=[" << lws[0] << ", " << lws[1] << ", " << lws[2] << "]"
|
||||
<< (needs_completion_event ? " has_completion_event=true" : "") << std::endl;
|
||||
|
||||
auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, is_output_event);
|
||||
auto ev = stream.enqueue_kernel(*_kernels[kd_idx], params, args, tmp_events, needs_completion_event);
|
||||
new_events.push_back(ev);
|
||||
all_events.push_back(ev);
|
||||
|
||||
|
@ -460,16 +460,6 @@ protected:
|
||||
|
||||
void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }
|
||||
|
||||
event::ptr aggregate_events(const std::vector<event::ptr>& events, stream& stream, bool group = false, bool is_output = false) const {
|
||||
if (events.size() == 1 && !is_output)
|
||||
return events[0];
|
||||
|
||||
if (group && !is_output)
|
||||
return stream.group_events(events);
|
||||
|
||||
return stream.enqueue_marker(events, is_output);
|
||||
}
|
||||
|
||||
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
|
||||
if (instance.can_be_optimized())
|
||||
return;
|
||||
@ -499,6 +489,12 @@ protected:
|
||||
}
|
||||
throw; // rethrowing dnnl::error if not out_of_memory
|
||||
}
|
||||
|
||||
// If oneDNN primitive is the output primitive or it's user is CPU implementation, then enqueue marker
|
||||
// with empty events wait list (which will trigger wait for all previously enqueued tasks) and
|
||||
// return it as oneDNN primitive's event as it is a single option for proper synchronization
|
||||
if (instance.needs_completion_event())
|
||||
event = stream.enqueue_marker({});
|
||||
}
|
||||
|
||||
if (_enable_profiling) {
|
||||
|
@ -230,7 +230,7 @@ public:
|
||||
bool is_dynamic() const { return _is_dynamic; }
|
||||
bool can_share_buffer() const { return _can_share_buffer; }
|
||||
bool is_constant() const { return _is_constant; }
|
||||
bool is_output_event() const { return _is_output_event; }
|
||||
bool needs_completion_event() const { return _needs_completion_event; }
|
||||
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
|
||||
|
||||
void allocate_internal_buffers();
|
||||
@ -330,7 +330,7 @@ protected:
|
||||
bool _can_be_optimized = false;
|
||||
bool _can_share_buffer = true;
|
||||
bool _is_constant = false;
|
||||
bool _is_output_event = false;
|
||||
bool _needs_completion_event = false;
|
||||
|
||||
size_t max_output_layout_size = 0;
|
||||
std::vector<size_t> max_intermediates_memory_sizes;
|
||||
|
@ -1396,8 +1396,11 @@ void network::execute_primitive(const std::shared_ptr<primitive_inst>& primitive
|
||||
const std::vector<event::ptr>& events) {
|
||||
event::ptr ev = primitive->execute(events);
|
||||
|
||||
// Collect events only for OOO queue and Profiling mode
|
||||
if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling) {
|
||||
// Collect events under any of the following conditions:
|
||||
// 1) OOO queue execution
|
||||
// 2) Profiling mode is enabled
|
||||
// 3) Primitive has CPU user or primitive is output
|
||||
if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling || primitive->needs_completion_event()) {
|
||||
auto id = primitive->id();
|
||||
_events.insert({id, ev});
|
||||
}
|
||||
|
@ -684,7 +684,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
dependencies = events;
|
||||
} else {
|
||||
auto queue_type = get_network().get_stream().get_queue_type();
|
||||
if (queue_type == QueueTypes::out_of_order) {
|
||||
// Prepare dependencies events in case of OOO queue, CPU implementation,
|
||||
// or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
|
||||
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
|
||||
dependencies.reserve(dependencies.size() + _exec_deps.size());
|
||||
for (auto& input : _exec_deps) {
|
||||
auto id = input->id();
|
||||
@ -803,7 +805,8 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
||||
, _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
|
||||
, _can_be_optimized(node.can_be_optimized())
|
||||
, _can_share_buffer(node.can_share_buffer())
|
||||
, _is_constant(node.is_constant()) {
|
||||
, _is_constant(node.is_constant())
|
||||
, _needs_completion_event(is_any_user_cpu(node.get_users()) || node.is_output()) {
|
||||
if (allocate_memory) {
|
||||
// In case when output is mutable_data primitive, and other users dependencies are only used for
|
||||
// suychronization, The output memory of such primitive will be fused with mutable_data
|
||||
@ -1392,9 +1395,7 @@ void primitive_inst::save(cldnn::BinaryOutputBuffer& ob) const {
|
||||
ob << can_be_optimized();
|
||||
ob << can_share_buffer();
|
||||
ob << is_constant();
|
||||
auto users = get_node().get_users();
|
||||
bool is_output_event = is_any_user_cpu(users) || get_node().is_output();
|
||||
ob << is_output_event;
|
||||
ob << needs_completion_event();
|
||||
|
||||
if (type() == cldnn::data::type_id()) {
|
||||
return;
|
||||
@ -1485,7 +1486,7 @@ void primitive_inst::load(cldnn::BinaryInputBuffer& ib) {
|
||||
ib >> _can_be_optimized;
|
||||
ib >> _can_share_buffer;
|
||||
ib >> _is_constant;
|
||||
ib >> _is_output_event;
|
||||
ib >> _needs_completion_event;
|
||||
|
||||
if (type() == cldnn::data::type_id()) {
|
||||
return;
|
||||
|
@ -322,8 +322,13 @@ void ocl_stream::enqueue_barrier() {
|
||||
}
|
||||
|
||||
event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool is_output) {
|
||||
if (deps.empty())
|
||||
return std::make_shared<ocl_user_event>(_engine.get_cl_context(), true);
|
||||
// Wait for all previously enqueued tasks if deps list is empty
|
||||
if (deps.empty()) {
|
||||
cl::Event ret_ev;
|
||||
_command_queue.enqueueMarkerWithWaitList(nullptr, &ret_ev);
|
||||
|
||||
return std::make_shared<ocl_event>(ret_ev);
|
||||
}
|
||||
|
||||
if (sync_method == sync_methods::events) {
|
||||
cl::Event ret_ev;
|
||||
|
@ -7,8 +7,13 @@
|
||||
#include "intel_gpu/graph/network.hpp"
|
||||
#include "intel_gpu/primitives/input_layout.hpp"
|
||||
#include "intel_gpu/primitives/data.hpp"
|
||||
#include "intel_gpu/primitives/activation.hpp"
|
||||
#include "intel_gpu/primitives/broadcast.hpp"
|
||||
#include "intel_gpu/primitives/concatenation.hpp"
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/reshape.hpp"
|
||||
|
||||
#include "runtime/ocl/ocl_event.hpp"
|
||||
|
||||
#include <memory>
|
||||
|
||||
@ -60,3 +65,147 @@ TEST(network_test, model_with_dynamic_input_is_dynamic) {
|
||||
|
||||
ASSERT_TRUE(net.is_dynamic());
|
||||
}
|
||||
|
||||
TEST(network_test, has_proper_event_for_in_order_queue) {
|
||||
auto& engine = get_test_engine();
|
||||
layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in_layout));
|
||||
topology.add(data("input2", const_mem));
|
||||
topology.add(activation("activation1", input_info("input1"), activation_func::clamp, {-10.f, 10.f}));
|
||||
topology.add(concatenation("concat", { input_info("activation1"), input_info("input2") }, 1));
|
||||
topology.add(reorder("reorder", input_info("concat"), in_layout));
|
||||
topology.add(activation("activation2", input_info("concat"), activation_func::relu));
|
||||
|
||||
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation2", impl_desc}};
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net(engine, topology, config);
|
||||
|
||||
net.set_input_data("input1", input_mem);
|
||||
net.execute();
|
||||
|
||||
ASSERT_FALSE(net.has_event("activation1"));
|
||||
ASSERT_TRUE(net.has_event("concat"));
|
||||
ASSERT_TRUE(net.has_event("reorder"));
|
||||
ASSERT_TRUE(net.has_event("activation2"));
|
||||
|
||||
auto concat_ev = net.get_primitive_event("concat");
|
||||
auto reorder_ev = net.get_primitive_event("reorder");
|
||||
auto activation_ev = net.get_primitive_event("activation2");
|
||||
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
|
||||
|
||||
// Check if we have real underlying OpenCL events
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
|
||||
}
|
||||
|
||||
TEST(network_test, has_proper_event_for_in_order_queue_optimized_out) {
|
||||
auto& engine = get_test_engine();
|
||||
layout in_layout{{1, 2, 2, 4}, data_types::f32, format::bfyx};
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in_layout));
|
||||
topology.add(data("input2", const_mem));
|
||||
topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
|
||||
topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
|
||||
topology.add(reorder("reorder", input_info("reshape"), in_layout));
|
||||
topology.add(activation("activation", input_info("reshape"), activation_func::relu));
|
||||
|
||||
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net(engine, topology, config);
|
||||
|
||||
net.set_input_data("input1", input_mem);
|
||||
net.execute();
|
||||
|
||||
ASSERT_TRUE(net.has_event("concat"));
|
||||
ASSERT_TRUE(net.has_event("reshape"));
|
||||
ASSERT_TRUE(net.has_event("reorder"));
|
||||
ASSERT_TRUE(net.has_event("activation"));
|
||||
|
||||
auto concat_ev = net.get_primitive_event("concat");
|
||||
auto reshape_ev = net.get_primitive_event("reshape");
|
||||
auto reorder_ev = net.get_primitive_event("reorder");
|
||||
auto activation_ev = net.get_primitive_event("activation");
|
||||
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(concat_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reshape_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
|
||||
|
||||
// Check if we have real underlying OpenCL events
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(concat_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reshape_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
TEST(network_test, has_proper_event_for_in_order_queue_onednn) {
|
||||
auto& engine = get_test_engine();
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto weights = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(data("weights", weights));
|
||||
topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
|
||||
topology.add(activation("activation", input_info("conv"), activation_func::relu));
|
||||
topology.add(reorder("reorder", input_info("conv"), in_layout));
|
||||
|
||||
auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
|
||||
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net(engine, topology, config);
|
||||
net.set_input_data("input", input_mem);
|
||||
net.execute();
|
||||
|
||||
ASSERT_TRUE(net.has_event("conv"));
|
||||
ASSERT_TRUE(net.has_event("reorder"));
|
||||
ASSERT_TRUE(net.has_event("activation"));
|
||||
|
||||
auto conv_ev = net.get_primitive_event("conv");
|
||||
auto reorder_ev = net.get_primitive_event("reorder");
|
||||
auto activation_ev = net.get_primitive_event("activation");
|
||||
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(conv_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(reorder_ev.get()));
|
||||
ASSERT_NO_THROW(downcast<ocl::ocl_base_event>(activation_ev.get()));
|
||||
|
||||
// Check if we have real underlying OpenCL events
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(conv_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(reorder_ev.get())->get().get() != nullptr);
|
||||
ASSERT_TRUE(downcast<ocl::ocl_base_event>(activation_ev.get())->get().get() != nullptr);
|
||||
}
|
||||
#endif
|
||||
|
@ -8,6 +8,8 @@
|
||||
#include <intel_gpu/primitives/activation.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
#include <intel_gpu/primitives/reorder.hpp>
|
||||
#include <intel_gpu/primitives/reshape.hpp>
|
||||
#include <intel_gpu/primitives/concatenation.hpp>
|
||||
#include "activation_inst.h"
|
||||
|
||||
#include <cmath>
|
||||
@ -1999,3 +2001,59 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(padding{}),
|
||||
::testing::Values(impl_types::cpu),
|
||||
::testing::Values(true)));
|
||||
|
||||
TEST(activation_gpu, has_proper_synchronization) {
|
||||
auto& engine = get_test_engine();
|
||||
auto in_layout = layout({1, 2, 2, 4}, data_types::f32, format::bfyx);
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto const_mem = engine.allocate_memory({{1, 2, 2, 4}, data_types::f32, format::bfyx});
|
||||
|
||||
auto in_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
|
||||
auto const_data = generate_random_4d<float>(1, 2, 2, 4, -1, 1);
|
||||
|
||||
set_values(input_mem, flatten_4d(format::bfyx, in_data));
|
||||
set_values(const_mem, flatten_4d(format::bfyx, const_data));
|
||||
|
||||
auto create_topology =[&]() {
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in_layout));
|
||||
topology.add(data("input2", const_mem));
|
||||
topology.add(concatenation("concat", { input_info("input1"), input_info("input2") }, 1));
|
||||
topology.add(reshape("reshape", input_info("concat"), false, {1, 2, 4, 4}, {1, 2, 4, 4}));
|
||||
topology.add(reorder("reorder", input_info("reshape"), in_layout));
|
||||
topology.add(activation("activation", input_info("reshape"), activation_func::relu));
|
||||
return topology;
|
||||
};
|
||||
|
||||
auto topology_ref = create_topology();
|
||||
auto topology_test = create_topology();
|
||||
|
||||
auto impl_desc = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"activation", impl_desc}};
|
||||
|
||||
auto config_ref = get_test_default_config(engine);
|
||||
config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
auto config_test = config_ref;
|
||||
config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net_test(engine, topology_test, config_test);
|
||||
net_test.set_input_data("input1", input_mem);
|
||||
auto outputs_test = net_test.execute();
|
||||
auto res_test = outputs_test.at("activation").get_memory();
|
||||
|
||||
network net_ref(engine, topology_ref, config_ref);
|
||||
net_ref.set_input_data("input1", input_mem);
|
||||
auto outputs_ref = net_ref.execute();
|
||||
auto res_ref = outputs_ref.at("activation").get_memory();
|
||||
|
||||
ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
|
||||
|
||||
cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
|
||||
cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
|
||||
ASSERT_EQ(test_mem[i], ref_mem[i]);
|
||||
}
|
||||
}
|
||||
|
@ -9293,12 +9293,13 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
|
||||
|
||||
network.set_input_data("input", input_mem);
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.size(), size_t(2));
|
||||
ASSERT_TRUE(outputs.find("conv_fsv") != outputs.end());
|
||||
|
||||
for (auto& p : network.get_primitives_info())
|
||||
std::cerr << p.original_id << " " << p.kernel_id << std::endl;
|
||||
|
||||
auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.begin()->second);
|
||||
auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.find("conv_fsv")->second);
|
||||
auto out_lay = network.get_node_output_layout("conv_fsv");
|
||||
ASSERT_EQ(out_lay.batch(), expected_result.size());
|
||||
ASSERT_EQ(out_lay.feature(), expected_result[0].size());
|
||||
@ -9657,6 +9658,65 @@ TEST(convolution_gpu_onednn, quantized_onednn_convolution_u8s8f32_asymmetric_act
|
||||
}
|
||||
}
|
||||
|
||||
TEST(convolution_gpu_onednn, has_proper_synchronization) {
|
||||
auto& engine = get_test_engine();
|
||||
if (!engine.get_device_info().supports_immad)
|
||||
return;
|
||||
|
||||
layout in_layout{{1, 16, 2, 4}, data_types::f32, format::bfyx};
|
||||
auto input_mem = engine.allocate_memory(in_layout);
|
||||
auto weights_mem = engine.allocate_memory({{16, 16, 1, 1}, data_types::f32, format::bfyx});
|
||||
|
||||
auto in_data = generate_random_4d<float>(1, 16, 2, 4, -1, 1);
|
||||
auto weights_data = generate_random_4d<float>(16, 16, 1, 1, -1, 1);
|
||||
|
||||
set_values(input_mem, flatten_4d(format::bfyx, in_data));
|
||||
set_values(weights_mem, flatten_4d(format::bfyx, weights_data));
|
||||
|
||||
auto create_topology =[&]() {
|
||||
topology topology;
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(data("weights", weights_mem));
|
||||
topology.add(convolution("conv", input_info("input"), "weights", "", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
|
||||
topology.add(activation("activation", input_info("conv"), activation_func::relu));
|
||||
topology.add(reorder("reorder", input_info("conv"), in_layout));
|
||||
return topology;
|
||||
};
|
||||
|
||||
auto topology_ref = create_topology();
|
||||
auto topology_test = create_topology();
|
||||
|
||||
auto impl_desc_cpu = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::cpu};
|
||||
auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::bfyx, "", impl_types::onednn};
|
||||
auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}, {"activation", impl_desc_cpu}};
|
||||
|
||||
auto config_ref = get_test_default_config(engine);
|
||||
config_ref.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
|
||||
config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
auto config_test = config_ref;
|
||||
config_test.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
|
||||
|
||||
network net_test(engine, topology_test, config_test);
|
||||
net_test.set_input_data("input", input_mem);
|
||||
auto outputs_test = net_test.execute();
|
||||
auto res_test = outputs_test.at("activation").get_memory();
|
||||
|
||||
network net_ref(engine, topology_ref, config_ref);
|
||||
net_ref.set_input_data("input", input_mem);
|
||||
auto outputs_ref = net_ref.execute();
|
||||
auto res_ref = outputs_ref.at("activation").get_memory();
|
||||
|
||||
ASSERT_EQ(res_test->get_layout().get_linear_size(), res_ref->get_layout().get_linear_size());
|
||||
|
||||
cldnn::mem_lock<float> test_mem(res_test, get_test_stream());
|
||||
cldnn::mem_lock<float> ref_mem(res_ref, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < res_ref->get_layout().get_linear_size(); ++i) {
|
||||
ASSERT_EQ(test_mem[i], ref_mem[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
template <typename T>
|
||||
|
Loading…
Reference in New Issue
Block a user