[GPU] Remove clFinish call from USM memory lock function (#17830)

This commit is contained in:
Sergey Shlyapnikov 2023-06-02 16:17:05 +04:00 committed by GitHub
parent 43d67b0a32
commit 5afbd4cf92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 30 additions and 23 deletions

View File

@ -14,6 +14,8 @@
#include <vector>
#include <set>
#include "openvino/core/except.hpp"
namespace cldnn {
struct primitive;
@ -137,7 +139,8 @@ inline derived_type* downcast(base_type* base) {
if (auto casted = dynamic_cast<derived_type*>(base))
return casted;
throw std::runtime_error("Unable to cast pointer from base to derived type");
OPENVINO_THROW("Unable to cast pointer from base (", typeid(base_type).name(), ") ",
"type to derived (", typeid(derived_type).name(), ") type");
}
template <typename derived_type, typename base_type, typename std::enable_if<std::is_base_of<base_type, derived_type>::value, int>::type = 0>

View File

@ -55,7 +55,7 @@ struct loop_impl : typed_primitive_impl<loop> {
instance.preprocess_input_memory();
instance.preprocess_backedge_memory();
// set input data for current_iteration primitive if current_it`eration is used
// set input data for current_iteration primitive if current_iteration is used
if (!primitive->current_iteration_id.empty()) {
auto current_iteration_prim = body_network->get_primitive(primitive->current_iteration_id);
auto input_layout_prim = std::dynamic_pointer_cast<input_layout_inst>(current_iteration_prim);
@ -91,6 +91,14 @@ struct loop_impl : typed_primitive_impl<loop> {
const auto& concatenated_input_mem_mappings = instance.concatenated_input_mem_mappings;
const auto& concatenated_output_mem_mappings = instance.concatenated_output_mem_mappings;
// If there are concatenated_output_mem_mappings or backedge_memory_mappings we need to wait for
// previous tasks before accessing memory in get_sliced_mem() and setup_iteration() functions
if (!concatenated_input_mem_mappings.empty() || !instance.backedge_memory_mappings.empty()) {
for (auto e : events) {
e->wait();
}
}
// Set sliced input data
for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) {
const auto& concatenated_input = concatenated_input_mem_mappings.at(i);

View File

@ -366,7 +366,6 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type) {
std::lock_guard<std::mutex> locker(_mutex);
if (0 == _lock_count) {
auto& cl_stream = downcast<const ocl_stream>(stream);
cl_stream.finish(); // Synchronization needed for OOOQ.
if (get_allocation_type() == allocation_type::usm_device) {
if (type != mem_lock_type::read) {
throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type.");

View File

@ -84,8 +84,8 @@ public:
ASSERT_EQ(outputs_ref.size(), outputs_fused.size());
ASSERT_EQ(outputs_ref.size(), size_t(1));
auto val_ref=get_output_values_to_float(not_fused, outputs_ref.begin()->first);
auto val_opt=get_output_values_to_float(fused, outputs_fused.begin()->first);
auto val_ref = get_output_values_to_float(not_fused, outputs_ref.begin()->second);
auto val_opt = get_output_values_to_float(fused, outputs_fused.begin()->second);
ASSERT_EQ(val_ref.size(), val_opt.size());
for (size_t i = 0; i < val_ref.size(); i++) {
ASSERT_NEAR(val_ref[i], val_opt[i], tolerance)

View File

@ -9292,12 +9292,13 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
network network(engine, topology, config);
network.set_input_data("input", input_mem);
network.execute();
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
for (auto& p : network.get_primitives_info())
std::cerr << p.original_id << " " << p.kernel_id << std::endl;
auto out_ptr = get_output_values_to_float<FLOAT16>(network, "conv_fsv");
auto out_ptr = get_output_values_to_float<FLOAT16>(network, outputs.begin()->second);
auto out_lay = network.get_node_output_layout("conv_fsv");
ASSERT_EQ(out_lay.batch(), expected_result.size());
ASSERT_EQ(out_lay.feature(), expected_result[0].size());

View File

@ -1406,8 +1406,8 @@ public:
ASSERT_EQ(outputs_ocl.size(), outputs_onednn.size());
ASSERT_EQ(outputs_ocl.size(), size_t(1));
auto val_ocl = get_output_values_to_float(network_ocl, outputs_ocl.begin()->first);
auto val_onednn = get_output_values_to_float(network_onednn, outputs_onednn.begin()->first);
auto val_ocl = get_output_values_to_float(network_ocl, outputs_ocl.begin()->second);
auto val_onednn = get_output_values_to_float(network_onednn, outputs_onednn.begin()->second);
ASSERT_EQ(val_ocl.size(), val_onednn.size());

View File

@ -579,13 +579,9 @@ T div_up(const T a, const U b) {
}
template <class T>
std::vector<float> get_output_values_to_float(cldnn::network& net, const cldnn::primitive_id& output_id, size_t max_cnt = std::numeric_limits<size_t>::max()) {
std::vector<float> get_output_values_to_float(cldnn::network& net, const cldnn::network_output& output, size_t max_cnt = std::numeric_limits<size_t>::max()) {
std::vector<float> ret;
auto ptr = net.get_output_memory(output_id);
auto out_ids = net.get_output_ids();
if (find(out_ids.begin(), out_ids.end(), output_id) == out_ids.end())
IE_THROW() << "Non output node's memory may have been reused. "
"Make target node to output by using ov::intel_gpu::custom_outputs in ExecutionConfig.";
auto ptr = output.get_memory();
cldnn::mem_lock<T, cldnn::mem_lock_type::read> mem(ptr, net.get_stream());
if (ptr->get_layout().data_type != cldnn::type_to_data_type<T>::value)
IE_THROW() << "target type " << cldnn::data_type_traits::name(cldnn::type_to_data_type<T>::value)
@ -595,20 +591,20 @@ std::vector<float> get_output_values_to_float(cldnn::network& net, const cldnn::
return ret;
}
inline std::vector<float> get_output_values_to_float(cldnn::network& net, const cldnn::primitive_id& output_id, size_t max_cnt = std::numeric_limits<size_t>::max()) {
switch(net.get_output_layout(output_id).data_type){
inline std::vector<float> get_output_values_to_float(cldnn::network& net, const cldnn::network_output& output, size_t max_cnt = std::numeric_limits<size_t>::max()) {
switch(output.get_layout().data_type){
case cldnn::data_types::f16:
return get_output_values_to_float<FLOAT16>(net, output_id, max_cnt);
return get_output_values_to_float<FLOAT16>(net, output, max_cnt);
case cldnn::data_types::f32:
return get_output_values_to_float<float>(net, output_id, max_cnt);
return get_output_values_to_float<float>(net, output, max_cnt);
case cldnn::data_types::i8:
return get_output_values_to_float<int8_t>(net, output_id, max_cnt);
return get_output_values_to_float<int8_t>(net, output, max_cnt);
case cldnn::data_types::u8:
return get_output_values_to_float<uint8_t>(net, output_id, max_cnt);
return get_output_values_to_float<uint8_t>(net, output, max_cnt);
case cldnn::data_types::i32:
return get_output_values_to_float<int32_t>(net, output_id, max_cnt);
return get_output_values_to_float<int32_t>(net, output, max_cnt);
case cldnn::data_types::i64:
return get_output_values_to_float<int64_t>(net, output_id, max_cnt);
return get_output_values_to_float<int64_t>(net, output, max_cnt);
default:
IE_THROW() << "Unknown output data_type";
}