[GPU] Fix synchronization issue from wrong stream in multi-stream mode on dGPU (#16671)

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2023-04-05 21:29:47 +09:00 committed by GitHub
parent f5e199c494
commit 44cfbea9ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 80 additions and 31 deletions

View File

@ -7,6 +7,7 @@
#include "intel_gpu/graph/serialization/binary_buffer.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "intel_gpu/runtime/utils.hpp"
#include "intel_gpu/runtime/tensor.hpp"
#include "intel_gpu/primitives/primitive.hpp"
@ -32,6 +33,7 @@ struct kernel_impl_params {
bool has_runtime_layouts = false;
const program *prog;
stream::ptr strm;
std::shared_ptr<const primitive> desc;
size_t unique_id;
std::vector<layout> input_layouts;
@ -52,9 +54,10 @@ struct kernel_impl_params {
std::map<size_t, memory::ptr> memory_deps = {};
size_t primary_input_idx = 0;
kernel_impl_params() : prog(nullptr), desc(nullptr), unique_id(0) {}
kernel_impl_params() : prog(nullptr), strm(nullptr), desc(nullptr), unique_id(0) {}
kernel_impl_params(program& _prog,
stream::ptr _strm,
std::shared_ptr<const primitive> _desc,
size_t _uid,
const std::vector<layout>& _in_layouts,
@ -62,6 +65,7 @@ struct kernel_impl_params {
const std::vector<cldnn::fused_primitive_desc>& _fused_descs)
: has_runtime_layouts(true)
, prog(&_prog)
, strm(_strm)
, desc(_desc)
, unique_id(_uid)
, input_layouts(_in_layouts)
@ -119,6 +123,8 @@ struct kernel_impl_params {
OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params in not initialized");
return *prog;
}
stream& get_stream() const { return *strm; }
stream::ptr get_stream_ptr() const { return strm; }
size_t hash() const;
bool operator==(const kernel_impl_params& rhs) const;

View File

@ -151,6 +151,7 @@ public:
nodes_ordering& get_processing_order();
uint32_t get_prog_id() { return prog_id; }
stream& get_stream() { return *_stream; }
stream::ptr get_stream_ptr() const { return _stream; }
const stream& get_stream() const { return *_stream; }
const std::list<primitive_id>& get_optimized_out() const { return optimized_out; }
const std::list<optimized_info>& get_optimized() const { return optimized; }

View File

@ -96,7 +96,7 @@ std::vector<layout> arg_max_min_inst::calc_output_layouts(arg_max_min_node const
} else if (constant_mem.count(1)) {
std::map<size_t, ngraph::HostTensorPtr> const_data;
auto target_shape_mem = constant_mem.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> target_shape_lock(target_shape_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> target_shape_lock(target_shape_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(target_shape_mem->get_layout(), target_shape_lock.data()));
ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data);

View File

@ -59,18 +59,18 @@ std::vector<layout> border_inst::calc_output_layouts(border_node const& /*node*/
if ((is_begin_mem && memory_deps.count(1)) && (is_end_mem && memory_deps.count(2))) {
auto pads_begin_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_begin_lock(pads_begin_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_begin_lock(pads_begin_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(pads_begin_mem->get_layout(), pads_begin_lock.data()));
auto pads_end_mem = memory_deps.at(2);
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_end_lock(pads_end_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_end_lock(pads_end_mem, impl_param.get_stream());
const_data.emplace(2, make_host_tensor(pads_end_mem->get_layout(), pads_end_lock.data()));
ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data);
} else if ((is_begin_mem || is_end_mem) && memory_deps.count(1)) {
if (is_begin_mem) {
auto pads_begin_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_begin_lock(pads_begin_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_begin_lock(pads_begin_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(pads_begin_mem->get_layout(), pads_begin_lock.data()));
auto pads_end_data = desc->pads_end;
@ -84,7 +84,7 @@ std::vector<layout> border_inst::calc_output_layouts(border_node const& /*node*/
const_data.emplace(1, pads_begin_tensor);
auto pads_end_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_end_lock(pads_end_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> pads_end_lock(pads_end_mem, impl_param.get_stream());
const_data.emplace(2, make_host_tensor(pads_end_mem->get_layout(), pads_end_lock.data()));
ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data);

View File

@ -73,7 +73,7 @@ std::vector<layout> broadcast_inst::calc_output_layouts(broadcast_node const& /*
auto& constant_mem = impl_param.memory_deps;
if (constant_mem.count(1)) {
auto target_shape_mem = constant_mem.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> target_shape_lock(target_shape_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> target_shape_lock(target_shape_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(target_shape_mem->get_layout(), target_shape_lock.data()));
ov::op::v3::shape_infer(&op, input_shapes, output_shapes, const_data);
} else if (impl_param.input_layouts.size() == 1) {

View File

@ -60,12 +60,12 @@ std::vector<layout> crop_inst::calc_output_layouts(const crop_node& /*node*/, co
OPENVINO_ASSERT(impl_param.memory_deps.count(1) > 0, "[GPU] Can't find Crop(ngraph VariadicSplit op mode) axis values memory dependency");
auto axis_values_mem = impl_param.memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> axis_values_mem_lock(axis_values_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> axis_values_mem_lock(axis_values_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(axis_values_mem->get_layout(), axis_values_mem_lock.data()));
OPENVINO_ASSERT(impl_param.memory_deps.count(2) > 0, "[GPU] Can't find Crop(ngraph VariadicSplit op mode) split length values memory dependency");
auto split_length_mem = impl_param.memory_deps.at(2);
cldnn::mem_lock<uint8_t, mem_lock_type::read> split_length_mem_lock(split_length_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> split_length_mem_lock(split_length_mem, impl_param.get_stream());
const_data.emplace(2, make_host_tensor(split_length_mem->get_layout(), split_length_mem_lock.data()));
ov::op::v1::VariadicSplit op;
@ -75,7 +75,7 @@ std::vector<layout> crop_inst::calc_output_layouts(const crop_node& /*node*/, co
OPENVINO_ASSERT(impl_param.memory_deps.count(1) > 0, "[GPU] Can't find Crop(ngraph Split op mode) axis values memory dependency");
auto axis_values_mem = impl_param.memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> axis_values_mem_lock(axis_values_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> axis_values_mem_lock(axis_values_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(axis_values_mem->get_layout(), axis_values_mem_lock.data()));
ov::op::v1::Split op;

View File

@ -185,7 +185,7 @@ std::vector<layout> deconvolution_inst::calc_output_layouts(deconvolution_node c
output_shapes = ov::op::v1::shape_infer(&op, input_shapes, pads_begin, pads_end);
} else if (memory_deps.count(2)) {
auto mem = memory_deps.at(2);
auto dims = read_vector<int64_t>(mem, impl_param.prog->get_stream());
auto dims = read_vector<int64_t>(mem, impl_param.get_stream());
auto dims_shape = ov::Shape{dims.size()};
input_shapes.push_back(dims_shape);
output_shapes = ov::op::v1::shape_infer(
@ -211,7 +211,7 @@ std::vector<layout> deconvolution_inst::calc_output_layouts(deconvolution_node c
output_shapes = ov::op::v1::shape_infer(&op, input_shapes, pads_begin, pads_end);
} else if (memory_deps.count(2)) {
auto mem = memory_deps.at(2);
auto dims = read_vector<int64_t>(mem, impl_param.prog->get_stream());
auto dims = read_vector<int64_t>(mem, impl_param.get_stream());
auto dims_shape = ov::Shape{dims.size()};
input_shapes.push_back(dims_shape);
output_shapes = ov::op::v1::shape_infer(

View File

@ -1021,6 +1021,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto data_layout = data.get_output_layout();
if (fc_layout.is_dynamic() || data_layout.is_dynamic())
continue;
if ((fc_layout.batch() == 1 || fc_layout.feature() == 1) ||
(data_layout.batch() == 1 && data_layout.feature() == 1) ||
(fc_layout.count() == data_layout.count())) {

View File

@ -156,7 +156,7 @@ struct resample_impl : typed_primitive_impl_ocl<resample> {
bool scales_calc_mod = primitive->shape_calc_mode == resample::InterpolateOp::ShapeCalcMode::SCALES;
if (scales_calc_mod && impl_param.input_layouts.size() > 1 && scales.empty()) {
auto mem = impl_param.memory_deps.at(2);
scales = read_vector<float>(mem, impl_param.prog->get_stream());
scales = read_vector<float>(mem, impl_param.get_stream());
}
for (size_t i = 0; i < scales.size(); ++i) {

View File

@ -144,6 +144,7 @@ public:
void check_memory_to_set(const memory& mem, const layout& layout) const;
const std::list<const cldnn::program_node *>& get_users() const { return _node->get_users(); }
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
const primitive_impl* get_impl() const { return _impl.get(); }

View File

@ -111,8 +111,8 @@ public:
}
virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params(const std::vector<layout>& in_layouts, const std::vector<layout>& out_layouts) const {
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layouts,
get_fused_primitives()));
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_program().get_stream_ptr(), get_primitive(),
get_unique_id(), in_layouts, out_layouts, get_fused_primitives()));
params->memory_deps = get_const_memory_deps();
auto deps = get_dependencies();

View File

@ -42,7 +42,7 @@ std::vector<layout> non_max_suppression_inst::calc_output_layouts(non_max_suppre
if (memory_deps.count(2)) {
auto max_output_boxes_per_class_mem = memory_deps.at(2);
cldnn::mem_lock<uint8_t, mem_lock_type::read> max_output_boxes_per_class_lock(max_output_boxes_per_class_mem,
impl_param.prog->get_stream());
impl_param.get_stream());
auto max_output_boxes_per_class_tensor = make_host_tensor(max_output_boxes_per_class_mem->get_layout(),
max_output_boxes_per_class_lock.data());
const_data.emplace(2, max_output_boxes_per_class_tensor);

View File

@ -57,7 +57,7 @@ layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node,
assert(static_cast<bool>(node.get_primitive()->output_data_types[0]) == false &&
"Output data type forcing is not supported for gather_nonzero_node!");
if (impl_param.memory_deps.count(1)) {
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.prog->get_stream());
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.get_stream());
ov::Shape output_shape(out_size.begin(), out_size.end());
ov::PartialShape output_pshape(output_shape);
return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx};
@ -72,7 +72,7 @@ std::vector<layout> gather_nonzero_inst::calc_output_layouts(gather_nonzero_node
assert(static_cast<bool>(desc->output_data_types[0]) == false &&
"Output data type forcing is not supported for gather_nonzero_node!");
if (impl_param.memory_deps.count(1)) {
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.prog->get_stream());
auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.get_stream());
// output shape of nonzero is [input_rank, count_non_zero]
auto rank = static_cast<size_t>(impl_param.get_input_layout(0).get<ShapeType>().rank().get_length());
auto count = static_cast<size_t>(out_size[0]);

View File

@ -633,7 +633,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx});
}
}
_impl_params->strm = _network.get_stream_ptr();
if (_outputs[0])
max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
}

View File

@ -39,15 +39,15 @@ std::vector<layout> range_inst::calc_output_layouts(range_node const& /*node*/,
if (memory_deps.count(0) > 0 && memory_deps.count(1) > 0 && memory_deps.count(2) > 0) {
auto start_mem = memory_deps.at(0);
cldnn::mem_lock<uint8_t, mem_lock_type::read> start_mem_lock(start_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> start_mem_lock(start_mem, impl_param.get_stream());
const_data.emplace(0, make_host_tensor(start_mem->get_layout(), start_mem_lock.data()));
auto stop_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> stop_mem_lock(stop_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> stop_mem_lock(stop_mem, impl_param.get_stream());
const_data.emplace(1, make_host_tensor(stop_mem->get_layout(), stop_mem_lock.data()));
auto step_mem = memory_deps.at(2);
cldnn::mem_lock<uint8_t, mem_lock_type::read> step_mem_lock(step_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> step_mem_lock(step_mem, impl_param.get_stream());
const_data.emplace(2, make_host_tensor(step_mem->get_layout(), step_mem_lock.data()));
shape_infer(&op, input_shapes, output_shapes, const_data);

View File

@ -84,7 +84,7 @@ std::vector<layout> resample_inst::calc_output_layouts(resample_node const& /*no
ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data});
} else {
auto sizes_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock(sizes_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock(sizes_mem, impl_param.get_stream());
auto sizes_tensor = make_host_tensor(sizes_mem->get_layout(), lock.data());
const_data.emplace(1, sizes_tensor);
ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data});
@ -96,7 +96,7 @@ std::vector<layout> resample_inst::calc_output_layouts(resample_node const& /*no
ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data});
} else {
auto scales_mem = memory_deps.at(2);
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock(scales_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock(scales_mem, impl_param.get_stream());
auto scales_tensor = make_host_tensor(scales_mem->get_layout(), lock.data());
const_data.emplace(2, scales_tensor);
ov::op::v4::shape_infer(&op, pads_begin, pads_end, input_shapes, output_shapes, {const_data});

View File

@ -111,7 +111,7 @@ std::vector<layout> reshape_inst::calc_output_layouts(reshape_node const& /*node
if (memory_deps.count(1) > 0) {
auto pattern_mem = memory_deps.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> pattern_lock(pattern_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> pattern_lock(pattern_mem, impl_param.get_stream());
auto pattern_ptr = pattern_lock.data();
auto pattern_tensor = make_host_tensor(pattern_mem->get_layout(), pattern_ptr);

View File

@ -83,9 +83,9 @@ std::vector<layout> strided_slice_inst::calc_output_layouts(strided_slice_node c
auto end_mem = constant_mem.at(2);
auto strides_mem = constant_mem.at(3);
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock1(begin_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock2(end_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock3(strides_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock1(begin_mem, impl_param.get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock2(end_mem, impl_param.get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> lock3(strides_mem, impl_param.get_stream());
auto begin_tensor = make_host_tensor(begin_mem->get_layout(), lock1.data());
auto end_tensor = make_host_tensor(end_mem->get_layout(), lock2.data());

View File

@ -54,7 +54,7 @@ std::vector<layout> tile_inst::calc_output_layouts(tile_node const& /*node*/, co
if (desc->input_size() == 2) {
if (constant_mem.count(1)) {
auto repeats_mem = constant_mem.at(1);
cldnn::mem_lock<uint8_t, mem_lock_type::read> repeats_lock(repeats_mem, impl_param.prog->get_stream());
cldnn::mem_lock<uint8_t, mem_lock_type::read> repeats_lock(repeats_mem, impl_param.get_stream());
const auto& layout = repeats_mem->get_layout();
const auto repeats_tensor =
ov::Tensor(data_type_to_element_type(layout.data_type), layout.get_shape(), repeats_lock.data());

View File

@ -78,7 +78,7 @@ public:
#ifdef ENABLE_ONEDNN_FOR_GPU
class FullyConnectedFusingTestOneDNN : public BaseFusingTest<fully_connected_test_params> {
public:
void execute(fully_connected_test_params& p, bool is_caching_test = false) {
void execute(fully_connected_test_params& p, bool is_caching_test = false, bool is_dynamic = false) {
// Onednn post operation has issue in a machine that does not support imad.
if (!engine.get_device_info().supports_immad)
return;
@ -103,6 +103,8 @@ public:
ov::intel_gpu::ImplementationDesc fc_ocl_impl = { ocl_forcing_format, p.ocl_kernel_name /*fully_connected_gpu_bfyx_ref*/};
cfg_not_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_ocl_impl } }));
}
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test);
network_fused->set_input_data("input", input_prim);
@ -498,6 +500,30 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add, ::testing::ValuesIn(s
fully_connected_test_params{ CASE_FC_FP16_3D_2, 2, 3, "fully_connected_gpu_bfyx_ref" },
}));
class fc_fp16_eltwise_add_dynamic : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_add_dynamic, basic) {
auto p = GetParam();
auto test_input_layout = get_input_layout(p);
auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().size()), test_input_layout.data_type, test_input_layout.format};
create_topologies(
input_layout("input", dynamic_input_layout),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_per_channel_layout(p), 1, 9)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", padding(), get_output_dim_size(p)),
eltwise("eltwise", { input_info("fc_prim"), input_info("eltwise_data") }, eltwise_mode::sum),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
);
tolerance = 1e-2f;
execute(p, false, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp16_eltwise_add_dynamic, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
fully_connected_test_params{ CASE_FC_FP16_3, 2, 3, "fully_connected_gpu_bfyx_ref" },
fully_connected_test_params{ CASE_FC_FP16_4, 2, 3, "fully_connected_gpu_bfyx_ref" },
}));
class fc_fp16_eltwise_sub : public FullyConnectedFusingTestOneDNN {};
TEST_P(fc_fp16_eltwise_sub, basic) {
auto p = GetParam();

View File

@ -13,6 +13,8 @@
#include <vector>
#include <iostream>
#include "primitive_inst.h"
using namespace cldnn;
using namespace ::tests;
@ -40,14 +42,20 @@ TEST(multistream_gpu, basic) {
topology.add(shape_of("shape_of", input_info("fc"), 3, data_types::i32));
auto prog_ptr = program::build_program(engine, topology, config);
auto &node = prog_ptr->get_node("shape_of");
auto strm = node.get_kernel_impl_params()->get_stream_ptr();
ASSERT_EQ(prog_ptr->get_stream_ptr(), strm);
std::vector<network::ptr> networks;
std::vector<stream::ptr> streams;
for (size_t i = 0; i < num_streams; i++) {
networks.push_back(network::allocate_network(engine, prog_ptr));
streams.push_back(networks[i]->get_stream_ptr());
}
std::vector<InferenceEngine::Task> tasks;
for (size_t i = 0; i < num_streams; i++) {
tasks.push_back([&networks, i, &engine] {
tasks.push_back([&networks, &streams, i, &engine] {
auto cfg = get_test_default_config(engine);
auto stream = engine.create_stream(cfg);
auto net = networks[i];
@ -61,6 +69,10 @@ TEST(multistream_gpu, basic) {
auto outputs = net->execute();
auto inst = net->get_primitive("shape_of");
auto strm = inst->get_impl_params()->get_stream_ptr();
ASSERT_EQ(streams[i], strm);
auto output = outputs.at("shape_of").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, *stream);