[GPU] Fix for conv/deconv weights calculated in runtime (#8952)
This commit is contained in:
parent
2f07b98251
commit
b492b59136
@ -196,15 +196,16 @@ KernelsData ConvolutionKernelBase::GetCommonKernelsData(const Params& params,
|
||||
return {};
|
||||
}
|
||||
|
||||
auto preferredWeightsLayout = GetPreferredWeightsLayout(newParams);
|
||||
bool succeed = UpdateWeightsParams(newParams,
|
||||
options,
|
||||
GetPreferredWeightsLayout(newParams),
|
||||
preferredWeightsLayout,
|
||||
kd.weightsReorderParams,
|
||||
GetSupportedKey(),
|
||||
newParams.groups,
|
||||
newParams.transposed);
|
||||
|
||||
bool bSupportedWeightsLayout = newParams.weights.GetLayout() == GetPreferredWeightsLayout(newParams);
|
||||
bool bSupportedWeightsLayout = newParams.weights.GetLayout() == preferredWeightsLayout;
|
||||
const bool bWeightsOK = bSupportedWeightsLayout || options.allowStaticInputReordering;
|
||||
|
||||
if (!succeed || !bWeightsOK) {
|
||||
|
@ -125,7 +125,7 @@ binary_convolution_inst::typed_primitive_inst(network& network, binary_convoluti
|
||||
"Only one-dimensional batch size are supported");
|
||||
CLDNN_ERROR_LESS_THAN(node.id(),
|
||||
"Weights feature maps number",
|
||||
(input_inst.size.feature[0] + pad.feature[0]) / split,
|
||||
input_inst.size.feature[0],
|
||||
"input feature maps number",
|
||||
filter_inst.size.feature[0],
|
||||
"Weights/ifm mismatch");
|
||||
|
@ -97,7 +97,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
|
||||
input_layout.format == format::image_2d_weights_winograd_6x3_s1_xfbyb)
|
||||
CLDNN_ERROR_MESSAGE(
|
||||
node.id(),
|
||||
"Input for convolution should not be in windograd weights format - it is reserved for weights only");
|
||||
"Input for convolution should not be in winograd weights format - it is reserved for weights only");
|
||||
|
||||
if (input_layout.format == format::winograd_2x3_s1_data) {
|
||||
CLDNN_ERROR_NOT_EQUAL(node.id(),
|
||||
@ -369,10 +369,19 @@ convolution_inst::typed_primitive_inst(network& network, convolution_node const&
|
||||
"Only one-dimensional batch size are supported");
|
||||
CLDNN_ERROR_LESS_THAN(node.id(),
|
||||
"Weights feature maps number",
|
||||
(input_inst.size.feature[0] + pad.feature[0]) / split,
|
||||
input_inst.size.feature[0],
|
||||
"input feature maps number",
|
||||
weights_ifm,
|
||||
"Weights/ifm mismatch");
|
||||
|
||||
if (!argument.grouped_weights_shape && !format::is_grouped(filter_inst.format)) {
|
||||
CLDNN_ERROR_NOT_EQUAL(node.id(),
|
||||
"Weights feature maps number",
|
||||
input_inst.size.feature[0],
|
||||
"input feature maps number",
|
||||
weights_ifm,
|
||||
"Weights/ifm mismatch");
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace cldnn
|
||||
|
@ -82,11 +82,11 @@ layout deconvolution_inst::calc_output_layout(deconvolution_node const& node) {
|
||||
int32_t off_factor = -2;
|
||||
size_t spatial_dims = cldnn::format::traits(input_layout.format).spatial_num;
|
||||
CLDNN_ERROR_GREATER_THAN(node.id(),
|
||||
"number of spatial dimensions",
|
||||
spatial_dims,
|
||||
"expected number of dimensions",
|
||||
3,
|
||||
"As for now, deconvolutions with more than 3 dimensions are not supported");
|
||||
"number of spatial dimensions",
|
||||
spatial_dims,
|
||||
"expected number of dimensions",
|
||||
3,
|
||||
"As for now, deconvolutions with more than 3 dimensions are not supported");
|
||||
|
||||
int32_t x = off_factor * pad.spatial[0] + (input_layout.size.spatial[0] - 1) * strd.spatial[0] + filter_size.spatial[0];
|
||||
int32_t y = 1;
|
||||
@ -208,6 +208,7 @@ deconvolution_inst::typed_primitive_inst(network& network, deconvolution_node co
|
||||
1,
|
||||
"Spatial[0] of bias should be 1. Bias isn't 1D vector.");
|
||||
}
|
||||
|
||||
CLDNN_ERROR_NOT_EQUAL(node.id(),
|
||||
"deconvolution padding filling value",
|
||||
node.get_output_layout().data_padding.filling_value(),
|
||||
@ -240,10 +241,19 @@ deconvolution_inst::typed_primitive_inst(network& network, deconvolution_node co
|
||||
"Only one-dimensional features are supported");
|
||||
CLDNN_ERROR_LESS_THAN(node.id(),
|
||||
"Weights feature maps number",
|
||||
(input_inst.size.feature[0] + pad.feature[0]) / split,
|
||||
input_inst.size.feature[0],
|
||||
"input feature maps number",
|
||||
weights_ifm,
|
||||
"Weights/ifm mimsmatch");
|
||||
"Weights/ifm mismatch");
|
||||
|
||||
if (!argument.grouped_weights_shape && !format::is_grouped(filter_inst.format)) {
|
||||
CLDNN_ERROR_NOT_EQUAL(node.id(),
|
||||
"Weights feature maps number",
|
||||
input_inst.size.feature[0],
|
||||
"input feature maps number",
|
||||
weights_ifm,
|
||||
"Weights/ifm mismatch");
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace cldnn
|
||||
|
@ -536,7 +536,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
};
|
||||
|
||||
const auto reorder_input_deconvolution = [&p, &lo, &rf](typed_program_node<deconvolution>& deconv_node) {
|
||||
const auto reorder_input_and_weights_deconvolution = [&p, &lo, &rf](typed_program_node<deconvolution>& deconv_node) {
|
||||
auto& input = deconv_node.input();
|
||||
auto input_layout = input.get_output_layout();
|
||||
auto new_format = lo.get_preferred_format(deconv_node);
|
||||
@ -547,14 +547,41 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
p.add_intermediate(reorder.first, deconv_node, 0, !reorder.second);
|
||||
}
|
||||
}
|
||||
|
||||
auto& weights = deconv_node.weights();
|
||||
auto weights_layout = weights.get_output_layout();
|
||||
if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
|
||||
auto dims = weights_layout.format.dimension();
|
||||
auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
|
||||
auto reorder = rf.get_reorder(weights.id(), weights_layout,
|
||||
layout{ weights_layout.data_type, preferred_format, weights_layout.size });
|
||||
if (reorder.first) {
|
||||
p.add_intermediate(reorder.first, deconv_node, 1, !reorder.second);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const auto reorder_weights_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
|
||||
auto& weights = conv_node.weights();
|
||||
auto weights_layout = weights.get_output_layout();
|
||||
if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
|
||||
auto dims = weights_layout.format.dimension();
|
||||
auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
|
||||
auto reorder = rf.get_reorder(weights.id(), weights_layout,
|
||||
layout{ weights_layout.data_type, preferred_format, weights_layout.size });
|
||||
if (reorder.first) {
|
||||
p.add_intermediate(reorder.first, conv_node, 1, !reorder.second);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (auto& prim : p.get_processing_order()) {
|
||||
program_helpers::do_for_types<detection_output, binary_convolution, deconvolution>(
|
||||
program_helpers::do_for_types<detection_output, binary_convolution, deconvolution, convolution>(
|
||||
*prim,
|
||||
reorder_input_detection_output,
|
||||
reorder_input_binary_convolution,
|
||||
reorder_input_deconvolution);
|
||||
reorder_input_and_weights_deconvolution,
|
||||
reorder_weights_convolution);
|
||||
}
|
||||
|
||||
for (auto n : p.get_processing_order()) {
|
||||
|
@ -1320,17 +1320,27 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
impl_candidate = impl_types::ocl;
|
||||
}
|
||||
|
||||
size_t eltw_dep = 0;
|
||||
for (auto& fo : node.get_fused_primitives()) {
|
||||
if (fo.node->is_type<eltwise>()) {
|
||||
auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
|
||||
auto out_layout = node.get_output_layout();
|
||||
auto in_dt = in_layout.data_type;
|
||||
auto out_dt = out_layout.data_type;
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
|
||||
fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
if (fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
|
||||
if ((out_layout.count() == in_layout.count()) &&
|
||||
(data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding &&
|
||||
data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) {
|
||||
if (eltw_dep > 0) {
|
||||
impl_candidate = impl_types::ocl;
|
||||
break;
|
||||
}
|
||||
eltw_dep = fo.dep_start_idx;
|
||||
}
|
||||
}
|
||||
} else if (fo.node->is_type<activation>()) {
|
||||
// Some activations aren't implemented in oneDNN
|
||||
|
@ -514,15 +514,17 @@ void network::allocate_primitives() {
|
||||
can_reuse_eltwise_mem = true;
|
||||
}
|
||||
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
auto& prim_inst = _primitives.at(node->id());
|
||||
auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
|
||||
auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
|
||||
if (!can_reuse_eltwise_mem) {
|
||||
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
|
||||
auto& eltw_inst = _primitives.at(eltw_in.id());
|
||||
auto& prim_inst = _primitives.at(node->id());
|
||||
auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
|
||||
auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
|
||||
|
||||
// Keep lockable memory type for `prim_inst` output if needed
|
||||
if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
|
||||
can_reuse_eltwise_mem = false;
|
||||
// Keep lockable memory type for `prim_inst` output if needed
|
||||
if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
|
||||
can_reuse_eltwise_mem = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_op.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(eltw_in_layout) && !can_reuse_eltwise_mem) {
|
||||
|
@ -741,10 +741,10 @@ program_node& program::get_or_create(std::shared_ptr<primitive> prim) {
|
||||
}
|
||||
|
||||
void program::add_intermediate(program_node& node,
|
||||
program_node& next,
|
||||
size_t prev_idx,
|
||||
bool connect_int_node_with_old_dep,
|
||||
bool move_usrs_of_prev_to_node) {
|
||||
program_node& next,
|
||||
size_t prev_idx,
|
||||
bool connect_int_node_with_old_dep,
|
||||
bool move_usrs_of_prev_to_node) {
|
||||
if (connect_int_node_with_old_dep && !node.dependencies.empty())
|
||||
throw std::invalid_argument(
|
||||
"Node which is about to be added in between two other nodes should not have any existing dependencies");
|
||||
@ -1112,8 +1112,8 @@ void program::remove_nodes(std::vector<program_node*>& to_remove) {
|
||||
// TODO: break this function into number of smaller ones + add per-primitive fields (possibly use
|
||||
// primitive_inst::to_string?)
|
||||
void program::dump_program(const char* stage,
|
||||
bool with_full_info,
|
||||
std::function<bool(program_node const&)> const& filter) const {
|
||||
bool with_full_info,
|
||||
std::function<bool(program_node const&)> const& filter) const {
|
||||
std::string path = get_dir_path(options);
|
||||
if (path.empty() || !with_full_info) {
|
||||
return;
|
||||
@ -1230,7 +1230,7 @@ void program::save_pass_info(std::string pass_name) {
|
||||
}
|
||||
|
||||
void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
|
||||
std::vector<primitive_id> replaced_with_ids) {
|
||||
std::vector<primitive_id> replaced_with_ids) {
|
||||
for (auto& e : optimized) {
|
||||
auto it = std::find_if(e.second.begin(), e.second.end(), [&optimized_primitive_id](const primitive_id& id) {
|
||||
return optimized_primitive_id == id;
|
||||
|
@ -428,7 +428,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
// Ignore optimized operations for "previous" operation in our operation pair
|
||||
while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) {
|
||||
prev_post_op_idx++;
|
||||
cur_post_op_idx++;
|
||||
if (prev_post_op_idx == cur_post_op_idx)
|
||||
cur_post_op_idx++;
|
||||
prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
||||
cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -681,7 +681,7 @@ TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) {
|
||||
reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32),
|
||||
convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32),
|
||||
convolution("conv_output", "reorder_bfyx", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
|
||||
convolution("conv_output", "reorder_bfyx", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
|
||||
activation("activation", "conv_output", activation_func::abs),
|
||||
reorder("reorder_output", "activation", p.default_format, data_types::f32)
|
||||
);
|
||||
@ -10059,7 +10059,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature)
|
||||
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
|
||||
convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32, values_to_subtract),
|
||||
convolution("conv_output", "reorder_fsv32", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
|
||||
convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
|
||||
activation("activation", "conv_output", activation_func::abs)
|
||||
);
|
||||
|
||||
@ -10088,7 +10088,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activat
|
||||
convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32),
|
||||
activation("activation_quantize", "reorder_fsv32", activation_func::relu),
|
||||
convolution("conv_output", "activation_quantize", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
|
||||
convolution("conv_output", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
|
||||
activation("activation", "conv_output", activation_func::abs)
|
||||
);
|
||||
|
||||
@ -10116,7 +10116,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) {
|
||||
data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
|
||||
convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
|
||||
reorder("reorder_fsv32", "conv_prim", layout(data_types::f32, format::fs_b_yx_fsv32, dw_tensor, padding{ {0, 0, 1, 1}, 0 })),
|
||||
convolution("conv_output", "reorder_fsv32", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
|
||||
convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
|
||||
activation("activation", "conv_output", activation_func::abs),
|
||||
activation("activation2", "conv_prim", activation_func::abs),
|
||||
eltwise("add_bias", { "activation", "activation2" }, eltwise_mode::sum)
|
||||
|
@ -43,7 +43,7 @@ TEST(memory_tests, DISABLED_network_creation_loop)
|
||||
{
|
||||
engine eng;
|
||||
|
||||
memory in = memory::allocate(eng, layout{ data_types::f32, format::bfyx,{ 1, 1, 1000, 1000 } });
|
||||
memory in = memory::allocate(eng, layout{ data_types::f32, format::bfyx, { 1, 1, 1000, 1000 } });
|
||||
|
||||
topology tpl{
|
||||
input_layout("in", in->get_layout()),
|
||||
@ -66,7 +66,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
|
||||
auto x_size = 1;
|
||||
auto y_size = 1;
|
||||
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input->get_layout()));
|
||||
@ -86,7 +86,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
|
||||
network.set_input_data("input", input);
|
||||
auto outputs = network.execute();
|
||||
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 64);
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)64);
|
||||
}
|
||||
|
||||
TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
|
||||
@ -99,13 +99,13 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
|
||||
auto x_size = 4;
|
||||
auto y_size = 4;
|
||||
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input->get_layout()));
|
||||
topology.add(activation("relu", "input", activation_func::relu));
|
||||
topology.add(activation("relu1", "relu", activation_func::relu));
|
||||
topology.add(pooling("pool1", "relu1",pooling_mode::max, { 1,1,3,3 }, { 1,1,2,2 }));
|
||||
topology.add(pooling("pool1", "relu1", pooling_mode::max, { 1, 1, 3, 3 }, { 1, 1, 2, 2 }));
|
||||
topology.add(activation("relu2", "pool1", activation_func::relu));
|
||||
topology.add(activation("relu3", "relu2", activation_func::relu));
|
||||
topology.add(activation("relu4", "relu3", activation_func::relu));
|
||||
@ -133,7 +133,7 @@ TEST(memory_pool, multi_outputs_network) {
|
||||
auto x_size = 4;
|
||||
auto y_size = 4;
|
||||
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input->get_layout()));
|
||||
@ -153,7 +153,7 @@ TEST(memory_pool, multi_outputs_network) {
|
||||
network.set_input_data("input", input);
|
||||
auto outputs = network.execute();
|
||||
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1536);
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 1536);
|
||||
}
|
||||
|
||||
TEST(memory_pool, oooq) {
|
||||
@ -171,14 +171,14 @@ TEST(memory_pool, oooq) {
|
||||
auto x_size = 4;
|
||||
auto y_size = 4;
|
||||
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input->get_layout()));
|
||||
topology.add(activation("relu1", "input", activation_func::relu));
|
||||
topology.add(activation("relu2", "input", activation_func::relu));
|
||||
topology.add(activation("relu3", "input", activation_func::relu));
|
||||
topology.add(concatenation("concat1", { "relu1", "relu2"},concatenation::along_f));
|
||||
topology.add(concatenation("concat1", { "relu1", "relu2" },concatenation::along_f));
|
||||
topology.add(activation("relu4", "concat1", activation_func::relu));
|
||||
topology.add(activation("relu5", "relu3", activation_func::relu));
|
||||
topology.add(concatenation("concat2", { "relu4", "relu5" }, concatenation::along_f));
|
||||
@ -209,7 +209,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
|
||||
auto inp_x_size = 4;
|
||||
auto inp_y_size = 4;
|
||||
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
|
||||
|
||||
set_values(input,
|
||||
{ 1.0f, 2.5f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 6.1f, 4.7f, 1.0f, 1.0f, 8.2f, 1.0f, 2.0f, 1.0f,
|
||||
@ -227,7 +227,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
|
||||
topology.add(activation("relu4", "concat1", activation_func::relu));
|
||||
topology.add(activation("relu5", "relu3", activation_func::relu));
|
||||
topology.add(concatenation("concat2", { "relu4", "relu5" }, concatenation::along_f));
|
||||
topology.add(activation("relu6", "concat2", activation_func::linear, {1.0f, 0.5f}));
|
||||
topology.add(activation("relu6", "concat2", activation_func::linear, { 1.0f, 0.5f }));
|
||||
|
||||
build_options bo;
|
||||
bo.set_option(build_option::optimize_data(true));
|
||||
@ -286,8 +286,8 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) {
|
||||
auto inp_x_size = 4;
|
||||
auto inp_y_size = 4;
|
||||
|
||||
auto input= engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto weights = engine->allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 3, 2 } });
|
||||
auto input= engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
|
||||
auto weights = engine->allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 3, 2 } });
|
||||
|
||||
std::vector<float> dummy_input_data_1 = {
|
||||
/*f0 xy*/ 0.8f, 0.65f, 0.1f, 1.0f, 1.0f, 0.5f, 0.11f, 0.33f, 0.66f, 0.11f, 0.22f, 0.33f, 0.99f, 0.8f, 0.7f, 0.5f,
|
||||
@ -373,10 +373,10 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
|
||||
layout lay_batch_8 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_8)) }};
|
||||
auto input_1 = engine->allocate_memory(lay_batch_1);
|
||||
auto input_8 = engine->allocate_memory(lay_batch_8);
|
||||
auto weights = engine->allocate_memory({ dt, fmt, { 1, 1, 3, 2 } });
|
||||
auto weights = engine->allocate_memory({ dt, fmt, { 1, 3, 3, 2 } });
|
||||
|
||||
std::vector<float> dummy_input_data_1 = generate_random_1d<float>(batch_1*feature_num*inp_x_size*inp_y_size, 0, 1);
|
||||
std::vector<float> dummy_input_data_8 = generate_random_1d<float>(batch_8*feature_num*inp_x_size*inp_y_size, 0, 1);
|
||||
std::vector<float> dummy_input_data_1 = generate_random_1d<float>(batch_1 * feature_num * inp_x_size * inp_y_size, 0, 1);
|
||||
std::vector<float> dummy_input_data_8 = generate_random_1d<float>(batch_8 * feature_num * inp_x_size * inp_y_size, 0, 1);
|
||||
|
||||
set_values(input_1, dummy_input_data_1);
|
||||
set_values(input_8, dummy_input_data_8);
|
||||
@ -396,14 +396,14 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
|
||||
auto outputs = network_first.execute();
|
||||
|
||||
auto dev_info = engine->get_device_info();
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 4744);
|
||||
|
||||
topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1
|
||||
|
||||
network network_second(*engine, topo, bo);
|
||||
network_second.set_input_data("input", input_1);
|
||||
auto outputs_second = network_second.execute();
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)4328);
|
||||
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 5912);
|
||||
}
|
||||
|
||||
TEST(memory_pool, shared_dep_two_output) {
|
||||
@ -459,20 +459,20 @@ TEST(memory_pool, non_opt_intermidate_opt_after) {
|
||||
|
||||
auto input_memory1 = engine.allocate_memory(input_layout1);
|
||||
auto input_memory2 = engine.allocate_memory(input_layout2);
|
||||
auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 }));
|
||||
auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 }));
|
||||
auto data_memory = cldnn::data("scale_mem", scale_memory);
|
||||
|
||||
set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f });
|
||||
set_values(input_memory2, { 5.0f, 6.0f, 7.0f, 8.0f });
|
||||
set_values(scale_memory, { 1.0f});
|
||||
set_values(scale_memory, { 1.0f });
|
||||
|
||||
auto reshape_tensor = cldnn::tensor(8, 1, 1, 1);
|
||||
auto input = cldnn::input_layout("input1", input_layout1);
|
||||
auto input2 = cldnn::input_layout("input2", input_layout2);
|
||||
auto concat = cldnn::concatenation("concat", { "input1", "input2" }, cldnn::concatenation::along_b);
|
||||
auto reshape = cldnn::reshape("reshape", "concat", reshape_tensor);
|
||||
auto crop1 = cldnn::crop("crop1", "reshape", { 1,1,1,1 }, { 0, 0, 0, 0 });
|
||||
auto crop2 = cldnn::crop("crop2", "reshape", { 1,1,1,1 }, { 1, 0, 0, 0 });
|
||||
auto crop1 = cldnn::crop("crop1", "reshape", { 1, 1, 1, 1 }, { 0, 0, 0, 0 });
|
||||
auto crop2 = cldnn::crop("crop2", "reshape", { 1, 1, 1, 1 }, { 1, 0, 0, 0 });
|
||||
auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem");
|
||||
auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem");
|
||||
|
||||
@ -508,7 +508,7 @@ TEST(memory_pool, add_mem_dep_test) {
|
||||
auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 });
|
||||
|
||||
auto input_memory1 = engine.allocate_memory(input_layout1);
|
||||
auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 }));
|
||||
auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 }));
|
||||
auto data_memory = cldnn::data("scale_mem", scale_memory);
|
||||
|
||||
set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f,
|
||||
@ -518,8 +518,8 @@ TEST(memory_pool, add_mem_dep_test) {
|
||||
auto input = cldnn::input_layout("input1", input_layout1);
|
||||
auto actv1 = cldnn::activation("input_activ1", "input1", activation_func::abs);
|
||||
auto actv2 = cldnn::activation("input_activ2", "input1", activation_func::abs);
|
||||
auto crop1 = cldnn::crop("crop1", "input_activ1", { 1,1,2,2 }, { 0, 0, 0, 0 });
|
||||
auto crop2 = cldnn::crop("crop2", "input_activ2", { 1,1,2,2 }, { 0, 1, 0, 0 });
|
||||
auto crop1 = cldnn::crop("crop1", "input_activ1", { 1, 1, 2, 2 }, { 0, 0, 0, 0 });
|
||||
auto crop2 = cldnn::crop("crop2", "input_activ2", { 1, 1, 2, 2 }, { 0, 1, 0, 0 });
|
||||
auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem");
|
||||
auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem");
|
||||
auto actv3 = cldnn::activation("out3", "elt1", activation_func::abs);
|
||||
|
@ -137,11 +137,12 @@ static void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptr<ng
|
||||
|
||||
auto weightsName = inputs[1];
|
||||
auto weights_node = op->get_input_node_shared_ptr(1);
|
||||
// WA: For the cases like Const(weights)->Sub(zp)->Deconv.
|
||||
bool hasConstantWeights = IsNodeOnConstPath(weights_node);
|
||||
// WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
|
||||
// Dimensions order of weights blob is IOYX, but
|
||||
// the selected format is OIYX by default. So we need to swap (and transpose) I and O dimensions to match the format
|
||||
// For Constant node on input transpose is not needed, because the data is transposed on const node creation
|
||||
if (IsNodeOnConstPath(weights_node) && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) {
|
||||
if ((hasConstantWeights && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) || !hasConstantWeights) {
|
||||
std::string permuteName = layerName + "_cldnn_weights_permute";
|
||||
auto weights_rank = op->get_input_shape(1).size();
|
||||
std::vector<uint16_t> permute_order(weights_rank);
|
||||
@ -195,11 +196,12 @@ static void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_p
|
||||
|
||||
auto weightsName = inputs[1];
|
||||
auto weights_node = op->get_input_node_shared_ptr(1);
|
||||
// WA: For the cases like Const(weights)->Sub(zp)->Deconv.
|
||||
bool hasConstWeights = IsNodeOnConstPath(weights_node);
|
||||
// WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
|
||||
// Dimensions order of weights blob is IOYX, but
|
||||
// the selected format is OIYX by default. So we need to swap I and O dimensions to match the format.
|
||||
// For Constant node on input transpose is not needed, because the data is transposed on const node creation
|
||||
if (IsNodeOnConstPath(weights_node) && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) {
|
||||
if ((hasConstWeights && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) || !hasConstWeights) {
|
||||
std::string permuteName = layerName + "_cldnn_weights_permute";
|
||||
auto weights_rank = op->get_input_shape(1).size();
|
||||
std::vector<uint16_t> permute_order(weights_rank);
|
||||
|
@ -346,7 +346,7 @@ bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node) {
|
||||
std::function<bool(const std::shared_ptr<ngraph::Node>&)> is_const_node = [&nodes_processed, &is_const_node](const std::shared_ptr<ngraph::Node>& node) {
|
||||
if (nodes_processed.count(node)) return true;
|
||||
nodes_processed.insert(node);
|
||||
// If input is constant, then drop if from the processing list
|
||||
// If input is constant, then drop it from the processing list
|
||||
if (std::dynamic_pointer_cast<ngraph::op::v0::Constant>(node) != nullptr)
|
||||
return true;
|
||||
// If the node doesn't have any parents and it's not a constant, then we deal with dynamic path
|
||||
|
Loading…
Reference in New Issue
Block a user