[GPU] Fix reshape split for dynamic models + accuracy fix for SAM (#16911)

This commit is contained in:
Vladimir Paramuzov
2023-04-25 13:21:31 +04:00
committed by GitHub
parent 9247906879
commit f736c71feb
7 changed files with 182 additions and 89 deletions

View File

@@ -149,6 +149,11 @@ void handle_reshape::run(program& p) {
auto new_reshape = std::make_shared<reshape>("reorder:_reshape_split_" + user->id() + "_" + node->id(),
input_node.id(),
output_shape);
new_reshape->special_zero = prim->special_zero;
new_reshape->output_partial_shape = prim->output_partial_shape;
new_reshape->output_pattern = prim->output_pattern;
new_reshape->mode = prim->mode;
new_reshape->input = prim->input;
auto& new_reshape_node = p.get_or_create(new_reshape);
user->replace_dependency(0, input_node);
p.add_intermediate(new_reshape_node, *user, 0);

View File

@@ -23,16 +23,6 @@ namespace cldnn {
inline std::string bool_to_str(bool cond) { return cond ? "true" : "false"; }
inline std::string get_extr_type(const std::string& str) {
auto begin = str.find('<');
auto end = str.find('>');
if (begin == std::string::npos || end == std::string::npos)
return {};
return str.substr(begin + 1, (end - begin) - 1);
}
inline std::string dt_to_str(data_types dt) {
return data_type_traits::name(dt);
}

View File

@@ -846,7 +846,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
allocation_type type, bool reusable, bool reset = true) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
// Use layout with max tensor for dynamic shape with upper bound
auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
return pool.get_memory(static_layout, type, reset);

View File

@@ -6,6 +6,7 @@
#include "to_string_utils.h"
#include "data_inst.h"
#include "condition_inst.h"
#include "data_inst.h"
#include "json_object.h"
#include <algorithm>
@@ -170,40 +171,18 @@ void dump_graph_init(std::ofstream& graph,
const program& program,
std::function<bool(program_node const&)> const& filter) {
const std::string invalid_layout_msg = "(invalid layout)";
const auto extr_oformat = [&invalid_layout_msg](const program_node* ptr) {
if (!ptr->is_valid_output_layout())
return invalid_layout_msg;
auto output_layout = ptr->get_output_layout();
std::string out = output_layout.format.to_string();
return out;
};
const auto extr_odt = [&invalid_layout_msg](const program_node* ptr) {
if (!ptr->is_valid_output_layout())
return invalid_layout_msg;
auto output_layout = ptr->get_output_layout();
std::string out = dt_to_str(output_layout.data_type);
return out;
};
const auto dump_mem_info = [&invalid_layout_msg](const program_node* ptr) {
std::string out = "size_info: ";
std::string out = "layout_info: ";
if (!ptr->is_valid_output_layout()) {
return out + invalid_layout_msg;
}
auto out_layout = ptr->get_output_layout();
auto tensor_str = out_layout.to_string();
auto padding = out_layout.data_padding;
out += tensor_str;
if (!padding) {
out += " (nonpadded)";
if (!out_layout.data_padding) {
out += " " + out_layout.to_short_string();
} else {
out += "\nl: " + padding.lower_size().to_string() + "\nu: " + padding.upper_size().to_string();
out += " " + out_layout.to_string();
}
return out;
@@ -218,23 +197,20 @@ void dump_graph_init(std::ofstream& graph,
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpotentially-evaluated-expression"
#endif
auto& node_type = typeid(*node);
std::string node_type_name = get_extr_type(node_type.name());
graph << " " << get_node_id(node) << "[label=\"" << node->id() << ":\n"
<< node_type_name << "\n out format: " + extr_oformat(node)
<< "\n out data_type: " + extr_odt(node)
std::string node_type_name = node->get_primitive()->type_string();
graph << " " << get_node_id(node) << "[label=\"" << node->id() << ":"
<< "\\ntype: " << node_type_name
<< "\\nprocessing number: " << program.get_processing_order().get_processing_number(node)
<< "\\n color:" << (node->is_reusing_memory() ? std::to_string(node->get_reused_memory_color()) : "none")
<< (node->can_be_optimized() ? "\\n optimized out" : "");
if (node_type_name != "struct cldnn::data" && node_type_name != "struct cldnn::input_layout" &&
!node->can_be_optimized()) {
if (!node->is_type<data>()) {
graph << "\\n Selected kernel: "
<< (node->get_selected_impl() == nullptr ? "none"
: node->get_selected_impl()->get_kernel_name()) + " / "
<< node->get_preferred_impl_type()
<< "\n" + dump_mem_info(node);
<< node->get_preferred_impl_type();
}
graph << "\n" + dump_mem_info(node);
graph << "\"";
#ifdef __clang__
#pragma clang diagnostic pop

View File

@@ -30,7 +30,15 @@ JitConstants ReduceKernelBase::GetJitConstants(const reduce_params& params) cons
const auto& output = params.outputs[0];
if (output.is_dynamic()) {
size_t output_tensor_offset = 1 + GetFusedPrimitiveInputsCount(params);
size_t output_tensor_offset = params.inputs[0].is_dynamic() ? 1 : 0;
for (size_t i = 0; i < params.fused_ops.size(); i++) {
auto& fused_op_inputs = params.fused_ops[i].tensors;
for (auto& t : fused_op_inputs) {
if (t.is_dynamic())
output_tensor_offset++;
}
}
DimensionAccessHelper dims(output, output_tensor_offset);
jit.AddConstant(MakeJitConstant("COMPUTATIONAL_OPERATIONS_NUMBER", toVectorMulString({dims.x,
dims.y,

View File

@@ -18,8 +18,8 @@ using namespace ::tests;
namespace {
struct reduce_test_params {
cldnn::tensor in_shape;
cldnn::tensor out_shape;
ov::PartialShape in_shape;
ov::PartialShape out_shape;
cldnn::data_types data_type;
cldnn::format input_format;
data_types default_type;
@@ -34,9 +34,12 @@ struct reduce_test_params {
class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
public:
void execute(reduce_test_params& p) {
void execute(reduce_test_params& p, bool is_dynamic = false) {
auto input_prim = get_mem(get_input_layout(p));
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
@@ -52,70 +55,60 @@ public:
if (axis >= static_cast<int64_t>(rank))
throw std::runtime_error("Unsupported reduce test case");
switch (axis) {
case 0: // batch
p.out_shape.batch[0] = 1;
break;
case 1: // feature
p.out_shape.feature[0] = 1;
break;
case 2:
p.out_shape.spatial[rank - 3] = 1;
break;
case 3:
p.out_shape.spatial[rank - 4] = 1;
break;
case 4:
p.out_shape.spatial[rank - 5] = 1;
break;
case 5:
p.out_shape.spatial[rank - 6] = 1;
break;
}
p.out_shape[axis] = 1;
}
}
layout get_dynamic_input_layout(reduce_test_params& p) {
return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
}
layout get_input_layout(reduce_test_params& p) {
return layout{ p.data_type, p.input_format, p.in_shape };
return layout{ p.in_shape, p.data_type, p.input_format };
}
layout get_output_layout(reduce_test_params& p) {
return layout{ p.out_shape, p.data_type, p.input_format };
}
layout get_per_channel_layout(reduce_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } };
return layout{ {1, p.in_shape[1], 1, 1}, p.default_type, p.default_format };
}
};
} // namespace
/* ----------------------------------------------------------------------------------------------------- */
/* ---------------------------------------- Reduce cases ----------------------------------------------- */
/* ----------------------------------------------------------------------------------------------------- */
#define CASE_REDUCE_F32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_1 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_3 { 16, 16, 16, 8, 8, 8 }, { 16, 16, 16, 8, 8, 8 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_1 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_3 { 16, 16, 8, 8, 8, 16 }, { 16, 16, 8, 8, 8, 16 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F32_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_F16_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i32, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_4 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I32_4 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_I8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_0 { 3, 7, 7, 5 }, { 3, 7, 7, 5 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_2 { 2, 4, 4, 4, 8 }, { 2, 4, 4, 4, 8 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_3 { 3, 5, 7, 7, 5, 3 }, { 3, 5, 7, 7, 5, 3 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx
#define CASE_REDUCE_U8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
@@ -272,6 +265,24 @@ TEST_P(reduce_scale_activation, per_channel) {
execute(p);
}
TEST_P(reduce_scale_activation, dynamic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_dynamic_input_layout(p)),
data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)),
reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
eltwise("scale", { input_info("reduce"), input_info("scale_data") }, eltwise_mode::prod),
activation("activation", input_info("scale"), activation_func::cos),
reorder("output_reorder", input_info("activation"), p.default_format, data_types::f32)
);
// Activation won't be fused because onednn doesn't support cos activation
if (engine.get_device_info().supports_immad)
p.expected_fused_primitives++;
tolerance = 1e-02f;
execute(p, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_scale_activation, ::testing::ValuesIn(std::vector<reduce_test_params>{
reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, { 3, 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
reduce_test_params{ CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, { 3, 2, 0 }, true, "reduce_ref" },

View File

@@ -11,6 +11,8 @@
#include "data_inst.h"
#include "eltwise_inst.h"
#include "reshape_inst.h"
#include "reorder_inst.h"
#include "broadcast_inst.h"
#include "pass_manager.h"
#include "to_string_utils.h"
@@ -90,3 +92,104 @@ TEST(handle_reshape, skip_reorder_node_to_split_when_onndnn_not_support) {
ASSERT_TRUE(prog->get_node("matmul").get_dependency(0).get_output_layout().data_type == data_types::f16);
}
TEST(handle_reshape, correct_parameters_propagation) {
auto& engine = get_test_engine();
auto data0_layout = engine.allocate_memory({ ov::PartialShape{}, data_types::f16, format::bfyx });
auto data1_layout = engine.allocate_memory({ ov::PartialShape{1, 12}, data_types::f16, format::bfyx });
auto in_layout = layout{ ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx };
topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("data0", data0_layout));
topology.add(data("data1", data1_layout));
topology.add(eltwise("e1", input_info("input"), input_info("data0"), eltwise_mode::sum));
topology.add(reshape("reshape", input_info("e1"), false, {2, 12}, {2, 12}));
topology.add(eltwise("e2", input_info("reshape"), input_info("data1"), eltwise_mode::sum));
topology.add(reorder("reorder", input_info("reshape"), format::bfyx, data_types::f32));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
auto prog = program::build_program(engine, topology, config, false, true);
layout_optimizer lo(true);
program_wrapper::apply_opt_pass<handle_reshape>(*prog);
ASSERT_NE(prog, nullptr);
ASSERT_TRUE(has_node_with_type<reshape>(*prog));
ASSERT_TRUE(prog->get_node("reshape").can_be_optimized());
auto out_shape0 = prog->get_node("e2").get_output_layout().get_partial_shape();
auto out_shape1 = prog->get_node("reorder").get_output_layout().get_partial_shape();
ov::PartialShape expected_out_shape{2, 12};
// handle_reshape may do reshape split, so ensure that output shape on all branches is correct
ASSERT_EQ(out_shape0, expected_out_shape);
ASSERT_EQ(out_shape1, expected_out_shape);
}
TEST(handle_reshape, reshape_input_reorder) {
auto& engine = get_test_engine();
auto shape_memory = engine.allocate_memory({ ov::PartialShape{5}, data_types::i32, format::bfyx });
auto in0_layout = layout{ ov::PartialShape{1, -1, 16, 64, 64}, data_types::f16, format::bfzyx };
auto in0_memory = engine.allocate_memory(layout{ ov::PartialShape{1, 2, 16, 64, 64}, data_types::f16, format::bfzyx });
auto in1_layout = layout{ ov::PartialShape{-1, 16, 64, 64}, data_types::f16, format::bfyx };
auto in1_memory = engine.allocate_memory({ ov::PartialShape{2, 16, 64, 64}, data_types::f16, format::bfyx });
auto in0 = generate_random_1d<FLOAT16>(in0_memory->count(), -10, 10);
auto in1 = generate_random_1d<FLOAT16>(in1_memory->count(), -10, 10);
set_values<FLOAT16>(in0_memory, in0);
set_values<int32_t>(shape_memory, {1, 2, 16, 64, 64});
set_values<FLOAT16>(in1_memory, in1);
topology topology;
topology.add(input_layout("input0", in0_layout));
topology.add(input_layout("target_shape", shape_memory->get_layout()));
topology.add(broadcast("broadcast", input_info("input0"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
topology.add(reshape("reshape", input_info("broadcast"), true, {-1, 16, 64, 64}, {-1, 16, 64, 64}));
topology.add(input_layout("input1", in1_layout));
topology.add(eltwise("eltw", input_info("reshape"), input_info("input1"), eltwise_mode::sum));
topology.add(reorder("reorder", input_info("eltw"), format::bfyx, data_types::f32));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
auto prog = program::build_program(engine, topology, config);
ASSERT_NE(prog, nullptr);
ASSERT_TRUE(has_node_with_type<reshape>(*prog));
ASSERT_TRUE(prog->get_node("reshape").can_be_optimized());
auto reshape_layout_in = prog->get_node("reshape").get_input_layouts()[0];
auto reshape_layout_out = prog->get_node("reshape").get_output_layout();
// At this moment transfomations insert reorder before reshape which
// converts tensor to default format with rank = reshape_out_rank
// Likely in the future we'll update that reorder so it will use reshape_input_rank
// After that expected in format will be bfzyx
ASSERT_EQ(reshape_layout_in.format, format::bfyx);
ASSERT_EQ(reshape_layout_out.format, format::bfyx);
ov::PartialShape expected_out_shape{-1, 16, 64, 64};
ASSERT_EQ(reshape_layout_out.get_partial_shape(), expected_out_shape);
network net(prog);
net.set_input_data("input0", in0_memory);
net.set_input_data("input1", in1_memory);
net.set_input_data("target_shape", shape_memory);
auto output = net.execute();
auto out_mem = output.at("reorder").get_memory();
mem_lock<float> lock(out_mem, get_test_stream());
for (size_t i = 0; i < out_mem->count(); i++) {
float expected = static_cast<float>(in0[i]) + static_cast<float>(in1[i]);
float actual = lock[i];
ASSERT_EQ(expected, actual) << " i = " << i;
}
}