[GPU] Fix OneDNN primitive attr serialization logic (#16654)

* fix onednn primitive attr serialization logic

* added an onednn fc fusing serialization test

* added gemm fusing serialization tests
This commit is contained in:
Eddy Kim
2023-04-04 10:24:40 +09:00
committed by GitHub
parent 4f7f7c31ee
commit 90615cf26a
7 changed files with 100 additions and 28 deletions

View File

@@ -120,6 +120,8 @@ struct fused_primitive_desc_onednn {
size_t mem_dep; // memory dependency for working with fused node
dnnl::memory::format_tag tag;
bool flatten;
dnnl::memory::dims dims;
dnnl::memory::data_type dt;
};
#endif // ENABLE_ONEDNN_FOR_GPU
} // namespace cldnn

View File

@@ -249,11 +249,16 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
dnnl::algorithm aalgorithm = dnnl::algorithm::undef;
ib >> make_data(&aalgorithm, sizeof(dnnl::algorithm));
dnnl::memory::desc md = onednn::layout_to_memory_desc(
impl_params->get_input_layout(fused_desc.at(idx).mem_dep),
fused_desc.at(idx).tag, fused_desc.at(idx).flatten);
if (fused_desc.at(idx).dims.size() > 0) {
_post_ops.append_binary(aalgorithm,
dnnl::memory::desc(fused_desc.at(idx).dims, fused_desc.at(idx).dt, fused_desc.at(idx).tag));
} else {
dnnl::memory::desc md = onednn::layout_to_memory_desc(
impl_params->get_input_layout(fused_desc.at(idx).mem_dep),
fused_desc.at(idx).tag, fused_desc.at(idx).flatten);
_post_ops.append_binary(aalgorithm, md);
_post_ops.append_binary(aalgorithm, md);
}
} else if (_kind == dnnl::primitive::kind::prelu) {
int mask;
ib >> mask;

View File

@@ -118,7 +118,13 @@ void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
size_t num_fused_prims = fused_desc_onednn.size();
ob << num_fused_prims;
for (auto fused_prim : fused_desc_onednn) {
ob << make_data(&fused_prim, sizeof(fused_primitive_desc_onednn));
ob << make_data(&fused_prim.op_type, sizeof(onednn_post_op_type));
ob << fused_prim.mem_offset;
ob << fused_prim.mem_dep;
ob << make_data(&fused_prim.tag, sizeof(dnnl::memory::format_tag));
ob << fused_prim.flatten;
ob << fused_prim.dims;
ob << make_data(&fused_prim.dt, sizeof(dnnl::memory::data_type));
}
#endif // ENABLE_ONEDNN_FOR_GPU
ob << primary_input_idx;
@@ -187,7 +193,13 @@ void kernel_impl_params::load(BinaryInputBuffer& ib) {
ib >> num_fused_prims;
fused_desc_onednn.resize(num_fused_prims);
for (size_t idx = 0; idx < num_fused_prims; ++idx) {
ib >> make_data(&fused_desc_onednn[idx], sizeof(fused_primitive_desc_onednn));
ib >> make_data(&fused_desc_onednn[idx].op_type, sizeof(onednn_post_op_type));
ib >> fused_desc_onednn[idx].mem_offset;
ib >> fused_desc_onednn[idx].mem_dep;
ib >> make_data(&fused_desc_onednn[idx].tag, sizeof(dnnl::memory::format_tag));
ib >> fused_desc_onednn[idx].flatten;
ib >> fused_desc_onednn[idx].dims;
ib >> make_data(&fused_desc_onednn[idx].dt, sizeof(dnnl::memory::data_type));
}
#endif // ENABLE_ONEDNN_FOR_GPU
ib >> primary_input_idx;

View File

@@ -1108,7 +1108,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i),
get_stream(),
"program" + std::to_string(get_program()->get_id()) +
"program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 1) +
"_network" + std::to_string(get_id()) +
"_" + layer_name + "_src" + std::to_string(i),
debug_config->dump_layers_raw);
@@ -1125,7 +1125,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i),
get_stream(),
"program" + std::to_string(get_program()->get_id()) +
"program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 1) +
"_network" + std::to_string(get_id()) +
"_" + layer_name + "_dst" + std::to_string(i),
debug_config->dump_layers_raw);

View File

@@ -918,8 +918,10 @@ void program_node::init_onednn_primitive_attributes() {
// Add information about post-operation into the list, update indices
auto update_onednn_post_op_list = [&](onednn_post_op_type type, size_t m_dep,
dnnl::memory::format_tag tag = dnnl::memory::format_tag::undef,
bool flatten = false) {
fused_primitive_desc_onednn cur_op_desc = { type, memory_offset, m_dep, tag, flatten };
bool flatten = false,
dnnl::memory::dims dims = {},
dnnl::memory::data_type dt = dnnl::memory::data_type::undef) {
fused_primitive_desc_onednn cur_op_desc = { type, memory_offset, m_dep, tag, flatten, dims, dt };
fused_ops.push_back(cur_op_desc);
auto has_memory_buffers = type == onednn_post_op_type::binary_add ||
@@ -984,14 +986,14 @@ void program_node::init_onednn_primitive_attributes() {
cldnn::onednn::combine_bf_with_first_spatial_dim(in);
}
post_ops.append_binary(alg, onednn::layout_to_memory_desc(in, dnnl::memory::format_tag::ab));
update_onednn_post_op_list(op_type, dep_idx);
update_onednn_post_op_list(op_type, dep_idx, dnnl::memory::format_tag::ab);
} else if (is_type<gemm>()) {
size_t rank = cldnn::format::dimension(in.format);
dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in.batch() == 1);
dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type);
dnnl::memory::format_tag fmt = onednn::convert_gemm_data_format(dims);
post_ops.append_binary(alg, dnnl::memory::desc(dims, dt, fmt));
update_onednn_post_op_list(op_type, dep_idx);
update_onednn_post_op_list(op_type, dep_idx, fmt, false, dims, dt);
} else {
post_ops.append_binary(alg, onednn::layout_to_memory_desc(in));
update_onednn_post_op_list(op_type, dep_idx);

View File

@@ -78,7 +78,7 @@ public:
#ifdef ENABLE_ONEDNN_FOR_GPU
class FullyConnectedFusingTestOneDNN : public BaseFusingTest<fully_connected_test_params> {
public:
void execute(fully_connected_test_params& p) {
void execute(fully_connected_test_params& p, bool is_caching_test = false) {
// Onednn post operation has issue in a machine that does not support imad.
if (!engine.get_device_info().supports_immad)
return;
@@ -103,12 +103,12 @@ public:
ov::intel_gpu::ImplementationDesc fc_ocl_impl = { ocl_forcing_format, p.ocl_kernel_name /*fully_connected_gpu_bfyx_ref*/};
cfg_not_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "fc_prim", fc_ocl_impl } }));
}
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_fused.set_input_data("input", input_prim);
network_not_fused.set_input_data("input", input_prim);
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test);
network_fused->set_input_data("input", input_prim);
network_not_fused->set_input_data("input", input_prim);
compare(network_not_fused, network_fused, p);
compare(*network_not_fused, *network_fused, p);
}
layout get_input_layout(fully_connected_test_params& p) {
@@ -440,6 +440,25 @@ TEST_P(fc_int8_inputs_fused_fp32_sum, basic) {
execute(p);
}
TEST_P(fc_int8_inputs_fused_fp32_sum, basic_cached) {
auto p = GetParam();
auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format };
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("shift_data", get_mem(shift_layout, 1)),
fully_connected("fc_prim", input_info("input"), "weights", "bias", cldnn::data_types::f32, padding(), get_output_dim_size(p), get_input_weights_rank(p)),
eltwise("shift", { input_info("fc_prim"), input_info("shift_data") }, eltwise_mode::sum, cldnn::data_types::f32),
crop("crop", input_info("shift"), get_output_layout(p).get_tensor(), { 0, 0, 0, 0 }),
reorder("reorder_bfyx", input_info("crop"), p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
// OneDNN has issue with small shapes - ticket 7064
// fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 4 },

View File

@@ -40,7 +40,7 @@ struct gemm_test_params {
class GemmFusingTest : public ::BaseFusingTest<gemm_test_params> {
public:
void execute(gemm_test_params& p) {
void execute(gemm_test_params& p, bool is_caching_test = false) {
auto input0_prim = get_mem(get_input_layout(p, 0));
auto input1_prim = get_mem(get_input_layout(p, 1));
@@ -51,19 +51,19 @@ public:
cfg_not_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm_prim", gemm_ref_impl} }));
}
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_fused.set_input_data("input0", input0_prim);
network_not_fused.set_input_data("input0", input0_prim);
network_fused.set_input_data("input1", input1_prim);
network_not_fused.set_input_data("input1", input1_prim);
network::ptr network_not_fused = get_network(this->engine, this->topology_non_fused, cfg_not_fused, get_test_stream_ptr(), is_caching_test);
network::ptr network_fused = get_network(this->engine, this->topology_fused, cfg_fused, get_test_stream_ptr(), is_caching_test);
network_fused->set_input_data("input0", input0_prim);
network_not_fused->set_input_data("input0", input0_prim);
network_fused->set_input_data("input1", input1_prim);
network_not_fused->set_input_data("input1", input1_prim);
if (p.in_shapes.size() > 2) {
auto input2_prim = get_mem(get_input_layout(p, 2));
network_fused.set_input_data("input2", input2_prim);
network_not_fused.set_input_data("input2", input2_prim);
network_fused->set_input_data("input2", input2_prim);
network_not_fused->set_input_data("input2", input2_prim);
}
compare(network_not_fused, network_fused, p);
compare(*network_not_fused, *network_fused, p);
}
layout get_input_layout(gemm_test_params& p, int in_no) {
@@ -317,6 +317,38 @@ TEST_P(gemm_2in_add, eltwise_postop) {
execute(p);
}
TEST_P(gemm_2in_add, eltwise_postop_cached) {
auto p = GetParam();
if (engine.get_device_info().supports_immad) {
ov::intel_gpu::ImplementationDesc gemmv_impl = { cldnn::format::type::any, "", impl_types::onednn };
cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "gemm_prim", gemmv_impl } }));
}
auto add_data_layout = get_output_layout(p);
auto add_data_size = add_data_layout.get_tensor();
if (p.broadcast_kind == dim_vec_kind::batch)
add_data_size.batch[0] = 1;
else
add_data_size.feature[0] = 1;
add_data_layout.set_tensor(add_data_size);
auto in_layout0 = get_input_layout(p, 0);
auto in_layout1 = get_input_layout(p, 1);
create_topologies(
input_layout("input0", in_layout0),
input_layout("input1", in_layout1),
data("add_data", get_mem(add_data_layout, 1.0f/p.kernel.count())),
gemm("gemm_prim", { input_info("input0"), input_info("input1") }, data_types::f32, false, false, 1.f, 0.f, in_layout0.get_rank(), in_layout1.get_rank()),
eltwise("add_prim", { input_info("gemm_prim"), input_info("add_data") }, p.eltwise_m, p.default_type),
reorder("reorder_bfyx", input_info("add_prim"), p.default_format, data_types::f32)
);
tolerance = default_tolerance(p.default_type);
execute(p, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_add, ::testing::ValuesIn(std::vector<gemm_test_params>{
gemm_test_params{ CASE_GEMM_2IN_FP16_5, 3, 4, "", dim_vec_kind::batch, eltwise_mode::sum },
gemm_test_params{ CASE_GEMM_2IN_FP16_5, 3, 4, "", dim_vec_kind::batch, eltwise_mode::prod },