[GPU] Fix failed onednn tests (#16410)

* Fix failed unit-tests on dGPU + modified fully_connected_random_test_i8_3d not to have ambiguous + oneDNN does NOT support i64 type for reorder. Added exception. + bugfix in prepare_primitive_fusing about exception of activation function + Add exception logic for dynamic to select ocl type in is_node_for_onednn Signed-off-by: Min, Byungil <byungil.min@intel.com>
2023-03-29 15:50:09 +09:00 · 2023-03-29 15:50:09 +09:00 · ea6e3481cd
commit ea6e3481cd
parent 966c47e7cd
7 changed files with 130 additions and 23 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@ -704,14 +704,17 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
            if (_lo.get_optimization_attributes().use_onednn_impls) {
                if (input.is_type<reshape>() || input.is_type<concatenation>())
                    return;
-                #ifdef ENABLE_ONEDNN_FOR_GPU
-                // Activation should not fused if it isn't supported in onednn
-                try {
-                    onednn::convert_activation_func(activation_node.get_primitive()->activation_function);
-                } catch (...) {
-                    return;
+
+                // Activation should not be fused if oneDNN does NOT support it
+                if (_lo.is_primitive_implemented_for_onednn(input))  {
+                    #ifdef ENABLE_ONEDNN_FOR_GPU
+                    try {
+                        onednn::convert_activation_func(activation_node.get_primitive()->activation_function);
+                    } catch (...) {
+                        return;
+                    }
+                    #endif
                }
-                #endif
            }

            bool should_fuse = input.is_type<binary_convolution>();
--- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
+++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
@ -189,6 +189,7 @@ public:
    impl_types get_forced_impl_type_by_config(program_node& node);
    static bool are_data_types_suitable_for_onednn(program_node& node);
    bool are_layouts_suitable_for_onednn(program_node& node);
+    bool is_primitive_implemented_for_onednn(program_node& node);
    bool is_format_supported(program_node& node, format::type fmt);

    // Returns whether reorder between "prev" with format fmt_prev and "next" with format fmt_next
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@ -825,7 +825,8 @@ static bool is_node_for_onednn(deconvolution_node const& node) {

 static bool is_node_for_onednn(fully_connected_node const& node) {
    auto fc_prim = node.get_primitive();
-    auto ps = node.get_output_layout().get_partial_shape();
+    auto output_layout = node.get_output_layout();
+    auto ps = output_layout.get_partial_shape();
    size_t non_spatial_count = 2 + (fc_prim->input_size == 3 ? 1 : 0);
    size_t rank = ps.size();

@ -1178,6 +1179,9 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) {
    if (in_dt == data_types::f32 && (!node.is_type<fully_connected>() && !node.is_type<convolution>()))
        return false;

+    if (in_dt == data_types::i64 || out_dt == data_types::i64)
+        return false;
+
    if (node.is_type<pooling>()) {
        if (!data_type_traits::is_floating_point(in_dt) && in_dt != out_dt)
            return false;
@ -1259,6 +1263,16 @@ bool layout_optimizer::are_layouts_suitable_for_onednn(program_node& node) {
    return true;
 }

+bool layout_optimizer::is_primitive_implemented_for_onednn(program_node& node) {
+    if (node.is_type<fully_connected>() || node.is_type<gemm>() || node.is_type<pooling>() ||
+        node.is_type<convolution>() || node.is_type<deconvolution>() ||
+        node.is_type<reduce>() || node.is_type<reorder>() || node.is_type<concatenation>()) {
+            return true;
+    }
+
+    return false;
+}
+
 impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) {
 #ifdef GPU_DEBUG_CONFIG
    GPU_DEBUG_GET_INSTANCE(debug_config);
@ -1419,6 +1433,10 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
            preferred_impl = impl_types::ocl;
        }

+        if (!are_data_types_suitable_for_onednn(node)) {
+            preferred_impl = impl_types::ocl;
+        }
+
        // For mixed precision case, onednn is slower than cldnn
        if (input_fmt == format::b_fs_yx_fsv16 && data_type_traits::is_i8_u8(input_dt))
            preferred_impl = impl_types::ocl;
--- a/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp
@ -172,7 +172,7 @@ TEST_P(activation_eltwise_activation_quantize_u8, basic) {
        data("out_low", get_mem(get_single_element_layout(p), -127)),
        data("out_high", get_mem(get_single_element_layout(p), 127)),
        eltwise("eltwise", { input_info("act"), input_info("eltwise_data") }, eltwise_mode::prod, p.default_type),
-        activation("act2", input_info("eltwise"), activation_func::softsign),
+        activation("act2", input_info("eltwise"), activation_func::swish),
        quantize("quant", input_info("act2"), input_info("in_low"), input_info("in_high"),
                 input_info("out_low"), input_info("out_high"), 256, data_types::u8),
        reorder("reorder_bfyx", input_info("quant"), p.default_format, data_types::f32)
@ -193,7 +193,7 @@ TEST_P(activation_eltwise_activation_quantize_u8, per_channel) {
        data("out_low", get_mem(get_single_element_layout(p), -127)),
        data("out_high", get_mem(get_single_element_layout(p), 127)),
        eltwise("eltwise", { input_info("act"), input_info("eltwise_data") }, eltwise_mode::prod, p.default_type),
-        activation("act2", input_info("eltwise"), activation_func::softsign),
+        activation("act2", input_info("eltwise"), activation_func::pow),
        quantize("quant", input_info("act2"), input_info("in_low"), input_info("in_high"),
                 input_info("out_low"), input_info("out_high"), 256, data_types::u8),
        reorder("reorder_bfyx", input_info("quant"), p.default_format, data_types::f32)
@ -223,6 +223,42 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8,
    activation_test_params{ CASE_ACTIVATION_3D_F32_2, 3, 5, "activation_ref" },
 }));

+class activation_eltwise_activation_quantize_u8_onendnn : public ActivationFusingTest {};
+TEST_P(activation_eltwise_activation_quantize_u8_onendnn, same_behavior) {
+    // Case : activation function is NOT supported on oneDNN and an input primitive selects clDNN execution
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        activation("act", input_info("input"), activation_func::relu),
+        data("eltwise_data", get_mem(get_single_element_layout(p), 1.0f / 255)),
+        data("in_low", get_mem(get_single_element_layout(p), 0)),
+        data("in_high", get_mem(get_single_element_layout(p), 1, max_random)),
+        data("out_low", get_mem(get_single_element_layout(p), -127)),
+        data("out_high", get_mem(get_single_element_layout(p), 127)),
+        eltwise("eltwise", { input_info("act"), input_info("eltwise_data") }, eltwise_mode::prod, p.default_type),
+        activation("act2", input_info("eltwise"), activation_func::softsign),
+        quantize("quant", input_info("act2"), input_info("in_low"), input_info("in_high"),
+                 input_info("out_low"), input_info("out_high"), 256, data_types::u8),
+        reorder("reorder_bfyx", input_info("quant"), p.default_format, data_types::f32)
+    );
+
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_eltwise_activation_quantize_u8_onendnn, ::testing::ValuesIn(std::vector<activation_test_params>{
+    // InputDataType = FP32
+    activation_test_params{ CASE_ACTIVATION_F32_0, 3, 5, "activation_opt" },
+    activation_test_params{ CASE_ACTIVATION_F32_1, 3, 5, "activation_opt" },
+    activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_opt" },
+    activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_opt" },
+
+    activation_test_params{ CASE_ACTIVATION_F32_0, 3, 5, "activation_ref" },
+    activation_test_params{ CASE_ACTIVATION_F32_1, 3, 5, "activation_ref" },
+    activation_test_params{ CASE_ACTIVATION_3D_F32_0, 3, 5, "activation_ref" },
+    activation_test_params{ CASE_ACTIVATION_3D_F32_1, 3, 5, "activation_ref" },
+}));
+
 INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_eltwise_activation_quantize_u8, ::testing::ValuesIn(std::vector<activation_test_params>{
    activation_test_params{ CASE_ACTIVATION_3D_F32_5, 3, 5, "activation_ref" },  // FIXME - accuracy bug
 }));
--- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp
@ -306,9 +306,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise, ::testing::ValuesIn(std::
    fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 },
    fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 },
    fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 },
-    fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
-    fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
-    fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
 }));

 class fc_int8_quantize_u8 : public FullyConnectedFusingTest {};
--- a/src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp
@ -118,7 +118,7 @@ TEST_P(lrn_fp32_quantize_u8_eltwise_activation, basic) {
        quantize("quantize", input_info("lrn_norm"), input_info("in_lo"), input_info("in_hi"),
                 input_info("out_lo"), input_info("out_hi"), 256, data_types::u8),
        eltwise("eltwise", { input_info("quantize"), input_info("eltwise_data") }, eltwise_mode::prod),
-        activation("activation", input_info("eltwise"), activation_func::floor),
+        activation("activation", input_info("eltwise"), activation_func::relu),
        reorder("reorder", input_info("activation"), p.default_format, data_types::f32)
    );

@ -176,6 +176,47 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_u8_eltwise_activation, :
    lrn_test_params{ CASE_LRN_FP32_TO_FP16_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" },
 }));

+class lrn_fp32_quantize_u8_eltwise_activation_onednn : public LrnFusingTest {};
+TEST_P(lrn_fp32_quantize_u8_eltwise_activation_onednn, same_behavior) {
+    // Case : activation function is NOT supported on oneDNN and an input primitive selects clDNN execution
+    auto p = GetParam();
+    uint32_t size = 5;
+    float k = 1.0f;
+    float alpha = (float)9.9e-05;
+    float beta = 0.75;
+
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
+        data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
+        data("out_lo", get_mem(get_single_element_layout(p), 0)),
+        data("out_hi", get_mem(get_single_element_layout(p), 255)),
+        data("eltwise_data", get_mem(get_single_element_layout(p), 1.0f / 255)),
+        lrn("lrn_norm", input_info("input"), size, k, alpha, beta, p.lrn_type),
+        quantize("quantize", input_info("lrn_norm"), input_info("in_lo"), input_info("in_hi"),
+                 input_info("out_lo"), input_info("out_hi"), 256, data_types::u8),
+        eltwise("eltwise", { input_info("quantize"), input_info("eltwise_data") }, eltwise_mode::prod),
+        activation("activation", input_info("eltwise"), activation_func::floor),
+        reorder("reorder", input_info("activation"), p.default_format, data_types::f32)
+    );
+
+    tolerance = default_tolerance(data_types::u8);
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_u8_eltwise_activation_onednn, ::testing::ValuesIn(std::vector<lrn_test_params>{
+    // InputDataType = FP32   OutputDataType = FP32
+    lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" },
+    lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" },
+    lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" },
+
+    // InputDataType = FP32   OutputDataType = FP16
+    lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" },
+    lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" },
+    lrn_test_params{ CASE_LRN_FP32_TO_FP16_3, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" },
+    lrn_test_params{ CASE_LRN_FP32_TO_FP16_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" },
+}));
+
 class lrn_fp32_quantize_i8_eltwise_activation : public LrnFusingTest {};
 TEST_P(lrn_fp32_quantize_i8_eltwise_activation, basic) {
    auto p = GetParam();
--- a/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp
@ -1140,6 +1140,7 @@ using fully_connected_random_test_f32_3d = fully_connected_random_test_3d<float,
 using fully_connected_random_test_f16_3d = fully_connected_random_test_3d<FLOAT16, FLOAT16, FLOAT16, FLOAT16>;
 using fully_connected_random_test_i8_3d = fully_connected_random_test_3d<int8_t, int8_t, int8_t, float>;

+
 TEST_P(fully_connected_random_test_f32_3d, basic) {
    run_test();
 }
@ -1149,9 +1150,9 @@ INSTANTIATE_TEST_SUITE_P(
    fully_connected_random_test_f32_3d,
    ::testing::Combine(
        ::testing::Values(1, 3),
-        ::testing::Values(shared_dims{1, 1, 1},
+        ::testing::Values(shared_dims{1, 1, 2},
                          shared_dims{1, 1, 3},
-                          shared_dims{3, 1, 1},
+                          shared_dims{3, 1, 2},
                          shared_dims{3, 1, 3}),
        ::testing::Values(1, 3, 16),
        ::testing::Values(format::bfyx),
@ -1201,9 +1202,9 @@ INSTANTIATE_TEST_SUITE_P(
    fully_connected_random_test_f16_3d,
    ::testing::Combine(
        ::testing::Values(1, 3),
-        ::testing::Values(shared_dims{1, 1, 1},
+        ::testing::Values(shared_dims{1, 1, 2},
                          shared_dims{1, 1, 16},
-                          shared_dims{3, 1, 1},
+                          shared_dims{3, 1, 2},
                          shared_dims{3, 1, 16}),
        ::testing::Values(1, 3, 16),
        ::testing::Values(format::bfyx),
@ -1221,9 +1222,9 @@ INSTANTIATE_TEST_SUITE_P(
    fully_connected_random_test_i8_3d,
    ::testing::Combine(
        ::testing::Values(1, 3),
-        ::testing::Values(shared_dims{1, 1, 1},
+        ::testing::Values(shared_dims{1, 1, 2},
                          shared_dims{1, 1, 16},
-                          shared_dims{3, 1, 1},
+                          shared_dims{3, 1, 2},
                          shared_dims{3, 1, 16}),
        ::testing::Values(1, 3, 16),
        ::testing::Values(format::bfyx),
@ -2169,9 +2170,19 @@ struct dynamic_fully_connected_gpu : ::testing::TestWithParam<fully_connected_dy
                                                                              input_data_vec,
                                                                              weights_data_vec,
                                                                              bias_data_vec);
-            for (int b = 0; b < batch_size; b++) {
-                for (int ofm = 0; ofm < output_f; ofm++) {
-                    ASSERT_EQ(ref_result[b * output_f + ofm], output_ptr[b * output_f + ofm]);
+
+            if (engine.get_device_info().supports_immad) {
+                for (int b = 0; b < batch_size; b++) {
+                    for (int ofm = 0; ofm < output_f; ofm++) {
+                        EXPECT_NEAR(ref_result[b * output_f + ofm], output_ptr[b * output_f + ofm],
+                                    default_tolerance(input_dt));
+                    }
+                }
+            } else {
+                for (int b = 0; b < batch_size; b++) {
+                    for (int ofm = 0; ofm < output_f; ofm++) {
+                        ASSERT_EQ(ref_result[b * output_f + ofm], output_ptr[b * output_f + ofm]);
+                    }
                }
            }
        }