From 00b7f58152ce6458bc35a28984e54132bf641037 Mon Sep 17 00:00:00 2001
From: Kelvin Choi <kelvin.choi@intel.com>
Date: Thu, 27 Jan 2022 13:08:35 +0900
Subject: [PATCH] [GPU] enable validate after adding reorder (#9594)

---
 .../graph_optimizer/add_required_reorders.cpp |  1 +
 .../activation/activation_kernel_ref.cpp      |  9 ++
 .../activation/activation_kernel_ref.h        |  5 +
 .../tests/fusions/convolution_fusion_test.cpp | 94 ++++++++++++-------
 4 files changed, 75 insertions(+), 34 deletions(-)
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
index df7a9fbea3a..b88ffc92fe6 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
@@ -30,6 +30,7 @@ void add_required_reorders::add_reorder(program& p, program_node* node, program_
 
     auto new_reorder = std::make_shared<reorder>(node->id() + "_reorder_" + usr->id(), node->id(), reorder_layout);
     auto& new_reorder_node = p.get_or_create(new_reorder);
+    new_reorder_node.set_output_layout(reorder_layout, false);
 
     // ToDo: add a method to program class which adds an intermediate node given a node and its user
     auto it = std::find(usr->get_dependencies().begin(), usr->get_dependencies().end(), node);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
index 34da9a3a4cf..eb49ae0db5c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
@@ -56,4 +56,13 @@ KernelsData ActivationKernelRef::GetKernelsData(const Params& params, const opti
 KernelsPriority ActivationKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
     return DONT_USE_IF_HAVE_SOMETHING_ELSE;
 }
+
+bool ActivationKernelRef::Validate(const Params& p, const optional_params& o) const {
+    if (!Parent::Validate(p, o)) return false;
+    const auto& params = static_cast<const activation_params&>(p);
+    if (params.inputs[0].GetDims().size() != params.output.GetDims().size())
+        return false;
+
+    return true;
+}
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h
index d7696784f6d..1f6c3075c19 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h
@@ -11,6 +11,9 @@
 namespace kernel_selector {
 class ActivationKernelRef : public ActivationKernelBase {
 public:
+    using Parent = ActivationKernelBase;
+    using Parent::Parent;
+
     ActivationKernelRef() : ActivationKernelBase("activation_ref") {}
     virtual ~ActivationKernelRef() {}
 
@@ -23,5 +26,7 @@ public:
                 FusedOpType::SCALE,
                 FusedOpType::ACTIVATION};
     }
+
+    bool Validate(const Params& p, const optional_params& o) const override;
 };
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
index 0e29fc74354..e0114e8f27c 100644
--- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
@@ -113,6 +113,29 @@ public:
     }
 };
 
+class ConvReorderFusingTest : public BaseFusingTest<convolution_test_params> {
+public:
+    void execute(convolution_test_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        compare(network_not_fused, network_fused, p, true);
+    }
+
+    layout get_input_layout(convolution_test_params& p) {
+        auto pad = p.pad;
+        std::vector<int> pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] };
+        return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
+    }
+
+    layout get_per_channel_layout(convolution_test_params& p) {
+        return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
+    }
+};
+
 class ConvEltwTest : public ::BaseFusingTest<conv_eltw_test_params> {
 public:
 
@@ -2543,10 +2566,14 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp16_scale, ::testing::ValuesIn(std::
     bc_force_kernel_params{ CASE_CONV_FP16_13, 2, 3, "convolution_gpu_fs_byx_fsv32" },
 }));
 
-// reorder(bfyx to fs_b_yx_fsv32) + conv
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* ---------------------- reorder(bfyx to fs_b_yx_fsv32) + convolution kernel cases -------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
 #define FSV32_CASE_CONV_FP32_1 { 1, 32, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx
 
-class conv_fp32_reorder_bfyx_to_fsv32_conv_basic : public ConvFusingTest {};
+// 'reorder_fsv32' is being removed from "remove_redundant_reorders" in the current impl
+class conv_fp32_reorder_bfyx_to_fsv32_conv_basic : public ConvReorderFusingTest {};
 TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_basic, basic) {
     auto p = GetParam();
 
@@ -2554,21 +2581,22 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_basic, basic) {
         input_layout("input", get_input_layout(p)),
         data("weights", get_mem(get_weights_layout(p), -127, 127)),
         reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32),
-        convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation),
-        activation("activation", "conv_output", activation_func::abs)
+        convolution("conv_prim", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation),
+        activation("activation", "conv_prim", activation_func::abs),
+        reorder("reorder_out", "activation", format::bfyx, data_types::f32)
     );
 
     implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
-    bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } }));
 
     execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_basic, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  3, 3 }
+    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  3, 4 }
 }));
 
-
-class conv_fp32_reorder_bfyx_to_fsv32_conv_mean : public ConvFusingTest {};
+// 'reorder_fsv32' is not being fused in the current impl, since it has 'mean'
+class conv_fp32_reorder_bfyx_to_fsv32_conv_mean : public ConvReorderFusingTest {};
 TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_mean, have_mean) {
     auto p = GetParam();
     memory::ptr mul = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } });
@@ -2579,21 +2607,21 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_mean, have_mean) {
         data("mul", mul),
         data("weights", get_mem(get_weights_layout(p), -127, 127)),
         reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32, "mul", reorder_mean_mode::mul),
-        convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation),
-        activation("activation", "conv_output", activation_func::abs)
+        convolution("conv_prim", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation),
+        activation("activation", "conv_prim", activation_func::abs)
     );
 
     implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
-    bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } }));
 
     execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_mean, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  3, 3 }
+    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  4, 4 }
 }));
 
-
-class conv_fp32_reorder_bfyx_to_fsv32_conv_subtract : public ConvFusingTest {};
+// 'reorder_fsv32' is not being fused in the current impl, since it has 'subtract'
+class conv_fp32_reorder_bfyx_to_fsv32_conv_subtract : public ConvReorderFusingTest {};
 TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature) {
     auto p = GetParam();
     const std::vector<float>& values_to_subtract = {
@@ -2603,7 +2631,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature)
         0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f
     };
 
-    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3));
+    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(2, 2));
     auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor };
     auto dw_stride = tensor{ 0, 0, 1, 1 };
 
@@ -2613,12 +2641,11 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature)
         data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
         convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
         reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32, values_to_subtract),
-        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
-        activation("activation", "conv_output", activation_func::abs)
+        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation)
     );
 
     implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
-    bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } }));
 
     execute(p);
 }
@@ -2626,12 +2653,12 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_subtr
     convolution_test_params{ FSV32_CASE_CONV_FP32_1,  4, 4 }
 }));
 
-
-class conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation : public ConvFusingTest {};
+// 'reorder_fsv32' is not being fused in the current impl, since it has 'fused_activation'
+class conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation : public ConvReorderFusingTest {};
 TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activation) {
     auto p = GetParam();
 
-    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3));
+    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(2, 2));
     auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor };
     auto dw_stride = tensor{ 0, 0, 1, 1 };
 
@@ -2642,25 +2669,26 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activat
         convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
         reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32),
         activation("activation_quantize", "reorder_fsv32", activation_func::relu),
-        convolution("conv_output", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
-        activation("activation", "conv_output", activation_func::abs)
+        convolution("conv_prim2", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
+        activation("activation", "conv_prim2", activation_func::abs)
     );
 
     implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
-    bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "conv_prim2", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "activation", conv_impl } }));
 
     execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  4, 5 }
+    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  5, 6 }
 }));
 
-
-class conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding : public ConvFusingTest {};
+// 'reorder_fsv32' is being fused even if it has 'padding'
+class conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding : public ConvReorderFusingTest {};
 TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) {
     auto p = GetParam();
 
-    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3));
+    auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(2, 2));
     auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor };
     auto dw_stride = tensor{ 0, 0, 1, 1 };
 
@@ -2670,19 +2698,17 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) {
         data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
         convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
         reorder("reorder_fsv32", "conv_prim", layout(data_types::f32, format::fs_b_yx_fsv32, dw_tensor, padding{ { 0, 0, 1, 1 }, 0 })),
-        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
-        activation("activation", "conv_output", activation_func::abs),
-        activation("activation2", "conv_prim", activation_func::abs),
-        eltwise("add_bias", { "activation", "activation2" }, eltwise_mode::sum)
+        convolution("conv_prim2", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
+        reorder("reorder_out", "conv_prim2", format::fs_b_yx_fsv32, data_types::f32)
     );
 
     implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
-    bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } }));
+    bo_fused.set_option(build_option::force_implementations({ { "conv_prim2", conv_impl } }));
 
     execute(p);
 }
 INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  5, 6 }
+    convolution_test_params{ FSV32_CASE_CONV_FP32_1,  5, 5 }
 }));
 
 #ifdef ENABLE_ONEDNN_FOR_GPU