[GPU] Fix input feature map indexing with pad and batch indices for ROIAlign (#19511)

* [GPU] Fix input feature map indexing with pad and batch indices for ROIAlign * Fix failed TCs for ov_gpu_func_tests Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix to do batch interpretation for inconsistency between ROIALign input and const 1D tensor Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
2023-09-02 04:43:23 +09:00
parent 441adcc122
commit 92c6316e8e
3 changed files with 83 additions and 11 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/roi_align_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/roi_align_ref.cl
@@ -9,11 +9,11 @@
 #define POOLED_WIDTH  OUTPUT_SIZE_X
 #define POOLED_HEIGHT OUTPUT_SIZE_Y

-KERNEL(roi_align_ref)
-(const __global INPUT0_TYPE* src_data,
- __global OUTPUT_TYPE* dst_data,
- const __global INPUT1_TYPE* src_rois,
- const __global INPUT2_TYPE* src_batches) {
+KERNEL(roi_align_ref)(const __global INPUT0_TYPE* src_data,
+                            __global OUTPUT_TYPE* dst_data,
+                      const __global INPUT1_TYPE* src_rois,
+                      const __global INPUT2_TYPE* src_batches)
+{
    const size_t i = get_global_id(0);

    const uint x = i % POOLED_WIDTH;
@@ -23,6 +23,9 @@ KERNEL(roi_align_ref)

    const __global INPUT1_TYPE* roi_ptr = &src_rois[INPUT1_GET_INDEX(r, 0, 0, 0)];

+    // Get the batch index of feature map
+    const uint b = (uint)src_batches[INPUT2_GET_INDEX(r, 0, 0, 0)];
+
    // Get ROI`s corners
    const INPUT1_TYPE x1 =
        (roi_ptr[0] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
@@ -46,8 +49,6 @@ KERNEL(roi_align_ref)
    const INPUT1_TYPE sample_distance_x = bin_width / (INPUT1_TYPE)sampling_ratio_x;
    const INPUT1_TYPE sample_distance_y = bin_height / (INPUT1_TYPE)sampling_ratio_y;

-    const __global INPUT0_TYPE* data = src_data + INPUT0_GET_INDEX(r, c, 0, 0);
-
    OUTPUT_TYPE pooled_value = 0;
    for (unsigned int y_sample_ind = 0; y_sample_ind < sampling_ratio_y; y_sample_ind++) {
        INPUT1_TYPE sample_y =
@@ -91,10 +92,10 @@ KERNEL(roi_align_ref)
                weight_right = INPUT1_VAL_ONE - weight_left;
            }

-            const INPUT0_TYPE top_left = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_low)];
-            const INPUT0_TYPE top_right = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_high)];
-            const INPUT0_TYPE bottom_left = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_low)];
-            const INPUT0_TYPE bottom_right = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_high)];
+            const INPUT0_TYPE top_left = src_data[INPUT0_GET_INDEX(b, c, sample_y_low, sample_x_low)];
+            const INPUT0_TYPE top_right = src_data[INPUT0_GET_INDEX(b, c, sample_y_low, sample_x_high)];
+            const INPUT0_TYPE bottom_left = src_data[INPUT0_GET_INDEX(b, c, sample_y_high, sample_x_low)];
+            const INPUT0_TYPE bottom_right = src_data[INPUT0_GET_INDEX(b, c, sample_y_high, sample_x_high)];

            const INPUT0_TYPE interpolated_value =
                weight_bottom * weight_right * top_left + weight_bottom * weight_left * top_right +
--- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
@@ -16,6 +16,7 @@
 #include "openvino/op/gather.hpp"
 #include "openvino/op/split.hpp"
 #include "openvino/op/prelu.hpp"
+#include "openvino/op/roi_align.hpp"
 #include "openvino/op/variadic_split.hpp"
 #include "openvino/op/util/op_types.hpp"

@@ -180,6 +181,8 @@ static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0
            if (constDims.size() == 4 && input_shape.size() == 3) { // In case of weight dim 4 and input dim 3,
                constDims.push_back(1);                             // The weight cldnn tensor adds 1d to the end as the input cldnn tensor does
            }
+        } else if (ov::is_type<ov::op::v3::ROIAlign>(outOp) || ov::is_type<ov::op::v9::ROIAlign>(outOp)) {
+            consts[op].needsBatchInterpretation = constDims.size() == 1;
        }
    }

--- a/src/plugins/intel_gpu/tests/unit/test_cases/roi_align_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/roi_align_gpu_test.cpp
@@ -199,3 +199,71 @@ TYPED_TEST(roi_align_test, max_half_pixel_cached) {
        {TD(4.375f), TD(4.9375f), TD(5.6875f), TD(5.625f), TD(4.625f), TD(7.125f), TD(3.3125f), TD(4.3125f)};
    this->execute(expected_output, roi_align::PoolingMode::max, roi_align::AlignedMode::half_pixel, true);
 }
+
+TEST(roi_align_gpu_fp32, bfyx_inpad_1x1) {
+    auto& engine = get_test_engine();
+
+    const int pooled_h{2};
+    const int pooled_w{2};
+    const int sampling_ratio{2};
+    const float spatial_scale{1};
+
+    const std::vector<float> input_data = {
+        0.f,  1.f,  8.f,  5.f,  5.f, 2.f,  0.f, 7.f,  7.f,  10.f, 4.f,
+        5.f,  9.f,  0.f,  0.f,  5.f, 7.f,  0.f, 4.f,  0.f,  4.f,  7.f,
+        6.f,  10.f, 9.f,  5.f,  1.f, 7.f,  4.f, 7.f,  10.f, 8.f,  2.f,
+        0.f,  8.f,  3.f,  6.f,  8.f, 10.f, 4.f, 2.f,  10.f, 7.f,  8.f,
+        7.f,  0.f,  6.f,  9.f,  2.f, 4.f,  8.f, 5.f,  2.f,  3.f,  3.f,
+        1.f,  5.f,  9.f,  10.f, 0.f, 9.f,  5.f, 5.f,  3.f,  10.f, 5.f,
+        2.f,  0.f,  10.f, 0.f,  5.f, 4.f,  3.f, 10.f, 5.f,  5.f,  10.f,
+        0.f,  8.f,  8.f,  9.f,  1.f, 0.f,  7.f, 9.f,  6.f,  8.f,  7.f,
+        10.f, 9.f,  2.f,  3.f,  3.f, 5.f,  6.f, 9.f,  4.f,  9.f,  2.f,
+        4.f,  5.f,  5.f,  3.f,  1.f, 1.f,  6.f, 8.f,  0.f,  5.f,  5.f,
+        10.f, 8.f,  6.f,  9.f,  6.f, 9.f,  1.f, 2.f,  7.f,  1.f,  1.f,
+        3.f,  0.f,  4.f,  0.f,  7.f, 10.f, 2.f
+    };
+    const std::vector<float> coords_data = {2.f, 2.f, 4.f, 4.f, 2.f, 2.f, 4.f, 4.f};
+    const std::vector<int32_t> roi_data = {0, 1};
+
+    auto input = engine.allocate_memory({ov::PartialShape{2, 1, 8, 8}, data_types::f32, format::bfyx});
+    auto coords = engine.allocate_memory({ov::PartialShape{2, 4, 1, 1}, data_types::f32, format::bfyx});
+    auto roi_ind = engine.allocate_memory({ov::PartialShape{2, 1, 1, 1}, data_types::i32, format::bfyx});
+    set_values(input, input_data);
+    set_values(coords, coords_data);
+    set_values(roi_ind, roi_data);
+
+    topology topology;
+    topology.add(input_layout("input", input->get_layout()));
+    topology.add(input_layout("coords", coords->get_layout()));
+    topology.add(input_layout("roi_ind", roi_ind->get_layout()));
+    topology.add(reorder("reorder_input", input_info("input"), input->get_layout().with_padding(padding{ {0,0,1,1},0 })));
+    topology.add(roi_align("roi_align",
+                           { input_info("reorder_input"), input_info("coords"), input_info("roi_ind") },
+                           pooled_h,
+                           pooled_w,
+                           sampling_ratio,
+                           spatial_scale,
+                           roi_align::PoolingMode::avg,
+                           roi_align::AlignedMode::asymmetric));
+    topology.add(reorder("out", input_info("roi_align"), format::bfyx, data_types::f32));
+
+    network network(engine, topology, get_test_default_config(engine));
+
+    network.set_input_data("input", input);
+    network.set_input_data("coords", coords);
+    network.set_input_data("roi_ind", roi_ind);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("out").get_memory();
+
+    std::vector<float> expected_output = {
+        3.f, 3.75f, 4.75f, 5.f, 3.f, 5.5f, 2.75f, 3.75f
+    };
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    ASSERT_EQ(output_ptr.size(), expected_output.size());
+    for (uint32_t i = 0; i < expected_output.size(); ++i) {
+        ASSERT_EQ(output_ptr[i], expected_output[i]);
+    }
+}