[GPU] Fix input feature map indexing with pad and batch indices for ROIAlign (#19511)

* [GPU] Fix input feature map indexing with pad and batch indices for ROIAlign

* Fix failed TCs for ov_gpu_func_tests

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Fix to do batch interpretation for inconsistency between ROIALign input and const 1D tensor

Signed-off-by: Andrew Park <andrew.park@intel.com>

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park
2023-09-02 04:43:23 +09:00
committed by GitHub
parent 441adcc122
commit 92c6316e8e
3 changed files with 83 additions and 11 deletions

View File

@@ -9,11 +9,11 @@
#define POOLED_WIDTH OUTPUT_SIZE_X
#define POOLED_HEIGHT OUTPUT_SIZE_Y
KERNEL(roi_align_ref)
(const __global INPUT0_TYPE* src_data,
__global OUTPUT_TYPE* dst_data,
const __global INPUT1_TYPE* src_rois,
const __global INPUT2_TYPE* src_batches) {
KERNEL(roi_align_ref)(const __global INPUT0_TYPE* src_data,
__global OUTPUT_TYPE* dst_data,
const __global INPUT1_TYPE* src_rois,
const __global INPUT2_TYPE* src_batches)
{
const size_t i = get_global_id(0);
const uint x = i % POOLED_WIDTH;
@@ -23,6 +23,9 @@ KERNEL(roi_align_ref)
const __global INPUT1_TYPE* roi_ptr = &src_rois[INPUT1_GET_INDEX(r, 0, 0, 0)];
// Get the batch index of feature map
const uint b = (uint)src_batches[INPUT2_GET_INDEX(r, 0, 0, 0)];
// Get ROI`s corners
const INPUT1_TYPE x1 =
(roi_ptr[0] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
@@ -46,8 +49,6 @@ KERNEL(roi_align_ref)
const INPUT1_TYPE sample_distance_x = bin_width / (INPUT1_TYPE)sampling_ratio_x;
const INPUT1_TYPE sample_distance_y = bin_height / (INPUT1_TYPE)sampling_ratio_y;
const __global INPUT0_TYPE* data = src_data + INPUT0_GET_INDEX(r, c, 0, 0);
OUTPUT_TYPE pooled_value = 0;
for (unsigned int y_sample_ind = 0; y_sample_ind < sampling_ratio_y; y_sample_ind++) {
INPUT1_TYPE sample_y =
@@ -91,10 +92,10 @@ KERNEL(roi_align_ref)
weight_right = INPUT1_VAL_ONE - weight_left;
}
const INPUT0_TYPE top_left = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_low)];
const INPUT0_TYPE top_right = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_high)];
const INPUT0_TYPE bottom_left = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_low)];
const INPUT0_TYPE bottom_right = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_high)];
const INPUT0_TYPE top_left = src_data[INPUT0_GET_INDEX(b, c, sample_y_low, sample_x_low)];
const INPUT0_TYPE top_right = src_data[INPUT0_GET_INDEX(b, c, sample_y_low, sample_x_high)];
const INPUT0_TYPE bottom_left = src_data[INPUT0_GET_INDEX(b, c, sample_y_high, sample_x_low)];
const INPUT0_TYPE bottom_right = src_data[INPUT0_GET_INDEX(b, c, sample_y_high, sample_x_high)];
const INPUT0_TYPE interpolated_value =
weight_bottom * weight_right * top_left + weight_bottom * weight_left * top_right +

View File

@@ -16,6 +16,7 @@
#include "openvino/op/gather.hpp"
#include "openvino/op/split.hpp"
#include "openvino/op/prelu.hpp"
#include "openvino/op/roi_align.hpp"
#include "openvino/op/variadic_split.hpp"
#include "openvino/op/util/op_types.hpp"
@@ -180,6 +181,8 @@ static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0
if (constDims.size() == 4 && input_shape.size() == 3) { // In case of weight dim 4 and input dim 3,
constDims.push_back(1); // The weight cldnn tensor adds 1d to the end as the input cldnn tensor does
}
} else if (ov::is_type<ov::op::v3::ROIAlign>(outOp) || ov::is_type<ov::op::v9::ROIAlign>(outOp)) {
consts[op].needsBatchInterpretation = constDims.size() == 1;
}
}

View File

@@ -199,3 +199,71 @@ TYPED_TEST(roi_align_test, max_half_pixel_cached) {
{TD(4.375f), TD(4.9375f), TD(5.6875f), TD(5.625f), TD(4.625f), TD(7.125f), TD(3.3125f), TD(4.3125f)};
this->execute(expected_output, roi_align::PoolingMode::max, roi_align::AlignedMode::half_pixel, true);
}
TEST(roi_align_gpu_fp32, bfyx_inpad_1x1) {
auto& engine = get_test_engine();
const int pooled_h{2};
const int pooled_w{2};
const int sampling_ratio{2};
const float spatial_scale{1};
const std::vector<float> input_data = {
0.f, 1.f, 8.f, 5.f, 5.f, 2.f, 0.f, 7.f, 7.f, 10.f, 4.f,
5.f, 9.f, 0.f, 0.f, 5.f, 7.f, 0.f, 4.f, 0.f, 4.f, 7.f,
6.f, 10.f, 9.f, 5.f, 1.f, 7.f, 4.f, 7.f, 10.f, 8.f, 2.f,
0.f, 8.f, 3.f, 6.f, 8.f, 10.f, 4.f, 2.f, 10.f, 7.f, 8.f,
7.f, 0.f, 6.f, 9.f, 2.f, 4.f, 8.f, 5.f, 2.f, 3.f, 3.f,
1.f, 5.f, 9.f, 10.f, 0.f, 9.f, 5.f, 5.f, 3.f, 10.f, 5.f,
2.f, 0.f, 10.f, 0.f, 5.f, 4.f, 3.f, 10.f, 5.f, 5.f, 10.f,
0.f, 8.f, 8.f, 9.f, 1.f, 0.f, 7.f, 9.f, 6.f, 8.f, 7.f,
10.f, 9.f, 2.f, 3.f, 3.f, 5.f, 6.f, 9.f, 4.f, 9.f, 2.f,
4.f, 5.f, 5.f, 3.f, 1.f, 1.f, 6.f, 8.f, 0.f, 5.f, 5.f,
10.f, 8.f, 6.f, 9.f, 6.f, 9.f, 1.f, 2.f, 7.f, 1.f, 1.f,
3.f, 0.f, 4.f, 0.f, 7.f, 10.f, 2.f
};
const std::vector<float> coords_data = {2.f, 2.f, 4.f, 4.f, 2.f, 2.f, 4.f, 4.f};
const std::vector<int32_t> roi_data = {0, 1};
auto input = engine.allocate_memory({ov::PartialShape{2, 1, 8, 8}, data_types::f32, format::bfyx});
auto coords = engine.allocate_memory({ov::PartialShape{2, 4, 1, 1}, data_types::f32, format::bfyx});
auto roi_ind = engine.allocate_memory({ov::PartialShape{2, 1, 1, 1}, data_types::i32, format::bfyx});
set_values(input, input_data);
set_values(coords, coords_data);
set_values(roi_ind, roi_data);
topology topology;
topology.add(input_layout("input", input->get_layout()));
topology.add(input_layout("coords", coords->get_layout()));
topology.add(input_layout("roi_ind", roi_ind->get_layout()));
topology.add(reorder("reorder_input", input_info("input"), input->get_layout().with_padding(padding{ {0,0,1,1},0 })));
topology.add(roi_align("roi_align",
{ input_info("reorder_input"), input_info("coords"), input_info("roi_ind") },
pooled_h,
pooled_w,
sampling_ratio,
spatial_scale,
roi_align::PoolingMode::avg,
roi_align::AlignedMode::asymmetric));
topology.add(reorder("out", input_info("roi_align"), format::bfyx, data_types::f32));
network network(engine, topology, get_test_default_config(engine));
network.set_input_data("input", input);
network.set_input_data("coords", coords);
network.set_input_data("roi_ind", roi_ind);
auto outputs = network.execute();
auto output = outputs.at("out").get_memory();
std::vector<float> expected_output = {
3.f, 3.75f, 4.75f, 5.f, 3.f, 5.5f, 2.75f, 3.75f
};
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
ASSERT_EQ(output_ptr.size(), expected_output.size());
for (uint32_t i = 0; i < expected_output.size(); ++i) {
ASSERT_EQ(output_ptr[i], expected_output[i]);
}
}