From ed65d199bb3365c9bb296ce1ec1c1acda81b096a Mon Sep 17 00:00:00 2001
From: Konstantin Beluchenko <kostiantyn.bieliuchenko@altran.com>
Date: Thu, 21 Jul 2022 12:52:32 +0300
Subject: [PATCH] [GPU] ROIAlign v9 support (#11899)

* ROIAlign v9 support

* Code changes after review1

* Code changes after review2

* fix of single layer test for Windows

* Since PR #12043 we don't need strong include order of primitive_base.hpp and
impls/implementation map.hpp anymore

* Code changes after review3

* Code changes after review4
---
 .../intel_gpu/plugin/primitives_list.hpp      |   1 +
 .../intel_gpu/primitives/roi_align.hpp        |  31 +--
 .../src/graph/impls/implementation_map.hpp    |  16 ++
 .../src/graph/impls/ocl/roi_align.cpp         |  52 +++--
 src/plugins/intel_gpu/src/graph/program.cpp   |   7 +-
 src/plugins/intel_gpu/src/graph/roi_align.cpp |  27 ++-
 .../roi_align/roi_align_kernel_ref.cpp        |  35 +++-
 .../roi_align/roi_align_kernel_ref.h          |   7 +-
 .../core/cl_kernels/roi_align_ref.cl          | 116 ++++++------
 .../intel_gpu/src/plugin/ops/roi_align.cpp    |  52 ++++-
 .../tests/test_cases/roi_align_gpu_test.cpp   | 178 ++++++++++++++++++
 .../single_layer_tests/roi_align.cpp          | 122 ++++++++----
 .../include/single_layer_tests/roi_align.hpp  |   4 +
 .../single_layer/roi_align.hpp                |  28 +++
 .../src/single_layer/roi_align.cpp            | 132 +++++++++++--
 15 files changed, 639 insertions(+), 169 deletions(-)
 create mode 100644 src/plugins/intel_gpu/tests/test_cases/roi_align_gpu_test.cpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
index caea06ef284..5383be3c67c 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -235,6 +235,7 @@ REGISTER_FACTORY(v8, AdaptiveMaxPool);
 
 // ------------------------------ Supported v9 ops ------------------------------ //
 REGISTER_FACTORY(v9, SoftSign)
+REGISTER_FACTORY(v9, ROIAlign);
 
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/roi_align.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/roi_align.hpp
index dfeb33224cb..a52011b5881 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/roi_align.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/roi_align.hpp
@@ -19,10 +19,10 @@ struct roi_align : public primitive_base<roi_align> {
     CLDNN_DECLARE_PRIMITIVE(roi_align)
 
     /// @brief Pooling mode for the @ref roi_align
-    enum PoolingMode {
-        Max,
-        Avg
-    };
+    enum PoolingMode { max, avg };
+
+    /// @brief Aligned mode for the @ref roi_align
+    enum AlignedMode { asymmetric, half_pixel_for_nn, half_pixel };
 
     /// @brief Constructs roi_align primitive.
     /// @param id This primitive id.
@@ -32,24 +32,25 @@ struct roi_align : public primitive_base<roi_align> {
     /// @param sampling_ratio Number of bins over height and width to use to calculate each output feature map element.
     /// @param spatial_scale multiplicative spatial scale factor to translate ROI coordinates
     /// from their input spatial scale to the scale used when pooling.
-    /// @param mode Method to perform pooling to produce output feature map elements.
-    /// @param shrink_axis_mask Array of bits, that provide shrinks the dimensionality by 1, taking on the value at index begin[i].
+    /// @param pooling_mode Method to perform pooling to produce output feature map elements.
+    /// @param aligned_mode Method to coordinates alignment.
     roi_align(const primitive_id& id,
               const std::vector<primitive_id>& inputs,
               int pooled_h,
               int pooled_w,
               int sampling_ratio,
               float spatial_scale,
-              PoolingMode mode,
+              PoolingMode pooling_mode,
+              AlignedMode aligned_mode,
               const primitive_id& ext_prim_id = "",
               const padding& output_padding = padding())
         : primitive_base(id, inputs, ext_prim_id, output_padding),
-          pooled_h {pooled_h},
-          pooled_w {pooled_w},
-          sampling_ratio {sampling_ratio},
-          spatial_scale {spatial_scale},
-          mode {mode}
-    {}
+          pooled_h{pooled_h},
+          pooled_w{pooled_w},
+          sampling_ratio{sampling_ratio},
+          spatial_scale{spatial_scale},
+          pooling_mode{pooling_mode},
+          aligned_mode{aligned_mode} {}
 
     /// @brief Height of the ROI output feature map.
     int pooled_h;
@@ -61,7 +62,9 @@ struct roi_align : public primitive_base<roi_align> {
     /// from their input spatial scale to the scale used when pooling.
     float spatial_scale;
     /// @brief Method to perform pooling to produce output feature map elements.
-    PoolingMode mode;
+    PoolingMode pooling_mode;
+    /// @brief Method to coordinate alignment.
+    AlignedMode aligned_mode;
 };
 /// @}
 /// @}
diff --git a/src/plugins/intel_gpu/src/graph/impls/implementation_map.hpp b/src/plugins/intel_gpu/src/graph/impls/implementation_map.hpp
index 1aeab6be1ab..ed23803e323 100644
--- a/src/plugins/intel_gpu/src/graph/impls/implementation_map.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/implementation_map.hpp
@@ -204,11 +204,27 @@ public:
         return false;
     }
 
+    static void add(impl_types impl_type, factory_type factory,
+                    const std::vector<data_types>& types, const std::vector<format::type>& formats) {
+        add(impl_type, factory, combine(types, formats));
+    }
+
     static void add(impl_types impl_type, factory_type factory, std::set<key_type> keys) {
         if (impl_type == impl_types::any) {
             throw std::runtime_error("[CLDNN] Can't register impl with type any");
         }
         map_type::instance().insert({impl_type, {keys, factory}});
     }
+
+private:
+    static std::set<key_type> combine(const std::vector<data_types>& types, const std::vector<format::type>& formats) {
+        std::set<key_type> keys;
+        for (const auto& type : types) {
+            for (const auto& format : formats) {
+                keys.emplace(type, format);
+            }
+        }
+        return keys;
+    }
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp
index 8c82fc62428..19b574383ce 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/roi_align.cpp
@@ -1,13 +1,14 @@
 // Copyright (C) 2018-2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-#include "roi_align_inst.h"
+
 #include "primitive_base.hpp"
 #include "impls/implementation_map.hpp"
 #include "intel_gpu/runtime/error_handler.hpp"
 #include "kernel_selector_helper.h"
-#include "roi_align/roi_align_kernel_selector.h"
 #include "roi_align/roi_align_kernel_ref.h"
+#include "roi_align/roi_align_kernel_selector.h"
+#include "roi_align_inst.h"
 
 namespace cldnn {
 namespace ocl {
@@ -15,11 +16,23 @@ namespace ocl {
 namespace {
 kernel_selector::pool_type from(roi_align::PoolingMode mode) {
     switch (mode) {
-        case roi_align::PoolingMode::Max:
-            return kernel_selector::pool_type::MAX;
-        default:
-        case roi_align::PoolingMode::Avg:
-            return kernel_selector::pool_type::AVG;
+    case roi_align::PoolingMode::max:
+        return kernel_selector::pool_type::MAX;
+    default:
+    case roi_align::PoolingMode::avg:
+        return kernel_selector::pool_type::AVG;
+    }
+}
+
+kernel_selector::roi_aligned_mode from(roi_align::AlignedMode mode) {
+    switch (mode) {
+    case roi_align::AlignedMode::half_pixel_for_nn:
+        return kernel_selector::roi_aligned_mode::HALF_PIXEL_FOR_NN;
+    case roi_align::AlignedMode::half_pixel:
+        return kernel_selector::roi_aligned_mode::HALF_PIXEL;
+    default:
+    case roi_align::AlignedMode::asymmetric:
+        return kernel_selector::roi_aligned_mode::ASYMMETRIC;
     }
 }
 }  // namespace
@@ -35,8 +48,8 @@ struct roi_align_impl : typed_primitive_impl_ocl<roi_align> {
 protected:
     kernel_arguments_data get_arguments(typed_primitive_inst<roi_align>& instance, int32_t) const override {
         kernel_arguments_data args;
-        args.inputs = { instance.input_memory_ptr(), instance.rois_memory(), instance.batches_memory() };
-        args.outputs = { instance.output_memory_ptr() };
+        args.inputs = {instance.input_memory_ptr(), instance.rois_memory(), instance.batches_memory()};
+        args.outputs = {instance.output_memory_ptr()};
 
         return args;
     }
@@ -67,10 +80,10 @@ public:
         auto roi_align_optional_params =
             get_default_optional_params<kernel_selector::roi_align_optional_params>(arg.get_program());
 
-        const auto roi_bfyx = convert_data_tensor(rois_layout);
-        roi_align_params.inputs.push_back(roi_bfyx.FlattenFeatureAndSpatials());
+        roi_align_params.inputs.push_back(convert_data_tensor(rois_layout));
         roi_align_params.inputs.push_back(convert_data_tensor(batches_layout));
-        roi_align_params.mode = from(primitive->mode);
+        roi_align_params.pooling_mode = from(primitive->pooling_mode);
+        roi_align_params.aligned_mode = from(primitive->aligned_mode);
         roi_align_params.sampling_ratio = primitive->sampling_ratio;
         roi_align_params.spatial_scale = primitive->spatial_scale;
 
@@ -91,11 +104,16 @@ public:
 namespace detail {
 
 attach_roi_align_impl::attach_roi_align_impl() {
-    implementation_map<roi_align>::add(impl_types::ocl, roi_align_impl::create,
-                                       {
-                                           std::make_tuple(data_types::f16, format::bfyx),
-                                           std::make_tuple(data_types::f32, format::bfyx),
-                                       });
+    auto types = {data_types::f16, data_types::f32, data_types::i8, data_types::u8, data_types::i32};
+
+    auto formats = {format::bfyx,
+                    format::b_fs_yx_fsv16,
+                    format::b_fs_yx_fsv32,
+                    format::bs_fs_yx_bsv16_fsv16,
+                    format::bs_fs_yx_bsv32_fsv16,
+                    format::bs_fs_yx_bsv32_fsv32};
+
+    implementation_map<roi_align>::add(impl_types::ocl, roi_align_impl::create, types, formats);
 }
 
 }  // namespace detail
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 4b302428fc7..a3818506304 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1400,7 +1400,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::mvn::type_id() &&
             prim.type() != cldnn::gather::type_id() &&
             prim.type() != cldnn::scatter_nd_update::type_id() &&
-            prim.type() != cldnn::non_max_suppression::type_id()) {
+            prim.type() != cldnn::non_max_suppression::type_id() &&
+            prim.type() != cldnn::roi_align::type_id()) {
             can_use_fsv16 = false;
         }
 
@@ -1428,8 +1429,10 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::generic_layer::type_id() &&
             prim.type() != cldnn::scatter_nd_update::type_id() &&
             prim.type() != cldnn::quantize::type_id() &&
-            prim.type() != cldnn::non_max_suppression::type_id())
+            prim.type() != cldnn::non_max_suppression::type_id() &&
+            prim.type() != cldnn::roi_align::type_id()) {
             can_use_bs_fs_yx_bsv16_fsv16 = false;
+        }
     }
 
     size_t total_conv_layers = lo.get_total_conv_count();
diff --git a/src/plugins/intel_gpu/src/graph/roi_align.cpp b/src/plugins/intel_gpu/src/graph/roi_align.cpp
index d89c5de53cb..61375c10f94 100644
--- a/src/plugins/intel_gpu/src/graph/roi_align.cpp
+++ b/src/plugins/intel_gpu/src/graph/roi_align.cpp
@@ -6,6 +6,7 @@
 #include "primitive_type_base.h"
 #include <sstream>
 #include <json_object.h>
+#include "openvino/core/enum_names.hpp"
 
 namespace cldnn {
 
@@ -23,7 +24,9 @@ layout roi_align_inst::calc_output_layout(roi_align_node const& node) {
     auto rois_layout = node.input(1).get_output_layout();
     auto num_rois = rois_layout.batch();
     auto num_channels = input_layout.feature();
-    return layout(input_layout.data_type, format::bfyx, {num_rois, num_channels, primitive->pooled_h, primitive->pooled_w});
+    return layout(input_layout.data_type,
+                  input_layout.format,
+                  {num_rois, num_channels, primitive->pooled_h, primitive->pooled_w});
 }
 
 std::string roi_align_inst::to_string(roi_align_node const& node) {
@@ -36,11 +39,29 @@ std::string roi_align_inst::to_string(roi_align_node const& node) {
     roi_align_info.add("pooled_w", node.get_primitive()->pooled_w);
     roi_align_info.add("sampling_ratio", node.get_primitive()->sampling_ratio);
     roi_align_info.add("spatial_scale", node.get_primitive()->spatial_scale);
-    roi_align_info.add("mode", node.get_primitive()->mode == roi_align::PoolingMode::Max ? "Max" : "Avg");
+    roi_align_info.add("pooling_mode", ov::as_string(node.get_primitive()->pooling_mode));
+    roi_align_info.add("aligned_mode", ov::as_string(node.get_primitive()->aligned_mode));
     node_info->add("roi_align info", roi_align_info);
     std::stringstream primitive_description;
     node_info->dump(primitive_description);
     return primitive_description.str();
 }
 
-} // namespace cldnn
+}  // namespace cldnn
+
+namespace ov {
+template <> EnumNames<roi_align::PoolingMode>& EnumNames<roi_align::PoolingMode>::get() {
+  static auto enum_names =
+      EnumNames<roi_align::PoolingMode>("PoolingMode", {{"max", roi_align::PoolingMode::max},
+                                                        {"avg", roi_align::PoolingMode::avg}});
+  return enum_names;
+}
+
+template <> EnumNames<roi_align::AlignedMode>& EnumNames<roi_align::AlignedMode>::get() {
+  static auto enum_names =
+      EnumNames<roi_align::AlignedMode>("AlignedMode", {{"asymmetric", roi_align::AlignedMode::asymmetric},
+                                                        {"half_pixel_for_nn", roi_align::AlignedMode::half_pixel_for_nn},
+                                                        {"half_pixel", roi_align::AlignedMode::half_pixel}});
+  return enum_names;
+}
+} // namespace ov
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.cpp
index 61c503bac1e..aaab0910be7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.cpp
@@ -10,11 +10,13 @@ ParamsKey ROIAlignKernelRef::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
@@ -70,21 +72,36 @@ bool ROIAlignKernelRef::Validate(const Params& p, const optional_params& o) cons
     if (params.inputs.size() != 3)
         return false;
 
-    if (params.outputs[0].Dimentions() > 4 || params.inputs[0].Dimentions() > 4 || params.inputs[1].Dimentions() > 2)
-        return false;
-
     return true;
 }
 
-JitConstants ROIAlignKernelRef::GetJitConstants(const roi_align_params &params) const {
+JitConstants ROIAlignKernelRef::GetJitConstants(const roi_align_params& params) const {
     JitConstants jit = MakeBaseParamsJitConstants(params);
+
     jit.AddConstant(MakeJitConstant("SPATIAL_SCALE", params.spatial_scale));
     jit.AddConstant(MakeJitConstant("SAMPLING_RATIO", params.sampling_ratio));
-    if (params.mode == PoolType::MAX)
+
+    if (params.pooling_mode == PoolType::MAX) {
         jit.AddConstant(MakeJitConstant("MAX_POOL", true));
-    else if (params.mode == PoolType::AVG)
+    } else if (params.pooling_mode == PoolType::AVG) {
         jit.AddConstant(MakeJitConstant("AVG_POOL", true));
+    }
+
+    if (params.aligned_mode == roi_aligned_mode::ASYMMETRIC) {
+        jit.AddConstant(MakeJitConstant("OFFSET_SRC", 0.f));
+        jit.AddConstant(MakeJitConstant("OFFSET_DST", 0.f));
+        jit.AddConstant(MakeJitConstant("MIN_SIZE", 1.0f));
+    } else if (params.aligned_mode == roi_aligned_mode::HALF_PIXEL_FOR_NN) {
+        jit.AddConstant(MakeJitConstant("OFFSET_SRC", 0.f));
+        jit.AddConstant(MakeJitConstant("OFFSET_DST", -0.5f));
+        jit.AddConstant(MakeJitConstant("MIN_SIZE", 0.f));
+    } else if (params.aligned_mode == roi_aligned_mode::HALF_PIXEL) {
+        jit.AddConstant(MakeJitConstant("OFFSET_SRC", 0.5f));
+        jit.AddConstant(MakeJitConstant("OFFSET_DST", -0.5f));
+        jit.AddConstant(MakeJitConstant("MIN_SIZE", 0.f));
+    }
+
     return jit;
 }
 
-} // namespace kernel_selector
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.h
index e719740afb5..310dbfe13f7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/roi_align/roi_align_kernel_ref.h
@@ -7,16 +7,19 @@
 
 namespace kernel_selector {
 
+enum class roi_aligned_mode : uint32_t { ASYMMETRIC, HALF_PIXEL_FOR_NN, HALF_PIXEL };
+
 struct roi_align_params : public base_params {
     roi_align_params() : base_params{KernelType::ROI_ALIGN} {}
 
     int sampling_ratio = 0;
     float spatial_scale = 1.f;
-    PoolType mode = PoolType::MAX;
+    PoolType pooling_mode = PoolType::MAX;
+    roi_aligned_mode aligned_mode = roi_aligned_mode::ASYMMETRIC;
 
     ParamsKey GetParamsKey() const override {
         auto k = base_params::GetParamsKey();
-        k.EnablePoolType(mode);
+        k.EnablePoolType(pooling_mode);
         return k;
     }
 };
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/roi_align_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/roi_align_ref.cl
index 1db3dc55cd8..234f9766f2d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/roi_align_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/roi_align_ref.cl
@@ -5,99 +5,103 @@
 #include "include/batch_headers/common.cl"
 #include "include/batch_headers/data_types.cl"
 
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-#define NUM_ROIS OUTPUT_BATCH_NUM
-#define NUM_CHANNELS INPUT0_FEATURE_NUM
-#define POOLED_WIDTH OUTPUT_SIZE_X
+#define MAX(a, b)     ((a) > (b) ? (a) : (b))
+#define NUM_ROIS      OUTPUT_BATCH_NUM
+#define NUM_CHANNELS  INPUT0_FEATURE_NUM
+#define POOLED_WIDTH  OUTPUT_SIZE_X
 #define POOLED_HEIGHT OUTPUT_SIZE_Y
 
 KERNEL(roi_align_ref)
-(
-    const __global INPUT0_TYPE * src_data,
-    __global OUTPUT_TYPE * dst_data,
-    const __global INPUT1_TYPE * src_rois,
-    const __global INPUT2_TYPE * src_batches
-)
-{
+(const __global INPUT0_TYPE* src_data,
+ __global OUTPUT_TYPE* dst_data,
+ const __global INPUT1_TYPE* src_rois,
+ const __global INPUT2_TYPE* src_batches) {
     const size_t i = get_global_id(0);
 
     const uint x = i % POOLED_WIDTH;
     const uint y = i / POOLED_WIDTH % POOLED_HEIGHT;
     const uint c = i / POOLED_WIDTH / POOLED_HEIGHT % NUM_CHANNELS;
     const uint r = i / POOLED_WIDTH / POOLED_HEIGHT / NUM_CHANNELS % NUM_ROIS;
-    
-    const __global INPUT1_TYPE* roi_ptr = &src_rois[INPUT1_BATCH_PITCH * r];
-    
+
+    const __global INPUT1_TYPE* roi_ptr = &src_rois[INPUT1_GET_INDEX(r, 0, 0, 0)];
+
     // Get ROI`s corners
-    const INPUT1_TYPE x1 = *roi_ptr * (INPUT1_TYPE) SPATIAL_SCALE;
-    const INPUT1_TYPE y1 = roi_ptr[1] * (INPUT1_TYPE) SPATIAL_SCALE;
-    const INPUT1_TYPE x2 = roi_ptr[2] * (INPUT1_TYPE) SPATIAL_SCALE;
-    const INPUT1_TYPE y2 = roi_ptr[3] * (INPUT1_TYPE) SPATIAL_SCALE;
-    
-    const INPUT1_TYPE roi_width = MAX(x2 - x1, (INPUT1_TYPE) 1.0);
-    const INPUT1_TYPE roi_height = MAX(y2 - y1, (INPUT1_TYPE) 1.0);
+    const INPUT1_TYPE x1 =
+        (roi_ptr[0] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
+    const INPUT1_TYPE y1 =
+        (roi_ptr[1] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
+    const INPUT1_TYPE x2 =
+        (roi_ptr[2] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
+    const INPUT1_TYPE y2 =
+        (roi_ptr[3] + (INPUT1_TYPE)OFFSET_SRC) * (INPUT1_TYPE)SPATIAL_SCALE + (INPUT1_TYPE)OFFSET_DST;
+
+
+    const INPUT1_TYPE roi_width = MAX(x2 - x1, (INPUT1_TYPE)MIN_SIZE);
+    const INPUT1_TYPE roi_height = MAX(y2 - y1, (INPUT1_TYPE)MIN_SIZE);
 
     const INPUT1_TYPE bin_width = roi_width / POOLED_WIDTH;
     const INPUT1_TYPE bin_height = roi_height / POOLED_HEIGHT;
 
-    const int sampling_ratio_x = SAMPLING_RATIO == 0 ? (int) ceil(bin_width) : SAMPLING_RATIO;
-    const int sampling_ratio_y = SAMPLING_RATIO == 0 ? (int) ceil(bin_height) : SAMPLING_RATIO;
+    const int sampling_ratio_x = SAMPLING_RATIO == 0 ? (int)ceil(bin_width) : SAMPLING_RATIO;
+    const int sampling_ratio_y = SAMPLING_RATIO == 0 ? (int)ceil(bin_height) : SAMPLING_RATIO;
+
+    const INPUT1_TYPE sample_distance_x = bin_width / (INPUT1_TYPE)sampling_ratio_x;
+    const INPUT1_TYPE sample_distance_y = bin_height / (INPUT1_TYPE)sampling_ratio_y;
+
+    const __global INPUT0_TYPE* data = src_data + INPUT0_GET_INDEX(r, c, 0, 0);
 
-    const INPUT1_TYPE sample_distance_x = bin_width / (INPUT1_TYPE) sampling_ratio_x;
-    const INPUT1_TYPE sample_distance_y = bin_height / (INPUT1_TYPE) sampling_ratio_y;
-    
-    const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + r*INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH*c;
     OUTPUT_TYPE pooled_value = 0;
     for (unsigned int y_sample_ind = 0; y_sample_ind < sampling_ratio_y; y_sample_ind++) {
-        INPUT1_TYPE sample_y = y1 + (INPUT1_TYPE) y * bin_height +
-            sample_distance_y * ((INPUT1_TYPE) y_sample_ind + (INPUT1_TYPE) 0.5f);
+        INPUT1_TYPE sample_y =
+            y1 + (INPUT1_TYPE)y * bin_height + sample_distance_y * ((INPUT1_TYPE)y_sample_ind + (INPUT1_TYPE)0.5f);
         for (unsigned int x_sample_ind = 0; x_sample_ind < sampling_ratio_x; x_sample_ind++) {
-            INPUT1_TYPE sample_x = x1 + (INPUT1_TYPE) x * bin_width +
-                sample_distance_x * ((INPUT1_TYPE) x_sample_ind + (INPUT1_TYPE) 0.5f);
+            INPUT1_TYPE sample_x =
+                x1 + (INPUT1_TYPE)x * bin_width + sample_distance_x * ((INPUT1_TYPE)x_sample_ind + (INPUT1_TYPE)0.5f);
             unsigned int sample_y_low = 0;
             unsigned int sample_x_low = 0;
             unsigned int sample_y_high = 0;
             unsigned int sample_x_high = 0;
-            INPUT1_TYPE weight_left = (INPUT1_TYPE) 0.f;
-            INPUT1_TYPE weight_right = (INPUT1_TYPE) 0.f;
-            INPUT1_TYPE weight_top = (INPUT1_TYPE) 0.f;
-            INPUT1_TYPE weight_bottom = (INPUT1_TYPE) 0.f;
+            INPUT1_TYPE weight_left = INPUT1_VAL_ZERO;
+            INPUT1_TYPE weight_right = INPUT1_VAL_ZERO;
+            INPUT1_TYPE weight_top = INPUT1_VAL_ZERO;
+            INPUT1_TYPE weight_bottom = INPUT1_VAL_ZERO;
             if (sample_x >= -1.0 || sample_x <= INPUT0_SIZE_X || sample_y >= -1.0 || sample_y <= INPUT0_SIZE_Y) {
-                sample_x = MAX(sample_x, (INPUT1_TYPE) 0.f);
-                sample_y = MAX(sample_y, (INPUT1_TYPE) 0.f);
+                sample_x = MAX(sample_x, INPUT1_VAL_ZERO);
+                sample_y = MAX(sample_y, INPUT1_VAL_ZERO);
 
-                sample_y_low = (unsigned int) sample_y;
-                sample_x_low = (unsigned int) sample_x;
+                sample_y_low = (unsigned int)sample_y;
+                sample_x_low = (unsigned int)sample_x;
 
                 if (sample_y_low >= INPUT0_SIZE_Y - 1) {
                     sample_y_high = sample_y_low = INPUT0_SIZE_Y - 1;
-                    sample_y = (INPUT1_TYPE) sample_y_low;
+                    sample_y = (INPUT1_TYPE)sample_y_low;
                 } else {
                     sample_y_high = sample_y_low + 1;
                 }
 
                 if (sample_x_low >= INPUT0_SIZE_X - 1) {
                     sample_x_high = sample_x_low = INPUT0_SIZE_X - 1;
-                    sample_x = (INPUT1_TYPE) sample_x_low;
+                    sample_x = (INPUT1_TYPE)sample_x_low;
                 } else {
                     sample_x_high = sample_x_low + 1;
                 }
 
                 // weight calculation for bilinear interpolation
-                weight_top = sample_y - (INPUT1_TYPE) sample_y_low;
-                weight_left = sample_x - (INPUT1_TYPE) sample_x_low;
-                weight_bottom = (INPUT1_TYPE) 1.f - weight_top;
-                weight_right = (INPUT1_TYPE) 1.f - weight_left;
+                weight_top = sample_y - (INPUT1_TYPE)sample_y_low;
+                weight_left = sample_x - (INPUT1_TYPE)sample_x_low;
+                weight_bottom = INPUT1_VAL_ONE - weight_top;
+                weight_right = INPUT1_VAL_ONE - weight_left;
             }
-            const INPUT0_TYPE top_left = data[sample_y_low * INPUT0_Y_PITCH + sample_x_low * INPUT0_X_PITCH];
-            const INPUT0_TYPE top_right = data[sample_y_low * INPUT0_Y_PITCH + sample_x_high * INPUT0_X_PITCH];
-            const INPUT0_TYPE bottom_left = data[sample_y_high * INPUT0_Y_PITCH + sample_x_low * INPUT0_X_PITCH];
-            const INPUT0_TYPE bottom_right = data[sample_y_high * INPUT0_Y_PITCH + sample_x_high * INPUT0_X_PITCH];
-            
-            const INPUT0_TYPE interpolated_value = weight_bottom * weight_right * top_left +
-                weight_bottom * weight_left * top_right +
-                weight_top * weight_right * bottom_left +
-                weight_top * weight_left * bottom_right;
+
+            const INPUT0_TYPE top_left = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_low)];
+            const INPUT0_TYPE top_right = data[INPUT0_GET_INDEX(0, 0, sample_y_low, sample_x_high)];
+            const INPUT0_TYPE bottom_left = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_low)];
+            const INPUT0_TYPE bottom_right = data[INPUT0_GET_INDEX(0, 0, sample_y_high, sample_x_high)];
+
+            const INPUT0_TYPE interpolated_value =
+                weight_bottom * weight_right * top_left + weight_bottom * weight_left * top_right +
+                weight_top * weight_right * bottom_left + weight_top * weight_left * bottom_right;
+
 #if MAX_POOL
             pooled_value = MAX(pooled_value, interpolated_value);
 #elif AVG_POOL
@@ -108,7 +112,7 @@ KERNEL(roi_align_ref)
 #if AVG_POOL
     pooled_value /= sampling_ratio_x * sampling_ratio_x;
 #endif
-    const uint output_offset = OUTPUT_OFFSET + x*OUTPUT_X_PITCH + y*OUTPUT_Y_PITCH + c*OUTPUT_FEATURE_PITCH + r*OUTPUT_BATCH_PITCH;
-    dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)pooled_value, ACTIVATION_PARAMS);
+
+    dst_data[OUTPUT_GET_INDEX(r, c, y, x)] = ACTIVATION((OUTPUT_TYPE)pooled_value, ACTIVATION_PARAMS);
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/ops/roi_align.cpp b/src/plugins/intel_gpu/src/plugin/ops/roi_align.cpp
index 416249fc949..567976e52c1 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/roi_align.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/roi_align.cpp
@@ -1,29 +1,44 @@
 // Copyright (C) 2018-2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-#include "intel_gpu/plugin/program.hpp"
-#include "intel_gpu/plugin/common_utils.hpp"
 #include "ngraph/op/roi_align.hpp"
-#include "intel_gpu/primitives/roi_align.hpp"
+
 #include <memory>
 
+#include "intel_gpu/plugin/common_utils.hpp"
+#include "intel_gpu/plugin/program.hpp"
+#include "intel_gpu/primitives/roi_align.hpp"
+
 namespace ov {
 namespace intel_gpu {
 
 namespace {
 
-cldnn::roi_align::PoolingMode from(ngraph::op::v3::ROIAlign::PoolingMode mode) {
+template <typename T>
+cldnn::roi_align::PoolingMode from(T mode) {
     switch (mode) {
-    case ngraph::op::v3::ROIAlign::PoolingMode::MAX:
-        return cldnn::roi_align::PoolingMode::Max;
-    case ngraph::op::v3::ROIAlign::PoolingMode::AVG:
+    case T::MAX:
+        return cldnn::roi_align::PoolingMode::max;
+    case T::AVG:
     default:
-        return cldnn::roi_align::PoolingMode::Avg;
+        return cldnn::roi_align::PoolingMode::avg;
+    }
+}
+
+cldnn::roi_align::AlignedMode from(ngraph::op::v9::ROIAlign::AlignedMode mode) {
+    switch (mode) {
+    case ngraph::op::v9::ROIAlign::AlignedMode::HALF_PIXEL_FOR_NN:
+        return cldnn::roi_align::AlignedMode::half_pixel_for_nn;
+    case ngraph::op::v9::ROIAlign::AlignedMode::HALF_PIXEL:
+        return cldnn::roi_align::AlignedMode::half_pixel;
+    case ngraph::op::v9::ROIAlign::AlignedMode::ASYMMETRIC:
+    default:
+        return cldnn::roi_align::AlignedMode::asymmetric;
     }
 }
 
 void CreateROIAlignOp(Program& p, const std::shared_ptr<ngraph::op::v3::ROIAlign>& op) {
-    p.ValidateInputs(op, { 3 });
+    p.ValidateInputs(op, {3});
     auto roi_align_prim = cldnn::roi_align(layer_type_name_ID(op),
                                            p.GetInputPrimitiveIDs(op),
                                            op->get_pooled_h(),
@@ -31,14 +46,31 @@ void CreateROIAlignOp(Program& p, const std::shared_ptr<ngraph::op::v3::ROIAlign
                                            op->get_sampling_ratio(),
                                            op->get_spatial_scale(),
                                            from(op->get_mode()),
+                                           cldnn::roi_align::AlignedMode::asymmetric,
                                            op->get_friendly_name());
     p.AddPrimitive(roi_align_prim);
     p.AddPrimitiveToProfiler(op);
 }
 
-} // anonymous namespace
+void CreateROIAlignOp(Program& p, const std::shared_ptr<ngraph::op::v9::ROIAlign>& op) {
+    p.ValidateInputs(op, {3});
+    auto roi_align_prim = cldnn::roi_align(layer_type_name_ID(op),
+                                           p.GetInputPrimitiveIDs(op),
+                                           op->get_pooled_h(),
+                                           op->get_pooled_w(),
+                                           op->get_sampling_ratio(),
+                                           op->get_spatial_scale(),
+                                           from(op->get_mode()),
+                                           from(op->get_aligned_mode()),
+                                           op->get_friendly_name());
+    p.AddPrimitive(roi_align_prim);
+    p.AddPrimitiveToProfiler(op);
+}
+
+}  // anonymous namespace
 
 REGISTER_FACTORY_IMPL(v3, ROIAlign);
+REGISTER_FACTORY_IMPL(v9, ROIAlign);
 
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/tests/test_cases/roi_align_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/roi_align_gpu_test.cpp
new file mode 100644
index 00000000000..ebebe27c124
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/test_cases/roi_align_gpu_test.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <intel_gpu/primitives/data.hpp>
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/roi_align.hpp>
+
+#include "test_utils.h"
+
+using namespace cldnn;
+using namespace ::tests;
+
+template <typename TD, typename TI, cldnn::format::type fmt>
+struct TypesWithFormat {
+    using DataType = TD;
+    using IndexType = TI;
+    static const cldnn::format::type format = fmt;
+};
+
+template <typename Types>
+struct roi_align_test : public testing::Test {
+    using TD = typename Types::DataType;
+    using TI = typename Types::IndexType;
+    const data_types device_data_type = type_to_data_type<TD>::value;
+    const data_types device_ind_type = type_to_data_type<TI>::value;
+    const cldnn::format::type blocked_format = Types::format;
+    const cldnn::format::type plain_format = format::bfyx;
+
+    const int pooled_h{2};
+    const int pooled_w{2};
+    const int sampling_ratio{2};
+    const float spatial_scale{1};
+
+    const std::vector<TD> input_data = {
+        TD(0.f),  TD(1.f),  TD(8.f),  TD(5.f),  TD(5.f), TD(2.f),  TD(0.f), TD(7.f),  TD(7.f),  TD(10.f), TD(4.f),
+        TD(5.f),  TD(9.f),  TD(0.f),  TD(0.f),  TD(5.f), TD(7.f),  TD(0.f), TD(4.f),  TD(0.f),  TD(4.f),  TD(7.f),
+        TD(6.f),  TD(10.f), TD(9.f),  TD(5.f),  TD(1.f), TD(7.f),  TD(4.f), TD(7.f),  TD(10.f), TD(8.f),  TD(2.f),
+        TD(0.f),  TD(8.f),  TD(3.f),  TD(6.f),  TD(8.f), TD(10.f), TD(4.f), TD(2.f),  TD(10.f), TD(7.f),  TD(8.f),
+        TD(7.f),  TD(0.f),  TD(6.f),  TD(9.f),  TD(2.f), TD(4.f),  TD(8.f), TD(5.f),  TD(2.f),  TD(3.f),  TD(3.f),
+        TD(1.f),  TD(5.f),  TD(9.f),  TD(10.f), TD(0.f), TD(9.f),  TD(5.f), TD(5.f),  TD(3.f),  TD(10.f), TD(5.f),
+        TD(2.f),  TD(0.f),  TD(10.f), TD(0.f),  TD(5.f), TD(4.f),  TD(3.f), TD(10.f), TD(5.f),  TD(5.f),  TD(10.f),
+        TD(0.f),  TD(8.f),  TD(8.f),  TD(9.f),  TD(1.f), TD(0.f),  TD(7.f), TD(9.f),  TD(6.f),  TD(8.f),  TD(7.f),
+        TD(10.f), TD(9.f),  TD(2.f),  TD(3.f),  TD(3.f), TD(5.f),  TD(6.f), TD(9.f),  TD(4.f),  TD(9.f),  TD(2.f),
+        TD(4.f),  TD(5.f),  TD(5.f),  TD(3.f),  TD(1.f), TD(1.f),  TD(6.f), TD(8.f),  TD(0.f),  TD(5.f),  TD(5.f),
+        TD(10.f), TD(8.f),  TD(6.f),  TD(9.f),  TD(6.f), TD(9.f),  TD(1.f), TD(2.f),  TD(7.f),  TD(1.f),  TD(1.f),
+        TD(3.f),  TD(0.f),  TD(4.f),  TD(0.f),  TD(7.f), TD(10.f), TD(2.f)};
+    const std::vector<TD> coords_data = {TD(2.f), TD(2.f), TD(4.f), TD(4.f), TD(2.f), TD(2.f), TD(4.f), TD(4.f)};
+    const std::vector<TI> roi_data = {0, 1};
+
+    const layout input_lt = layout(device_data_type, plain_format, {2, 1, 8, 8});
+    const layout coords_lt = layout(device_data_type, plain_format, {2, 4, 1, 1});
+    const layout roi_lt = layout(device_ind_type, plain_format, {2, 1, 1, 1});
+
+    memory::ptr get_memory(engine& engine, const layout& lt, const std::vector<TD>& data) const {
+        auto mem = engine.allocate_memory(lt);
+        tests::set_values(mem, data);
+        return mem;
+    }
+
+    memory::ptr get_roi_memory(engine& engine) const {
+        auto mem = engine.allocate_memory(roi_lt);
+        tests::set_values(mem, roi_data);
+        return mem;
+    }
+
+    void execute(const std::vector<TD>& expected_output,
+                 roi_align::PoolingMode pooling_mode,
+                 roi_align::AlignedMode aligned_mode) const {
+        auto& engine = get_test_engine();
+
+        auto input = get_memory(engine, input_lt, input_data);
+        auto coords = get_memory(engine, coords_lt, coords_data);
+        auto roi_ind = get_roi_memory(engine);
+
+        topology topology;
+        topology.add(input_layout("input", input->get_layout()));
+        topology.add(input_layout("coords", coords->get_layout()));
+        topology.add(input_layout("roi_ind", roi_ind->get_layout()));
+        topology.add(reorder("reorder_input", "input", blocked_format, device_data_type));
+        topology.add(reorder("reorder_coords", "coords", blocked_format, device_data_type));
+        topology.add(reorder("reorder_ind", "roi_ind", blocked_format, device_ind_type));
+        topology.add(roi_align("roi_align",
+                               {"reorder_input", "reorder_coords", "reorder_ind"},
+                               pooled_h,
+                               pooled_w,
+                               sampling_ratio,
+                               spatial_scale,
+                               pooling_mode,
+                               aligned_mode));
+        topology.add(reorder("out", "roi_align", plain_format, device_data_type));
+
+        network network(engine, topology);
+        network.set_input_data("input", input);
+        network.set_input_data("coords", coords);
+        network.set_input_data("roi_ind", roi_ind);
+
+        auto outputs = network.execute();
+
+        auto output = outputs.at("out").get_memory();
+        cldnn::mem_lock<TD> output_ptr(output, get_test_stream());
+
+        ASSERT_EQ(output_ptr.size(), expected_output.size());
+        for (uint32_t i = 0; i < expected_output.size(); ++i) {
+            EXPECT_NEAR(output_ptr[i], expected_output[i], 0.01);
+        }
+    }
+};
+
+// it's a bit overloaded with the cartesian product of types and formats, but that's the lesser evil
+// since we have specific type for expected values that are tied to specific input modes
+// so that Combine approach could avoid manual combinations but it would be much more complicated
+using roi_align_test_types = testing::Types<TypesWithFormat<half_t, uint8_t, format::bfyx>,
+                                            TypesWithFormat<half_t, uint8_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<half_t, uint8_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<half_t, uint8_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<half_t, uint8_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<half_t, uint8_t, format::bs_fs_yx_bsv32_fsv32>,
+
+                                            TypesWithFormat<half_t, int8_t, format::bfyx>,
+                                            TypesWithFormat<half_t, int8_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<half_t, int8_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<half_t, int8_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<half_t, int8_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<half_t, int8_t, format::bs_fs_yx_bsv32_fsv32>,
+
+                                            TypesWithFormat<half_t, int32_t, format::bfyx>,
+                                            TypesWithFormat<half_t, int32_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<half_t, int32_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<half_t, int32_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<half_t, int32_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<half_t, int32_t, format::bs_fs_yx_bsv32_fsv32>,
+
+                                            TypesWithFormat<float, uint8_t, format::bfyx>,
+                                            TypesWithFormat<float, uint8_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<float, uint8_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<float, uint8_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<float, uint8_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<float, uint8_t, format::bs_fs_yx_bsv32_fsv32>,
+
+                                            TypesWithFormat<float, int8_t, format::bfyx>,
+                                            TypesWithFormat<float, int8_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<float, int8_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<float, int8_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<float, int8_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<float, int8_t, format::bs_fs_yx_bsv32_fsv32>,
+
+                                            TypesWithFormat<float, int32_t, format::bfyx>,
+                                            TypesWithFormat<float, int32_t, format::b_fs_yx_fsv16>,
+                                            TypesWithFormat<float, int32_t, format::b_fs_yx_fsv32>,
+                                            TypesWithFormat<float, int32_t, format::bs_fs_yx_bsv16_fsv16>,
+                                            TypesWithFormat<float, int32_t, format::bs_fs_yx_bsv32_fsv16>,
+                                            TypesWithFormat<float, int32_t, format::bs_fs_yx_bsv32_fsv32>>;
+
+TYPED_TEST_SUITE(roi_align_test, roi_align_test_types);
+
+TYPED_TEST(roi_align_test, avg_asymmetric) {
+    using TD = typename TypeParam::DataType;
+    const std::vector<TD>
+        expected_output{TD(3.f), TD(3.75f), TD(4.75f), TD(5.f), TD(3.f), TD(5.5f), TD(2.75f), TD(3.75f)};
+    this->execute(expected_output, roi_align::PoolingMode::avg, roi_align::AlignedMode::asymmetric);
+}
+
+TYPED_TEST(roi_align_test, avg_half_pixel_for_nn) {
+    using TD = typename TypeParam::DataType;
+    const std::vector<TD> expected_output =
+        {TD(3.14f), TD(2.16f), TD(2.86f), TD(5.03f), TD(1.83f), TD(5.84f), TD(2.77f), TD(3.44f)};
+    this->execute(expected_output, roi_align::PoolingMode::avg, roi_align::AlignedMode::half_pixel_for_nn);
+}
+
+TYPED_TEST(roi_align_test, max_half_pixel) {
+    using TD = typename TypeParam::DataType;
+    const std::vector<TD> expected_output =
+        {TD(4.375f), TD(4.9375f), TD(5.6875f), TD(5.625f), TD(4.625f), TD(7.125f), TD(3.3125f), TD(4.3125f)};
+    this->execute(expected_output, roi_align::PoolingMode::max, roi_align::AlignedMode::half_pixel);
+}
diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/roi_align.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/roi_align.cpp
index ca39592bde6..217614d5c92 100644
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/roi_align.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/roi_align.cpp
@@ -1,16 +1,16 @@
 // Copyright (C) 2018-2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+#include "single_layer_tests/roi_align.hpp"
+
 #include <vector>
 
-#include "single_layer_tests/roi_align.hpp"
 #include "common_test_utils/test_constants.hpp"
 
 using namespace LayerTestsDefinitions;
 
-
 const std::vector<InferenceEngine::Precision> netPRCs = {
-    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP32
     // There is no possibility to test ROIAlign in fp16 precision,
     // because on edge cases where in fp32 version ROI value is
     // a little bit smaller than the nearest integer value,
@@ -19,44 +19,88 @@ const std::vector<InferenceEngine::Precision> netPRCs = {
     // in fp32 and fp16 precisions.
     // In real AI applications this problem is solved by precision-aware training.
 
-    // InferenceEngine::Precision::FP16,
+    // InferenceEngine::Precision::FP16
 };
 
-const auto ROIAlignCases_average =
-    ::testing::Combine(
-                       ::testing::ValuesIn(
-                                           std::vector<std::vector<size_t>> {
-                                               { 3, 8, 16, 16 },
-                                               { 2, 1, 16, 16 },
-                                               { 2, 1, 8, 16 }}),
-                       ::testing::Values(std::vector<size_t>{ 2, 4 }),
-                       ::testing::Values(2),
-                       ::testing::Values(2),
-                       ::testing::ValuesIn(std::vector<float> { 1, 0.625 }),
-                       ::testing::Values(2),
-                       ::testing::Values("avg"),
-                       ::testing::ValuesIn(netPRCs),
-                       ::testing::Values(CommonTestUtils::DEVICE_GPU)
-);
+const auto ROIAlignCases_average = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>{{3, 8, 16, 16}, {2, 1, 16, 16}, {2, 1, 8, 16}}),
+    ::testing::Values(std::vector<size_t>{2, 4}),
+    ::testing::Values(2),
+    ::testing::Values(2),
+    ::testing::ValuesIn(std::vector<float>{1, 0.625}),
+    ::testing::Values(2),
+    ::testing::Values("avg"),
+    ::testing::ValuesIn(netPRCs),
+    ::testing::Values(CommonTestUtils::DEVICE_GPU));
 
-INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_average, ROIAlignLayerTest, ROIAlignCases_average, ROIAlignLayerTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_average,
+                         ROIAlignLayerTest,
+                         ROIAlignCases_average,
+                         ROIAlignLayerTest::getTestCaseName);
 
-const auto ROIAlignCases_max =
-    ::testing::Combine(
-                       ::testing::ValuesIn(
-                                           std::vector<std::vector<size_t>> {
-                                               { 2, 8, 20, 20 },
-                                               { 2, 1, 20, 20 },
-                                               { 2, 1, 10, 20 }
-                                           }),
-                       ::testing::Values(std::vector<size_t>{ 2, 4 }),
-                       ::testing::Values(2),
-                       ::testing::Values(2),
-                       ::testing::ValuesIn(std::vector<float> { 1, 0.625 }),
-                       ::testing::Values(2),
-                       ::testing::Values("max"),
-                       ::testing::ValuesIn(netPRCs),
-                       ::testing::Values(CommonTestUtils::DEVICE_GPU)
-);
+const auto ROIAlignCases_max = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>{{2, 8, 20, 20}, {2, 1, 20, 20}, {2, 1, 10, 20}}),
+    ::testing::Values(std::vector<size_t>{2, 4}),
+    ::testing::Values(2),
+    ::testing::Values(2),
+    ::testing::ValuesIn(std::vector<float>{1, 0.625}),
+    ::testing::Values(2),
+    ::testing::Values("max"),
+    ::testing::ValuesIn(netPRCs),
+    ::testing::Values(CommonTestUtils::DEVICE_GPU));
 
-INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_max, ROIAlignLayerTest, ROIAlignCases_max, ROIAlignLayerTest::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_max,
+                         ROIAlignLayerTest,
+                         ROIAlignCases_max,
+                         ROIAlignLayerTest::getTestCaseName);
+
+const auto ROIAlignCases_average_asymmetric = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>{{2, 1, 8, 8}, {2, 8, 20, 20}, {2, 1, 20, 20}, {2, 1, 10, 20}}),
+    ::testing::Values(std::vector<size_t>{2, 4}),
+    ::testing::Values(2),
+    ::testing::Values(2),
+    ::testing::ValuesIn(std::vector<float>{1, 0.625}),
+    ::testing::Values(2),
+    ::testing::Values("avg"),
+    ::testing::Values("asymmetric"),
+    ::testing::ValuesIn(netPRCs),
+    ::testing::Values(CommonTestUtils::DEVICE_GPU));
+
+INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_avg_asym,
+                         ROIAlignV9LayerTest,
+                         ROIAlignCases_average_asymmetric,
+                         ROIAlignV9LayerTest::getTestCaseName);
+
+const auto ROIAlignCases_average_half_pixel_for_nn = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>{{2, 1, 8, 8}, {2, 8, 20, 20}, {2, 1, 20, 20}, {2, 1, 10, 20}}),
+    ::testing::Values(std::vector<size_t>{2, 4}),
+    ::testing::Values(2),
+    ::testing::Values(2),
+    ::testing::ValuesIn(std::vector<float>{1, 0.625}),
+    ::testing::Values(2),
+    ::testing::Values("avg"),
+    ::testing::Values("half_pixel_for_nn"),
+    ::testing::ValuesIn(netPRCs),
+    ::testing::Values(CommonTestUtils::DEVICE_GPU));
+
+INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_avg_hpfn,
+                         ROIAlignV9LayerTest,
+                         ROIAlignCases_average_half_pixel_for_nn,
+                         ROIAlignV9LayerTest::getTestCaseName);
+
+const auto ROIAlignCases_maximum_half_pixel = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>{{2, 1, 8, 8}, {2, 8, 20, 20}, {2, 1, 20, 20}, {2, 1, 10, 20}}),
+    ::testing::Values(std::vector<size_t>{2, 4}),
+    ::testing::Values(2),
+    ::testing::Values(2),
+    ::testing::ValuesIn(std::vector<float>{1, 0.625}),
+    ::testing::Values(2),
+    ::testing::Values("max"),
+    ::testing::Values("half_pixel"),
+    ::testing::ValuesIn(netPRCs),
+    ::testing::Values(CommonTestUtils::DEVICE_GPU));
+
+INSTANTIATE_TEST_SUITE_P(smoke_TestsROIAlign_max_hp,
+                         ROIAlignV9LayerTest,
+                         ROIAlignCases_maximum_half_pixel,
+                         ROIAlignV9LayerTest::getTestCaseName);
diff --git a/src/tests/functional/plugin/shared/include/single_layer_tests/roi_align.hpp b/src/tests/functional/plugin/shared/include/single_layer_tests/roi_align.hpp
index bf9cfeb92d7..a8eecb0ca98 100644
--- a/src/tests/functional/plugin/shared/include/single_layer_tests/roi_align.hpp
+++ b/src/tests/functional/plugin/shared/include/single_layer_tests/roi_align.hpp
@@ -12,4 +12,8 @@ TEST_P(ROIAlignLayerTest, CompareWithRefs) {
     Run();
 }
 
+TEST_P(ROIAlignV9LayerTest, CompareWithRefs) {
+    Run();
+}
+
 }  // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/roi_align.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/roi_align.hpp
index 5d61726c001..6c1f76d8f63 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/roi_align.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/roi_align.hpp
@@ -37,4 +37,32 @@ private:
     std::string poolingMode;
 };
 
+using roialignV9Params = std::tuple<std::vector<size_t>,             // feature map shape
+                                    std::vector<size_t>,             // proposal coords shape
+                                    int,                             // bin's row count
+                                    int,                             // bin's column count
+                                    float,                           // spatial scale
+                                    int,                             // pooling ratio
+                                    std::string,                     // pooling mode
+                                    std::string,                     // roi aligned mode
+                                    InferenceEngine::Precision,      // net precision
+                                    LayerTestsUtils::TargetDevice>;  // device name
+
+class ROIAlignV9LayerTest : public testing::WithParamInterface<roialignV9Params>,
+                            virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<roialignV9Params>& obj);
+
+protected:
+    void SetUp() override;
+
+private:
+    int pooledH;
+    int pooledW;
+    float spatialScale;
+    int poolingRatio;
+    std::string poolingMode;
+    std::string roiAlignedMode;
+};
+
 }  // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp b/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
index df0b3dd6340..0acb6e4c29c 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/roi_align.cpp
@@ -2,10 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "shared_test_classes/single_layer/roi_align.hpp"
+
 #include <ngraph/opsets/opset3.hpp>
+#include <ngraph/opsets/opset9.hpp>
 
 #include "ngraph_functions/builders.hpp"
-#include "shared_test_classes/single_layer/roi_align.hpp"
+#include "openvino/core/enum_names.hpp"
 
 using namespace InferenceEngine;
 using namespace FuncTestUtils::PrecisionUtils;
@@ -77,36 +80,131 @@ void ROIAlignLayerTest::SetUp() {
     std::vector<size_t> inputShape;
     std::vector<size_t> coordsShape;
     InferenceEngine::Precision netPrecision;
-    std::tie(inputShape, coordsShape, pooledH, pooledW,
-             spatialScale, poolingRatio, poolingMode, netPrecision, targetDevice) = this->GetParam();
+    std::tie(inputShape,
+             coordsShape,
+             pooledH,
+             pooledW,
+             spatialScale,
+             poolingRatio,
+             poolingMode,
+             netPrecision,
+             targetDevice) = this->GetParam();
 
     auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
     auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
-    auto paramOuts = ngraph::helpers::convert2OutputVector(
-            ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+    auto paramOuts =
+        ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
     std::vector<float> proposalVector;
     std::vector<int> roiIdxVector;
     proposalVector.resize(coordsShape[0] * 4);
     roiIdxVector.resize(coordsShape[0]);
 
-    fillCoordTensor(proposalVector, inputShape[2], inputShape[3],
-            spatialScale, poolingRatio, pooledH, pooledW);
+    fillCoordTensor(proposalVector, inputShape[2], inputShape[3], spatialScale, poolingRatio, pooledH, pooledW);
     fillIdxTensor(roiIdxVector, inputShape[0]);
-    ngraph::Shape idxShape = { coordsShape[0] };
+    ngraph::Shape idxShape = {coordsShape[0]};
 
     auto coords = std::make_shared<ngraph::opset1::Constant>(ngPrc, coordsShape, proposalVector.data());
     auto roisIdx = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i32, idxShape, roiIdxVector.data());
 
-    std::shared_ptr<ngraph::Node> roiAlign =
-            std::make_shared<ngraph::opset3::ROIAlign>(paramOuts[0],
-                                                    coords,
-                                                    roisIdx,
-                                                    pooledH,
-                                                    pooledW,
-                                                    poolingRatio,
-                                                    spatialScale,
-                                                    poolingMode);
+    std::shared_ptr<ngraph::Node> roiAlign = std::make_shared<ngraph::opset3::ROIAlign>(paramOuts[0],
+                                                                                        coords,
+                                                                                        roisIdx,
+                                                                                        pooledH,
+                                                                                        pooledW,
+                                                                                        poolingRatio,
+                                                                                        spatialScale,
+                                                                                        poolingMode);
     ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(roiAlign)};
     function = std::make_shared<ngraph::Function>(results, params, "roi_align");
 }
+
+std::string ROIAlignV9LayerTest::getTestCaseName(const testing::TestParamInfo<roialignV9Params>& obj) {
+    std::vector<size_t> inputShape;
+    std::vector<size_t> coordsShape;
+
+    int pooledH;
+    int pooledW;
+    float spatialScale;
+    int poolingRatio;
+    std::string poolingMode;
+    std::string roiAlignedMode;
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::tie(inputShape,
+             coordsShape,
+             pooledH,
+             pooledW,
+             spatialScale,
+             poolingRatio,
+             poolingMode,
+             roiAlignedMode,
+             netPrecision,
+             targetDevice) = obj.param;
+
+    std::ostringstream result;
+
+    result << "in_shape=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "coord_shape=" << CommonTestUtils::vec2str(coordsShape) << "_";
+    result << "pooled_h=" << pooledH << "_";
+    result << "pooled_w=" << pooledW << "_";
+    result << "spatial_scale=" << spatialScale << "_";
+    result << "pooling_ratio=" << poolingRatio << "_";
+    result << "mode=" << poolingMode << "_";
+    result << "mode=" << roiAlignedMode << "_";
+    result << "prec=" << netPrecision.name() << "_";
+    result << "dev=" << targetDevice;
+    return result.str();
+}
+
+void ROIAlignV9LayerTest::SetUp() {
+    std::vector<size_t> inputShape;
+    std::vector<size_t> coordsShape;
+    InferenceEngine::Precision netPrecision;
+    std::tie(inputShape,
+             coordsShape,
+             pooledH,
+             pooledW,
+             spatialScale,
+             poolingRatio,
+             poolingMode,
+             roiAlignedMode,
+             netPrecision,
+             targetDevice) = this->GetParam();
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+    auto paramOuts =
+        ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+    std::vector<float> proposalVector;
+    std::vector<int> roiIdxVector;
+    proposalVector.resize(coordsShape[0] * 4);
+    roiIdxVector.resize(coordsShape[0]);
+
+    ROIAlignLayerTest::fillCoordTensor(proposalVector,
+                                       inputShape[2],
+                                       inputShape[3],
+                                       spatialScale,
+                                       poolingRatio,
+                                       pooledH,
+                                       pooledW);
+    ROIAlignLayerTest::fillIdxTensor(roiIdxVector, inputShape[0]);
+    ngraph::Shape idxShape = {coordsShape[0]};
+
+    auto coords = std::make_shared<ngraph::opset1::Constant>(ngPrc, coordsShape, proposalVector.data());
+    auto roisIdx = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i32, idxShape, roiIdxVector.data());
+
+    std::shared_ptr<ngraph::Node> roiAlign = std::make_shared<ngraph::opset9::ROIAlign>(
+            paramOuts[0],
+            coords,
+            roisIdx,
+            pooledH,
+            pooledW,
+            poolingRatio,
+            spatialScale,
+            ov::EnumNames<ngraph::opset9::ROIAlign::PoolingMode>::as_enum(poolingMode),
+            ov::EnumNames<ngraph::opset9::ROIAlign::AlignedMode>::as_enum(roiAlignedMode));
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset9::Result>(roiAlign)};
+    function = std::make_shared<ngraph::Function>(results, params, "roi_align");
+}
 }  // namespace LayerTestsDefinitions