[Core] Conv reference optimization. (#17303)

2023-05-17 15:54:33 +04:00
parent db58355fad
commit a6b043b1ca
8 changed files with 331 additions and 120 deletions
--- a/src/core/include/openvino/op/proposal.hpp
+++ b/src/core/include/openvino/op/proposal.hpp
@@ -78,7 +78,7 @@ namespace v4 {
 /// \ingroup ov_ops_cpp_api
 class OPENVINO_API Proposal : public op::v0::Proposal {
 public:
-    OPENVINO_OP("Proposal", "opset4", op::Op);
+    OPENVINO_OP("Proposal", "opset4", op::v0::Proposal);
    Proposal() = default;
    /// \brief Constructs a Proposal operation
    ///
--- a/src/core/reference/include/ngraph/runtime/reference/convolution.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/convolution.hpp
@@ -4,31 +4,21 @@

 #pragma once

-#include <cassert>
-#include <cfenv>
-#include <cmath>
-#include <functional>
-#include <numeric>
+#include <future>

-#include "ngraph/axis_vector.hpp"
-#include "ngraph/coordinate_transform.hpp"
-#include "ngraph/runtime/reference/concat.hpp"
-#include "ngraph/runtime/reference/helpers.hpp"
-#include "ngraph/runtime/reference/reverse.hpp"
-#include "ngraph/runtime/reference/split.hpp"
 #include "ngraph/util.hpp"

 namespace ngraph {
 namespace runtime {
 namespace reference {
 namespace {
+
 constexpr size_t in_batch_axis = 0;
 constexpr size_t in_channel_axis = 1;
 constexpr size_t filter_out_ch_axis = 0;
 constexpr size_t filter_in_ch_axis = 1;
 constexpr size_t out_batch_axis = 0;
 constexpr size_t out_channel_axis = 1;
-constexpr size_t spatial_axis = 2;

 struct ConvolutionParams {
    std::vector<int64_t> strides;
@@ -54,65 +44,164 @@ constexpr inline bool in_range(Int val, std::pair<Int, Int> range) noexcept {
    return val >= range.first && val < range.second;
 }

+template <typename T>
+void convolve_2D_channels(const ConvolutionParams& p,
+                          const T* batch,
+                          const Shape& batch_shape,
+                          const T* filter,
+                          const Shape& filter_shape,
+                          T* out) {
+    const int dilation_y = static_cast<int>(p.dilation[0]);
+    const int dilation_x = static_cast<int>(p.dilation[1]);
+
+    const int pad_begin_y = static_cast<int>(p.pads_begin[0]);
+    const int pad_begin_x = static_cast<int>(p.pads_begin[1]);
+
+    const int stride_y = static_cast<int>(p.strides[0]);
+    const int stride_x = static_cast<int>(p.strides[1]);
+
+    const int input_size_y = static_cast<int>(batch_shape[1]);
+    const int input_size_x = static_cast<int>(batch_shape[2]);
+
+    const int input_size_yx = input_size_y * input_size_x;
+
+    const size_t f_channels = filter_shape[0];
+    const int filter_size_y = static_cast<int>(filter_shape[1]);
+    const int filter_size_x = static_cast<int>(filter_shape[2]);
+
+    const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (dilation_y - 1));
+    const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (dilation_x - 1));
+
+    const int i_y_lim = static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y + p.output_padding[0]);
+    const int i_x_lim = static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x + p.output_padding[1]);
+
+    const int f_y_increment = dilation_y * input_size_x;
+    const int f_x_increment = dilation_x;
+
+    const int f_y_block = filter_size_y * f_y_increment;
+    const int f_x_block = filter_size_x * f_x_increment;
+
+    for (int i_y = -pad_begin_y; i_y <= i_y_lim; i_y += stride_y) {
+        const int i_y_m = i_y * input_size_x;
+        const int f_y_up_lim = f_y_block + i_y_m;
+
+        for (int i_x = -pad_begin_x; i_x <= i_x_lim; i_x += stride_x) {
+            const int f_x_up_lim = f_x_block + i_x;
+            auto input_channel = batch;
+            auto filter_channel = filter;
+            T sum = 0;
+            size_t filter_channels_count = f_channels;
+
+            while (filter_channels_count--) {
+                for (int f_y_i = i_y_m; f_y_i < f_y_up_lim; f_y_i += f_y_increment) {
+                    if (f_y_i < 0 || f_y_i >= input_size_yx) {
+                        filter_channel += filter_size_x;
+                        continue;
+                    }
+                    const int x_up_bound = input_size_x + f_y_i;
+                    for (int f_x_i = f_y_i + i_x; f_x_i < f_x_up_lim + f_y_i;
+                         f_x_i += f_x_increment, filter_channel++) {
+                        if (f_x_i < f_y_i || f_x_i >= x_up_bound) {
+                            continue;
+                        }
+
+                        sum += input_channel[f_x_i] * filter_channel[0];
+                    }
+                }
+                input_channel += input_size_yx;
+            }
+            *out = sum;
+            ++out;
+        }
+    }
+}
+
 template <typename T>
 void convolve_3D_channels(const ConvolutionParams& p,
                          const T* batch,
                          const Shape& batch_shape,
                          const T* filter,
                          const Shape& filter_shape,
-                          T*& out) {
+                          T* out) {
+    const int dilation_z = static_cast<int>(p.dilation[0]);
+    const int dilation_y = static_cast<int>(p.dilation[1]);
+    const int dilation_x = static_cast<int>(p.dilation[2]);
+
+    const int pad_begin_z = static_cast<int>(p.pads_begin[0]);
+    const int pad_begin_y = static_cast<int>(p.pads_begin[1]);
+    const int pad_begin_x = static_cast<int>(p.pads_begin[2]);
+
+    const int stride_z = static_cast<int>(p.strides[0]);
+    const int stride_y = static_cast<int>(p.strides[1]);
+    const int stride_x = static_cast<int>(p.strides[2]);
+
    const int input_size_z = static_cast<int>(batch_shape[1]);
    const int input_size_y = static_cast<int>(batch_shape[2]);
    const int input_size_x = static_cast<int>(batch_shape[3]);
+
+    const int input_size_yx = input_size_y * input_size_x;
+    const int input_size_zyx = input_size_z * input_size_yx;
+
+    const size_t f_channels = filter_shape[0];
    const int filter_size_z = static_cast<int>(filter_shape[1]);
    const int filter_size_y = static_cast<int>(filter_shape[2]);
    const int filter_size_x = static_cast<int>(filter_shape[3]);
-    const int dilated_filter_size_z = static_cast<int>(filter_size_z + (filter_size_z - 1) * (p.dilation[0] - 1));
-    const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (p.dilation[1] - 1));
-    const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (p.dilation[2] - 1));
+    const int filter_size_yx = filter_size_y * filter_size_x;

-    const Shape input_channel_shape(++batch_shape.begin(), batch_shape.end());
-    const size_t input_channel_size = shape_size(input_channel_shape);
-    const Shape filter_channel_shape(++filter_shape.begin(), filter_shape.end());
-    const size_t filter_channel_size = shape_size(filter_channel_shape);
+    const int dilated_filter_size_z = static_cast<int>(filter_size_z + (filter_size_z - 1) * (dilation_z - 1));
+    const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (dilation_y - 1));
+    const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (dilation_x - 1));

-    for (int i_z = static_cast<int>(-p.pads_begin[0]);
-         i_z <= static_cast<int>(p.pads_end[0] + input_size_z - dilated_filter_size_z + p.output_padding[0]);
-         i_z += static_cast<int>(p.strides[0])) {
-        for (int i_y = static_cast<int>(-p.pads_begin[1]);
-             i_y <= static_cast<int>(p.pads_end[1] + input_size_y - dilated_filter_size_y + p.output_padding[1]);
-             i_y += static_cast<int>(p.strides[1])) {
-            for (int i_x = static_cast<int>(-p.pads_begin[2]);
-                 i_x <= static_cast<int>(p.pads_end[2] + input_size_x - dilated_filter_size_x + p.output_padding[2]);
-                 i_x += static_cast<int>(p.strides[2])) {
+    const int i_z_lim = static_cast<int>(p.pads_end[0] + input_size_z - dilated_filter_size_z + p.output_padding[0]);
+    const int i_y_lim = static_cast<int>(p.pads_end[1] + input_size_y - dilated_filter_size_y + p.output_padding[1]);
+    const int i_x_lim = static_cast<int>(p.pads_end[2] + input_size_x - dilated_filter_size_x + p.output_padding[2]);
+
+    const int f_z_increment = dilation_z * input_size_yx;
+    const int f_y_increment = dilation_y * input_size_x;
+    const int f_x_increment = dilation_x;
+
+    const int f_z_block = filter_size_z * f_z_increment;
+    const int f_y_block = filter_size_y * f_y_increment;
+    const int f_x_block = filter_size_x * f_x_increment;
+
+    for (int i_z = -pad_begin_z; i_z <= i_z_lim; i_z += stride_z) {
+        const int s_z_shift = i_z * input_size_yx;
+        const int f_z_up_bound = f_z_block + s_z_shift;
+
+        for (int i_y = -pad_begin_y; i_y <= i_y_lim; i_y += stride_y) {
+            const int i_y_m = i_y * input_size_x;
+
+            for (int i_x = -pad_begin_x; i_x <= i_x_lim; i_x += stride_x) {
                auto input_channel = batch;
                auto filter_channel = filter;
                T sum = 0;
-                size_t filter_channels_count = filter_shape[0];
+                size_t filter_channels_count = f_channels;
+
                while (filter_channels_count--) {
-                    for (int f_z = 0; f_z < filter_size_z; ++f_z) {
-                        for (int f_y = 0; f_y < filter_size_y; ++f_y) {
-                            for (int f_x = 0; f_x < filter_size_x; ++f_x) {
-                                int rel_i_z = i_z + (f_z * static_cast<int>(p.dilation[0]));
-                                int rel_i_y = i_y + (f_y * static_cast<int>(p.dilation[1]));
-                                int rel_i_x = i_x + (f_x * static_cast<int>(p.dilation[2]));
-
-                                bool padding =
-                                    !(in_range(rel_i_x, {0, input_size_x}) && in_range(rel_i_y, {0, input_size_y}) &&
-                                      in_range(rel_i_z, {0, input_size_z}));
-                                if (padding)
+                    for (int f_z_i = s_z_shift; f_z_i < f_z_up_bound; f_z_i += f_z_increment) {
+                        if (f_z_i < 0 || f_z_i >= input_size_zyx) {
+                            filter_channel += filter_size_yx;
+                            continue;
+                        }
+                        const int y_up_bound = f_z_i + input_size_yx;
+                        const int y_shift = f_z_i + i_y_m;
+                        for (int f_y_i = y_shift; f_y_i < f_y_block + y_shift; f_y_i += f_y_increment) {
+                            if (f_y_i < f_z_i || f_y_i >= y_up_bound) {
+                                filter_channel += filter_size_x;
+                                continue;
+                            }
+                            const int x_up_bound = input_size_x + f_y_i;
+                            for (int f_x_i = f_y_i + i_x; f_x_i < f_x_block + f_y_i + i_x;
+                                 f_x_i += f_x_increment, filter_channel++) {
+                                if (f_x_i < f_y_i || f_x_i >= x_up_bound) {
                                    continue;
+                                }

-                                int f_buf_idx = (f_z * filter_size_y * filter_size_x) + (f_y * filter_size_x) + f_x;
-                                int i_buf_idx =
-                                    (rel_i_z * input_size_y * input_size_x) + (rel_i_y * input_size_x) + rel_i_x;
-                                sum += static_cast<T>(input_channel[i_buf_idx]) *
-                                       static_cast<T>(filter_channel[f_buf_idx]);
+                                sum += input_channel[f_x_i] * filter_channel[0];
                            }
                        }
                    }
-                    input_channel += input_channel_size;
-                    filter_channel += filter_channel_size;
+                    input_channel += input_size_zyx;
                }
                *out = sum;
                ++out;
@@ -121,6 +210,20 @@ void convolve_3D_channels(const ConvolutionParams& p,
    }
 }

+inline void extend_to_2D(ConvolutionParams& p, Shape& in_shape, Shape& filter_shape) {
+    const int spatial_rank = static_cast<int>(in_shape.size() - 2);
+    if (spatial_rank < 2) {
+        int missing_dims = 2 - spatial_rank;
+        p.dilation.insert(std::prev(p.dilation.end(), spatial_rank), missing_dims, 1);
+        p.strides.insert(std::prev(p.strides.end(), spatial_rank), missing_dims, 1);
+        p.pads_begin.insert(std::prev(p.pads_begin.end(), spatial_rank), missing_dims, 0);
+        p.pads_end.insert(std::prev(p.pads_end.end(), spatial_rank), missing_dims, 0);
+        p.output_padding.insert(std::prev(p.output_padding.end(), spatial_rank), missing_dims, 0);
+        in_shape.insert(std::next(in_shape.end(), -spatial_rank), missing_dims, 1);
+        filter_shape.insert(std::prev(filter_shape.end(), spatial_rank), missing_dims, 1);
+    }
+}
+
 inline void extend_to_3D(ConvolutionParams& p, Shape& in_shape, Shape& filter_shape) {
    int spatial_rank = static_cast<int>(in_shape.size() - 2);
    if (spatial_rank < 3) {
@@ -209,9 +312,7 @@ void convolution(const T* in,
                 const Strides& strides,
                 const Strides& dilations,
                 const CoordinateDiff& pads_begin,
-                 const CoordinateDiff& pads_end)
-
-{
+                 const CoordinateDiff& pads_end) {
    validate_convolution_parameters(in_shape, f_shape, out_shape, strides, dilations, pads_begin, pads_end);

    // here we are converting all param types to int's to avoid arithmetic issues
@@ -222,26 +323,83 @@ void convolution(const T* in,
    // convolution implementation to convolve also in 1D & 2D case
    Shape input_shape{in_shape};
    Shape filters_shape{f_shape};
-    if (in_shape.size() < 5) {
-        extend_to_3D(params, input_shape, filters_shape);
+    if (in_shape.size() < 4) {
+        extend_to_2D(params, input_shape, filters_shape);
    }

    const size_t batches_count = input_shape[in_batch_axis];
    const Shape batch_shape(++input_shape.begin(), input_shape.end());
    const size_t batch_size = shape_size(batch_shape);
+    const size_t out_spatial_size =
+        std::accumulate(out_shape.begin() + 2, out_shape.end(), size_t(1), std::multiplies<size_t>());

    const size_t filters_count = filters_shape[filter_out_ch_axis];
    const Shape filter_shape(++filters_shape.begin(), filters_shape.end());
    const size_t filter_size = shape_size(filter_shape);

-    auto batch = in;
-    for (size_t batch_idx = 0; batch_idx < batches_count; ++batch_idx) {
-        auto filter = f;
-        for (size_t f_idx = 0; f_idx < filters_count; ++f_idx) {
-            convolve_3D_channels(params, batch, batch_shape, filter, filter_shape, out);
-            filter += filter_size;
+    const size_t work_amount = batches_count * filters_count;
+
+    void (*conv_channels)(const ConvolutionParams&, const T*, const Shape&, const T*, const Shape&, T*);
+    if (in_shape.size() == 5) {
+        conv_channels = &convolve_3D_channels;
+    } else {
+        conv_channels = &convolve_2D_channels;
+    }
+
+    auto ncores = std::thread::hardware_concurrency() / 2;
+    if (ncores == 0) {
+        ncores = 1;
+    }
+    std::vector<std::future<void>> futures(ncores);
+
+    auto split_work = [](const size_t& n, const size_t& nthr, const size_t ithr, size_t& n_start, size_t& n_end) {
+        if (nthr <= 1 || n == 0) {
+            n_start = 0;
+            n_end = n;
+        } else {
+            auto n1 = (n + nthr - 1) / nthr;
+            auto n2 = n1 - 1;
+            auto T1 = n - n2 * nthr;
+            n_end = ithr < T1 ? n1 : n2;
+            n_start = ithr <= T1 ? ithr * n1 : T1 * n1 + (ithr - T1) * n2;
        }
-        batch += batch_size;
+
+        n_end += n_start;
+    };
+
+    auto ker_callback = [&](int nthr, int ithr) {
+        size_t start = 0, end = 0;
+        split_work(work_amount, nthr, ithr, start, end);
+        if (start >= end) {
+            return;
+        }
+        size_t batch_idx = start / filters_count;
+        size_t c_idx = start % filters_count;
+
+        auto in_data = in + batch_size * batch_idx;
+        auto filter = f + filter_size * c_idx;
+        auto out_data = out + out_spatial_size * filters_count * batch_idx + out_spatial_size * c_idx;
+
+        for (; batch_idx < batches_count; ++batch_idx) {
+            for (; c_idx < filters_count && start < end; c_idx++, start++) {
+                conv_channels(params, in_data, batch_shape, filter, filter_shape, out_data);
+                filter += filter_size;
+                out_data += out_spatial_size;
+            }
+            if (start >= end) {
+                break;
+            }
+            filter = f;
+            c_idx = 0;
+            in_data += batch_size;
+        }
+    };
+
+    for (size_t ithr = 0; ithr < ncores; ithr++) {
+        futures[ithr] = std::async(ker_callback, ncores, ithr);
+    }
+    for (size_t ithr = 0; ithr < ncores; ithr++) {
+        futures[ithr].get();
    }
 }
 }  // namespace reference
--- a/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp
@@ -11,6 +11,7 @@

 #include "ngraph/axis_vector.hpp"
 #include "ngraph/runtime/reference/convolution.hpp"
+#include "ngraph/runtime/reference/reverse.hpp"
 #include "ngraph/util.hpp"

 namespace ngraph {
@@ -171,8 +172,8 @@ void convolution_backprop_impl(const T* in,
    // convolution implementation to convolve also in 1D & 2D case
    Shape input_shape{in_shape};
    Shape filters_shape{f_shape};
-    if (in_shape.size() < 5) {
-        extend_to_3D(params, input_shape, filters_shape);
+    if (in_shape.size() < 4) {
+        extend_to_2D(params, input_shape, filters_shape);
    }

    for (size_t i = 0; i < input_shape.size() - 2; ++i) {
@@ -186,24 +187,26 @@ void convolution_backprop_impl(const T* in,
    }

    // convert output shape to 3D, contains only dimensions
-    Shape out_shape_3d{out_shape.begin() + 2, out_shape.end()};
+    Shape out_shape_2d{out_shape.begin() + 2, out_shape.end()};
+    const size_t out_spatial_size =
+        std::accumulate(out_shape.begin() + 2, out_shape.end(), size_t(1), std::multiplies<size_t>());

    int out_shape_rank = static_cast<int>(out_shape.size()) - 2;
-    if (out_shape_rank < 3) {
-        int missing_dims = 3 - out_shape_rank;
-        out_shape_3d.insert(std::prev(out_shape_3d.end(), out_shape_rank), missing_dims, 1);
+    if (out_shape_rank < 2) {
+        int missing_dims = 2 - out_shape_rank;
+        out_shape_2d.insert(std::prev(out_shape_2d.end(), out_shape_rank), missing_dims, 1);
    }

    // modify params.pads_end when output_shape was provided in ctor in order to
    // calculate expected number of output elements
-    for (size_t i = 0; i < out_shape_3d.size(); i++) {
-        if (out_shape_3d[i] > 1) {
+    for (size_t i = 0; i < out_shape_2d.size(); i++) {
+        if (out_shape_2d[i] > 1) {
            // expected_dim = (in - 1)* strides + filter - 2*padding + out_padding
            // strides is already applied (through 0's extension in input)
            // padding = pads_begin + pads_end, formula below is using
            // params.pad_begin/params.pads_end:
            const size_t expected_dim =
-                out_shape_3d[i] - ((input_shape[i + 2] - 1) - filters_shape[i + 2] + params.pads_begin[i] +
+                out_shape_2d[i] - ((input_shape[i + 2] - 1) - filters_shape[i + 2] + params.pads_begin[i] +
                                   params.pads_end[i] + 2 + params.output_padding[i]);
            params.pads_end[i] += expected_dim;
        }
@@ -217,15 +220,69 @@ void convolution_backprop_impl(const T* in,
    Shape batch_shape(++input_shape.begin(), input_shape.end());
    const size_t batch_size = shape_size(batch_shape);

-    auto batch = in;
+    const size_t work_amount = batches_count * filters_count;

-    for (size_t batch_idx = 0; batch_idx < batches_count; ++batch_idx) {
-        auto filter = f;
-        for (size_t f_idx = 0; f_idx < filters_count; ++f_idx) {
-            convolve_3D_channels(params, batch, batch_shape, filter, filter_shape, out);
-            filter += filter_size;
+    void (*conv_channels)(const ConvolutionParams&, const T*, const Shape&, const T*, const Shape&, T*);
+    if (in_shape.size() == 5) {
+        conv_channels = &convolve_3D_channels;
+    } else {
+        conv_channels = &convolve_2D_channels;
+    }
+
+    auto ncores = std::thread::hardware_concurrency() / 2;
+    if (ncores == 0) {
+        ncores = 1;
+    }
+    std::vector<std::future<void>> futures(ncores);
+
+    auto split_work = [](const size_t& n, const size_t& nthr, const size_t ithr, size_t& n_start, size_t& n_end) {
+        if (nthr <= 1 || n == 0) {
+            n_start = 0;
+            n_end = n;
+        } else {
+            auto n1 = (n + nthr - 1) / nthr;
+            auto n2 = n1 - 1;
+            auto T1 = n - n2 * nthr;
+            n_end = ithr < T1 ? n1 : n2;
+            n_start = ithr <= T1 ? ithr * n1 : T1 * n1 + (ithr - T1) * n2;
        }
-        batch += batch_size;
+
+        n_end += n_start;
+    };
+
+    auto ker_callback = [&](int nthr, int ithr) {
+        size_t start = 0, end = 0;
+        split_work(work_amount, nthr, ithr, start, end);
+        if (start >= end) {
+            return;
+        }
+        size_t batch_idx = start / filters_count;
+        size_t c_idx = start % filters_count;
+
+        auto in_data = in + batch_size * batch_idx;
+        auto filter = f + filter_size * c_idx;
+        auto out_data = out + out_spatial_size * filters_count * batch_idx + out_spatial_size * c_idx;
+
+        for (; batch_idx < batches_count; ++batch_idx) {
+            for (; c_idx < filters_count && start < end; c_idx++, start++) {
+                conv_channels(params, in_data, batch_shape, filter, filter_shape, out_data);
+                filter += filter_size;
+                out_data += out_spatial_size;
+            }
+            if (start >= end) {
+                break;
+            }
+            filter = f;
+            c_idx = 0;
+            in_data += batch_size;
+        }
+    };
+
+    for (size_t ithr = 0; ithr < ncores; ithr++) {
+        futures[ithr] = std::async(ker_callback, ncores, ithr);
+    }
+    for (size_t ithr = 0; ithr < ncores; ithr++) {
+        futures[ithr].get();
    }
 }

@@ -305,7 +362,7 @@ void convolution_backprop_in(const T* delta_in,
    if (stride_dim >= 2) {
        extend_with_zeros(stride, in_shape, delta_in, conv_input_shape, extended_input);
        std::fill(conv_stride.begin(), conv_stride.end(), 1);
-        conv_input_data = &extended_input[0];
+        conv_input_data = extended_input.data();
    }

    const size_t dilation_dim =
@@ -317,7 +374,7 @@ void convolution_backprop_in(const T* delta_in,
                             conv_filter_shape,
                             extended_filter);
        std::fill(conv_filter_dilation.begin(), conv_filter_dilation.end(), 1);
-        conv_filter_data = &extended_filter[0];
+        conv_filter_data = extended_filter.data();
    }

    convolution_backprop_impl(conv_input_data,
--- a/src/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp
@@ -114,47 +114,56 @@ void convolve_2D_channels(const ConvolutionParams& p,
                          const T* mask,
                          const Shape& mask_shape,
                          T* out,
-                          size_t group_idx,
-                          int64_t groups,
-                          int64_t deformable_groups,
-                          bool bilinear_interpolation_pad) {
+                          const size_t group_idx,
+                          const int64_t groups,
+                          const int64_t deformable_groups,
+                          const bool bilinear_interpolation_pad) {
    const int input_size_y = static_cast<int>(batch_shape[1]);
    const int input_size_x = static_cast<int>(batch_shape[2]);
    const int filter_size_y = static_cast<int>(filter_shape[1]);
    const int filter_size_x = static_cast<int>(filter_shape[2]);
-    const int dilated_filter_size_y = filter_size_y + (filter_size_y - 1) * (static_cast<int>(p.dilation[0]) - 1);
-    const int dilated_filter_size_x = filter_size_x + (filter_size_x - 1) * (static_cast<int>(p.dilation[1]) - 1);
+    const int dilation_y = static_cast<int>(p.dilation[0]);
+    const int dilation_x = static_cast<int>(p.dilation[1]);
+    const int dilated_filter_size_y = filter_size_y + (filter_size_y - 1) * (dilation_y - 1);
+    const int dilated_filter_size_x = filter_size_x + (filter_size_x - 1) * (dilation_x - 1);
+
+    const int i_y_lim = static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y);
+    const int i_x_lim = static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x);

    const int input_channel_size = static_cast<int>(shape_size(shape_reduce(batch_shape)));
-    const int filter_channel_size = static_cast<int>(shape_size(shape_reduce(filter_shape)));
    const int offsets_size = static_cast<int>(shape_size(offset_shape));
    const int offsets_spatial_size = static_cast<int>(shape_size(shape_reduce(offset_shape)));
    const int filter_channels_count = static_cast<int>(filter_shape[0]);
    const int mask_size = static_cast<int>(shape_size(mask_shape));
    const int mask_spatial_size = static_cast<int>(shape_size(shape_reduce(mask_shape)));

+    const int group_idx_m = filter_channels_count * static_cast<int>(group_idx);
+    const int group_idx_d = filter_channels_count * groups / deformable_groups;
+
+    const int f_shift_inc = 2 * offsets_spatial_size;
+
    int out_idx = 0;
-    for (int i_y = static_cast<int>(-p.pads_begin[0]);
-         i_y <= static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y);
-         i_y += static_cast<int>(p.strides[0])) {
-        for (int i_x = static_cast<int>(-p.pads_begin[1]);
-             i_x <= static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x);
-             i_x += static_cast<int>(p.strides[1])) {
+    for (int i_y = static_cast<int>(-p.pads_begin[0]); i_y <= i_y_lim; i_y += static_cast<int>(p.strides[0])) {
+        for (int i_x = static_cast<int>(-p.pads_begin[1]); i_x <= i_x_lim; i_x += static_cast<int>(p.strides[1])) {
            auto input_channel = batch;
            auto filter_channel = filter;
            T sum = 0;
            for (int fc = 0; fc < filter_channels_count; fc++) {
-                auto deformable_group_idx =
-                    (filter_channels_count * group_idx + fc) / (filter_channels_count * groups / deformable_groups);
-                for (int f_y = 0; f_y < filter_size_y; ++f_y) {
-                    for (int f_x = 0; f_x < filter_size_x; ++f_x) {
-                        int f_buf_idx = (f_y * filter_size_x) + f_x;
-                        T y_offset = offsets[deformable_group_idx * offsets_size +
-                                             f_buf_idx * 2 * offsets_spatial_size + out_idx];
-                        T x_offset = offsets[deformable_group_idx * offsets_size +
-                                             (f_buf_idx * 2 + 1) * offsets_spatial_size + out_idx];
-                        T rel_i_y = static_cast<T>(i_y + (f_y * p.dilation[0]) + y_offset);
-                        T rel_i_x = static_cast<T>(i_x + (f_x * p.dilation[1]) + x_offset);
+                const int deformable_group_idx = (group_idx_m + fc) / group_idx_d;
+                int f_y_shift = deformable_group_idx * offsets_size + out_idx;
+                int f_x_shift = f_y_shift + offsets_spatial_size;
+                int f_mask_shift = deformable_group_idx * mask_size + out_idx;
+                int i_y_dil = i_y;
+                for (int f_y = 0; f_y < filter_size_y; ++f_y, i_y_dil += dilation_y) {
+                    int i_x_dil = i_x;
+                    for (int f_x = 0; f_x < filter_size_x; ++f_x,
+                             f_y_shift += f_shift_inc,
+                             f_x_shift += f_shift_inc,
+                             f_mask_shift += mask_spatial_size,
+                             i_x_dil += dilation_x,
+                             filter_channel++) {
+                        T rel_i_y = static_cast<T>(i_y_dil + offsets[f_y_shift]);
+                        T rel_i_x = static_cast<T>(i_x_dil + offsets[f_x_shift]);

                        bool padding;
                        if (bilinear_interpolation_pad) {
@@ -168,19 +177,17 @@ void convolve_2D_channels(const ConvolutionParams& p,
                        if (padding)
                            continue;

-                        T mask_scalar =
-                            mask[deformable_group_idx * mask_size + f_buf_idx * mask_spatial_size + out_idx];
+                        T mask_scalar = mask[f_mask_shift];
                        sum += static_cast<T>(bilinear_interpolation(input_channel,
                                                                     static_cast<float>(rel_i_x),
                                                                     static_cast<float>(rel_i_y),
                                                                     input_size_x,
                                                                     input_size_y,
                                                                     bilinear_interpolation_pad)) *
-                               filter_channel[f_buf_idx] * mask_scalar;
+                               filter_channel[0] * mask_scalar;
                    }
                }
                input_channel += input_channel_size;
-                filter_channel += filter_channel_size;
            }
            out[out_idx++] = sum;
        }
@@ -206,9 +213,7 @@ void deformable_convolution(const T* in,
                            const CoordinateDiff& pads_end,
                            const int64_t groups,
                            const int64_t deformable_groups,
-                            const bool bilinear_interpolation_pad)
-
-{
+                            const bool bilinear_interpolation_pad) {
    using namespace def_conv_impl;

    validate_deformable_convolution_params(in_shape,
--- a/src/core/reference/include/ngraph/runtime/reference/group_convolution.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/group_convolution.hpp
@@ -5,7 +5,7 @@
 #pragma once

 #include "ngraph/runtime/reference/convolution.hpp"
-#include "ngraph/util.hpp"
+#include "ngraph/runtime/reference/helpers.hpp"

 namespace {
 constexpr size_t filter_group_axis = 0;
--- a/src/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp
@@ -4,7 +4,6 @@

 #pragma once

-#include "ngraph/runtime/reference/convolution_backprop_data.hpp"
 #include "ngraph/runtime/reference/group_convolution.hpp"
 #include "ngraph/util.hpp"

--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/augru_cell.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/augru_cell.cpp
@@ -99,12 +99,6 @@ protected:
            ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hiddenSize/*, activations, {}, {}, clip, linearBeforeReset*/);

        function = makeNgraphFunction(netPrecision, params, augruCellOp, "AUGRUCell");
-        auto filePrefix = CommonTestUtils::generateTestFilePrefix();
-        std::string xmlFileName = filePrefix + "_AUGRUCell.xml";
-        std::string binFileName = filePrefix + "_AUGRUCell.bin";
-        ov::pass::Serialize serializer(xmlFileName, binFileName);
-        serializer.run_on_model(function);
-        std::cout << "Saved subgraph IR." << std::endl;
    }
 };

--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp
@@ -16,8 +16,6 @@
 #include "ie_core.hpp"
 #include "ie_precision.hpp"

-#include "ngraph/opsets/opset1.hpp"
-
 #include "functional_test_utils/blob_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "common_test_utils/common_utils.hpp"