MatMul reference implementation refactoring (#4671)

* MatMul backend tests * Single layer tests * Style apply * Updaye IE_CPU manifest * New dot implementation * Use new dot in MatMul reference * Fix output batch offset * Style apply * Relax tests tolerance * Remove legacy dot reference file * Remove usage of broadcast builder * Add one more broadcast test * Remove NGRAPH_SUPPRESS_DEPRECATED * Style apply * Few more MatMul single layer tests * Update IE tests manifest * Move variable declarations to inner loops * Add const to variables * Apply review suggestions * Reuse vector for transposed and broadcasted data
2021-03-22 13:27:31 +01:00 · 2021-03-22 13:27:31 +01:00 · b8f36ec354
commit b8f36ec354
parent e64d84b47b
7 changed files with 268 additions and 318 deletions
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
@ -18,6 +18,19 @@ const std::vector<ShapeRelatedParams> shapeRelatedParams = {
        { { {1, 4, 5, 6}, false }, { {1, 4, 6, 4}, false } },
        { { {4, 5, 6}, false }, { {6, 3}, false } },
        { { {9, 9, 9}, false }, { {9, 9}, false } },
+        { { {1, 2, 3}, false }, { {1, 1, 3, 2}, false } },
+        { { {1, 3, 2, 4}, false }, { {2, 1, 4, 2}, false } },
+        { { {2, 1, 2, 4}, false }, { {1, 3, 4, 2}, false } },
+        { { {3, 2, 4}, false }, { {2, 1, 4, 2}, false } },
+        { { {2, 1, 4, 2}, false }, { {3, 2, 4}, false } },
+        { { {2, 1, 2, 3}, true }, { {3, 2, 4}, false } },
+        { { {2, 1, 3, 2}, false }, { {3, 4, 2}, true } },
+        { { {2, 1, 2, 3}, true }, { {3, 4, 2}, true } },
+        { { {3}, false }, { {2, 2, 3, 1}, false } },
+        { { {2, 2, 1, 3}, false }, { {3}, false } },
+        { { {1, 5}, false }, { {5, 1}, false } },
+        { { {5, 1}, true }, { {5, 1}, false } },
+        { { {1, 5}, false }, { {1, 5}, true } },
        { { {1, 5}, false }, { {5}, false } },
        { { {5}, false }, { {5, 1}, false } },
        { { {5}, false }, { {5}, false } },
--- a/ngraph/core/reference/include/ngraph/runtime/reference/dot.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/dot.hpp
@ -1,170 +0,0 @@
-//*****************************************************************************
-// Copyright 2017-2021 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <cmath>
-#include <utility>
-
-#include <cfenv>
-#include <functional>
-#include "ngraph/coordinate_transform.hpp"
-#include "ngraph/runtime/reference/helpers.hpp"
-#include "ngraph/shape_util.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace reference
-        {
-            template <typename INPUT0,
-                      typename INPUT1,
-                      typename OUTPUT,
-                      typename ACCUMULATION = typename widen<OUTPUT>::type>
-            void dot(const INPUT0* arg0,
-                     const INPUT1* arg1,
-                     OUTPUT* out,
-                     const Shape& arg0_shape,
-                     const Shape& arg1_shape,
-                     const Shape& out_shape,
-                     size_t reduction_axes_count,
-                     const float* input0_scale = nullptr,
-                     const INPUT0* input0_zero_point = nullptr,
-                     const float* input1_scale = nullptr,
-                     const INPUT1* input1_zero_point = nullptr,
-                     const float* output_scale = nullptr,
-                     const OUTPUT* output_zero_point = nullptr)
-            {
-                bool is_quantized = false;
-                if (input0_scale && input0_zero_point && input1_scale && input1_zero_point &&
-                    output_scale && output_zero_point)
-                {
-                    is_quantized = true;
-                }
-
-                auto old_mode = std::fegetround();
-                std::fesetround(FE_TONEAREST);
-                // Get the sizes of the dot axes. It's easiest to pull them from arg1 because
-                // they're right up front.
-                Shape dot_axis_sizes(reduction_axes_count);
-                std::copy(arg1_shape.begin(),
-                          arg1_shape.begin() + reduction_axes_count,
-                          dot_axis_sizes.begin());
-
-                CoordinateTransform arg0_transform(arg0_shape);
-                CoordinateTransform arg1_transform(arg1_shape);
-                CoordinateTransform output_transform(out_shape);
-
-                // Create coordinate transforms for arg0 and arg1 that throw away the dotted axes.
-                size_t arg0_projected_rank = arg0_shape.size() - reduction_axes_count;
-                size_t arg1_projected_rank = arg1_shape.size() - reduction_axes_count;
-
-                Shape arg0_projected_shape(arg0_projected_rank);
-                std::copy(arg0_shape.begin(),
-                          arg0_shape.begin() + arg0_projected_rank,
-                          arg0_projected_shape.begin());
-
-                Shape arg1_projected_shape(arg1_projected_rank);
-                std::copy(arg1_shape.begin() + reduction_axes_count,
-                          arg1_shape.end(),
-                          arg1_projected_shape.begin());
-
-                CoordinateTransform arg0_projected_transform(arg0_projected_shape);
-                CoordinateTransform arg1_projected_transform(arg1_projected_shape);
-
-                // Create a coordinate transform that allows us to iterate over all possible values
-                // for the dotted axes.
-                CoordinateTransform dot_axes_transform(dot_axis_sizes);
-
-                for (const Coordinate& arg0_projected_coord : arg0_projected_transform)
-                {
-                    for (const Coordinate& arg1_projected_coord : arg1_projected_transform)
-                    {
-                        // The output coordinate is just the concatenation of the projected
-                        // coordinates.
-                        Coordinate out_coord(arg0_projected_coord.size() +
-                                             arg1_projected_coord.size());
-
-                        auto out_coord_it = std::copy(arg0_projected_coord.begin(),
-                                                      arg0_projected_coord.end(),
-                                                      out_coord.begin());
-                        std::copy(
-                            arg1_projected_coord.begin(), arg1_projected_coord.end(), out_coord_it);
-
-                        // Zero out to start the sum.
-                        ACCUMULATION sum = 0;
-
-                        size_t out_index = output_transform.index(out_coord);
-
-                        // Walk along the dotted axes.
-                        Coordinate arg0_coord(arg0_shape.size());
-                        Coordinate arg1_coord(arg1_shape.size());
-                        auto arg0_it = std::copy(arg0_projected_coord.begin(),
-                                                 arg0_projected_coord.end(),
-                                                 arg0_coord.begin());
-                        for (const Coordinate& dot_axis_positions : dot_axes_transform)
-                        {
-                            // In order to find the points to multiply together, we need to inject
-                            // our current positions along the dotted axes back into the projected
-                            // arg0 and arg1 coordinates.
-                            std::copy(
-                                dot_axis_positions.begin(), dot_axis_positions.end(), arg0_it);
-
-                            auto arg1_it = std::copy(dot_axis_positions.begin(),
-                                                     dot_axis_positions.end(),
-                                                     arg1_coord.begin());
-                            std::copy(
-                                arg1_projected_coord.begin(), arg1_projected_coord.end(), arg1_it);
-
-                            // Multiply and add to the sum.
-                            if (is_quantized)
-                            {
-                                sum = sum + ((static_cast<ACCUMULATION>(
-                                                  arg0[arg0_transform.index(arg0_coord)]) -
-                                              static_cast<ACCUMULATION>(*input0_zero_point)) *
-                                             (static_cast<ACCUMULATION>(
-                                                  arg1[arg1_transform.index(arg1_coord)]) -
-                                              static_cast<ACCUMULATION>(*input1_zero_point)));
-                            }
-                            else
-                            {
-                                sum = sum + (static_cast<ACCUMULATION>(
-                                                 arg0[arg0_transform.index(arg0_coord)]) *
-                                             static_cast<ACCUMULATION>(
-                                                 arg1[arg1_transform.index(arg1_coord)]));
-                            }
-                        }
-
-                        if (is_quantized)
-                        {
-                            float scale = *input0_scale * *input1_scale / *output_scale;
-                            // Write the sum back.
-                            out[out_index] =
-                                static_cast<OUTPUT>(std::round(static_cast<float>(sum) * scale)) +
-                                *output_zero_point;
-                        }
-                        else
-                        {
-                            out[out_index] = sum;
-                        }
-                    }
-                    std::fesetround(old_mode);
-                }
-            }
-        }
-    }
-}
--- a/ngraph/core/reference/include/ngraph/runtime/reference/gru_cell.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/gru_cell.hpp
@ -225,7 +225,7 @@ namespace ngraph
                clip_activation(r_t, activation_f);

                // calculate h_t
-                vector<T> h_t(gate_shape_size);
+                std::vector<T> h_t(gate_shape_size);
                if (linear_before_reset)
                {
                    // ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh)
@ -287,8 +287,8 @@ namespace ngraph
                }
                clip_activation(h_t, activation_g);
                // Ht = (1 - zt) (.) ht + zt (.) Ht-1
-                vector<T> mul1(gate_shape_size);
-                vector<T> mul2(gate_shape_size);
+                std::vector<T> mul1(gate_shape_size);
+                std::vector<T> mul2(gate_shape_size);
                T one[] = {1};
                reference::subtract(
                    one, z_t.data(), mul1.data(), {1}, gate_shape, op::AutoBroadcastSpec::NUMPY);
--- a/ngraph/core/reference/include/ngraph/runtime/reference/lstm_cell.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/lstm_cell.hpp
@ -178,9 +178,9 @@ namespace ngraph
                // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Wbo + Rbo)
                clip_activation(X_W_fico[3], activation_f);

-                vector<T> mul1(gate_shape_size);
-                vector<T> mul2(gate_shape_size);
-                vector<T> Ct(gate_shape_size);
+                std::vector<T> mul1(gate_shape_size);
+                std::vector<T> mul2(gate_shape_size);
+                std::vector<T> Ct(gate_shape_size);
                // ft (.) Ct-1
                reference::multiply(X_W_fico[0].data(),
                                    C,
--- a/ngraph/core/reference/include/ngraph/runtime/reference/matmul.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/matmul.hpp
@ -21,23 +21,63 @@
 #include <utility>
 #include <vector>

-#include "ngraph/axis_vector.hpp"
-#include "ngraph/builder/autobroadcast.hpp"
 #include "ngraph/runtime/opt_kernel/reshape.hpp"
 #include "ngraph/runtime/reference/broadcast.hpp"
-#include "ngraph/runtime/reference/dot.hpp"
 #include "ngraph/shape_util.hpp"

-NGRAPH_SUPPRESS_DEPRECATED_START
-
-using namespace std;
-
 namespace ngraph
 {
    namespace runtime
    {
        namespace reference
        {
+            namespace details
+            {
+                template <typename T>
+                void dot(const T* arg0,
+                         const T* arg1,
+                         T* out,
+                         const Shape& arg0_shape,
+                         const Shape& arg1_shape,
+                         const Shape& out_shape)
+                {
+                    std::fill(out, out + shape_size(out_shape), T{0});
+                    const size_t arg0_rank = arg0_shape.size();
+                    const size_t arg1_rank = arg1_shape.size();
+
+                    // 2D inputs shapes are interpreted as {I, K} x {K, J}
+                    // If first input is 1D tensor of shape {K}, it is interpreted as {1, K}
+                    // If second input is 1D tensor of shape {K}, it is interpreted as {K, 1}
+                    const size_t I_dim = arg0_rank == 1 ? 1 : arg0_shape[arg0_rank - 2];
+                    const size_t J_dim = arg1_rank == 1 ? 1 : arg1_shape[arg1_rank - 1];
+                    const size_t K_dim =
+                        arg1_rank == 1 ? arg1_shape[arg1_rank - 1] : arg1_shape[arg1_rank - 2];
+
+                    for (size_t i = 0; i < I_dim; ++i)
+                    {
+                        for (size_t k = 0; k < K_dim; ++k)
+                        {
+                            const size_t a_idx = i * K_dim + k;
+                            for (size_t j = 0; j < J_dim; ++j)
+                            {
+                                const size_t b_idx = k * J_dim + j;
+                                const size_t out_idx = i * J_dim + j;
+                                out[out_idx] += arg0[a_idx] * arg1[b_idx];
+                            }
+                        }
+                    }
+                }
+
+                std::vector<size_t> get_transpose_order(const Shape& input_shape)
+                {
+                    size_t rank = input_shape.size();
+                    NGRAPH_CHECK(rank > 1, "Invalid input for transpose");
+                    std::vector<size_t> axes_order(rank);
+                    std::iota(axes_order.begin(), axes_order.end(), 0);
+                    std::swap(axes_order[rank - 1], axes_order[rank - 2]);
+                    return axes_order;
+                }
+            }
            /// \brief Reference kernel for matmul computation.
            ///
            /// \tparam T Type of input and output tensors.
@ -70,89 +110,59 @@ namespace ngraph
                //    and perform broadcast if applicable
                // 4) Perform dot on the args or updated args and return result

-                size_t arg0_rank = arg0_shape.size();
-                size_t arg1_rank = arg1_shape.size();
-                size_t out_rank = out_shape.size();
-
-                // vector vars to hold pontential intermediate transpose,
-                // broadcast result
-                vector<T> arg0_transpose_vec;
-                vector<T> arg1_transpose_vec;
-                vector<T> arg0_broadcast_vec;
-                vector<T> arg1_broadcast_vec;
-
                // pointers to updated inputs
-                const T* arg0_update = arg0;
-                const T* arg1_update = arg1;
+                const T* arg0_data = arg0;
+                const T* arg1_data = arg1;
+
+                // vectors to hold pontential intermediate transpose,
+                // broadcast result
+                std::vector<T> arg0_new_data;
+                std::vector<T> arg1_new_data;

                // vars for updated inputs shapes
-                Shape wip_arg0_shape = arg0_shape;
-                Shape wip_arg1_shape = arg1_shape;
+                Shape arg0_shape_tmp = arg0_shape;
+                Shape arg1_shape_tmp = arg1_shape;

-                auto get_transpose_order = [](const Shape& input_shape) {
-                    size_t rank = input_shape.size();
-                    NGRAPH_CHECK(rank > 1, "Invalid input for transpose");
-                    vector<size_t> axes_order(rank);
-                    iota(axes_order.begin(), axes_order.end(), 0);
-                    swap(axes_order[rank - 1], axes_order[rank - 2]);
-                    return AxisVector{begin(axes_order), end(axes_order)};
-                };
-
-                auto get_broadcast_axes = [](const Shape& marker_shape, const Shape& target_shape) {
-                    NGRAPH_CHECK(marker_shape.size() == target_shape.size(),
-                                 "Incompatible input shapes");
-                    AxisSet broadcast_axes;
-                    for (size_t i = 0; i < marker_shape.size(); i++)
-                    {
-                        if (marker_shape[i] == 1 && target_shape[i] != 1)
-                        {
-                            broadcast_axes.insert(i);
-                        }
-                    }
-                    return broadcast_axes;
-                };
+                size_t arg0_rank = arg0_shape.size();
+                size_t arg1_rank = arg1_shape.size();
+                const size_t out_rank = out_shape.size();

                // Perform transpose if requested
                if (transpose_arg0 && arg0_rank > 1)
                {
-                    arg0_transpose_vec.reserve(shape_size(arg0_shape));
-                    auto axis_vector = get_transpose_order(arg0_shape);
-                    swap(wip_arg0_shape[arg0_rank - 1], wip_arg0_shape[arg0_rank - 2]);
-                    opt_kernel::reshape(reinterpret_cast<const char*>(arg0),
-                                        reinterpret_cast<char*>(arg0_transpose_vec.data()),
+                    std::vector<T> tmp(shape_size(arg0_shape));
+                    auto axis_vector = details::get_transpose_order(arg0_shape);
+                    std::swap(arg0_shape_tmp[arg0_rank - 1], arg0_shape_tmp[arg0_rank - 2]);
+                    opt_kernel::reshape(reinterpret_cast<const char*>(arg0_data),
+                                        reinterpret_cast<char*>(tmp.data()),
                                        arg0_shape,
                                        axis_vector,
-                                        wip_arg0_shape,
+                                        arg0_shape_tmp,
                                        sizeof(T));
-
-                    arg0_update = arg0_transpose_vec.data();
+                    arg0_new_data.swap(tmp);
+                    arg0_data = arg0_new_data.data();
                }

                if (transpose_arg1 && arg1_rank > 1)
                {
-                    arg1_transpose_vec.reserve(shape_size(arg1_shape));
-                    auto axis_vector = get_transpose_order(arg1_shape);
-                    swap(wip_arg1_shape[arg1_rank - 1], wip_arg1_shape[arg1_rank - 2]);
-                    opt_kernel::reshape(reinterpret_cast<const char*>(arg1),
-                                        reinterpret_cast<char*>(arg1_transpose_vec.data()),
+                    std::vector<T> tmp(shape_size(arg1_shape));
+                    auto axis_vector = details::get_transpose_order(arg1_shape);
+                    std::swap(arg1_shape_tmp[arg1_rank - 1], arg1_shape_tmp[arg1_rank - 2]);
+                    opt_kernel::reshape(reinterpret_cast<const char*>(arg1_data),
+                                        reinterpret_cast<char*>(tmp.data()),
                                        arg1_shape,
                                        axis_vector,
-                                        wip_arg1_shape,
+                                        arg1_shape_tmp,
                                        sizeof(T));
-
-                    arg1_update = arg1_transpose_vec.data();
+                    arg1_new_data.swap(tmp);
+                    arg1_data = arg1_new_data.data();
                }

                // Inputs are 2D and below, perform dot directly
                if (arg0_rank <= 2 && arg1_rank <= 2)
                {
-                    dot(arg0_update,
-                        arg1_update,
-                        out,
-                        wip_arg0_shape,
-                        wip_arg1_shape,
-                        out_shape,
-                        1);
+                    details::dot(
+                        arg0_data, arg1_data, out, arg0_shape_tmp, arg1_shape_tmp, out_shape);
                    return;
                }

@ -163,80 +173,73 @@ namespace ngraph

                if (arg0_rank > 2 && arg1_rank > 2)
                {
-                    const auto& broadcast_shapes = builder::get_numpy_broadcast_shapes(
-                        {Shape{begin(wip_arg0_shape), next(end(wip_arg0_shape), -2)},
-                         Shape{begin(wip_arg1_shape), next(end(wip_arg1_shape), -2)}});
-
-                    Shape arg0_br_target_shape = broadcast_shapes.first;
-                    Shape arg1_br_target_shape = broadcast_shapes.first;
-                    Shape arg0_br_marker_shape = broadcast_shapes.second.at(0);
-                    Shape arg1_br_marker_shape = broadcast_shapes.second.at(1);
+                    // Align input batches to the output shape
+                    Shape arg0_br_target_shape(out_shape.begin(), out_shape.end() - 2);
+                    Shape arg1_br_target_shape(out_shape.begin(), out_shape.end() - 2);

                    arg0_br_target_shape.insert(
-                        end(arg0_br_target_shape),
-                        next(begin(wip_arg0_shape), wip_arg0_shape.size() - 2),
-                        end(wip_arg0_shape));
+                        end(arg0_br_target_shape), end(arg0_shape_tmp) - 2, end(arg0_shape_tmp));
                    arg1_br_target_shape.insert(
-                        end(arg1_br_target_shape),
-                        next(begin(wip_arg1_shape), wip_arg1_shape.size() - 2),
-                        end(wip_arg1_shape));
+                        end(arg1_br_target_shape), end(arg1_shape_tmp) - 2, end(arg1_shape_tmp));

-                    arg0_br_marker_shape.insert(
-                        end(arg0_br_marker_shape),
-                        next(begin(wip_arg0_shape), wip_arg0_shape.size() - 2),
-                        end(wip_arg0_shape));
-                    arg1_br_marker_shape.insert(
-                        end(arg1_br_marker_shape),
-                        next(begin(wip_arg1_shape), wip_arg1_shape.size() - 2),
-                        end(wip_arg1_shape));
-
-                    if (arg0_br_target_shape != wip_arg0_shape)
+                    std::vector<size_t> broadcast_axes(out_shape.size() - 2);
+                    std::iota(broadcast_axes.begin(), broadcast_axes.end(), 0);
+                    if (!broadcast_axes.empty())
                    {
-                        auto broadcast_axes =
-                            get_broadcast_axes(arg0_br_marker_shape, arg0_br_target_shape);
-                        if (!broadcast_axes.empty())
+                        // Usual rules of the broadcasting are applied for batch dimensions.
+                        // If ranks of input arguments are different,
+                        // the smaller tensor is unsqueezed from the left side of the shape
+                        // by necessary number of axes to make both shapes of the same rank.
+                        // Broadcast all batches (last two dimensions represent matrix),
+                        // expand dim with value 1 to bigger dim if dimensions are not equal.
+                        if (arg0_br_target_shape != arg0_shape_tmp)
                        {
-                            arg0_broadcast_vec.reserve(shape_size(arg0_br_target_shape));
-                            broadcast(reinterpret_cast<const char*>(arg0_update),
-                                      reinterpret_cast<char*>(arg0_broadcast_vec.data()),
-                                      wip_arg0_shape,
+                            std::vector<T> tmp(shape_size(arg0_br_target_shape));
+                            broadcast(reinterpret_cast<const char*>(arg0_data),
+                                      reinterpret_cast<char*>(tmp.data()),
+                                      arg0_shape_tmp,
                                      arg0_br_target_shape,
                                      broadcast_axes,
                                      sizeof(T));

-                            arg0_update = arg0_broadcast_vec.data();
-                            wip_arg0_shape = arg0_br_target_shape;
-                            arg0_rank = wip_arg0_shape.size();
+                            arg0_shape_tmp = arg0_br_target_shape;
+                            arg0_rank = arg0_shape_tmp.size();
+                            arg0_new_data.swap(tmp);
+                            arg0_data = arg0_new_data.data();
                        }
-                    }

-                    if (arg1_br_target_shape != wip_arg1_shape)
-                    {
-                        auto broadcast_axes =
-                            get_broadcast_axes(arg1_br_marker_shape, arg1_br_target_shape);
-                        if (!broadcast_axes.empty())
+                        if (arg1_br_target_shape != arg1_shape_tmp)
                        {
-                            arg1_broadcast_vec.reserve(shape_size(arg1_br_target_shape));
-                            broadcast(reinterpret_cast<const char*>(arg1_update),
-                                      reinterpret_cast<char*>(arg1_broadcast_vec.data()),
-                                      wip_arg1_shape,
+                            std::vector<T> tmp(shape_size(arg1_br_target_shape));
+                            broadcast(reinterpret_cast<const char*>(arg1_data),
+                                      reinterpret_cast<char*>(tmp.data()),
+                                      arg1_shape_tmp,
                                      arg1_br_target_shape,
                                      broadcast_axes,
                                      sizeof(T));
-
-                            arg1_update = arg1_broadcast_vec.data();
-                            wip_arg1_shape = arg1_br_target_shape;
-                            arg1_rank = wip_arg1_shape.size();
+                            arg1_shape_tmp = arg1_br_target_shape;
+                            arg1_rank = arg1_shape_tmp.size();
+                            arg1_new_data.swap(tmp);
+                            arg1_data = arg1_new_data.data();
                        }
                    }
                }

                // Perform batched dot
-
-                size_t output_batch_size = 1;
+                const Shape dot_arg0_shape = (arg0_rank > 2) ? Shape{arg0_shape_tmp[arg0_rank - 2],
+                                                                     arg0_shape_tmp[arg0_rank - 1]}
+                                                             : arg0_shape_tmp;
+                const Shape dot_arg1_shape = (arg1_rank > 2) ? Shape{arg1_shape_tmp[arg1_rank - 2],
+                                                                     arg1_shape_tmp[arg1_rank - 1]}
+                                                             : arg1_shape_tmp;
+                const Shape dot_output_shape =
+                    (out_rank > 2 && arg0_rank > 1 && arg1_rank > 1)
+                        ? Shape{out_shape[out_rank - 2], out_shape[out_rank - 1]}
+                        : Shape{out_shape[out_rank - 1]};

                // Calculate number of batches
-                if (out_rank < 3)
+                size_t output_batch_size = 1;
+                if (out_rank <= 2)
                {
                    // Output is {batch_size, dot_result}, i.e.,
                    // arg 0 shape {2}, arg1 shape {3, 2, 1}, output shape {3, 1}
@ -244,38 +247,24 @@ namespace ngraph
                }
                else
                {
-                    for (size_t i = 0; i < (out_rank - 2); i++)
+                    for (size_t i = 0; i < (out_rank - dot_output_shape.size()); i++)
                    {
                        output_batch_size *= out_shape[i];
                    }
                }
-
-                Shape dot_arg0_shape = (arg0_rank > 2) ? Shape{wip_arg0_shape[arg0_rank - 2],
-                                                               wip_arg0_shape[arg0_rank - 1]}
-                                                       : wip_arg0_shape;
-                Shape dot_arg1_shape = (arg1_rank > 2) ? Shape{wip_arg1_shape[arg1_rank - 2],
-                                                               wip_arg1_shape[arg1_rank - 1]}
-                                                       : wip_arg1_shape;
-                Shape dot_output_shape =
-                    (out_rank > 2) ? Shape{out_shape[out_rank - 2], out_shape[out_rank - 1]}
-                                   : Shape{out_shape[out_rank - 1]};
-
                const size_t arg0_offset = (arg0_rank > 2) ? shape_size(dot_arg0_shape) : 0;
                const size_t arg1_offset = (arg1_rank > 2) ? shape_size(dot_arg1_shape) : 0;
                const size_t output_offset = shape_size(dot_output_shape);
                for (size_t i = 0; i < output_batch_size; i++)
                {
-                    dot(arg0_update + i * arg0_offset,
-                        arg1_update + i * arg1_offset,
-                        out + i * output_offset,
-                        dot_arg0_shape,
-                        dot_arg1_shape,
-                        dot_output_shape,
-                        1);
+                    details::dot(arg0_data + i * arg0_offset,
+                                 arg1_data + i * arg1_offset,
+                                 out + i * output_offset,
+                                 dot_arg0_shape,
+                                 dot_arg1_shape,
+                                 dot_output_shape);
                }
            }
        }
    }
 }
-
-NGRAPH_SUPPRESS_DEPRECATED_END
--- a/ngraph/test/backend/matmul.in.cpp
+++ b/ngraph/test/backend/matmul.in.cpp
@ -1008,3 +1008,121 @@ NGRAPH_TEST(${BACKEND_NAME}, matmul_3_x_1_1_3_false_true_const)
    test_case.add_expected_output<float>(shape_out, expected_result);
    test_case.run();
 }
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2_2_1_3_x_3_false_false_param)
+{
+    Shape shape_a{2, 2, 1, 3};
+    Shape shape_b{3};
+    Shape shape_out{2, 2, 1};
+
+    bool transpose_a = false;
+    bool transpose_b = false;
+
+    std::vector<float> inputs_a(shape_size(shape_a));
+    std::iota(inputs_a.begin(), inputs_a.end(), 0);
+
+    std::vector<float> inputs_b(shape_size(shape_b));
+    std::iota(inputs_b.begin(), inputs_b.end(), 0);
+
+    std::vector<float> expected_result{5, 14, 23, 32};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto matmul = make_shared<op::MatMul>(A, B, transpose_a, transpose_b);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>(inputs_a);
+    test_case.add_input<float>(inputs_b);
+
+    test_case.add_expected_output<float>(shape_out, expected_result);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_2_2_1_3_x_3_false_false_const)
+{
+    Shape shape_a{2, 2, 1, 3};
+    Shape shape_b{3};
+    Shape shape_out{2, 2, 1};
+
+    bool transpose_a = false;
+    bool transpose_b = false;
+
+    std::vector<float> inputs_a(shape_size(shape_a));
+    std::iota(inputs_a.begin(), inputs_a.end(), 0);
+
+    std::vector<float> inputs_b(shape_size(shape_b));
+    std::iota(inputs_b.begin(), inputs_b.end(), 0);
+
+    std::vector<float> expected_result{5, 14, 23, 32};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Constant>(element::f32, shape_b, inputs_b);
+    auto matmul = make_shared<op::MatMul>(A, B, transpose_a, transpose_b);
+    auto f = make_shared<Function>(matmul, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>(inputs_a);
+
+    test_case.add_expected_output<float>(shape_out, expected_result);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_3_x_2_2_3_1_false_false_param)
+{
+    Shape shape_a{3};
+    Shape shape_b{2, 2, 3, 1};
+    Shape shape_out{2, 2, 1};
+
+    bool transpose_a = false;
+    bool transpose_b = false;
+
+    std::vector<float> inputs_a(shape_size(shape_a));
+    std::iota(inputs_a.begin(), inputs_a.end(), 0);
+
+    std::vector<float> inputs_b(shape_size(shape_b));
+    std::iota(inputs_b.begin(), inputs_b.end(), 0);
+
+    std::vector<float> expected_result{5, 14, 23, 32};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto matmul = make_shared<op::MatMul>(A, B, transpose_a, transpose_b);
+    auto f = make_shared<Function>(matmul, ParameterVector{A, B});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>(inputs_a);
+    test_case.add_input<float>(inputs_b);
+
+    test_case.add_expected_output<float>(shape_out, expected_result);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matmul_3_x_2_2_3_1_false_false_const)
+{
+    Shape shape_a{3};
+    Shape shape_b{2, 2, 3, 1};
+    Shape shape_out{2, 2, 1};
+
+    bool transpose_a = false;
+    bool transpose_b = false;
+
+    std::vector<float> inputs_a(shape_size(shape_a));
+    std::iota(inputs_a.begin(), inputs_a.end(), 0);
+
+    std::vector<float> inputs_b(shape_size(shape_b));
+    std::iota(inputs_b.begin(), inputs_b.end(), 0);
+
+    std::vector<float> expected_result{5, 14, 23, 32};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Constant>(element::f32, shape_b, inputs_b);
+    auto matmul = make_shared<op::MatMul>(A, B, transpose_a, transpose_b);
+    auto f = make_shared<Function>(matmul, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>(inputs_a);
+
+    test_case.add_expected_output<float>(shape_out, expected_result);
+    test_case.run();
+}
--- a/ngraph/test/onnx/onnx_import_rnn.in.cpp
+++ b/ngraph/test/onnx/onnx_import_rnn.in.cpp
@ -1839,7 +1839,7 @@ NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_fwd_activations_con
                                             0.f,
                                             0.f,
                                         });
-    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 3);
+    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 5);
 }

 NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_fwd_activations)
@ -1887,7 +1887,7 @@ NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_fwd_activations)
                                             0.f,
                                             0.f,
                                         });
-    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 3);
+    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 5);
 }

 NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_fwd_mixed_seq_len_const)
@ -1983,7 +1983,7 @@ NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_fwd_mixed_seq_len)
                                             -0.18203181f,
                                             0.9996245f,
                                         });
-    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 3);
+    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 4);
 }

 NGRAPH_TEST_F(${BACKEND_NAME}, RNNSequenceOp, onnx_model_rnn_reverse_mixed_seq_len_const)