CompressQuantizeWeights optimizations (#20025)

* Optimize CompressQuantizeWeights transformation - remove CoordinateTransform usage from FakeQuantize reference implementation - move ZeroPointOptimizer functionality inside CompressQuantizeWeights - compute scale and zero point in the same loop Ticket: CVS-119273 * review comments * clang format * fix comments
2023-10-10 11:10:05 +02:00 · 2023-10-10 11:10:05 +02:00 · 4a3ce48f7a
commit 4a3ce48f7a
parent ed45a92e30
6 changed files with 1307 additions and 390 deletions
--- a/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp
+++ b/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp
@ -109,7 +109,6 @@ void regmodule_offline_transformations(py::module m) {
        [](std::shared_ptr<ov::Model> model) {
            ov::pass::Manager manager;
            manager.register_pass<ov::pass::CompressQuantizeWeights>();
-            manager.register_pass<ov::pass::ZeroPointOptimizer>();
            manager.run_passes(model);
        },
        py::arg("model"));
--- a/src/common/offline_transformations/include/compress_quantize_weights.hpp
+++ b/src/common/offline_transformations/include/compress_quantize_weights.hpp
@ -10,7 +10,6 @@ namespace ov {
 namespace pass {

 class CompressQuantizeWeights;
-class ZeroPointOptimizer;

 }  // namespace pass
 }  // namespace ov
@ -57,36 +56,10 @@ class ZeroPointOptimizer;
    Transformation prepares quantized constant data for Low Precision pipeline.
    Such constant data packing reduces IR size (.bin file size) in offline transformations.
    With that we can skip same calculations in the runtime and make loading of such sub-graphs to the plugin faster.
+    Additionally zero point can be fused to weights if it doesn't affect accuracy.
 */
 class ov::pass::CompressQuantizeWeights : public ov::pass::MatcherPass {
 public:
    OPENVINO_RTTI("CompressQuantizeWeights", "0");
    CompressQuantizeWeights();
 };
-
-/*
-   if zero_point == 0 we can eliminate Subtract from following dequantization subgraph:
-
-                                +-----------------+
-                                |    Constant     |
-                                | (low precision) |
-                                +-----------------+
-                                        |
-                                        v
-                                +------------------+
-                                |     Convert      |
-                                |  (to high prec)  |
-                                +------------------+
-                                        |
-                                        v
-                  +----------+    +------------+
-                  |zero point|--->|  Subtract  |
-                  +----------+    +-----+------+
-                                        |
-                                        v
-*/
-class ov::pass::ZeroPointOptimizer : public ov::pass::MatcherPass {
-public:
-    OPENVINO_RTTI("ZeroPointOptimizer");
-    ZeroPointOptimizer();
-};
--- a/src/common/offline_transformations/src/compress_quantize_weigths.cpp
+++ b/src/common/offline_transformations/src/compress_quantize_weigths.cpp
--- a/src/common/transformations/tests/utils/compress_quantize_weights.cpp
+++ b/src/common/transformations/tests/utils/compress_quantize_weights.cpp
@ -31,6 +31,7 @@ struct CompressQuantizeWeightsParams {
    std::vector<float> expected_weights;
    float scale_val;
    float zero_point_val;
+    bool fuse_zero_point;
 };

 class CompressQuantizeWeightsTests
@ -66,9 +67,14 @@ class CompressQuantizeWeightsTests
            auto data = opset8::Constant::create(param.expected_type, param.shape, param.expected_weights);
            auto convert = std::make_shared<opset8::Convert>(data, element::f32);
            auto scale = opset8::Constant::create(element::f32, Shape{}, {param.scale_val});
-            auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
-            auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
-            auto mul = std::make_shared<opset8::Multiply>(sub, scale);
+            std::shared_ptr<opset8::Multiply> mul;
+            if (!param.fuse_zero_point) {
+                auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
+                auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
+                mul = std::make_shared<opset8::Multiply>(sub, scale);
+            } else {
+                mul = std::make_shared<opset8::Multiply>(convert, scale);
+            }
            model_ref = std::make_shared<Model>(mul, ParameterVector{});
        }
        comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
@ -89,7 +95,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
     element::i4,
     {-1.0f, -1.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     3.0f,
-     -0.666667f},
+     -0.666667f,
+     false},
    {Shape{2, 3, 1, 1},
     {-1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
     0.0f,
@ -100,7 +107,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
     element::i4,
     {-8.0f, -5.0f, -4.0f, -2.0f, 0.0f, 7.0f},
     0.333333f,
-     -5.0f},
+     -5.0f,
+     false},
    {Shape{2, 4, 1, 1},
     {-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
     1.0f,
@ -109,9 +117,10 @@ static std::vector<CompressQuantizeWeightsParams> params = {
     6.0f,
     17,
     element::i8,
-     {-8.0f, -8.0f, -8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 8.0f},
+     {-4.0f, -4.0f, -4.0f, -2.0f, 0.0f, 2.0f, 4.0f, 12.0f},
     0.5f,
-     -4.0f},
+     -4.0f,
+     true},
    {Shape{2, 4, 1, 1},
     {-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
     1.0f,
@ -122,7 +131,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
     element::i8,
     {-128.0f, -128.0f, -128.0f, -96.0f, -64.0f, -32.0f, 0.0f, 127.0f},
     0.0313725f,
-     -64.25f},
+     -64.25f,
+     false},
 };

 static element::TypeVector data_precisions = {element::f32, element::f16};
@ -198,7 +208,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithDequantizationSubgraphFP
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
 }

-TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
+TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminated) {
    {
        auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
        auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
@ -209,7 +219,6 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
        model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});

        manager.register_pass<ov::pass::CompressQuantizeWeights>();
-        manager.register_pass<ov::pass::ZeroPointOptimizer>();
    }

    {
@ -223,7 +232,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
 }

-TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16) {
+TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
    {
        auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});
        auto input_low =
@ -239,7 +248,6 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16)
        model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});

        manager.register_pass<ov::pass::CompressQuantizeWeights>();
-        manager.register_pass<ov::pass::ZeroPointOptimizer>();
    }

    {
@ -253,7 +261,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16)
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
 }

-TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimizer) {
+TEST_F(TransformationTestsF, NegativeCompressQuantizeWeights) {
    {
        auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
        auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
@ -264,7 +272,6 @@ TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimiz
        model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});

        manager.register_pass<ov::pass::CompressQuantizeWeights>();
-        manager.register_pass<ov::pass::ZeroPointOptimizer>();
    }
    {
        auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
@ -289,7 +296,6 @@ TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsNonConstantInput) {
    model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{data});

    manager.register_pass<ov::pass::CompressQuantizeWeights>();
-    manager.register_pass<ov::pass::ZeroPointOptimizer>();

    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
--- a/src/core/reference/include/openvino/reference/fake_quantize.hpp
+++ b/src/core/reference/include/openvino/reference/fake_quantize.hpp
@ -21,31 +21,86 @@ namespace ov {
 namespace reference {
 namespace fake_quantize_details {
 template <typename T>
-inline T quantize(const T& arg,
-                  const T& in_low,
-                  const T& in_high,
-                  const T& out_low,
-                  const T& out_high,
-                  const size_t& levels) {
+static inline T quantize(const T arg,
+                         const T in_low,
+                         const T in_high,
+                         const T out_low,
+                         const T out_high,
+                         const T levels_minus_one) {
    if (arg <= std::min(in_low, in_high)) {
        return out_low;
    } else if (arg > std::max(in_low, in_high)) {
        return out_high;
    }
-    return static_cast<T>(std::nearbyint((arg - in_low) / (in_high - in_low) * (levels - 1)) / (levels - 1) *
+    return static_cast<T>(std::nearbyint((arg - in_low) / (in_high - in_low) * levels_minus_one) / levels_minus_one *
                              (out_high - out_low) +
                          out_low);
 }

+static std::vector<size_t> compute_strides(const ov::Shape& out_shape, const ov::Shape& shape);
+
+static std::tuple<size_t, size_t> get_inner_stride(size_t num_output_elements,
+                                                   const ov::Shape& output_shape,
+                                                   const ov::Shape& shape,
+                                                   size_t current_output_inner_stride);
+
+template <typename T, typename F>
+static void fake_quantize_non_unit_inner_stride(const T* arg,
+                                                const T* in_low,
+                                                const T* in_high,
+                                                const T* out_low,
+                                                const T* out_high,
+                                                T* out,
+                                                const Shape& arg_shape,
+                                                T levels_minus_one,
+                                                size_t input_inner_stride,
+                                                const F& get_outer_strides);
+
+template <typename T, typename F>
+static void fake_quantize_unit_inner_stride(const T* arg,
+                                            const T* in_low,
+                                            const T* in_high,
+                                            const T* out_low,
+                                            const T* out_high,
+                                            T* out,
+                                            const Shape& arg_shape,
+                                            T levels_minus_one,
+                                            size_t input_inner_stride,
+                                            const F& get_outer_strides);
+
+template <typename T, typename F>
+static void fake_quantize_unit_output_intervals_inner_stride(const T* arg,
+                                                             const T* in_low,
+                                                             const T* in_high,
+                                                             const T* out_low,
+                                                             const T* out_high,
+                                                             T* out,
+                                                             const Shape& arg_shape,
+                                                             T levels_minus_one,
+                                                             size_t input_inner_stride,
+                                                             const F& get_outer_strides);
+
+template <typename T, typename F>
+static void fake_quantize_unit_input_intervals_inner_stride(const T* arg,
+                                                            const T* in_low,
+                                                            const T* in_high,
+                                                            const T* out_low,
+                                                            const T* out_high,
+                                                            T* out,
+                                                            const Shape& arg_shape,
+                                                            T levels_minus_one,
+                                                            size_t input_inner_stride,
+                                                            const F& get_outer_strides);
+
 }  // namespace fake_quantize_details

 template <typename T>
-void fake_quantize(const T* const arg,
-                   const T* const in_low,
-                   const T* const in_high,
-                   const T* const out_low,
-                   const T* const out_high,
-                   T* const out,
+void fake_quantize(const T* arg,
+                   const T* in_low,
+                   const T* in_high,
+                   const T* out_low,
+                   const T* out_high,
+                   T* out,
                   const Shape& arg_shape,
                   const Shape& in_low_shape,
                   const Shape& in_high_shape,
@ -55,133 +110,452 @@ void fake_quantize(const T* const arg,
                   const op::AutoBroadcastSpec& broadcast) {
    using namespace fake_quantize_details;

+    T levels_minus_one = static_cast<T>(levels - 1);
+    const size_t arg_size = shape_size(arg_shape);
+
    if (shape_size(in_low_shape) == 1 && shape_size(in_high_shape) == 1 && shape_size(out_low_shape) == 1 &&
        shape_size(out_high_shape) == 1) {
-        const size_t arg_size = shape_size(arg_shape);
-        const auto q = [=](const T& a) {
-            return quantize(a, *in_low, *in_high, *out_low, *out_high, levels);
-        };
        for (size_t i = 0; i < arg_size; ++i) {
-            out[i] = q(arg[i]);
+            out[i] = quantize(arg[i], *in_low, *in_high, *out_low, *out_high, levels_minus_one);
        }
+        return;
+    }
+
+    // clang-format off
+    /*
+     * ---------------------------------------------------
+     * Overview:
+     *  Numpy broadcasted input tensors can be partitioned into two: outer and inner part (which also defines inner
+     *  stride as a product of inner part), so N-dimensional tensors can be processed using two loops.
+     *
+     *  For example with two inputs [2, 2, 3, 4] and [1, 1, 3, 4] we can have:
+     *      input 1 with shape [2, 2, 3, 4] can be divided into outer part [2, 2] and inner part [3, 4]
+     *      with inner stride = 12 (3 * 4).
+     *      input 2 with shape [1, 1, 3, 4] can be divided into outer part [1, 1]
+     *      and inner part [3, 4] with inner stride = 12 (3 * 4)
+     *
+     *      Having that, those inputs can be processed by the following:
+     *
+     *      output_shape = {2, 2, 3, 4};
+     *      output_inner_stride = 12;
+     *      for (i = 0; i < shape_size(shape); i += output_inner_stride) {
+     *          first_input_stride = i;
+     *          second_input_stride = 0;
+     *          for (j = 0; j < 12; j++) {
+     *              *out++ = f(first_input[first_input_stride + j], second_input[second_input_stride + j]);
+     *          }
+     *      }
+     *
+     * ---------------------------------------------------
+     * How the partitioning is done:
+     *  Partitioning process starts with the last dimension of input tensor shape and it stops when either one of below
+     *  occurs:
+     *      - if the last dimension is equal to 1, partitioning stops at the dimension that is greater than 1 (this
+     *        dimension is not included in the inner part),
+     *      - if the last dimension is greater than 1, partitioning stops at the dimension that is equal to 1 (this
+     *        dimension is not included in the inner part).
+     *
+     *  Examples:
+     *      tensor_shape=[2, 3, 4, 5], inner_part = [2, 3, 4, 5], inner_stride = 120
+     *      tensor_shape=[1, 1, 4, 5], inner_part = [4, 5], inner_stride = 20
+     *      tensor_shape=[2, 3, 1, 1], inner_part = [1, 1], inner_stride = 1
+     *
+     *
+     * ---------------------------------------------------
+     * How the output inner stride is calculated:
+     *  Inner part (and inner stride) for every input tensor is determined. Then the size of output inner part is the
+     *  size of inner part with the fewest number of dimensions.
+     *
+     *  Example with 5 inputs:
+     *      input 1 shape [2, 3, 4, 5], inner_part = [2, 3, 4, 5], inner_stride = 120
+     *      input 2 shape [1, 3, 4, 5], inner_part = [3, 4, 5], inner_stride = 60
+     *      input 3 shape [2, 3, 1, 1], inner_part = [1, 1], inner_stride = 1
+     *      input 4 shape [2, 1, 1, 1], inner_part = [1, 1, 1], inner_stride = 1
+     *      input 5 shape [1, 1, 1, 1], inner_part = [1, 1, 1, 1], inner_stride = 1
+     *
+     *      output shape [2, 3, 4, 5], inner_part = [4, 5], inner_stride = 20
+     *
+     *      Inner part with fewest number of elements is [1, 1] for input 3. So the inner part for output shape is [4, 5]
+     *      and output inner stride is 20.
+     */
+    // clang-format on
+
+    std::vector<size_t> output_strides = compute_strides(arg_shape, arg_shape);
+    std::vector<size_t> in_low_strides = compute_strides(arg_shape, in_low_shape);
+    std::vector<size_t> in_high_strides = compute_strides(arg_shape, in_high_shape);
+    std::vector<size_t> out_low_strides = compute_strides(arg_shape, out_low_shape);
+    std::vector<size_t> out_high_strides = compute_strides(arg_shape, out_high_shape);
+
+    size_t input_inner_stride = arg_size;
+    size_t in_low_inner_stride = 0;
+    size_t in_high_inner_stride = 0;
+    size_t out_low_inner_stride = 0;
+    size_t out_high_inner_stride = 0;
+
+    std::tie(in_low_inner_stride, input_inner_stride) =
+        get_inner_stride(arg_size, arg_shape, in_low_shape, input_inner_stride);
+    std::tie(in_high_inner_stride, input_inner_stride) =
+        get_inner_stride(arg_size, arg_shape, in_high_shape, input_inner_stride);
+    std::tie(out_low_inner_stride, input_inner_stride) =
+        get_inner_stride(arg_size, arg_shape, out_low_shape, input_inner_stride);
+    std::tie(out_high_inner_stride, input_inner_stride) =
+        get_inner_stride(arg_size, arg_shape, out_high_shape, input_inner_stride);
+
+    auto get_outer_strides =
+        [&output_strides, &in_low_strides, &in_high_strides, &out_low_strides, &out_high_strides](size_t flat_index) {
+            size_t in_low_stride = 0;
+            size_t in_high_stride = 0;
+            size_t out_low_stride = 0;
+            size_t out_high_stride = 0;
+
+            for (size_t i = 0; i < output_strides.size(); i++) {
+                size_t div = flat_index / output_strides[i];
+                flat_index = flat_index % output_strides[i];
+                in_low_stride += div * in_low_strides[i];
+                in_high_stride += div * in_high_strides[i];
+                out_low_stride += div * out_low_strides[i];
+                out_high_stride += div * out_high_strides[i];
+            }
+
+            return std::tuple<size_t, size_t, size_t, size_t>{in_low_stride,
+                                                              in_high_stride,
+                                                              out_low_stride,
+                                                              out_high_stride};
+        };
+
+    if (in_low_inner_stride > 1 && in_high_inner_stride > 1 && out_low_inner_stride > 1 && out_high_inner_stride > 1) {
+        fake_quantize_non_unit_inner_stride(arg,
+                                            in_low,
+                                            in_high,
+                                            out_low,
+                                            out_high,
+                                            out,
+                                            arg_shape,
+                                            levels_minus_one,
+                                            input_inner_stride,
+                                            get_outer_strides);
+    } else if (in_low_inner_stride == 1 && in_high_inner_stride == 1 && out_low_inner_stride == 1 &&
+               out_high_inner_stride == 1) {
+        fake_quantize_unit_inner_stride(arg,
+                                        in_low,
+                                        in_high,
+                                        out_low,
+                                        out_high,
+                                        out,
+                                        arg_shape,
+                                        levels_minus_one,
+                                        input_inner_stride,
+                                        get_outer_strides);
+
+    } else if (in_low_inner_stride > 1 && in_high_inner_stride > 1 && out_low_inner_stride == 1 &&
+               out_high_inner_stride == 1) {
+        fake_quantize_unit_output_intervals_inner_stride(arg,
+                                                         in_low,
+                                                         in_high,
+                                                         out_low,
+                                                         out_high,
+                                                         out,
+                                                         arg_shape,
+                                                         levels_minus_one,
+                                                         input_inner_stride,
+                                                         get_outer_strides);
+
+    } else if (in_low_inner_stride == 1 && in_high_inner_stride == 1 && out_low_inner_stride > 1 &&
+               out_high_inner_stride > 1) {
+        fake_quantize_unit_input_intervals_inner_stride(arg,
+                                                        in_low,
+                                                        in_high,
+                                                        out_low,
+                                                        out_high,
+                                                        out,
+                                                        arg_shape,
+                                                        levels_minus_one,
+                                                        input_inner_stride,
+                                                        get_outer_strides);
    } else {
-        OPENVINO_ASSERT(in_low_shape.size() <= arg_shape.size() && in_high_shape.size() <= arg_shape.size() &&
-                            out_low_shape.size() <= arg_shape.size() && out_high_shape.size() <= arg_shape.size(),
-                        "Tensors with input\\output ranges should have rank less or "
-                        "equal to data tensor rank equal to ",
-                        arg_shape.size());
+        size_t in_low_stride = 0;
+        size_t in_high_stride = 0;
+        size_t out_low_stride = 0;
+        size_t out_high_stride = 0;

-        Shape arg0_padded_shape = arg_shape;
-        Shape arg1_padded_shape = in_low_shape;
-        Shape arg2_padded_shape = in_high_shape;
-        Shape arg3_padded_shape = out_low_shape;
-        Shape arg4_padded_shape = out_high_shape;
-
-        size_t max_shape_size = arg_shape.size();
-
-        while (arg0_padded_shape.size() < max_shape_size) {
-            arg0_padded_shape.insert(arg0_padded_shape.begin(), 1);
-        }
-
-        while (arg1_padded_shape.size() < max_shape_size) {
-            arg1_padded_shape.insert(arg1_padded_shape.begin(), 1);
-        }
-
-        while (arg2_padded_shape.size() < max_shape_size) {
-            arg2_padded_shape.insert(arg2_padded_shape.begin(), 1);
-        }
-
-        while (arg3_padded_shape.size() < max_shape_size) {
-            arg3_padded_shape.insert(arg3_padded_shape.begin(), 1);
-        }
-
-        while (arg4_padded_shape.size() < max_shape_size) {
-            arg4_padded_shape.insert(arg4_padded_shape.begin(), 1);
-        }
-
-        Shape arg0_squeezed_shape, arg1_squeezed_shape, arg2_squeezed_shape, arg3_squeezed_shape, arg4_squeezed_shape;
-        AxisSet arg0_squeezed_axes, arg1_squeezed_axes, arg2_squeezed_axes, arg3_squeezed_axes, arg4_squeezed_axes;
-        Shape output_shape;
-
-        for (size_t i = 0; i < max_shape_size; i++) {
-            if (arg1_padded_shape[i] == 1) {
-                arg1_squeezed_axes.insert(i);
-            } else {
-                arg1_squeezed_shape.push_back(arg1_padded_shape[i]);
-            }
-
-            if (arg2_padded_shape[i] == 1) {
-                arg2_squeezed_axes.insert(i);
-            } else {
-                arg2_squeezed_shape.push_back(arg2_padded_shape[i]);
-            }
-
-            if (arg0_padded_shape[i] == 1) {
-                arg0_squeezed_axes.insert(i);
-            } else {
-                arg0_squeezed_shape.push_back(arg0_padded_shape[i]);
-            }
-
-            if (arg3_padded_shape[i] == 1) {
-                arg3_squeezed_axes.insert(i);
-            } else {
-                arg3_squeezed_shape.push_back(arg3_padded_shape[i]);
-            }
-
-            if (arg4_padded_shape[i] == 1) {
-                arg4_squeezed_axes.insert(i);
-            } else {
-                arg4_squeezed_shape.push_back(arg4_padded_shape[i]);
-            }
-
-            output_shape.push_back(std::max({arg0_padded_shape[i],
-                                             arg2_padded_shape[i],
-                                             arg1_padded_shape[i],
-                                             arg3_padded_shape[i],
-                                             arg4_padded_shape[i]}));
-        }
-
-        CoordinateTransformBasic arg0_transform(arg0_squeezed_shape);
-        CoordinateTransformBasic arg1_transform(arg1_squeezed_shape);
-        CoordinateTransformBasic arg2_transform(arg2_squeezed_shape);
-        CoordinateTransformBasic arg3_transform(arg3_squeezed_shape);
-        CoordinateTransformBasic arg4_transform(arg4_squeezed_shape);
-        CoordinateTransformBasic output_transform(output_shape);
-
-        const auto arg0_strides = row_major_strides(arg0_squeezed_shape);
-        const auto arg1_strides = row_major_strides(arg1_squeezed_shape);
-        const auto arg2_strides = row_major_strides(arg2_squeezed_shape);
-        const auto arg3_strides = row_major_strides(arg3_squeezed_shape);
-        const auto arg4_strides = row_major_strides(arg4_squeezed_shape);
-        const auto output_strides = row_major_strides(output_shape);
-
-        for (const Coordinate& output_coord : output_transform) {
-            const auto arg0_coord = util::reduce(output_coord, arg0_squeezed_axes);
-            const auto arg1_coord = util::reduce(output_coord, arg1_squeezed_axes);
-            const auto arg2_coord = util::reduce(output_coord, arg2_squeezed_axes);
-            const auto arg3_coord = util::reduce(output_coord, arg3_squeezed_axes);
-            const auto arg4_coord = util::reduce(output_coord, arg4_squeezed_axes);
-
-            const size_t arg0_idx =
-                std::inner_product(arg0_coord.begin(), arg0_coord.end(), arg0_strides.begin(), uint64_t(0));
-            const size_t arg1_idx =
-                std::inner_product(arg1_coord.begin(), arg1_coord.end(), arg1_strides.begin(), uint64_t(0));
-            const size_t arg2_idx =
-                std::inner_product(arg2_coord.begin(), arg2_coord.end(), arg2_strides.begin(), uint64_t(0));
-            const size_t arg3_idx =
-                std::inner_product(arg3_coord.begin(), arg3_coord.end(), arg3_strides.begin(), uint64_t(0));
-            const size_t arg4_idx =
-                std::inner_product(arg4_coord.begin(), arg4_coord.end(), arg4_strides.begin(), uint64_t(0));
-            const size_t output_idx =
-                std::inner_product(output_coord.begin(), output_coord.end(), output_strides.begin(), uint64_t(0));
-            out[output_idx] = quantize(arg[arg0_idx],
-                                       in_low[arg1_idx],
-                                       in_high[arg2_idx],
-                                       out_low[arg3_idx],
-                                       out_high[arg4_idx],
-                                       levels);
+        for (size_t i = 0; i < arg_size; i++) {
+            std::tie(in_low_stride, in_high_stride, out_low_stride, out_high_stride) = get_outer_strides(i);
+            *out++ = quantize(*arg++,
+                              *(in_low + in_low_stride),
+                              *(in_high + in_high_stride),
+                              *(out_low + out_low_stride),
+                              *(out_high + out_low_stride),
+                              levels_minus_one);
        }
    }
 }
+
+namespace fake_quantize_details {
+std::vector<size_t> compute_strides(const ov::Shape& out_shape, const ov::Shape& shape) {
+    size_t stride = 1;
+    size_t out_rank = out_shape.size();
+    size_t shape_rank = shape.size();
+    std::vector<size_t> strides(out_rank);
+    for (size_t i = 0; i < out_rank; i++) {
+        if (i < shape_rank && shape[shape_rank - i - 1] == out_shape[out_rank - i - 1]) {
+            strides[out_rank - i - 1] = stride;
+            stride *= shape[shape_rank - i - 1];
+        } else {
+            strides[out_rank - i - 1] = 0;
+        }
+    }
+    return strides;
+}
+
+std::tuple<size_t, size_t> get_inner_stride(size_t num_output_elements,
+                                            const ov::Shape& output_shape,
+                                            const ov::Shape& shape,
+                                            size_t current_output_inner_stride) {
+    if (shape.size() == 0)
+        return std::tuple<size_t, size_t>{1, std::min(current_output_inner_stride, num_output_elements)};
+    const size_t last = shape.back();
+    auto it = std::find_if(shape.rbegin(), shape.rend(), [last](size_t dim) {
+        return (last == 1 && dim > 1) || (last > 1 && dim == 1);
+    });
+    if (it == shape.rend()) {
+        const size_t num_elements = shape_size(shape);
+        return std::tuple<size_t, size_t>{
+            num_elements,
+            last == 1 ? current_output_inner_stride : std::min(current_output_inner_stride, num_elements)};
+    }
+    const size_t idx = std::distance(it, shape.rbegin()) + static_cast<int64_t>(shape.size());
+    const size_t inner_stride =
+        std::accumulate(shape.begin() + idx, shape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
+    const size_t output_inner_stride = std::accumulate(output_shape.begin() + output_shape.size() - shape.size() + idx,
+                                                       output_shape.end(),
+                                                       static_cast<size_t>(1),
+                                                       std::multiplies<size_t>());
+    return std::tuple<size_t, size_t>{inner_stride, std::min(current_output_inner_stride, output_inner_stride)};
+}
+
+template <typename T, typename F>
+static void transform(const T* first1, const T* const last1, const T* first2, const T* first3, T* out, const F& f) {
+    while (first1 < last1) {
+        *out++ = f(*first1++, *first2++, *first3++);
+    }
+}
+
+template <typename T, typename F>
+static void transform(const T* first1,
+                      const T* const last1,
+                      const T* first2,
+                      const T* first3,
+                      const T* first4,
+                      const T* first5,
+                      T* out,
+                      const F& f) {
+    while (first1 < last1) {
+        *out++ = f(*first1++, *first2++, *first3++, *first4++, *first5++);
+    }
+}
+
+template <typename T, typename F1, typename F2>
+static void fake_quantize_loop(const Shape& arg_shape,
+                               const T* arg,
+                               const T* in_low,
+                               const T* in_high,
+                               const T* out_low,
+                               const T* out_high,
+                               T* out,
+                               size_t input_inner_stride,
+                               const F1& get_outer_strides,
+                               const F2& quantize_loop) {
+    size_t in_low_stride = 0;
+    size_t in_high_stride = 0;
+    size_t out_low_stride = 0;
+    size_t out_high_stride = 0;
+
+    for (size_t i = 0; i < shape_size(arg_shape); i += input_inner_stride) {
+        std::tie(in_low_stride, in_high_stride, out_low_stride, out_high_stride) = get_outer_strides(i);
+        quantize_loop(arg,
+                      arg + input_inner_stride,
+                      in_low + in_low_stride,
+                      in_high + in_high_stride,
+                      out_low + out_low_stride,
+                      out_high + out_high_stride,
+                      out);
+        arg += input_inner_stride;
+        out += input_inner_stride;
+    }
+}
+
+template <typename T, typename F>
+void fake_quantize_non_unit_inner_stride(const T* arg,
+                                         const T* in_low,
+                                         const T* in_high,
+                                         const T* out_low,
+                                         const T* out_high,
+                                         T* out,
+                                         const Shape& arg_shape,
+                                         T levels_minus_one,
+                                         size_t input_inner_stride,
+                                         const F& get_outer_strides) {
+    fake_quantize_loop(arg_shape,
+                       arg,
+                       in_low,
+                       in_high,
+                       out_low,
+                       out_high,
+                       out,
+                       input_inner_stride,
+                       get_outer_strides,
+                       [levels_minus_one](const T* input,
+                                          const T* const input_end,
+                                          const T* in_low,
+                                          const T* in_high,
+                                          const T* out_low,
+                                          const T* out_high,
+                                          T* out) {
+                           transform(input,
+                                     input_end,
+                                     in_low,
+                                     in_high,
+                                     out_low,
+                                     out_high,
+                                     out,
+                                     [levels_minus_one](T input, T in_low, T in_high, T out_low, T out_high) {
+                                         return quantize(input, in_low, in_high, out_low, out_high, levels_minus_one);
+                                     });
+                       });
+}
+
+template <typename T, typename F>
+void fake_quantize_unit_inner_stride(const T* arg,
+                                     const T* in_low,
+                                     const T* in_high,
+                                     const T* out_low,
+                                     const T* out_high,
+                                     T* out,
+                                     const Shape& arg_shape,
+                                     T levels_minus_one,
+                                     size_t input_inner_stride,
+                                     const F& get_outer_strides) {
+    auto quantize_with_scalar_intervals = [levels_minus_one](const T* input,
+                                                             const T* const input_end,
+                                                             const T* in_low,
+                                                             const T* in_high,
+                                                             const T* out_low,
+                                                             const T* out_high,
+                                                             T* out) {
+        const auto in_low_scalar = *in_low;
+        const auto in_high_scalar = *in_high;
+        const auto out_low_scalar = *out_low;
+        const auto out_high_scalar = *out_high;
+        std::transform(input,
+                       input_end,
+                       out,
+                       [levels_minus_one, in_low_scalar, in_high_scalar, out_low_scalar, out_high_scalar](T input) {
+                           return quantize(input,
+                                           in_low_scalar,
+                                           in_high_scalar,
+                                           out_low_scalar,
+                                           out_high_scalar,
+                                           levels_minus_one);
+                       });
+    };
+
+    fake_quantize_loop(arg_shape,
+                       arg,
+                       in_low,
+                       in_high,
+                       out_low,
+                       out_high,
+                       out,
+                       input_inner_stride,
+                       get_outer_strides,
+                       quantize_with_scalar_intervals);
+}
+
+template <typename T, typename F>
+void fake_quantize_unit_output_intervals_inner_stride(const T* arg,
+                                                      const T* in_low,
+                                                      const T* in_high,
+                                                      const T* out_low,
+                                                      const T* out_high,
+                                                      T* out,
+                                                      const Shape& arg_shape,
+                                                      T levels_minus_one,
+                                                      size_t input_inner_stride,
+                                                      const F& get_outer_strides) {
+    auto quantize_with_scalar_output_intervals = [levels_minus_one](const T* input,
+                                                                    const T* const input_end,
+                                                                    const T* in_low,
+                                                                    const T* in_high,
+                                                                    const T* out_low,
+                                                                    const T* out_high,
+                                                                    T* out) {
+        const auto out_low_scalar = *out_low;
+        const auto out_high_scalar = *out_high;
+        transform(input,
+                  input_end,
+                  in_low,
+                  in_high,
+                  out,
+                  [levels_minus_one, out_low_scalar, out_high_scalar](T input, T in_low, T in_high) {
+                      return quantize(input, in_low, in_high, out_low_scalar, out_high_scalar, levels_minus_one);
+                  });
+    };
+
+    fake_quantize_loop(arg_shape,
+                       arg,
+                       in_low,
+                       in_high,
+                       out_low,
+                       out_high,
+                       out,
+                       input_inner_stride,
+                       get_outer_strides,
+                       quantize_with_scalar_output_intervals);
+}
+
+template <typename T, typename F>
+void fake_quantize_unit_input_intervals_inner_stride(const T* arg,
+                                                     const T* in_low,
+                                                     const T* in_high,
+                                                     const T* out_low,
+                                                     const T* out_high,
+                                                     T* out,
+                                                     const Shape& arg_shape,
+                                                     T levels_minus_one,
+                                                     size_t input_inner_stride,
+                                                     const F& get_outer_strides) {
+    auto quantize_with_scalar_input_intervals = [levels_minus_one](const T* input,
+                                                                   const T* const input_end,
+                                                                   const T* in_low,
+                                                                   const T* in_high,
+                                                                   const T* out_low,
+                                                                   const T* out_high,
+                                                                   T* out) {
+        const auto in_low_scalar = *in_low;
+        const auto in_high_scalar = *in_high;
+        transform(input,
+                  input_end,
+                  out_low,
+                  out_high,
+                  out,
+                  [levels_minus_one, in_low_scalar, in_high_scalar](T input, T out_low, T out_high) {
+                      return quantize(input, in_low_scalar, in_high_scalar, out_low, out_high, levels_minus_one);
+                  });
+    };
+
+    fake_quantize_loop(arg_shape,
+                       arg,
+                       in_low,
+                       in_high,
+                       out_low,
+                       out_high,
+                       out,
+                       input_inner_stride,
+                       get_outer_strides,
+                       quantize_with_scalar_input_intervals);
+}
+
+}  // namespace fake_quantize_details
+
 }  // namespace reference
 }  // namespace ov
--- a/src/plugins/template/tests/functional/op_reference/fake_quantize.cpp
+++ b/src/plugins/template/tests/functional/op_reference/fake_quantize.cpp
@ -253,6 +253,22 @@ std::vector<FakeQuantizeParams> generateParamsForFakeQuantize() {
                                                    }),
                           16,
                           op::AutoBroadcastSpec(op::AutoBroadcastType::NUMPY)),
+        FakeQuantizeParams(
+            ov::Shape{1, 2, 4, 4},
+            ov::Shape{1, 2, 4, 4},
+            IN_ET,
+            IN_ET,
+            iota_vector<T>(shape_size(Shape{1, 2, 4, 4})),
+            std::vector<T>{
+                0,     0,     0,    0,    0,    0,    0,    0,     0,     8.75,  8.75,  8.75,  8.75, 8.75, 8.75, 17.5,
+                23.75, 23.75, 27.5, 27.5, 27.5, 27.5, 27.5, 31.25, 31.25, 31.25, 31.25, 31.25, 35,   35,   35,   35,
+            },
+            op::v0::Constant::create(IN_ET, Shape{1, 2, 1, 1}, {5.f, 10.f}),
+            op::v0::Constant::create(IN_ET, Shape{1, 1}, {30.f}),
+            op::v0::Constant::create(IN_ET, Shape{2, 1, 1}, {0.f, 20.f}),
+            op::v0::Constant::create(IN_ET, Shape{1}, {35.f}),
+            5),
+
    };
    return params;
 }