From 6ff0cad127d91d02df3cfacbeb3133c49d57f214 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Mon, 24 Apr 2023 11:13:04 +0200
Subject: [PATCH] Fix mixed precision inference for quantized IRs (#16785)

* disable mixed precision inference for quantized IRs

* typo fix

* improved solution, disable mixed precision in quantized IRs selectively only for float nodes

* minor typos correction

* added unit-tests

* renamed rt_info

* updated list of nodes for which FQ is propagated; updated unit-tests

* fix failing build
---
 ...k_subgraphs_to_keep_in_mixed_precision.cpp | 118 ++++++--
 ...bgraph_to_keep_in_mixed_precision_test.cpp | 281 +++++++++---------
 .../tests/utils/convert_precision.cpp         | 116 ++++++++
 3 files changed, 361 insertions(+), 154 deletions(-)
diff --git a/src/common/transformations/src/transformations/common_optimizations/mark_subgraphs_to_keep_in_mixed_precision.cpp b/src/common/transformations/src/transformations/common_optimizations/mark_subgraphs_to_keep_in_mixed_precision.cpp
index 7238ca870c7..e276efa9aaf 100644
--- a/src/common/transformations/src/transformations/common_optimizations/mark_subgraphs_to_keep_in_mixed_precision.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/mark_subgraphs_to_keep_in_mixed_precision.cpp
@@ -8,6 +8,7 @@
 #include "openvino/op/util/broadcast_base.hpp"
 #include "openvino/op/util/gather_base.hpp"
 #include "openvino/opsets/opset10.hpp"
+#include "openvino/opsets/opset11.hpp"
 #include "openvino/opsets/opset2.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
@@ -23,6 +24,30 @@ using namespace ov::opset10;
 namespace ov {
 namespace pass {
 
+void mark_reduceop_path(const std::shared_ptr<Node>& node) {
+    node->get_rt_info().emplace("reduceop_path", true);
+}
+bool is_reduceop_path(const std::shared_ptr<const Node>& node) {
+    return node->get_rt_info().count("reduceop_path");
+}
+
+void erase_reduceop_path(const std::shared_ptr<Node>& node) {
+    auto& rt_info = node->get_rt_info();
+    rt_info.erase("reduceop_path");
+}
+
+void mark_fq_path(const std::shared_ptr<Node>& node) {
+    node->get_rt_info().emplace("fq_path", true);
+}
+bool is_fq_path(const std::shared_ptr<const Node>& node) {
+    return node->get_rt_info().count("fq_path");
+}
+
+void erase_fq_path(const std::shared_ptr<Node>& node) {
+    auto& rt_info = node->get_rt_info();
+    rt_info.erase("fq_path");
+}
+
 // Marking continues to propagate through these ops.
 std::shared_ptr<Node> propagate_through_ops = pattern::wrap_type<Squeeze,
                                                                  Unsqueeze,
@@ -72,11 +97,11 @@ public:
             if (!has_marked_output)
                 return false;
 
-            auto convert_node = dynamic_pointer_cast<Convert>(node);
+            auto convert_node = as_type_ptr<Convert>(node);
             if (convert_node) {
                 // if during propagating up there is a Convert it must go to Const,
                 // otherwise interrupt propagation
-                auto const_node = dynamic_pointer_cast<Constant>(node->input_value(0).get_node_shared_ptr());
+                auto const_node = as_type_ptr<Constant>(node->input_value(0).get_node_shared_ptr());
                 if (!const_node)
                     return false;
             }
@@ -106,7 +131,7 @@ public:
                 return false;
 
             // on convert down propagation should be interrupted
-            auto convert_node = dynamic_pointer_cast<Convert>(node);
+            auto convert_node = as_type_ptr<Convert>(node);
             if (convert_node)
                 return false;
 
@@ -114,6 +139,11 @@ public:
             for (const auto& in_node : node->input_values()) {
                 if (!in_node.get_element_type().is_real())
                     continue;
+                if (is_fq_path(in_node.get_node_shared_ptr())) {
+                    enable_fp16_compression(node);
+                    return true;
+                }
+
                 if (fp16_compression_is_disabled(in_node.get_node_shared_ptr())) {
                     disable_fp16_compression(node);
                     is_changed = true;
@@ -127,18 +157,6 @@ public:
     }
 };
 
-void mark_reduceop_path(const std::shared_ptr<Node>& node) {
-    node->get_rt_info().emplace("reduceop_path", true);
-}
-bool is_reduceop_path(const std::shared_ptr<const Node>& node) {
-    return node->get_rt_info().count("reduceop_path");
-}
-
-void erase_reduceop_path(const std::shared_ptr<Node>& node) {
-    auto& rt_info = node->get_rt_info();
-    rt_info.erase("reduceop_path");
-}
-
 class InitMarkReduceOpPath : public pass::MatcherPass {
 public:
     OPENVINO_RTTI("InitMarkReduceOpPath", "0");
@@ -267,11 +285,11 @@ public:
             if (!m.get_match_root())
                 return false;
 
-            const auto mul = std::dynamic_pointer_cast<Multiply>(m.get_match_root());
+            const auto mul = as_type_ptr<Multiply>(m.get_match_root());
             // if pattern input_1*Pow(Maximum(input_2, eps), z) or input_1*Pow(Add(input_2, eps), z) is matched
             // need to check that power is negative
             if (mul) {
-                const auto pow_const = std::dynamic_pointer_cast<Constant>(pattern_to_output.at(pow_exp));
+                const auto pow_const = as_type_ptr<Constant>(pattern_to_output.at(pow_exp));
                 if (pow_const) {
                     // continue only if exponent is negative (z < 0)
                     if (pow_const->get_element_type() == element::f16) {
@@ -286,7 +304,7 @@ public:
                 }
             }
 
-            const auto eps_const = std::dynamic_pointer_cast<Constant>(pattern_to_output.at(eps_const_pattern));
+            const auto eps_const = as_type_ptr<Constant>(pattern_to_output.at(eps_const_pattern));
             if (!eps_const)
                 return false;
             if (eps_const->get_element_type() == element::f32) {
@@ -307,6 +325,68 @@ public:
     }
 };
 
+class PropagateDownDisableSensitivityForQuantized : public pass::MatcherPass {
+public:
+    OPENVINO_RTTI("DisableMarkingForQuantizedNodes", "0");
+    PropagateDownDisableSensitivityForQuantized() {
+        MATCHER_SCOPE(PropagateDownDisableSensitivityForQuantized);
+
+        // through this nodes
+        std::shared_ptr<Node> quantization_propagating_nodes = pattern::wrap_type<Squeeze,
+                                                                                  Unsqueeze,
+                                                                                  FakeQuantize,
+                                                                                  Reshape,
+                                                                                  op::util::BroadcastBase,
+                                                                                  DepthToSpace,
+                                                                                  opset2::Interpolate,
+                                                                                  opset4::Interpolate,
+                                                                                  opset11::Interpolate,
+                                                                                  opset2::MaxPool,
+                                                                                  MaxPool,
+                                                                                  Pad,
+                                                                                  ReduceMax,
+                                                                                  ReduceMin,
+                                                                                  Relu,
+                                                                                  Transpose,
+                                                                                  ShuffleChannels,
+                                                                                  StridedSlice,
+                                                                                  Slice,
+                                                                                  VariadicSplit,
+                                                                                  Split,
+                                                                                  op::util::GatherBase,
+                                                                                  Concat,
+                                                                                  Tile>();
+
+        matcher_pass_callback callback = [=](pattern::Matcher& m) {
+            const auto& node = m.get_match_root();
+            if (!node)
+                return false;
+
+            auto is_quantize = as_type_ptr<FakeQuantize>(node);
+            if (is_quantize) {
+                mark_fq_path(node);
+                return true;
+            }
+
+            bool is_changed = false;
+
+            for (const auto& in_node_output : node->input_values()) {
+                auto input_node = in_node_output.get_node_shared_ptr();
+                auto is_quantize = as_type_ptr<FakeQuantize>(input_node);
+                if (is_quantize || is_fq_path(input_node)) {
+                    mark_fq_path(node);
+                    enable_fp16_compression(node);
+                    is_changed = true;
+                }
+            }
+
+            return is_changed;
+        };
+        auto m = make_shared<pattern::Matcher>(quantization_propagating_nodes, matcher_name);
+        register_matcher(m, callback);
+    }
+};
+
 bool MarkSugraphsToKeepInMixedPrecision::run_on_model(const shared_ptr<ov::Model>& m) {
     RUN_ON_MODEL_SCOPE(MarkSugraphsToKeepInMixedPrecision);
 
@@ -314,6 +394,7 @@ bool MarkSugraphsToKeepInMixedPrecision::run_on_model(const shared_ptr<ov::Model
     // Mark root of Division with eps pattern to keep in FP32
     REGISTER_PASS(manager, MarkDivWithEps)
     REGISTER_PASS(manager, MarkExpInReduceOpPath)
+    REGISTER_PASS(manager, PropagateDownDisableSensitivityForQuantized)
 
     // both Up and Down propagations are needed.
     // Why both of them are needed is explained in comments in passes declarations.
@@ -328,6 +409,7 @@ bool MarkSugraphsToKeepInMixedPrecision::run_on_model(const shared_ptr<ov::Model
 
     for (auto& node : m->get_ops()) {
         erase_reduceop_path(node);
+        erase_fq_path(node);
     }
 
     return false;  // no need to revalidate
diff --git a/src/common/transformations/tests/common_optimizations/mark_subgraph_to_keep_in_mixed_precision_test.cpp b/src/common/transformations/tests/common_optimizations/mark_subgraph_to_keep_in_mixed_precision_test.cpp
index 643045dbd04..7dea7e948d3 100644
--- a/src/common/transformations/tests/common_optimizations/mark_subgraph_to_keep_in_mixed_precision_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/mark_subgraph_to_keep_in_mixed_precision_test.cpp
@@ -126,36 +126,21 @@ TEST(TransformationTests, MarkSugraphsToKeepInMixedPrecision_reducesum_without_e
     shared_ptr<Model> model, model_ref;
     pass::Manager manager;
 
-    {
-        auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
-        auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
+    auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
+    auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
 
-        auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
-        auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
-        auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
-        auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
+    auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
+    auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
+    auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
+    auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
 
-        model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
 
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
-
-    {
-        auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
-        auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
-
-        auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
-        auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
-        auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
-        auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
-
-        model_ref = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
-    }
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
     const FunctionsComparator func_comparator =
         FunctionsComparator::with_default().enable(FunctionsComparator::RUNTIME_KEYS);
@@ -519,32 +504,20 @@ TEST(TransformationTests, PowWithPositiveExponent) {
     pass::Manager manager;
     // graph should be left unchanged
     const float eps_value = 1.0e-12f;
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
-        auto pow = std::make_shared<Power>(add, pow_exp_const);
-        auto mul = std::make_shared<Multiply>(input_1, pow);
+    auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
+    auto add = std::make_shared<Add>(input_2, eps_const);
+    auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
+    auto pow = std::make_shared<Power>(add, pow_exp_const);
+    auto mul = std::make_shared<Multiply>(input_1, pow);
 
-        model = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
+    model = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
 
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
-        auto pow = std::make_shared<Power>(add, pow_exp_const);
-        auto mul = std::make_shared<Multiply>(input_1, pow);
-
-        model_ref = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
-    }
     const FunctionsComparator func_comparator =
         FunctionsComparator::with_default().enable(FunctionsComparator::RUNTIME_KEYS);
     // need to compare twice to ensure that no extra nodes are marked
@@ -559,28 +532,18 @@ TEST(TransformationTests, DivisionByZeroMinimalPatternUnchanged) {
     pass::Manager manager;
     // if eps_value is greater than normalized_fp16_min then leave graph unchanged
     const float eps_value = 0.0001f;
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto divide = std::make_shared<Divide>(input_1, add);
+    auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
+    auto add = std::make_shared<Add>(input_2, eps_const);
+    auto divide = std::make_shared<Divide>(input_1, add);
 
-        model = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
+    model = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
 
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto divide = std::make_shared<Divide>(input_1, add);
-
-        model_ref = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
-    }
     const FunctionsComparator func_comparator =
         FunctionsComparator::with_default().enable(FunctionsComparator::RUNTIME_KEYS);
     // need to compare twice to ensure that no extra nodes are marked
@@ -798,36 +761,21 @@ TEST(TransformationTests, MarkReduceOpExpToKeepInMixedPrecision_reducesum_withou
     // ReduceSum without Exp is not a precision sensitive case
     shared_ptr<Model> model, model_ref;
     pass::Manager manager;
-    {
-        auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
-        auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
+    auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
+    auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
 
-        auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
-        auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
-        auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
-        auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
+    auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
+    auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
+    auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
+    auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
 
-        model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
 
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
-
-    {
-        auto input_1 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto input_2 = make_shared<Parameter>(element::f32, Shape{1, 3, 224, 224});
-        auto reduction_axes = Constant::create(element::i64, Shape{1}, {-1});
-        auto reduce_sum_1 = make_shared<ReduceSum>(input_1, reduction_axes);
-
-        auto factor_const = Constant::create(element::f16, Shape{1}, {-1});
-        auto factor_const_decompressed = make_shared<Convert>(factor_const, element::f32);
-        auto mul_1 = make_shared<Multiply>(reduce_sum_1, factor_const_decompressed);
-        auto matmul_1 = make_shared<MatMul>(mul_1, input_2);
-
-        model_ref = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
-    }
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
     const FunctionsComparator func_comparator =
         FunctionsComparator::with_default().enable(FunctionsComparator::RUNTIME_KEYS);
@@ -986,31 +934,21 @@ TEST(TransformationTests, MarkDivWithEpsToKeepInMixedPrecision_PowWithPositiveEx
     const float eps_value = 1.e-12f;
     shared_ptr<Model> model, model_ref;
     pass::Manager manager;
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
-        auto pow = std::make_shared<Power>(add, pow_exp_const);
-        auto mul = std::make_shared<Multiply>(input_1, pow);
 
-        model = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
+    auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
+    auto add = std::make_shared<Add>(input_2, eps_const);
+    auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
+    auto pow = std::make_shared<Power>(add, pow_exp_const);
+    auto mul = std::make_shared<Multiply>(input_1, pow);
 
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto pow_exp_const = Constant::create(element::f32, Shape{1}, {1.77});
-        auto pow = std::make_shared<Power>(add, pow_exp_const);
-        auto mul = std::make_shared<Multiply>(input_1, pow);
+    model = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
+
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
-        model_ref = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input_1, input_2});
-    }
     const auto fc = FunctionsComparator::with_default()
                         .enable(FunctionsComparator::PRECISIONS)
                         .enable(FunctionsComparator::RUNTIME_KEYS)
@@ -1027,27 +965,19 @@ TEST(TransformationTests, MarkDivWithEpsToKeepInMixedPrecision_MinimalPatternUnc
     const float eps_value = 0.0001f;
     shared_ptr<Model> model, model_ref;
     pass::Manager manager;
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto divide = std::make_shared<Divide>(input_1, add);
 
-        model = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
-        manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
-        manager.run_passes(model);
-    }
+    auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
+    auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
+    auto add = std::make_shared<Add>(input_2, eps_const);
+    auto divide = std::make_shared<Divide>(input_1, add);
 
-    {
-        auto input_1 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto input_2 = std::make_shared<Parameter>(element::f32, PartialShape::dynamic(3));
-        auto eps_const = Constant::create(element::f32, Shape{1}, {eps_value});
-        auto add = std::make_shared<Add>(input_2, eps_const);
-        auto divide = std::make_shared<Divide>(input_1, add);
+    model = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
+
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
 
-        model_ref = std::make_shared<Model>(NodeVector{divide}, ParameterVector{input_1, input_2});
-    }
     const auto fc = FunctionsComparator::with_default()
                         .enable(FunctionsComparator::PRECISIONS)
                         .enable(FunctionsComparator::RUNTIME_KEYS)
@@ -1162,3 +1092,82 @@ TEST(TransformationTests, MarkDivWithEpsToKeepInMixedPrecision_InL2NormWithSqrtA
     result = fc(model, model_ref);
     ASSERT_TRUE(result.valid) << result.message;
 }
+
+TEST(TransformationTests, MarkDivWithEpsToKeepInMixedPrecision_disable_for_quantized_nodes_1) {
+    shared_ptr<Model> model, model_ref;
+    pass::Manager manager;
+    // despite there are sensitive Exp->ReduceSum nodes, but because of the FQ they will
+    // be inferred in int8 therefore no need to mark them: model and model_ref should match
+    auto input_1 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto input_2 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+    auto in_low = op::v0::Constant::create(element::f32, Shape{}, {0.f});
+    auto in_high = op::v0::Constant::create(element::f32, Shape{}, {5.f});
+    auto out_low = op::v0::Constant::create(element::f32, Shape{}, {2.f});
+    auto out_high = op::v0::Constant::create(element::f32, Shape{}, {4.f});
+    auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+    auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+    auto reduce_sum_1 = make_shared<opset10::ReduceSum>(fq_1, reduction_axes);
+
+    auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+    auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+    model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
+
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
+
+    const auto fc = FunctionsComparator::with_default()
+                        .enable(FunctionsComparator::PRECISIONS)
+                        .enable(FunctionsComparator::RUNTIME_KEYS)
+                        .enable(FunctionsComparator::CONST_VALUES);
+    // need to compare twice to ensure that no extra nodes are marked
+    FunctionsComparator::Result result = fc(model_ref, model);
+    ASSERT_TRUE(result.valid) << result.message;
+    result = fc(model, model_ref);
+    ASSERT_TRUE(result.valid) << result.message;
+}
+
+TEST(TransformationTests, MarkDivWithEpsToKeepInMixedPrecision_disable_for_quantized_nodes_2) {
+    shared_ptr<Model> model, model_ref;
+    pass::Manager manager;
+    // despite there are sensitive Exp->ReduceSum nodes, but because of the FQ they will
+    // be inferred in int8 therefore no need to mark them: model and model_ref should match
+    auto input_1 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto input_2 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+    auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+    auto in_low = op::v0::Constant::create(element::f32, Shape{}, {0.f});
+    auto in_high = op::v0::Constant::create(element::f32, Shape{}, {5.f});
+    auto out_low = op::v0::Constant::create(element::f32, Shape{}, {2.f});
+    auto out_high = op::v0::Constant::create(element::f32, Shape{}, {4.f});
+    auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+    auto unsqueeze_axes = opset10::Constant::create(element::i64, Shape{1}, {1});
+    auto unsqueeze_1 = make_shared<opset10::Unsqueeze>(fq_1, unsqueeze_axes);
+
+    auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+    auto reduce_sum_1 = make_shared<opset10::ReduceSum>(unsqueeze_1, reduction_axes);
+
+    auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+    auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+    model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    model_ref = model->clone();
+
+    manager.register_pass<pass::MarkSugraphsToKeepInMixedPrecision>();
+    manager.run_passes(model);
+
+    const auto fc = FunctionsComparator::with_default()
+                        .enable(FunctionsComparator::PRECISIONS)
+                        .enable(FunctionsComparator::RUNTIME_KEYS)
+                        .enable(FunctionsComparator::CONST_VALUES);
+    // need to compare twice to ensure that no extra nodes are marked
+    FunctionsComparator::Result result = fc(model_ref, model);
+    ASSERT_TRUE(result.valid) << result.message;
+    result = fc(model, model_ref);
+    ASSERT_TRUE(result.valid) << result.message;
+}
diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp
index 73d33361121..5226c41ae17 100644
--- a/src/common/transformations/tests/utils/convert_precision.cpp
+++ b/src/common/transformations/tests/utils/convert_precision.cpp
@@ -1716,3 +1716,119 @@ TEST(TransformationTests, ConvertPrecision_exp_through_unsqueeze) {
     FunctionsComparator::Result result = func_comparator(model_ref, model);
     ASSERT_TRUE(result.valid) << result.message;
 }
+
+TEST(TransformationTests, ConvertPrecision_disable_for_quantized_nodes_1) {
+    shared_ptr<Model> model, model_ref;
+    pass::Manager manager;
+    {
+        auto input_1 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+        auto input_2 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+        auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+        auto in_low = op::v0::Constant::create(element::f32, Shape{}, {0.f});
+        auto in_high = op::v0::Constant::create(element::f32, Shape{}, {5.f});
+        auto out_low = op::v0::Constant::create(element::f32, Shape{}, {2.f});
+        auto out_high = op::v0::Constant::create(element::f32, Shape{}, {4.f});
+        auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+        auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+        auto reduce_sum_1 = make_shared<opset10::ReduceSum>(fq_1, reduction_axes);
+
+        auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+        auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+        model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+
+        type_to_fuse_map empty_type_to_fuse_map = {};
+        bool keep_precision_sensitive_in_fp32 = true;
+        manager.register_pass<pass::ConvertPrecision>(precisions_map{{element::f32, element::f16}},
+                                                      empty_type_to_fuse_map,
+                                                      keep_precision_sensitive_in_fp32);
+        manager.run_passes(model);
+    }
+
+    {
+        auto input_1 = make_shared<opset10::Parameter>(element::f16, Shape{1, 3, 224, 224});
+        auto input_2 = make_shared<opset10::Parameter>(element::f16, Shape{1, 3, 224, 224});
+        auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+        auto in_low = op::v0::Constant::create(element::f16, Shape{}, {0.f});
+        auto in_high = op::v0::Constant::create(element::f16, Shape{}, {5.f});
+        auto out_low = op::v0::Constant::create(element::f16, Shape{}, {2.f});
+        auto out_high = op::v0::Constant::create(element::f16, Shape{}, {4.f});
+        auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+        auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+        auto reduce_sum_1 = make_shared<opset10::ReduceSum>(fq_1, reduction_axes);
+
+        auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+        auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+        model_ref = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    }
+
+    const FunctionsComparator func_comparator = FunctionsComparator::with_default();
+    FunctionsComparator::Result result = func_comparator(model_ref, model);
+    ASSERT_TRUE(result.valid) << result.message;
+}
+
+TEST(TransformationTests, ConvertPrecision_disable_for_quantized_nodes_2) {
+    shared_ptr<Model> model, model_ref;
+    pass::Manager manager;
+    {
+        auto input_1 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+        auto input_2 = make_shared<opset10::Parameter>(element::f32, Shape{1, 3, 224, 224});
+        auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+        auto in_low = op::v0::Constant::create(element::f32, Shape{}, {0.f});
+        auto in_high = op::v0::Constant::create(element::f32, Shape{}, {5.f});
+        auto out_low = op::v0::Constant::create(element::f32, Shape{}, {2.f});
+        auto out_high = op::v0::Constant::create(element::f32, Shape{}, {4.f});
+        auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+        auto unsqueeze_axes = opset10::Constant::create(element::i64, Shape{1}, {1});
+        auto unsqueeze_1 = make_shared<opset10::Unsqueeze>(fq_1, unsqueeze_axes);
+
+        auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+        auto reduce_sum_1 = make_shared<opset10::ReduceSum>(unsqueeze_1, reduction_axes);
+
+        auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+        auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+        model = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+
+        type_to_fuse_map empty_type_to_fuse_map = {};
+        bool keep_precision_sensitive_in_fp32 = true;
+        manager.register_pass<pass::ConvertPrecision>(precisions_map{{element::f32, element::f16}},
+                                                      empty_type_to_fuse_map,
+                                                      keep_precision_sensitive_in_fp32);
+        manager.run_passes(model);
+    }
+
+    {
+        auto input_1 = make_shared<opset10::Parameter>(element::f16, Shape{1, 3, 224, 224});
+        auto input_2 = make_shared<opset10::Parameter>(element::f16, Shape{1, 3, 224, 224});
+        auto exp_1 = make_shared<opset10::Exp>(input_1);
+
+        auto in_low = op::v0::Constant::create(element::f16, Shape{}, {0.f});
+        auto in_high = op::v0::Constant::create(element::f16, Shape{}, {5.f});
+        auto out_low = op::v0::Constant::create(element::f16, Shape{}, {2.f});
+        auto out_high = op::v0::Constant::create(element::f16, Shape{}, {4.f});
+        auto fq_1 = make_shared<opset10::FakeQuantize>(exp_1, in_low, in_high, out_low, out_high, 256);
+
+        auto unsqueeze_axes = opset10::Constant::create(element::i64, Shape{1}, {1});
+        auto unsqueeze_1 = make_shared<opset10::Unsqueeze>(fq_1, unsqueeze_axes);
+
+        auto reduction_axes = opset10::Constant::create(element::i64, Shape{1}, {-1});
+        auto reduce_sum_1 = make_shared<opset10::ReduceSum>(unsqueeze_1, reduction_axes);
+
+        auto fq_2 = make_shared<opset10::FakeQuantize>(reduce_sum_1, in_low, in_high, out_low, out_high, 256);
+        auto matmul_1 = make_shared<opset10::MatMul>(fq_2, input_2);
+
+        model_ref = make_shared<Model>(NodeVector{matmul_1}, ParameterVector{input_1, input_2});
+    }
+
+    const FunctionsComparator func_comparator = FunctionsComparator::with_default();
+    FunctionsComparator::Result result = func_comparator(model_ref, model);
+    ASSERT_TRUE(result.valid) << result.message;
+}