[CPU] FakeQuantize decomposition (#3741)

2021-02-02 09:37:02 +03:00 · 2021-02-02 09:37:02 +03:00 · 537179b235
commit 537179b235
parent cca0d568e0
9 changed files with 762 additions and 7 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@ -57,6 +57,8 @@
 #include <transformations/convert_precision.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/rt_info/fused_names_attribute.hpp>
+#include <transformations/op_conversions/fq_decomposition.hpp>
+#include <transformations/utils/utils.hpp>

 #include <ngraph/opsets/opset2.hpp>
 #include <ngraph/opsets/opset3.hpp>
@ -71,6 +73,8 @@
 # include <low_precision/group_convolution.hpp>
 # include <low_precision/multiply_to_group_convolution.hpp>

+#include "nodes/mkldnn_quantize_node.h"
+
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
 #if defined(_WIN32) || defined(WIN32)
 #include <intrin.h>
@ -227,13 +231,22 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
        transformer.transform(nGraphFunc);
    }

+    bool has_fake_quantize = ::ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc);
+
    ngraph::pass::Manager legacyManager;
+
+    legacyManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
    legacyManager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
    legacyManager.register_pass<ngraph::pass::ConvertPrecision>(ngraph::element::i64, ngraph::element::i32);
    // not legacy actually, but it should be the last transformation in the transformation pipeline
    legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();

    auto legacyPassConfig = legacyManager.get_pass_config();
+
+    legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
+        return !MKLDNNQuantizeNode::isNeedToDecompose(node);
+    });
+
    legacyPassConfig->set_callback<ngraph::pass::AddMultiplyFusion>([](const_node_ptr &node) -> bool {
        if (auto mul_op = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
            auto add_op = std::dynamic_pointer_cast<const ngraph::opset1::Add>(mul_op->get_input_node_shared_ptr(0));
@ -248,15 +261,16 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
        return false;
    });

-    legacyManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
+    legacyPassConfig->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
        // UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
        return node->get_rt_info().count("UNROLL_TI") == 0;
    });
+
    legacyManager.run_passes(nGraphFunc);

    OV_ITT_TASK_CHAIN(taskChain, MKLDNNPlugin::itt::domains::MKLDNN_LT, "Transformation", "convertFunctionToICNNNetwork");

-    clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork));
+    clonedNetwork = CNNNetwork(InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, clonedNetwork, has_fake_quantize));

    OV_ITT_TASK_NEXT(taskChain, "ConvertIOPrecision");

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
@ -18,6 +18,8 @@
 #include <cpu/x64/jit_generator.hpp>
 #include "ie_parallel.hpp"

+#include <ngraph/opsets/opset1.hpp>
+
 // Quantization ranges validation is switched off by default in order to avoid regressions on user side
 // #define VALIDATE_QUANTIZATION_RANGES

@ -1029,7 +1031,7 @@ void MKLDNNQuantizeNode::init() {
            float ih = inputHighData[isInputHighBroadcasted ? 0 : i];

 #if defined(VALIDATE_QUANTIZATION_RANGES)
-            if ((il == ih && levels != 2) || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
+            if ((il == ih && levels != 2) || il > ih || std::isnan(il) || std::isnan(ih) || std::isinf(il) || std::isinf(ih)) {
                THROW_IE_EXCEPTION << "Quantize layer with name '" << getName() << "' has invalid input quantize ranges: "
                                   << "inputLow = " << il << ", inputHigh = " << ih;
            }
@ -1578,6 +1580,33 @@ void MKLDNNQuantizeNode::appendPostOps(mkldnn::post_ops& ops) {
        isPostOpDataInitialized = true;
 }

+bool MKLDNNQuantizeNode::isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node) {
+    if (const auto fq = std::dynamic_pointer_cast<const ngraph::opset1::FakeQuantize>(node)) {
+        for (size_t i = 0; i < fq->get_input_size(); i++) {
+            if (fq->get_input_shape(i).size() > 5)
+                return true;
+        }
+
+        for (size_t i = 1; i < fq->get_input_size(); i++) {
+            size_t count_not_unit_axis = 0;
+            auto shape = fq->get_input_shape(i);
+
+            if (ngraph::shape_size(shape) != 1) {
+                size_t not_unit_axis = 0;
+                for (size_t i = 0; i < shape.size(); i++) {
+                    if (shape[i] > 1) {
+                        not_unit_axis = i;
+                        count_not_unit_axis++;
+                    }
+                }
+                if (count_not_unit_axis > 1 || not_unit_axis > 1)
+                    return true;
+            }
+        }
+    }
+    return false;
+}
+
 bool MKLDNNQuantizeNode::created() const {
    return getType() == Quantize;
 }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h
@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@ -113,6 +113,8 @@ public:

    void appendPostOps(mkldnn::post_ops& ops) override;

+    static bool isNeedToDecompose(const std::shared_ptr<const ngraph::Node>& node);
+
 private:
    void init() override;
    std::vector<mkldnn::memory::format_tag> getDataFormats() const;
--- a/inference-engine/src/transformations/include/transformations/op_conversions/fq_decomposition.hpp
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/fq_decomposition.hpp
@ -0,0 +1,47 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API FakeQuantizeDecomposition;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief FakeQuantizeDecomposition transformation decomposes FakeQuantize layer.
+ *
+ * Expression from specification:
+ * if x <= min(input_low, input_high):
+ *   output = output_low
+ * elif x > max(input_low, input_high):
+ *   output = output_high
+ * else:
+ *   output = round((x - input_low) / (input_high - input_low) * (levels-1)) / (levels-1) * (output_high - output_low) + output_low
+ *
+ * expand brackets into round:
+ * round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
+ * div on (levels-1) and mult on (output_high - output_low) => mult on (output_high - output_low) / (levels-1)
+ *
+ *  =>
+ * round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) + output_low
+ *
+ * This transformation doesn't support following cases:
+ * 1. At least one 'range' input is not Constant
+ * 2. At least one 'input_low' input value greater or equal than 'input_high' input value
+ *
+ */
+
+class ngraph::pass::FakeQuantizeDecomposition: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    FakeQuantizeDecomposition();
+};
--- a/inference-engine/src/transformations/src/transformations/op_conversions/fq_decomposition.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/fq_decomposition.cpp
@ -0,0 +1,124 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include "transformations/op_conversions/fq_decomposition.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/builder/autobroadcast.hpp>
+
+#include <numeric>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::FakeQuantizeDecomposition, "FakeQuantizeDecomposition", 0);
+
+bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize> &fq) {
+    auto il = fq->input_value(1);
+    auto ih = fq->input_value(2);
+    auto greater_equal = std::make_shared<ngraph::opset1::GreaterEqual>(il, ih);
+
+    ngraph::OutputVector result(1);
+    if (!greater_equal->constant_fold(result, greater_equal->input_values()))
+        return false;
+
+    auto res_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(result[0].get_node_shared_ptr());
+
+    const std::vector<bool> comp_result = res_node->cast_vector<bool>();
+
+    return !std::any_of(comp_result.begin(), comp_result.end(), [](const bool value) { return value; });
+}
+
+ngraph::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
+    MATCHER_SCOPE(FakeQuantizeDecomposition);
+    auto data = ngraph::pattern::any_input();
+    auto il = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto ih = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto ol = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto oh = ngraph::pattern::wrap_type<opset1::Constant>();
+    auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>({data, il, ih, ol, oh});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        auto &pattern_to_output = m.get_pattern_value_map();
+        const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(pattern_to_output.at(fake_quantize).get_node_shared_ptr());
+
+        if (fake_quantize_node == nullptr || transformation_callback(fake_quantize_node) || !isValidRangesInputs(fake_quantize_node)) {
+            return false;
+        }
+
+        Output<Node> data{fake_quantize_node->input_value(0)};
+        const Output<Node> input_low{fake_quantize_node->input_value(1)};
+        const Output<Node> input_high{fake_quantize_node->input_value(2)};
+        const Output<Node> output_low{fake_quantize_node->input_value(3)};
+        const Output<Node> output_high{fake_quantize_node->input_value(4)};
+        auto input_type = data.get_element_type();
+
+        ngraph::NodeVector decomp_ops;
+        if (input_type != input_low.get_element_type()) {
+            input_type = input_low.get_element_type();
+            data = std::make_shared<ngraph::opset1::Convert>(data, input_type);
+            decomp_ops.push_back(data.get_node_shared_ptr());
+        }
+
+        // if we set input_low or input_high in formula we got output = output_low and output = output_high respectively
+        // so we just clamp x
+        const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
+        const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
+        decomp_ops.push_back(max);
+        decomp_ops.push_back(min);
+
+        // (levels-1)
+        const auto levels_minus_one = std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
+        decomp_ops.push_back(levels_minus_one);
+        // (input_high - input_low)
+        const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
+        // (levels-1) / (input_high - input_low)
+        const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
+        // input_low * (levels-1) / (input_high - input_low)
+        const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
+        decomp_ops.push_back(subInHighLow);
+        decomp_ops.push_back(isc);
+        decomp_ops.push_back(ish);
+
+        // x * (levels-1) / (input_high - input_low)
+        const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
+        // x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
+        const auto after_ish_apply = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
+        decomp_ops.push_back(after_isc_apply);
+        decomp_ops.push_back(after_ish_apply);
+
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
+        const auto round = std::make_shared<ngraph::opset5::Round>(after_ish_apply, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
+        decomp_ops.push_back(round);
+
+        // (output_high - output_low)
+        const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
+        // (output_high - output_low) / (levels-1)
+        const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
+        decomp_ops.push_back(sub_out_high_low);
+        decomp_ops.push_back(osc);
+
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1)
+        const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(round, osc);
+        // round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) * (output_high - output_low) / (levels-1) +
+        // output_low
+        std::shared_ptr<Node> result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
+        decomp_ops.push_back(after_osc_apply);
+        decomp_ops.push_back(result);
+
+        if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
+            result = std::make_shared<ngraph::opset1::Convert>(result, fake_quantize_node->get_output_element_type(0));
+            decomp_ops.push_back(result);
+        }
+
+        result->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
+        ngraph::replace_node(m.get_match_root(), result);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
+    register_matcher(m, callback);
+}
--- a/inference-engine/tests/functional/inference_engine/transformations/fq_decomposition_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/fq_decomposition_test.cpp
@ -0,0 +1,249 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/op_conversions/fq_decomposition.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+using FakeQuantizeDecompositionBasicParams = std::tuple<ngraph::element::Type_t, // 'data' input precision
+                                                        ngraph::Shape,           // data shape
+                                                        ngraph::element::Type_t, // 'range' inputs precision
+                                                        ngraph::Shape,           // il shape
+                                                        ngraph::Shape,           // ih shape
+                                                        ngraph::Shape,           // ol shape
+                                                        ngraph::Shape,           // oh shape
+                                                        size_t                   // levels
+>;
+
+using FakeQuantizeDecompositionParamsSet = std::tuple<FakeQuantizeDecompositionBasicParams,
+                                                      std::pair<float, float>, // il and ih values
+                                                      bool                     // should be decompos
+>;
+
+class FakeQuantizeDecompositionTest : public CommonTestUtils::TestsCommon, public ::testing::WithParamInterface<FakeQuantizeDecompositionParamsSet> {
+public:
+    static std::string getTestCaseName(::testing::TestParamInfo<FakeQuantizeDecompositionParamsSet> obj) {
+        FakeQuantizeDecompositionBasicParams basic_params;
+        std::pair<float, float> input_ranges_values;
+        bool should_be_decompos;
+        std::tie(basic_params, input_ranges_values, should_be_decompos) = obj.param;
+
+        ngraph::Shape data_shape, il_shape, ih_shape, ol_shape, oh_shape;
+        ngraph::element::Type_t data_prec, ranges_prec;
+        size_t levels;
+        std::tie(data_prec, data_shape, ranges_prec, il_shape, ih_shape, ol_shape, oh_shape, levels) = basic_params;
+
+        std::ostringstream result;
+        result << "DATA=" << CommonTestUtils::vec2str(data_shape) << "_";
+        result << "DATA_PRC=" << ngraph::element::Type(data_prec) << "_";
+        result << "IL=" << CommonTestUtils::vec2str(il_shape) << "_" << input_ranges_values.first << "_";
+        result << "IH=" << CommonTestUtils::vec2str(ih_shape) << "_" << input_ranges_values.second << "_";
+        result << "OL=" << CommonTestUtils::vec2str(ol_shape) << "_";
+        result << "OH=" << CommonTestUtils::vec2str(oh_shape) << "_";
+        result << "RANGES_PRC=" << ngraph::element::Type(ranges_prec) << "_";
+        result << "LEVELS=" << levels;
+        return result.str();
+    }
+
+protected:
+    void SetUp() {
+        FakeQuantizeDecompositionBasicParams basic_params;
+        std::pair<float, float> input_ranges_values;
+        bool should_be_decompos;
+        std::tie(basic_params, input_ranges_values, should_be_decompos) = this->GetParam();
+
+        ngraph::Shape data_shape, il_shape, ih_shape, ol_shape, oh_shape;
+        ngraph::element::Type_t data_prec, ranges_prec;
+        size_t levels;
+        std::tie(data_prec, data_shape, ranges_prec, il_shape, ih_shape, ol_shape, oh_shape, levels) = basic_params;
+
+        bool need_convert = data_prec != ranges_prec;
+
+        std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+        {
+            const auto data = std::make_shared<ngraph::opset1::Parameter>(data_prec, ngraph::PartialShape(data_shape));
+            const auto il = std::make_shared<ngraph::opset1::Constant>(ranges_prec, il_shape, input_ranges_values.first);
+            const auto ih = std::make_shared<ngraph::opset1::Constant>(ranges_prec, ih_shape, input_ranges_values.second);
+            const auto ol = std::make_shared<ngraph::opset1::Constant>(ranges_prec, ol_shape);
+            const auto oh = std::make_shared<ngraph::opset1::Constant>(ranges_prec, oh_shape);
+
+            const auto fq = std::make_shared<ngraph::opset1::FakeQuantize>(data, il, ih, ol, oh, levels);
+            f = std::make_shared<ngraph::Function>(ngraph::NodeVector{fq}, ngraph::ParameterVector{data});
+
+            ngraph::pass::Manager manager;
+            manager.register_pass<ngraph::pass::InitNodeInfo>();
+            manager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
+            manager.run_passes(f);
+
+            ASSERT_NO_THROW(check_rt_info(f));
+        }
+
+        {
+            auto input_data = std::make_shared<ngraph::opset1::Parameter>(data_prec, ngraph::PartialShape(data_shape));
+            ngraph::ParameterVector params;
+            params.push_back(input_data);
+            std::shared_ptr<ngraph::Node> data = input_data;
+            const auto il = std::make_shared<ngraph::opset1::Constant>(ranges_prec, il_shape, input_ranges_values.first);
+            const auto ih = std::make_shared<ngraph::opset1::Constant>(ranges_prec, ih_shape, input_ranges_values.second);
+            const auto ol = std::make_shared<ngraph::opset1::Constant>(ranges_prec, ol_shape);
+            const auto oh = std::make_shared<ngraph::opset1::Constant>(ranges_prec, oh_shape);
+
+            if (should_be_decompos) {
+                if (need_convert) {
+                    data = std::make_shared<ngraph::opset1::Convert>(data, ranges_prec);
+                }
+
+                const auto max = std::make_shared<ngraph::opset1::Maximum>(data, il);
+                const auto min = std::make_shared<ngraph::opset1::Minimum>(max, ih);
+
+                const auto levels_minus_one = std::make_shared<ngraph::opset1::Constant>(ranges_prec, ngraph::Shape{}, levels - 1);
+
+                const auto sub_in_high_low = std::make_shared<ngraph::opset1::Subtract>(ih, il);
+                const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, sub_in_high_low);
+                const auto ish = std::make_shared<ngraph::opset1::Multiply>(il, isc);
+
+                const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
+                const auto after_ish_apply = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
+
+                const auto round = std::make_shared<ngraph::opset5::Round>(after_ish_apply, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
+
+                const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(oh, ol);
+                const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
+
+                const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(round, osc);
+                const auto after_out_low_add = std::make_shared<ngraph::opset1::Add>(after_osc_apply, ol);
+                std::shared_ptr<ngraph::Node> result = after_out_low_add;
+
+                if (need_convert) {
+                    result = std::make_shared<ngraph::opset1::Convert>(result, data_prec);
+                }
+
+                f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{result}, params);
+            } else {
+                const auto fq = std::make_shared<ngraph::opset1::FakeQuantize>(data, il, ih, ol, oh, levels);
+                f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{fq}, params);
+            }
+        }
+
+        const auto res = compare_functions(f, f_ref);
+        ASSERT_TRUE(res.first) << res.second;
+    }
+};
+
+TEST_P(FakeQuantizeDecompositionTest, CompareFunctions) {}
+
+const std::vector<ngraph::element::Type_t> precisions = {ngraph::element::Type_t::f16, ngraph::element::Type_t::f32};
+
+const std::vector<size_t> levels = {16, 255, 256};
+
+const std::vector<std::pair<float, float>> input_ranges_supported = {
+    {-10.0f, 10.f}
+};
+
+const auto simple_fq_basic = ::testing::Combine(::testing::ValuesIn(precisions),
+                                                ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                ::testing::ValuesIn(precisions),
+                                                ::testing::Values(ngraph::Shape{1, 3, 1, 1}),
+                                                ::testing::Values(ngraph::Shape{1, 3, 1, 1}),
+                                                ::testing::Values(ngraph::Shape{1, 3, 1, 1}),
+                                                ::testing::Values(ngraph::Shape{1, 3, 1, 1}),
+                                                ::testing::ValuesIn(levels));
+
+const auto broadcast_fq_basic = ::testing::Combine(::testing::ValuesIn(precisions),
+                                                   ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                   ::testing::ValuesIn(precisions),
+                                                   ::testing::Values(ngraph::Shape{1, 3, 4, 1}),
+                                                   ::testing::Values(ngraph::Shape{1, 1, 4, 5}),
+                                                   ::testing::Values(ngraph::Shape{1, 1, 1, 1}),
+                                                   ::testing::Values(ngraph::Shape{1, 1, 1, 1}),
+                                                   ::testing::ValuesIn(levels));
+
+const auto elementwise_fq_basic = ::testing::Combine(::testing::ValuesIn(precisions),
+                                                     ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                     ::testing::ValuesIn(precisions),
+                                                     ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                     ::testing::Values(ngraph::Shape{2, 3, 4, 1}),
+                                                     ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                     ::testing::Values(ngraph::Shape{2, 3, 4, 5}),
+                                                     ::testing::ValuesIn(levels));
+
+const auto broadcast_6D_fq_basic = ::testing::Combine(::testing::ValuesIn(precisions),
+                                                      ::testing::Values(ngraph::Shape{2, 3, 4, 5, 6, 7}),
+                                                      ::testing::ValuesIn(precisions),
+                                                      ::testing::Values(ngraph::Shape{2, 3, 4, 1, 1, 1}),
+                                                      ::testing::Values(ngraph::Shape{1, 3, 4, 5, 1, 1}),
+                                                      ::testing::Values(ngraph::Shape{1, 1, 1, 5, 6, 7}),
+                                                      ::testing::Values(ngraph::Shape{1, 1, 1, 5, 6, 7}),
+                                                      ::testing::ValuesIn(levels));
+
+INSTANTIATE_TEST_CASE_P(SimpleFakeQuantize_Decomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            simple_fq_basic,
+                            ::testing::ValuesIn(input_ranges_supported),
+                            ::testing::Values(true)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BroadcastFakeQuantize_Decomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            broadcast_fq_basic,
+                            ::testing::ValuesIn(input_ranges_supported),
+                            ::testing::Values(true)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ElementwiseFakeQuantize_Decomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            elementwise_fq_basic,
+                            ::testing::ValuesIn(input_ranges_supported),
+                            ::testing::Values(true)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(FakeQuantize6D_Decomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            broadcast_6D_fq_basic,
+                            ::testing::ValuesIn(input_ranges_supported),
+                            ::testing::Values(true)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+const std::vector<std::pair<float, float>> input_ranges_unsupported = {
+    {10.0f, -10.f},
+    {5.0f, 5.0f},
+    {-5.0f, -5.0f}
+};
+
+INSTANTIATE_TEST_CASE_P(SimpleFakeQuantize_NoDecomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            simple_fq_basic,
+                            ::testing::ValuesIn(input_ranges_unsupported),
+                            ::testing::Values(false)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BroadcastFakeQuantize_NoDecomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            broadcast_fq_basic,
+                            ::testing::ValuesIn(input_ranges_unsupported),
+                            ::testing::Values(false)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ElementwiseFakeQuantize_NoDecomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            elementwise_fq_basic,
+                            ::testing::ValuesIn(input_ranges_unsupported),
+                            ::testing::Values(false)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(FakeQuantize6D_NoDecomposition, FakeQuantizeDecompositionTest,
+                        ::testing::Combine(
+                            broadcast_6D_fq_basic,
+                            ::testing::ValuesIn(input_ranges_unsupported),
+                            ::testing::Values(false)),
+                        FakeQuantizeDecompositionTest::getTestCaseName);
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/fake_quantize.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/fake_quantize.cpp
@ -0,0 +1,288 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+
+using namespace InferenceEngine;
+using namespace ngraph;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+using fqSpecificParams = std::tuple<int64_t,                  // 'data' input low bounds
+                                    int64_t,                  // 'data' input high bounds
+                                    std::vector<float>,       // output low
+                                    std::vector<float>,       // output high
+                                    std::vector<SizeVector>,  // 'range' inputs shapes
+                                    size_t>;                  // levels
+
+using fqLayerTestParamsSet = std::tuple<fqSpecificParams,
+                                        SizeVector,                                        // 'data' input shape
+                                        Precision,                                         // input precision
+                                        std::pair<std::vector<float>, std::vector<float>>, // il and ih values
+                                        bool,                                              // should be decomposed
+                                        CPUSpecificParams>;
+
+class FakeQuantizeLayerCPUTest : public testing::WithParamInterface<fqLayerTestParamsSet>,
+                                 virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<fqLayerTestParamsSet> obj) {
+        fqSpecificParams fqParams;
+        SizeVector inDataShape;
+        Precision inPrec;
+        std::pair<std::vector<float>, std::vector<float>> inputRangesValues;
+        bool shouldBeDecomposed;
+        CPUSpecificParams cpuParams;
+        std::tie(fqParams, inDataShape, inPrec, inputRangesValues, shouldBeDecomposed, cpuParams) = obj.param;
+
+        int64_t inDataLowBounds, inDataHighBounds;
+        std::vector<float> inputLow, inputHigh, outputLow, outputHigh;
+        std::vector<SizeVector> inRangesShapes;
+        size_t levels;
+        inputLow = inputRangesValues.first;
+        inputHigh = inputRangesValues.second;
+        std::tie(inDataLowBounds, inDataHighBounds, outputLow, outputHigh, inRangesShapes, levels) = fqParams;
+
+        std::ostringstream result;
+        result << "IS=" << CommonTestUtils::vec2str(inDataShape) << "_";
+        result << "inPrec=" << inPrec.name() << "_";
+
+        std::string rs = "";
+        for (size_t i = 0; i < inRangesShapes.size(); i++) {
+            rs += CommonTestUtils::vec2str(inRangesShapes[i]) + "_";
+        }
+        result << "RS=" << rs;
+        result << "LOW_BOUNDS=" << inDataLowBounds << "_";
+        result << "HIGH_BOUNDS=" << inDataHighBounds << "_";
+        result << "IL=" << CommonTestUtils::vec2str(inputLow) << "_";
+        result << "IH=" << CommonTestUtils::vec2str(inputHigh) << "_";
+        result << "OL=" << CommonTestUtils::vec2str(outputLow) << "_";
+        result << "OH=" << CommonTestUtils::vec2str(outputHigh) << "_";
+        result << "LEVELS=" << levels;
+
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
+        return result.str();
+    }
+
+    void Infer() override {
+        inferRequest = executableNetwork.CreateInferRequest();
+        inputs.clear();
+
+        const InputsDataMap &inDataMap = cnnNetwork.getInputsInfo();
+        auto input = inDataMap.begin();
+
+        Blob::Ptr blob = FuncTestUtils::createAndFillBlob(input->second->getTensorDesc(), inDataHighBounds - inDataLowBounds, inDataLowBounds);
+        inferRequest.SetBlob(input->second->name(), blob);
+        inputs.push_back(blob);
+
+        inferRequest.Infer();
+    }
+
+protected:
+    std::string layerName;
+
+    void SetUp() override {
+        targetDevice = CommonTestUtils::DEVICE_CPU;
+        fqSpecificParams fqParams;
+        SizeVector inDataShape;
+        Precision inPrec;
+        std::pair<std::vector<float>, std::vector<float>> inputRangesValues;
+        bool shouldBeDecomposed;
+        CPUSpecificParams cpuParams;
+        std::tie(fqParams, inDataShape, inPrec, inputRangesValues, shouldBeDecomposed, cpuParams) = this->GetParam();
+
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+
+        std::vector<SizeVector> inRangesShapes;
+        size_t levels;
+        std::vector<std::vector<float>> rangesBounds(RANGES_INPUT_NUMBER);
+        rangesBounds[0] = inputRangesValues.first;
+        rangesBounds[1] = inputRangesValues.second;
+        std::tie(inDataLowBounds, inDataHighBounds, rangesBounds[2], rangesBounds[3], inRangesShapes, levels) = fqParams;
+
+        auto ngInPrec = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrec);
+        ParameterVector params = builder::makeParams(ngInPrec, {inDataShape});
+        auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<opset5::Parameter>(params));
+
+        auto il = builder::makeConstant(ngInPrec, inRangesShapes[0], rangesBounds[0], rangesBounds[0].empty());
+        auto ih = builder::makeConstant(ngInPrec, inRangesShapes[1], rangesBounds[1], rangesBounds[1].empty());
+        auto ol = builder::makeConstant(ngInPrec, inRangesShapes[2], rangesBounds[2], rangesBounds[2].empty());
+        auto oh = builder::makeConstant(ngInPrec, inRangesShapes[3], rangesBounds[3], rangesBounds[3].empty());
+        auto fq = std::make_shared<opset5::FakeQuantize>(paramOuts[0], il, ih, ol, oh, levels);
+
+        layerName = shouldBeDecomposed ? "" : "Quantize";
+
+        if (selectedType.empty()) {
+           selectedType = getPrimitiveType() + "_" + inPrec.name();
+        }
+
+        fq->get_rt_info() = getCPUInfo();
+
+        function = std::make_shared<Function>(fq, params, "FakeQuantizeCPU");
+    }
+
+private:
+    const size_t RANGES_INPUT_NUMBER = 4;
+
+    int64_t inDataLowBounds, inDataHighBounds;
+};
+
+TEST_P(FakeQuantizeLayerCPUTest, CompareWithRefs) {
+    Run();
+
+    CheckPluginRelatedResults(executableNetwork, layerName);
+}
+
+
+const std::vector<size_t> levels = {16, 255, 256};
+
+int64_t dataLowBounds{-10}, dataHighBounds{10};
+
+const std::vector<std::pair<std::vector<float>, std::vector<float>>> input_ranges = {
+    {{0.0f}, {5.f}},
+    {{-10.0f}, {-5.f}}
+};
+
+const std::vector<float> outputLow{5.0f}, outputHigh{25.0f};
+
+namespace fqImpl {
+
+std::vector<CPUSpecificParams> memForm4D_jit = {
+        CPUSpecificParams({nchw}, {nchw}, {}, {}),
+        CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
+        CPUSpecificParams({nChw16c}, {nChw16c}, {}, {})
+};
+
+const std::vector<std::vector<SizeVector>> rangesShapes4D_jit = {
+    {{1, 5, 1, 1}, {1, 5, 1, 1}, {1, 5, 1, 1}, {1, 5, 1, 1}},
+    {{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}
+};
+
+const auto specificParams4D_jit = ::testing::Combine(::testing::Values(dataLowBounds),
+                                                     ::testing::Values(dataHighBounds),
+                                                     ::testing::Values(outputLow),
+                                                     ::testing::Values(outputHigh),
+                                                     ::testing::ValuesIn(rangesShapes4D_jit),
+                                                     ::testing::ValuesIn(levels));
+const auto testParams4D_jit = ::testing::Combine(specificParams4D_jit,
+                                                 ::testing::Values(SizeVector{4, 5, 6, 7}),
+                                                 ::testing::Values(Precision::FP32),
+                                                 ::testing::ValuesIn(input_ranges),
+                                                 ::testing::Values(false),
+                                                 ::testing::ValuesIn(filterCPUSpecificParams(memForm4D_jit)));
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizeLayerCPUTest_4D_jit, FakeQuantizeLayerCPUTest, testParams4D_jit, FakeQuantizeLayerCPUTest::getTestCaseName);
+
+
+std::vector<CPUSpecificParams> memForm4D_ref = {
+        CPUSpecificParams({nchw}, {nchw}, {"ref_FP32"}, {"ref_FP32"})
+};
+
+const std::vector<std::vector<SizeVector>> rangesShapes4D_ref = {
+    {{4, 1, 1, 1}, {4, 1, 1, 1}, {4, 1, 1, 1}, {4, 1, 1, 1}}
+};
+
+const auto specificParams4D_ref = ::testing::Combine(::testing::Values(dataLowBounds),
+                                                     ::testing::Values(dataHighBounds),
+                                                     ::testing::Values(outputLow),
+                                                     ::testing::Values(outputHigh),
+                                                     ::testing::ValuesIn(rangesShapes4D_ref),
+                                                     ::testing::ValuesIn(levels));
+const auto testParams4D_ref = ::testing::Combine(specificParams4D_ref,
+                                                 ::testing::Values(SizeVector{4, 5, 6, 7}),
+                                                 ::testing::Values(Precision::FP32),
+                                                 ::testing::ValuesIn(input_ranges),
+                                                 ::testing::Values(false),
+                                                 ::testing::ValuesIn(memForm4D_ref));
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizeLayerCPUTest_4D_ref, FakeQuantizeLayerCPUTest, testParams4D_ref, FakeQuantizeLayerCPUTest::getTestCaseName);
+
+
+std::vector<CPUSpecificParams> memForm5D_jit = {
+        CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}),
+        CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}),
+        CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {})
+};
+
+const std::vector<std::vector<SizeVector>> rangesShapes5D_jit = {
+    {{1, 4, 1, 1, 1}, {1, 4, 1, 1, 1}, {1, 4, 1, 1, 1}, {1, 4, 1, 1, 1}},
+    {{1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
+};
+
+const auto specificParams5D_jit = ::testing::Combine(::testing::Values(dataLowBounds),
+                                                     ::testing::Values(dataHighBounds),
+                                                     ::testing::Values(outputLow),
+                                                     ::testing::Values(outputHigh),
+                                                     ::testing::ValuesIn(rangesShapes5D_jit),
+                                                     ::testing::ValuesIn(levels));
+const auto testParams5D_jit = ::testing::Combine(specificParams5D_jit,
+                                                 ::testing::Values(SizeVector{3, 4, 5, 6, 7}),
+                                                 ::testing::Values(Precision::FP32),
+                                                 ::testing::ValuesIn(input_ranges),
+                                                 ::testing::Values(false),
+                                                 ::testing::ValuesIn(filterCPUSpecificParams(memForm5D_jit)));
+
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizeLayerCPUTest_5D_jit, FakeQuantizeLayerCPUTest, testParams5D_jit, FakeQuantizeLayerCPUTest::getTestCaseName);
+
+
+std::vector<CPUSpecificParams> memForm5D_ref = {
+        CPUSpecificParams({ncdhw}, {ncdhw}, {"ref_FP32"}, {"ref_FP32"})
+};
+
+const std::vector<std::vector<SizeVector>> rangesShapes5D_ref = {
+    {{3, 1, 1, 1, 1}, {3, 1, 1, 1, 1}, {3, 1, 1, 1, 1}, {3, 1, 1, 1, 1}}
+};
+
+const auto specificParams5D_ref = ::testing::Combine(::testing::Values(dataLowBounds),
+                                                     ::testing::Values(dataHighBounds),
+                                                     ::testing::Values(outputLow),
+                                                     ::testing::Values(outputHigh),
+                                                     ::testing::ValuesIn(rangesShapes5D_ref),
+                                                     ::testing::ValuesIn(levels));
+const auto testParams5D_ref = ::testing::Combine(specificParams5D_ref,
+                                                 ::testing::Values(SizeVector{3, 4, 5, 6, 7}),
+                                                 ::testing::Values(Precision::FP32),
+                                                 ::testing::ValuesIn(input_ranges),
+                                                 ::testing::Values(false),
+                                                 ::testing::ValuesIn(memForm5D_ref));
+
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizeLayerCPUTest_5D_ref, FakeQuantizeLayerCPUTest, testParams5D_ref, FakeQuantizeLayerCPUTest::getTestCaseName);
+
+} // namespace fqImpl
+
+const std::vector<SizeVector> dataShapes = {
+    {4, 5, 6, 7},
+    {3, 4, 5, 6, 7},
+    {2, 3, 4, 5, 6, 7},
+};
+
+const std::vector<std::vector<SizeVector>> rangesShapes = {
+    {{4, 5, 6, 7}, {4, 5, 6, 7}, {4, 5, 6, 7}, {4, 5, 6, 7}},
+    {{1, 5, 1, 1}, {1, 1, 6, 7}, {1, 1, 6, 7}, {1, 1, 6, 7}},
+    {{1, 1, 6, 7}, {1, 1, 6, 7}, {1, 1, 6, 7}, {1, 1, 6, 7}},
+    {{1, 1, 6, 7}, {1, 1, 6, 7}, {1, 1, 1, 1}, {1, 1, 1, 1}},
+    {{1, 1, 6, 1}, {1, 5, 6, 7}, {1, 1, 6, 1}, {1, 1, 6, 1}}
+};
+
+namespace fqDecompos {
+
+const auto specificParams = ::testing::Combine(::testing::Values(dataLowBounds),
+                                               ::testing::Values(dataHighBounds),
+                                               ::testing::Values(outputLow),
+                                               ::testing::Values(outputHigh),
+                                               ::testing::ValuesIn(rangesShapes),
+                                               ::testing::ValuesIn(levels));
+const auto testParams = ::testing::Combine(specificParams,
+                                           ::testing::ValuesIn(dataShapes),
+                                           ::testing::Values(Precision::FP32),
+                                           ::testing::ValuesIn(input_ranges),
+                                           ::testing::Values(true),
+                                           ::testing::Values(CPUSpecificParams{}));
+
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizeLayerCPUTest_Decompos, FakeQuantizeLayerCPUTest, testParams, FakeQuantizeLayerCPUTest::getTestCaseName);
+
+} // namespace fqDecompos
+
+} // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
@ -77,6 +77,8 @@ std::string CPUTestsBase::impls2str(const std::vector<std::string> &priority) {
 }

 void CPUTestsBase::CheckPluginRelatedResults(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType) const {
+    if (nodeType.empty()) return;
+
    ASSERT_TRUE(!selectedType.empty()) << "Node type is not defined.";
    bool isNodeFound = false;
    InferenceEngine::CNNNetwork execGraphInfo = execNet.GetExecGraphInfo();
--- a/ngraph/core/reference/include/ngraph/runtime/reference/fake_quantize.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/fake_quantize.hpp
@ -223,11 +223,11 @@ namespace ngraph
                                               out_high,
                                               i,
                                               out_high_offsets);
-                    if (arg[i] <= in_low_val)
+                    if (arg[i] <= std::min(in_low_val, in_high_val))
                    {
                        out[i] = out_low_val;
                    }
-                    else if (arg[i] > in_high_val)
+                    else if (arg[i] > std::max(in_low_val, in_high_val))
                    {
                        out[i] = out_high_val;
                    }