Add CompressQuantizeWeights transformation (#7538)

* Add CompressQuantizeWeights transformation It's based on model-optimizer/extensions/back/compress_quantized_weights.py * handle dequantization subgraph after quantization * fix scale shift calculation * remove TRANSFORMATIONS_API from CompressQuantizeWeights * ZeroPointOptimizer * add CompressQuantizeWeights pass to to ApplyMOCTransformations * add comment * fix code style * cleanup * ambiguous copy_runtime_info * ambigous call * fix case when zero_point is close to zero * fix cf test case * move files * update tests * add tests for evaluate_subgraph * Address review comments * make params static
2022-01-13 23:38:39 +01:00
parent 6c69535d6c
commit 508af22c66
12 changed files with 581 additions and 33 deletions
--- a/src/bindings/python/src/openvino/offline_transformations_pybind/init.py
+++ b/src/bindings/python/src/openvino/offline_transformations_pybind/init.py
@@ -38,3 +38,4 @@ from openvino.pyopenvino.offline_transformations_pybind import generate_mapping_
 from openvino.pyopenvino.offline_transformations_pybind import apply_make_stateful_transformation
 from openvino.pyopenvino.offline_transformations_pybind import serialize
 from openvino.pyopenvino.offline_transformations_pybind import compress_model_transformation
+from openvino.pyopenvino.offline_transformations_pybind import compress_quantize_weights_transformation
--- a/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp
+++ b/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp
@@ -6,6 +6,7 @@

 #include <pybind11/stl.h>

+#include <compress_quantize_weights.hpp>
 #include <generate_mapping_file.hpp>
 #include <openvino/pass/make_stateful.hpp>
 #include <openvino/pass/serialize.hpp>
@@ -120,6 +121,16 @@ void regmodule_offline_transformations(py::module m) {
        },
        py::arg("function"));

+    m_offline_transformations.def(
+        "compress_quantize_weights_transformation",
+        [](std::shared_ptr<ov::Model> function) {
+            ov::pass::Manager manager;
+            manager.register_pass<ngraph::pass::CompressQuantizeWeights>();
+            manager.register_pass<ngraph::pass::ZeroPointOptimizer>();
+            manager.run_passes(function);
+        },
+        py::arg("function"));
+
    // todo: remove as serialize as part of passManager api will be merged
    m_offline_transformations.def(
        "serialize",
--- a/src/common/offline_transformations/include/compress_quantize_weights.hpp
+++ b/src/common/offline_transformations/include/compress_quantize_weights.hpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class CompressQuantizeWeights;
+class ZeroPointOptimizer;
+
+}  // namespace pass
+}  // namespace ngraph
+
+
+/*
+    CompressQuantizeWeights transformation goal is to pre-quantize data to minimize runtime calculations with constant data.
+    To achieve this goal we perform FakeQuantize decomposition to separate quantization from dequantization in it.
+
+    Initial graph (FakeQuantize where all inputs are Constants):
+
+                                   |  |  |  |  |
+                                   |  |  |  |  |
+                                   v  v  v  v  v
+                                  +------------+
+                                  |FakeQuantize|
+                                  +------------+
+                                        |
+                                        v
+
+    is replaced to:
+                                +-----------------+
+                                |    Constant     |
+                                | (low precision) |
+                                +-----------------+
+                                        |
+                                        v
+                                +------------------+
+                                |     Convert      |
+                                |  (to high prec)  |
+                                +------------------+
+                                        |
+                                        v
+                  +----------+    +------------+
+                  |zero point|--->|  Subtract  |
+                  +----------+    +-----+------+
+                                        |
+                                        v
+                   +---------+    +------------+
+                   |  scale  |--->|  Multiply  |
+                   +---------+    +-----+------+
+                                        |
+                                        v
+
+    Transformation prepares quantized constant data for Low Precision pipeline.
+    Such constant data packing reduces IR size (.bin file size) in offline transformations.
+    With that we can skip same calculations in the runtime and make loading of such sub-graphs to the plugin faster.
+*/
+class ngraph::pass::CompressQuantizeWeights: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    CompressQuantizeWeights();
+};
+
+/*
+   if zero_point == 0 we can eliminate Subtract from following dequantization subgraph:
+
+                                +-----------------+
+                                |    Constant     |
+                                | (low precision) |
+                                +-----------------+
+                                        |
+                                        v
+                                +------------------+
+                                |     Convert      |
+                                |  (to high prec)  |
+                                +------------------+
+                                        |
+                                        v
+                  +----------+    +------------+
+                  |zero point|--->|  Subtract  |
+                  +----------+    +-----+------+
+                                        |
+                                        v
+*/
+class ngraph::pass::ZeroPointOptimizer: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    ZeroPointOptimizer();
+};
--- a/src/common/offline_transformations/src/compress_quantize_weigths.cpp
+++ b/src/common/offline_transformations/src/compress_quantize_weigths.cpp
@@ -0,0 +1,235 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <ngraph/validation_util.hpp>
+#include <ngraph/rt_info.hpp>
+#include <openvino/pass/constant_folding.hpp>
+#include <compress_quantize_weights.hpp>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::CompressQuantizeWeights, "CompressQuantizeWeights", 0);
+
+static bool has_dequantization_subgraph(const std::shared_ptr<ngraph::Node>& first_convert) {
+    auto first_convert_users = first_convert->get_users();
+    const auto second_convert = std::find_if(first_convert_users.begin(), first_convert_users.end(),
+                                       [] (const std::shared_ptr<ngraph::Node>& n) -> bool {
+                                           return ov::is_type<ngraph::opset8::Convert>(n);
+                                       });
+    if (second_convert == first_convert_users.end())
+        return false;
+    auto convert_or_subtract_users = (*second_convert)->get_users();
+    const auto subtract = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
+                                       [] (const std::shared_ptr<ngraph::Node>& n) -> bool {
+                                           return ov::is_type<ngraph::opset8::Subtract>(n);
+                                       });
+    if (subtract != convert_or_subtract_users.end()) {
+        convert_or_subtract_users = (*subtract)->get_users();
+    }
+    const auto multiply = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
+                                       [] (const std::shared_ptr<ngraph::Node>& n) -> bool {
+                                           return ov::is_type<ngraph::opset8::Multiply>(n);
+                                       });
+    return multiply != convert_or_subtract_users.end();
+}
+
+ngraph::pass::CompressQuantizeWeights::CompressQuantizeWeights() {
+    auto weights_pattern = pattern::wrap_type<opset8::Constant>();
+    auto input_low_pattern = pattern::wrap_type<opset8::Constant>();
+    auto input_high_pattern = pattern::wrap_type<opset8::Constant>();
+    auto output_low_pattern = pattern::wrap_type<opset8::Constant>();
+    auto output_high_pattern = pattern::wrap_type<opset8::Constant>();
+    auto fq_pattern = pattern::wrap_type<opset8::FakeQuantize>({weights_pattern, input_low_pattern, input_high_pattern,
+                                                                output_low_pattern, output_high_pattern});
+
+    ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        auto fq = std::dynamic_pointer_cast<opset8::FakeQuantize>(m.get_match_root());
+        if (!fq)
+            return false;
+        auto levels = fq->get_levels();
+        if (levels <= 2 || levels > 256)
+            return false;
+        auto quantized_type = element::undefined;
+        // Currently we support two weights quantize types: i4 and i8
+        if (levels <= 16) {
+            quantized_type = element::i4;
+        } else if (levels <= 256) {
+            quantized_type = element::i8;
+        }
+
+        const auto& pattern_value_map = m.get_pattern_value_map();
+        const auto& input_type = fq->get_element_type();
+
+        // skip dequantize part if there is already dequantization subgraph after FakeQuantize
+        auto fq_users = fq->get_users();
+        if (fq_users.size() == 1 && has_dequantization_subgraph(fq_users[0])) {
+            auto& first_convert = fq_users[0];
+            if (auto new_weights = ov::get_constant_from_source(first_convert)) {
+                replace_node(first_convert, new_weights);
+                // preserve dequantization subgraph for LP transformations
+                auto weights_users = new_weights->get_users();
+                if (weights_users.size() == 1 && ov::is_type<ngraph::opset8::Convert>(weights_users[0])) {
+                    ov::pass::disable_constant_folding(weights_users[0]);
+                }
+                return true;
+            } else {
+                return false;
+            }
+        } else {
+            /*
+               Quantize part
+
+               Prepare new FakeQuantize that performs weights quantization.
+               In this case input_low/high stays the same, but we need new output_low/high:
+                 output_low = -levels / 2
+                 output_high = levels - 1 + output_low
+               The FakeQuantize result is converted to low precision type and then constant folded
+            */
+            std::shared_ptr<Node> new_input_low;
+            auto new_output_low = op::Constant::create(input_type, Shape{}, {-static_cast<float>(levels / 2)});
+            auto new_output_high = std::make_shared<opset8::Add>(new_output_low, op::Constant::create(input_type, Shape{}, {levels - 1}));
+            const auto& weights = pattern_value_map.at(weights_pattern);
+            const auto& input_low = pattern_value_map.at(input_low_pattern);
+            const auto& input_high = pattern_value_map.at(input_high_pattern);
+            auto quantize = fq->clone_with_new_inputs({weights, input_low, input_high,
+                                                       new_output_low, new_output_high});
+            // Convert quantized weights to low precision type
+            std::shared_ptr<Node> new_weights = std::make_shared<opset8::Convert>(quantize, quantized_type);
+            // Constant fold quantized weights
+            if (auto constant = ov::get_constant_from_source(new_weights)) {
+                new_weights = constant;
+            } else {
+                return false;
+            }
+            new_weights->set_friendly_name(weights.get_node()->get_friendly_name());
+
+            /*
+               Dequantize part is performed by Convert(from low to high precision)->Subtract->Multiply subgraph.
+
+                                 +-------------------------+
+                                 |         Convert         |
+                                 | (from low to high prec) |
+                                 +-------------------------+
+                                              |
+                                              v
+                        +----------+    +------------+
+                        |zero point|--->|  Subtract  |
+                        +----------+    +-----+------+
+                                              |
+                                              v
+                         +---------+    +------------+
+                         |  scale  |--->|  Multiply  |
+                         +---------+    +-----+------+
+                                              |
+                                              v
+
+                where:
+                    scale = (output_high - output_low) / (new_output_high - new_output_low)
+                    zero_point = new_output_low - output_low / scale
+            */
+            const auto& output_low = pattern_value_map.at(output_low_pattern);
+            const auto& output_high = pattern_value_map.at(output_high_pattern);
+            auto output_range = std::make_shared<opset8::Subtract>(output_high, output_low);
+            auto input_range = std::make_shared<opset8::Subtract>(new_output_high, new_output_low);
+            std::shared_ptr<Node> scale = std::make_shared<opset8::Divide>(output_range, input_range);
+            auto descaled_output_low = std::make_shared<opset8::Divide>(output_low, scale);
+            std::shared_ptr<Node> shift = std::make_shared<opset8::Subtract>(new_output_low, descaled_output_low);
+            if (auto constant = ov::get_constant_from_source(scale))
+                scale = constant;
+            auto zero = op::Constant::create(input_type, Shape{}, {0});
+            auto scale_eq_zero = std::make_shared<opset8::Equal>(scale, zero);
+            // shift equals to input_low - output_low / scale
+            // for positions where scale == 0, we put zero as shift
+            std::shared_ptr<Node> zero_point = std::make_shared<opset8::Select>(scale_eq_zero, zero, shift);
+            if (auto constant = ov::get_constant_from_source(zero_point))
+                zero_point = constant;
+            if (auto constant = ov::get_constant_from_source(scale))
+                scale = constant;
+            auto convert_to_high_prec = std::make_shared<opset8::Convert>(new_weights, input_type);
+            auto sub = register_new_node<opset8::Subtract>(convert_to_high_prec, zero_point);
+            auto mul = register_new_node<opset8::Multiply>(sub, scale);
+            mul->set_friendly_name(fq->get_friendly_name());
+            copy_runtime_info(fq, {convert_to_high_prec, sub, mul});
+            ov::pass::disable_constant_folding(convert_to_high_prec);
+            replace_node(fq, mul);
+        }
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(fq_pattern, "CompressQuantizeWeights");
+    this->register_matcher(m, callback);
+}
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::ZeroPointOptimizer, "ZeroPointOptimizer", 0);
+
+ngraph::pass::ZeroPointOptimizer::ZeroPointOptimizer() {
+    auto weights_pattern = pattern::wrap_type<opset8::Constant>();
+    auto zero_point_pattern = pattern::wrap_type<opset8::Constant>();
+    auto convert_pattern = pattern::wrap_type<opset8::Convert>({weights_pattern});
+    auto sub_pattern = pattern::wrap_type<opset8::Subtract>({convert_pattern, zero_point_pattern});
+
+    ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        const auto& pattern_value_map = m.get_pattern_value_map();
+        auto convert = pattern_value_map.at(convert_pattern).get_node_shared_ptr();
+        auto sub = pattern_value_map.at(sub_pattern).get_node_shared_ptr();
+        auto weights = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(weights_pattern).get_node_shared_ptr());
+        if (!weights || weights->get_element_type() != element::i8)
+            return false;
+        auto zero_point = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(zero_point_pattern).get_node_shared_ptr());
+        if (!zero_point)
+            return false;
+
+        auto zp_value = zero_point->cast_vector<float>();
+        if (std::all_of(zp_value.begin(), zp_value.end(), [] (float f) -> bool { return std::fabs(f) <= std::numeric_limits<float>::epsilon(); })) {
+            copy_runtime_info(sub, convert);
+            replace_node(sub, convert);
+        }
+
+        auto int8_zero_point = std::make_shared<opset8::Convert>(
+                std::make_shared<opset8::Round>(zero_point, opset8::Round::RoundMode::HALF_TO_EVEN),
+                weights->get_element_type());
+        auto adj_zero_point = std::make_shared<opset8::Subtract>(zero_point, std::make_shared<opset8::Convert>(int8_zero_point, convert->get_element_type()));
+
+        auto adj_zero_point_const = ov::get_constant_from_source(adj_zero_point);
+        if (!adj_zero_point_const)
+            return false;
+        auto adj_zero_point_val = adj_zero_point_const->cast_vector<float>();
+        bool is_adj_zero_point_close_to_zero = std::all_of(adj_zero_point_val.begin(), adj_zero_point_val.end(),
+                                                           [] (float f) -> bool {
+                                                               return std::fabs(f) < 1e-4;
+                                                           });
+        if (!is_adj_zero_point_close_to_zero)
+            return false;
+
+        auto transformed = std::make_shared<opset8::Subtract>(
+            std::make_shared<opset8::Convert>(std::make_shared<opset8::Subtract>(weights, int8_zero_point), convert->get_element_type()),
+            adj_zero_point);
+        auto diff = std::make_shared<opset8::Subtract>(sub, transformed);
+        auto diff_const = ov::get_constant_from_source(diff);
+        if (!diff_const)
+            return false;
+        auto diff_val = diff_const->cast_vector<float>();
+        bool is_transformed_and_original_equal = std::all_of(diff_val.begin(), diff_val.end(),
+                                                             [] (float f) -> bool {
+                                                                 return std::fabs(f) < std::numeric_limits<float>::epsilon();
+                                                             });
+        if (!is_transformed_and_original_equal)
+            return false;
+
+        std::shared_ptr<Node> new_weights = std::make_shared<opset8::Subtract>(weights, int8_zero_point);
+        if (auto constant = ov::get_constant_from_source(new_weights))
+            new_weights = constant;
+        else
+            return false;
+        new_weights->set_friendly_name(weights->get_friendly_name());
+        replace_node(weights, new_weights);
+
+        copy_runtime_info(sub, convert);
+        replace_node(sub, convert);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(sub_pattern, "ZeroPointOptimizer");
+    this->register_matcher(m, callback);
+}
--- a/src/core/reference/include/ngraph/runtime/reference/convert.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/convert.hpp
@@ -32,8 +32,8 @@ inline uint8_t get_u1(const uint8_t* buf, size_t idx) {
 inline void set_u4(uint8_t* buf, size_t idx, uint8_t val) {
    const size_t byte_idx = idx / 2;
    const uint8_t bit_shift = 4 * (++idx % 2);
-    buf[byte_idx] &= ~(0xF << bit_shift);  // half byte zeroed
-    buf[byte_idx] |= (val << bit_shift);   // set 1's
+    buf[byte_idx] &= ~(0xF << bit_shift);         // half byte zeroed
+    buf[byte_idx] |= ((val & 0xF) << bit_shift);  // set 1's
 }

 inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
@@ -45,8 +45,8 @@ inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
 inline void set_i4(uint8_t* buf, size_t idx, int8_t val) {
    const size_t byte_idx = idx / 2;
    const uint8_t bit_shift = 4 * (++idx % 2);
-    buf[byte_idx] &= ~(0xF << bit_shift);  // half byte zeroed
-    buf[byte_idx] |= (val << bit_shift);   // set 1's
+    buf[byte_idx] &= ~(0xF << bit_shift);         // half byte zeroed
+    buf[byte_idx] |= ((val & 0xF) << bit_shift);  // set 1's
 }

 inline int8_t get_i4(const uint8_t* buf, size_t idx) {
--- a/src/core/src/op/convert.cpp
+++ b/src/core/src/op/convert.cpp
@@ -131,12 +131,6 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
    NGRAPH_CHECK(node, validate_host_tensor_vector(output_values, 1));
    const auto& input = node->input_value(0);
    if (const auto& value = is_upper ? input.get_tensor().get_upper_value() : input.get_tensor().get_lower_value()) {
-        // constants for dynamic values translation
-        auto input_maximum_value = get_constant_max_of_type(input.get_element_type());
-        auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
-        if (input_maximum_value == nullptr || output_maximum_value == nullptr)
-            return false;
-
        OPENVINO_SUPPRESS_DEPRECATED_START
        bool status = node->evaluate(output_values, {value});
        OPENVINO_SUPPRESS_DEPRECATED_END
@@ -144,6 +138,19 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
        if (!status)
            return status;

+        const auto& input_element_type = input.get_element_type();
+        const auto& output_element_type = output_values[0]->get_element_type();
+        if ((input_element_type.is_integral() && input_element_type.bitwidth() <= 16) ||
+            (output_element_type.is_integral() && output_element_type.bitwidth() <= 16)) {
+            return status;
+        }
+
+        // constants for dynamic values translation
+        auto input_maximum_value = get_constant_max_of_type(input_element_type);
+        auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
+        if (input_maximum_value == nullptr || output_maximum_value == nullptr)
+            return false;
+
        // dynamic values translation
        auto input_dynamic_mask = std::make_shared<HostTensor>(element::boolean, input.get_shape());
        status =
--- a/src/core/src/op/subtract.cpp
+++ b/src/core/src/op/subtract.cpp
@@ -35,8 +35,10 @@ bool evaluate_subtract(const HostTensorPtr& arg0,
    bool rc = true;
    out->set_broadcast(broadcast_spec, arg0, arg1);
    switch (arg0->get_element_type()) {
+        NGRAPH_TYPE_CASE(evaluate_subtract, i8, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_subtract, i32, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_subtract, i64, arg0, arg1, out, broadcast_spec);
+        NGRAPH_TYPE_CASE(evaluate_subtract, u8, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_subtract, u32, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_subtract, u64, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_subtract, f16, arg0, arg1, out, broadcast_spec);
--- a/src/core/src/validation_util.cpp
+++ b/src/core/src/validation_util.cpp
@@ -1348,9 +1348,6 @@ shared_ptr<op::Constant> ngraph::get_constant_max_of_type(element::Type_t t) {
        NGRAPH_TYPE_TO_MAX_CONST(element::u16);
        NGRAPH_TYPE_TO_MAX_CONST(element::u32);
        NGRAPH_TYPE_TO_MAX_CONST(element::u64);
-
-    case element::undefined:
-    case element::dynamic:
    default:
        return nullptr;
    }
@@ -1377,9 +1374,6 @@ shared_ptr<op::Constant> ngraph::get_constant_min_of_type(element::Type_t t) {
        NGRAPH_TYPE_TO_MIN_CONST(element::u16);
        NGRAPH_TYPE_TO_MIN_CONST(element::u32);
        NGRAPH_TYPE_TO_MIN_CONST(element::u64);
-
-    case element::undefined:
-    case element::dynamic:
    default:
        return nullptr;
    }
--- a/src/core/tests/constant_folding.cpp
+++ b/src/core/tests/constant_folding.cpp
@@ -266,6 +266,8 @@ TEST(constant_folding, constant_unary_binary) {
    vector<int> values_g{1, 4};
    vector<char> values_h{0, 0, 1, 1};
    vector<char> values_i{0, 1};
+    vector<int8_t> values_j{-3, 5};
+    vector<uint8_t> values_k{3, 5};
    auto a = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_a);
    auto b = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_b);
    auto c = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_c);
@@ -275,6 +277,8 @@ TEST(constant_folding, constant_unary_binary) {
    auto g = make_shared<op::Constant>(element::i32, Shape{2}, values_g);
    auto h = make_shared<op::Constant>(element::boolean, Shape{2, 2}, values_h);
    auto i = make_shared<op::Constant>(element::boolean, Shape{2}, values_i);
+    auto j = make_shared<op::Constant>(element::i8, Shape{2}, values_j);
+    auto k = make_shared<op::Constant>(element::u8, Shape{2}, values_k);
    auto doubles = make_shared<op::Constant>(element::f64, Shape{2}, std::vector<double>{4.0, 9.0});

    auto add = make_shared<op::v1::Add>(a, b);
@@ -303,6 +307,8 @@ TEST(constant_folding, constant_unary_binary) {
    auto logical_or_autob_numpy = make_shared<op::v1::LogicalOr>(h, i, op::AutoBroadcastType::NUMPY);
    auto logical_xor_autob_numpy = make_shared<op::Xor>(h, i, op::AutoBroadcastType::NUMPY);
    auto doubles_sqrt = make_shared<op::Sqrt>(doubles);
+    auto sub_int8 = make_shared<op::v1::Subtract>(j, j);
+    auto sub_uint8 = make_shared<op::v1::Subtract>(k, k);

    auto neg_sqrt = make_shared<op::Sqrt>(c);

@@ -331,7 +337,9 @@ TEST(constant_folding, constant_unary_binary) {
                                                 less_eq_autob_numpy,
                                                 logical_or_autob_numpy,
                                                 logical_xor_autob_numpy,
-                                                 doubles_sqrt},
+                                                 doubles_sqrt,
+                                                 sub_int8,
+                                                 sub_uint8},
                                      ParameterVector{});
    auto func_error = make_shared<Function>(NodeVector{neg_sqrt}, ParameterVector{});

@@ -365,6 +373,8 @@ TEST(constant_folding, constant_unary_binary) {
    vector<char> logical_or_autob_numpy_expected{0, 1, 1, 1};
    vector<char> logical_xor_autob_numpy_expected{0, 1, 1, 0};
    vector<double> doubles_sqrt_expected{2.0, 3.0};
+    vector<int8_t> sub_int8_expected{0, 0};
+    vector<uint8_t> sub_uint8_expected{0, 0};

    ASSERT_EQ(get_result_constant<int>(func, 0), add_expected);
    ASSERT_EQ(get_result_constant<int>(func, 1), sub_expected);
@@ -392,13 +402,15 @@ TEST(constant_folding, constant_unary_binary) {
    ASSERT_EQ(get_result_constant<char>(func, 23), logical_or_autob_numpy_expected);
    ASSERT_EQ(get_result_constant<char>(func, 24), logical_xor_autob_numpy_expected);
    ASSERT_EQ(get_result_constant<double>(func, 25), doubles_sqrt_expected);
+    ASSERT_EQ(get_result_constant<int8_t>(func, 26), sub_int8_expected);
+    ASSERT_EQ(get_result_constant<uint8_t>(func, 27), sub_uint8_expected);
    ASSERT_NO_THROW(pass_manager.run_passes(func_error));
 }

-template <typename T, typename U>
+template <element::Type_t from, element::Type_t to, typename T, typename U>
 static void test_const_convert(const vector<T>& values_in, const vector<U>& values_expected) {
-    auto constant = op::Constant::create(element::from<T>(), Shape{values_in.size()}, values_in);
-    auto convert = make_shared<op::Convert>(constant, element::from<U>());
+    auto constant = op::Constant::create(from, Shape{values_in.size()}, values_in);
+    auto convert = make_shared<op::Convert>(constant, to);
    convert->set_friendly_name("test");
    auto f = make_shared<Function>(convert, ParameterVector{});

@@ -412,8 +424,8 @@ static void test_const_convert(const vector<T>& values_in, const vector<U>& valu
    auto new_const = ov::as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
    ASSERT_TRUE(new_const);
    ASSERT_EQ(new_const->get_friendly_name(), "test");
-    ASSERT_EQ(new_const->get_output_element_type(0), element::from<U>());
-    auto values_out = new_const->template get_vector<U>();
+    ASSERT_EQ(new_const->get_output_element_type(0), to);
+    auto values_out = new_const->template cast_vector<U>();

    ASSERT_EQ(values_expected, values_out);
 }
@@ -422,47 +434,57 @@ TEST(constant_folding, const_convert) {
    {
        vector<float> in{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
        vector<uint64_t> expected{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
-        test_const_convert(in, expected);
+        test_const_convert<element::f32, element::u64>(in, expected);
    }
    {
        vector<bool> in{false, true, true, false, false, false, true};
        vector<float> expected{0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f};
-        test_const_convert(in, expected);
+        test_const_convert<element::boolean, element::f32>(in, expected);
    }
    {
        vector<float> in{1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f};
        vector<bool> expected{true, false, true, false, true, false, true};
-        test_const_convert(in, expected);
+        test_const_convert<element::f32, element::boolean>(in, expected);
    }
    {
        vector<int64_t> in{1, 2, 3, 4, 5};
        vector<double> expected{1.0, 2.0, 3.0, 4.0, 5.0};
-        test_const_convert(in, expected);
+        test_const_convert<element::i64, element::f64>(in, expected);
    }
    {
        vector<double> in{1.2, 2.1, 3.3, 4.45, 5.02};
        vector<int64_t> expected{1, 2, 3, 4, 5};
-        test_const_convert(in, expected);
+        test_const_convert<element::f64, element::i64>(in, expected);
+    }
+    {
+        vector<int8_t> in{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
+        vector<float> expected{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
+        test_const_convert<element::i4, element::f32>(in, expected);
+    }
+    {
+        vector<float> in{9, 0, 1, 2, 3, 4, 5, -1, -2, -10};
+        vector<int8_t> expected{-7, 0, 1, 2, 3, 4, 5, -1, -2, 6};
+        test_const_convert<element::f32, element::i4>(in, expected);
    }
    {
        vector<int8_t> in{-128, -2, 0, 1, 3, 127};
        vector<float> expected{-128, -2, 0, 1, 3, 127};
-        test_const_convert(in, expected);
+        test_const_convert<element::i8, element::f32>(in, expected);
    }
    {
        vector<uint8_t> in{0, 1, 3, 127, 255};
        vector<float> expected{0, 1, 3, 127, 255};
-        test_const_convert(in, expected);
+        test_const_convert<element::u8, element::f32>(in, expected);
    }
    {
        vector<float> in{-300, -128, -1, 0, 33, 127, 128};
        vector<int8_t> expected{-44, -128, -1, 0, 33, 127, -128};
-        test_const_convert(in, expected);
+        test_const_convert<element::f32, element::i8>(in, expected);
    }
    {
        vector<float> in{0, 33, 127, 255, 256};
        vector<uint8_t> expected{0, 33, 127, 255, 0};
-        test_const_convert(in, expected);
+        test_const_convert<element::f32, element::u8>(in, expected);
    }
 }

--- a/src/frontends/onnx/frontend/src/op/quantize_linear.cpp
+++ b/src/frontends/onnx/frontend/src/op/quantize_linear.cpp
@@ -88,12 +88,12 @@ std::tuple<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>> get_inp
    input_low =
        std::make_shared<default_opset::Multiply>(y_scale,
                                                  std::make_shared<default_opset::Subtract>(output_low, zero_point));
-    if (auto constant = get_constant_from_source(input_low))
+    if (auto constant = ov::get_constant_from_source(input_low))
        input_low = constant;
    input_high =
        std::make_shared<default_opset::Multiply>(y_scale,
                                                  std::make_shared<default_opset::Subtract>(output_high, zero_point));
-    if (auto constant = get_constant_from_source(input_high))
+    if (auto constant = ov::get_constant_from_source(input_high))
        input_high = constant;

    return std::make_tuple(input_low, input_high);
--- a/src/tests/functional/inference_engine/transformations/compress_quantize_weights.cpp
+++ b/src/tests/functional/inference_engine/transformations/compress_quantize_weights.cpp
@@ -0,0 +1,179 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <compress_quantize_weights.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+
+using namespace testing;
+using namespace ngraph;
+
+
+struct CompressQuantizeWeightsParams {
+    Shape shape;
+    std::vector<float> weights;
+    float in_low;
+    float in_high;
+    float out_low;
+    float out_high;
+    size_t levels;
+    element::Type_t expected_type;
+    std::vector<float> expected_weights;
+    float scale_val;
+    float zero_point_val;
+};
+
+class CompressQuantizeWeightsTests
+        : public testing::WithParamInterface<CompressQuantizeWeightsParams>,
+          public TransformationTestsF {
+    void SetUp() override {
+        TransformationTestsF::SetUp();
+        auto param = GetParam();
+        {
+            auto data = opset8::Constant::create(element::f32, param.shape, param.weights);
+            auto input_low = opset8::Constant::create(element::f32, Shape{}, {param.in_low});
+            auto input_high = opset8::Constant::create(element::f32, Shape{}, {param.in_high});
+            auto output_low = opset8::Constant::create(element::f32, Shape{}, {param.out_low});
+            auto output_high = opset8::Constant::create(element::f32, Shape{}, {param.out_high});
+            auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, param.levels);
+            function = std::make_shared<Function>(fq, ParameterVector{});
+        }
+
+        manager.register_pass<pass::CompressQuantizeWeights>();
+
+        {
+            auto data = opset8::Constant::create(param.expected_type, param.shape, param.expected_weights);
+            auto convert = std::make_shared<opset8::Convert>(data, element::f32);
+            auto scale = opset8::Constant::create(element::f32, Shape{}, {param.scale_val});
+            auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
+            auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
+            auto mul = std::make_shared<opset8::Multiply>(sub, scale);
+            function_ref = std::make_shared<Function>(mul, ParameterVector{});
+        }
+        comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+        enable_accuracy_check();
+    }
+};
+
+TEST_P(CompressQuantizeWeightsTests, FusionTest) {
+}
+
+static std::vector<CompressQuantizeWeightsParams> params = {
+    {Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 5, 3, element::i4, {-1, -1, 0, 0, 0, 1}, 3, -0.666667},
+    {Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 4, 16, element::i4, {-8, -5, -4, -2, 0, 7}, 0.333333, -5},
+    {Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 17, element::i8, {-8, -8, -8, -6, -4, -2, 0, 8}, 0.5, -4},
+    {Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 256, element::i8, {-128, -128, -128, -96, -64, -32, 0, 127}, 0.0313725, -64.25},
+};
+
+INSTANTIATE_TEST_SUITE_P(TransformationTests, CompressQuantizeWeightsTests, ::testing::ValuesIn(params));
+
+
+TEST_F(TransformationTestsF, CompressQuantizeWeightsWithDequantizationSubgraph) {
+    {
+        auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
+        auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
+        auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
+        auto output_low = opset8::Constant::create(element::f32, Shape{}, {-128});
+        auto output_high = opset8::Constant::create(element::f32, Shape{}, {127});
+        auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
+        auto convert = std::make_shared<opset8::Convert>(fq, element::i8);
+        auto second_convert = std::make_shared<opset8::Convert>(convert, element::f32);
+        auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
+        auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
+        auto sub = std::make_shared<opset8::Subtract>(second_convert, zero_point);
+        auto mul = std::make_shared<opset8::Multiply>(sub, scale);
+
+        function = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
+
+        manager.register_pass<pass::CompressQuantizeWeights>();
+    }
+    {
+        auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
+        auto convert = std::make_shared<opset8::Convert>(data, element::f32);
+        auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
+        auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
+        auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
+        auto mul = std::make_shared<opset8::Multiply>(sub, scale);
+        function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    enable_accuracy_check();
+}
+
+TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
+    {
+        auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
+        auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
+        auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
+        auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
+        auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
+        auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
+        function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
+
+        manager.register_pass<pass::CompressQuantizeWeights>();
+        manager.register_pass<pass::ZeroPointOptimizer>();
+    }
+
+    {
+        auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
+        auto convert = std::make_shared<opset8::Convert>(data, element::f32);
+        auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.00299335, 0.00266047});
+        auto mul = std::make_shared<opset8::Multiply>(convert, scale);
+        function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    enable_accuracy_check();
+}
+
+TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimizer) {
+    {
+        auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
+        auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
+        auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
+        auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
+        auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
+        auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
+        function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
+
+        manager.register_pass<pass::CompressQuantizeWeights>();
+        manager.register_pass<pass::ZeroPointOptimizer>();
+    }
+    {
+        auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
+        auto convert = std::make_shared<opset8::Convert>(data, element::f32);
+        auto scale = opset8::Constant::create(element::f32, Shape{}, {0.0313725});
+        auto zero_point = opset8::Constant::create(element::f32, Shape{}, {-64.25});
+        auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
+        auto mul = std::make_shared<opset8::Multiply>(sub, scale);
+        function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    enable_accuracy_check();
+}
+
+TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsNonConstantInput) {
+    auto data = std::make_shared<opset8::Parameter>(element::f32, Shape{2, 4, 1, 1});
+    auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
+    auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
+    auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
+    auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
+    auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
+    function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{data});
+
+    manager.register_pass<pass::CompressQuantizeWeights>();
+    manager.register_pass<pass::ZeroPointOptimizer>();
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    enable_accuracy_check();
+}
--- a/tools/mo/openvino/tools/mo/moc_frontend/serialize.py
+++ b/tools/mo/openvino/tools/mo/moc_frontend/serialize.py
@@ -24,6 +24,8 @@ def moc_emit_ir(ngraph_function: Model, argv: argparse.Namespace):

    apply_user_transformations(ngraph_function, parse_transform(argv.transform))
    apply_moc_transformations(ngraph_function)
+    from openvino.offline_transformations_pybind import compress_quantize_weights_transformation
+    compress_quantize_weights_transformation(ngraph_function)

    if argv.framework == "onnx":
        # set OldApi map in IR to be executed via OV API 1.x and for parity with legacy MO