Add CompressQuantizeWeights transformation (#7538)

* Add CompressQuantizeWeights transformation

It's based on model-optimizer/extensions/back/compress_quantized_weights.py

* handle dequantization subgraph after quantization

* fix scale shift calculation

* remove TRANSFORMATIONS_API from CompressQuantizeWeights

* ZeroPointOptimizer

* add CompressQuantizeWeights pass to to ApplyMOCTransformations

* add comment

* fix code style

* cleanup

* ambiguous copy_runtime_info

* ambigous call

* fix case when zero_point is close to zero

* fix cf test case

* move files

* update tests

* add tests for evaluate_subgraph

* Address review comments

* make params static
This commit is contained in:
Mateusz Tabaka
2022-01-13 23:38:39 +01:00
committed by GitHub
parent 6c69535d6c
commit 508af22c66
12 changed files with 581 additions and 33 deletions

View File

@@ -38,3 +38,4 @@ from openvino.pyopenvino.offline_transformations_pybind import generate_mapping_
from openvino.pyopenvino.offline_transformations_pybind import apply_make_stateful_transformation
from openvino.pyopenvino.offline_transformations_pybind import serialize
from openvino.pyopenvino.offline_transformations_pybind import compress_model_transformation
from openvino.pyopenvino.offline_transformations_pybind import compress_quantize_weights_transformation

View File

@@ -6,6 +6,7 @@
#include <pybind11/stl.h>
#include <compress_quantize_weights.hpp>
#include <generate_mapping_file.hpp>
#include <openvino/pass/make_stateful.hpp>
#include <openvino/pass/serialize.hpp>
@@ -120,6 +121,16 @@ void regmodule_offline_transformations(py::module m) {
},
py::arg("function"));
m_offline_transformations.def(
"compress_quantize_weights_transformation",
[](std::shared_ptr<ov::Model> function) {
ov::pass::Manager manager;
manager.register_pass<ngraph::pass::CompressQuantizeWeights>();
manager.register_pass<ngraph::pass::ZeroPointOptimizer>();
manager.run_passes(function);
},
py::arg("function"));
// todo: remove as serialize as part of passManager api will be merged
m_offline_transformations.def(
"serialize",

View File

@@ -0,0 +1,95 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <transformations_visibility.hpp>
#include <ngraph/pass/graph_rewrite.hpp>
namespace ngraph {
namespace pass {
class CompressQuantizeWeights;
class ZeroPointOptimizer;
} // namespace pass
} // namespace ngraph
/*
CompressQuantizeWeights transformation goal is to pre-quantize data to minimize runtime calculations with constant data.
To achieve this goal we perform FakeQuantize decomposition to separate quantization from dequantization in it.
Initial graph (FakeQuantize where all inputs are Constants):
| | | | |
| | | | |
v v v v v
+------------+
|FakeQuantize|
+------------+
|
v
is replaced to:
+-----------------+
| Constant |
| (low precision) |
+-----------------+
|
v
+------------------+
| Convert |
| (to high prec) |
+------------------+
|
v
+----------+ +------------+
|zero point|--->| Subtract |
+----------+ +-----+------+
|
v
+---------+ +------------+
| scale |--->| Multiply |
+---------+ +-----+------+
|
v
Transformation prepares quantized constant data for Low Precision pipeline.
Such constant data packing reduces IR size (.bin file size) in offline transformations.
With that we can skip same calculations in the runtime and make loading of such sub-graphs to the plugin faster.
*/
class ngraph::pass::CompressQuantizeWeights: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
CompressQuantizeWeights();
};
/*
if zero_point == 0 we can eliminate Subtract from following dequantization subgraph:
+-----------------+
| Constant |
| (low precision) |
+-----------------+
|
v
+------------------+
| Convert |
| (to high prec) |
+------------------+
|
v
+----------+ +------------+
|zero point|--->| Subtract |
+----------+ +-----+------+
|
v
*/
class ngraph::pass::ZeroPointOptimizer: public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
ZeroPointOptimizer();
};

View File

@@ -0,0 +1,235 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/opsets/opset8.hpp>
#include <ngraph/validation_util.hpp>
#include <ngraph/rt_info.hpp>
#include <openvino/pass/constant_folding.hpp>
#include <compress_quantize_weights.hpp>
NGRAPH_RTTI_DEFINITION(ngraph::pass::CompressQuantizeWeights, "CompressQuantizeWeights", 0);
static bool has_dequantization_subgraph(const std::shared_ptr<ngraph::Node>& first_convert) {
auto first_convert_users = first_convert->get_users();
const auto second_convert = std::find_if(first_convert_users.begin(), first_convert_users.end(),
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
return ov::is_type<ngraph::opset8::Convert>(n);
});
if (second_convert == first_convert_users.end())
return false;
auto convert_or_subtract_users = (*second_convert)->get_users();
const auto subtract = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
return ov::is_type<ngraph::opset8::Subtract>(n);
});
if (subtract != convert_or_subtract_users.end()) {
convert_or_subtract_users = (*subtract)->get_users();
}
const auto multiply = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
return ov::is_type<ngraph::opset8::Multiply>(n);
});
return multiply != convert_or_subtract_users.end();
}
ngraph::pass::CompressQuantizeWeights::CompressQuantizeWeights() {
auto weights_pattern = pattern::wrap_type<opset8::Constant>();
auto input_low_pattern = pattern::wrap_type<opset8::Constant>();
auto input_high_pattern = pattern::wrap_type<opset8::Constant>();
auto output_low_pattern = pattern::wrap_type<opset8::Constant>();
auto output_high_pattern = pattern::wrap_type<opset8::Constant>();
auto fq_pattern = pattern::wrap_type<opset8::FakeQuantize>({weights_pattern, input_low_pattern, input_high_pattern,
output_low_pattern, output_high_pattern});
ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
auto fq = std::dynamic_pointer_cast<opset8::FakeQuantize>(m.get_match_root());
if (!fq)
return false;
auto levels = fq->get_levels();
if (levels <= 2 || levels > 256)
return false;
auto quantized_type = element::undefined;
// Currently we support two weights quantize types: i4 and i8
if (levels <= 16) {
quantized_type = element::i4;
} else if (levels <= 256) {
quantized_type = element::i8;
}
const auto& pattern_value_map = m.get_pattern_value_map();
const auto& input_type = fq->get_element_type();
// skip dequantize part if there is already dequantization subgraph after FakeQuantize
auto fq_users = fq->get_users();
if (fq_users.size() == 1 && has_dequantization_subgraph(fq_users[0])) {
auto& first_convert = fq_users[0];
if (auto new_weights = ov::get_constant_from_source(first_convert)) {
replace_node(first_convert, new_weights);
// preserve dequantization subgraph for LP transformations
auto weights_users = new_weights->get_users();
if (weights_users.size() == 1 && ov::is_type<ngraph::opset8::Convert>(weights_users[0])) {
ov::pass::disable_constant_folding(weights_users[0]);
}
return true;
} else {
return false;
}
} else {
/*
Quantize part
Prepare new FakeQuantize that performs weights quantization.
In this case input_low/high stays the same, but we need new output_low/high:
output_low = -levels / 2
output_high = levels - 1 + output_low
The FakeQuantize result is converted to low precision type and then constant folded
*/
std::shared_ptr<Node> new_input_low;
auto new_output_low = op::Constant::create(input_type, Shape{}, {-static_cast<float>(levels / 2)});
auto new_output_high = std::make_shared<opset8::Add>(new_output_low, op::Constant::create(input_type, Shape{}, {levels - 1}));
const auto& weights = pattern_value_map.at(weights_pattern);
const auto& input_low = pattern_value_map.at(input_low_pattern);
const auto& input_high = pattern_value_map.at(input_high_pattern);
auto quantize = fq->clone_with_new_inputs({weights, input_low, input_high,
new_output_low, new_output_high});
// Convert quantized weights to low precision type
std::shared_ptr<Node> new_weights = std::make_shared<opset8::Convert>(quantize, quantized_type);
// Constant fold quantized weights
if (auto constant = ov::get_constant_from_source(new_weights)) {
new_weights = constant;
} else {
return false;
}
new_weights->set_friendly_name(weights.get_node()->get_friendly_name());
/*
Dequantize part is performed by Convert(from low to high precision)->Subtract->Multiply subgraph.
+-------------------------+
| Convert |
| (from low to high prec) |
+-------------------------+
|
v
+----------+ +------------+
|zero point|--->| Subtract |
+----------+ +-----+------+
|
v
+---------+ +------------+
| scale |--->| Multiply |
+---------+ +-----+------+
|
v
where:
scale = (output_high - output_low) / (new_output_high - new_output_low)
zero_point = new_output_low - output_low / scale
*/
const auto& output_low = pattern_value_map.at(output_low_pattern);
const auto& output_high = pattern_value_map.at(output_high_pattern);
auto output_range = std::make_shared<opset8::Subtract>(output_high, output_low);
auto input_range = std::make_shared<opset8::Subtract>(new_output_high, new_output_low);
std::shared_ptr<Node> scale = std::make_shared<opset8::Divide>(output_range, input_range);
auto descaled_output_low = std::make_shared<opset8::Divide>(output_low, scale);
std::shared_ptr<Node> shift = std::make_shared<opset8::Subtract>(new_output_low, descaled_output_low);
if (auto constant = ov::get_constant_from_source(scale))
scale = constant;
auto zero = op::Constant::create(input_type, Shape{}, {0});
auto scale_eq_zero = std::make_shared<opset8::Equal>(scale, zero);
// shift equals to input_low - output_low / scale
// for positions where scale == 0, we put zero as shift
std::shared_ptr<Node> zero_point = std::make_shared<opset8::Select>(scale_eq_zero, zero, shift);
if (auto constant = ov::get_constant_from_source(zero_point))
zero_point = constant;
if (auto constant = ov::get_constant_from_source(scale))
scale = constant;
auto convert_to_high_prec = std::make_shared<opset8::Convert>(new_weights, input_type);
auto sub = register_new_node<opset8::Subtract>(convert_to_high_prec, zero_point);
auto mul = register_new_node<opset8::Multiply>(sub, scale);
mul->set_friendly_name(fq->get_friendly_name());
copy_runtime_info(fq, {convert_to_high_prec, sub, mul});
ov::pass::disable_constant_folding(convert_to_high_prec);
replace_node(fq, mul);
}
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(fq_pattern, "CompressQuantizeWeights");
this->register_matcher(m, callback);
}
NGRAPH_RTTI_DEFINITION(ngraph::pass::ZeroPointOptimizer, "ZeroPointOptimizer", 0);
ngraph::pass::ZeroPointOptimizer::ZeroPointOptimizer() {
auto weights_pattern = pattern::wrap_type<opset8::Constant>();
auto zero_point_pattern = pattern::wrap_type<opset8::Constant>();
auto convert_pattern = pattern::wrap_type<opset8::Convert>({weights_pattern});
auto sub_pattern = pattern::wrap_type<opset8::Subtract>({convert_pattern, zero_point_pattern});
ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
const auto& pattern_value_map = m.get_pattern_value_map();
auto convert = pattern_value_map.at(convert_pattern).get_node_shared_ptr();
auto sub = pattern_value_map.at(sub_pattern).get_node_shared_ptr();
auto weights = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(weights_pattern).get_node_shared_ptr());
if (!weights || weights->get_element_type() != element::i8)
return false;
auto zero_point = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(zero_point_pattern).get_node_shared_ptr());
if (!zero_point)
return false;
auto zp_value = zero_point->cast_vector<float>();
if (std::all_of(zp_value.begin(), zp_value.end(), [] (float f) -> bool { return std::fabs(f) <= std::numeric_limits<float>::epsilon(); })) {
copy_runtime_info(sub, convert);
replace_node(sub, convert);
}
auto int8_zero_point = std::make_shared<opset8::Convert>(
std::make_shared<opset8::Round>(zero_point, opset8::Round::RoundMode::HALF_TO_EVEN),
weights->get_element_type());
auto adj_zero_point = std::make_shared<opset8::Subtract>(zero_point, std::make_shared<opset8::Convert>(int8_zero_point, convert->get_element_type()));
auto adj_zero_point_const = ov::get_constant_from_source(adj_zero_point);
if (!adj_zero_point_const)
return false;
auto adj_zero_point_val = adj_zero_point_const->cast_vector<float>();
bool is_adj_zero_point_close_to_zero = std::all_of(adj_zero_point_val.begin(), adj_zero_point_val.end(),
[] (float f) -> bool {
return std::fabs(f) < 1e-4;
});
if (!is_adj_zero_point_close_to_zero)
return false;
auto transformed = std::make_shared<opset8::Subtract>(
std::make_shared<opset8::Convert>(std::make_shared<opset8::Subtract>(weights, int8_zero_point), convert->get_element_type()),
adj_zero_point);
auto diff = std::make_shared<opset8::Subtract>(sub, transformed);
auto diff_const = ov::get_constant_from_source(diff);
if (!diff_const)
return false;
auto diff_val = diff_const->cast_vector<float>();
bool is_transformed_and_original_equal = std::all_of(diff_val.begin(), diff_val.end(),
[] (float f) -> bool {
return std::fabs(f) < std::numeric_limits<float>::epsilon();
});
if (!is_transformed_and_original_equal)
return false;
std::shared_ptr<Node> new_weights = std::make_shared<opset8::Subtract>(weights, int8_zero_point);
if (auto constant = ov::get_constant_from_source(new_weights))
new_weights = constant;
else
return false;
new_weights->set_friendly_name(weights->get_friendly_name());
replace_node(weights, new_weights);
copy_runtime_info(sub, convert);
replace_node(sub, convert);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(sub_pattern, "ZeroPointOptimizer");
this->register_matcher(m, callback);
}

View File

@@ -32,8 +32,8 @@ inline uint8_t get_u1(const uint8_t* buf, size_t idx) {
inline void set_u4(uint8_t* buf, size_t idx, uint8_t val) {
const size_t byte_idx = idx / 2;
const uint8_t bit_shift = 4 * (++idx % 2);
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
buf[byte_idx] |= (val << bit_shift); // set 1's
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
buf[byte_idx] |= ((val & 0xF) << bit_shift); // set 1's
}
inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
@@ -45,8 +45,8 @@ inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
inline void set_i4(uint8_t* buf, size_t idx, int8_t val) {
const size_t byte_idx = idx / 2;
const uint8_t bit_shift = 4 * (++idx % 2);
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
buf[byte_idx] |= (val << bit_shift); // set 1's
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
buf[byte_idx] |= ((val & 0xF) << bit_shift); // set 1's
}
inline int8_t get_i4(const uint8_t* buf, size_t idx) {

View File

@@ -131,12 +131,6 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
NGRAPH_CHECK(node, validate_host_tensor_vector(output_values, 1));
const auto& input = node->input_value(0);
if (const auto& value = is_upper ? input.get_tensor().get_upper_value() : input.get_tensor().get_lower_value()) {
// constants for dynamic values translation
auto input_maximum_value = get_constant_max_of_type(input.get_element_type());
auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
if (input_maximum_value == nullptr || output_maximum_value == nullptr)
return false;
OPENVINO_SUPPRESS_DEPRECATED_START
bool status = node->evaluate(output_values, {value});
OPENVINO_SUPPRESS_DEPRECATED_END
@@ -144,6 +138,19 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
if (!status)
return status;
const auto& input_element_type = input.get_element_type();
const auto& output_element_type = output_values[0]->get_element_type();
if ((input_element_type.is_integral() && input_element_type.bitwidth() <= 16) ||
(output_element_type.is_integral() && output_element_type.bitwidth() <= 16)) {
return status;
}
// constants for dynamic values translation
auto input_maximum_value = get_constant_max_of_type(input_element_type);
auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
if (input_maximum_value == nullptr || output_maximum_value == nullptr)
return false;
// dynamic values translation
auto input_dynamic_mask = std::make_shared<HostTensor>(element::boolean, input.get_shape());
status =

View File

@@ -35,8 +35,10 @@ bool evaluate_subtract(const HostTensorPtr& arg0,
bool rc = true;
out->set_broadcast(broadcast_spec, arg0, arg1);
switch (arg0->get_element_type()) {
NGRAPH_TYPE_CASE(evaluate_subtract, i8, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, i32, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, i64, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, u8, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, u32, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, u64, arg0, arg1, out, broadcast_spec);
NGRAPH_TYPE_CASE(evaluate_subtract, f16, arg0, arg1, out, broadcast_spec);

View File

@@ -1348,9 +1348,6 @@ shared_ptr<op::Constant> ngraph::get_constant_max_of_type(element::Type_t t) {
NGRAPH_TYPE_TO_MAX_CONST(element::u16);
NGRAPH_TYPE_TO_MAX_CONST(element::u32);
NGRAPH_TYPE_TO_MAX_CONST(element::u64);
case element::undefined:
case element::dynamic:
default:
return nullptr;
}
@@ -1377,9 +1374,6 @@ shared_ptr<op::Constant> ngraph::get_constant_min_of_type(element::Type_t t) {
NGRAPH_TYPE_TO_MIN_CONST(element::u16);
NGRAPH_TYPE_TO_MIN_CONST(element::u32);
NGRAPH_TYPE_TO_MIN_CONST(element::u64);
case element::undefined:
case element::dynamic:
default:
return nullptr;
}

View File

@@ -266,6 +266,8 @@ TEST(constant_folding, constant_unary_binary) {
vector<int> values_g{1, 4};
vector<char> values_h{0, 0, 1, 1};
vector<char> values_i{0, 1};
vector<int8_t> values_j{-3, 5};
vector<uint8_t> values_k{3, 5};
auto a = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_a);
auto b = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_b);
auto c = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_c);
@@ -275,6 +277,8 @@ TEST(constant_folding, constant_unary_binary) {
auto g = make_shared<op::Constant>(element::i32, Shape{2}, values_g);
auto h = make_shared<op::Constant>(element::boolean, Shape{2, 2}, values_h);
auto i = make_shared<op::Constant>(element::boolean, Shape{2}, values_i);
auto j = make_shared<op::Constant>(element::i8, Shape{2}, values_j);
auto k = make_shared<op::Constant>(element::u8, Shape{2}, values_k);
auto doubles = make_shared<op::Constant>(element::f64, Shape{2}, std::vector<double>{4.0, 9.0});
auto add = make_shared<op::v1::Add>(a, b);
@@ -303,6 +307,8 @@ TEST(constant_folding, constant_unary_binary) {
auto logical_or_autob_numpy = make_shared<op::v1::LogicalOr>(h, i, op::AutoBroadcastType::NUMPY);
auto logical_xor_autob_numpy = make_shared<op::Xor>(h, i, op::AutoBroadcastType::NUMPY);
auto doubles_sqrt = make_shared<op::Sqrt>(doubles);
auto sub_int8 = make_shared<op::v1::Subtract>(j, j);
auto sub_uint8 = make_shared<op::v1::Subtract>(k, k);
auto neg_sqrt = make_shared<op::Sqrt>(c);
@@ -331,7 +337,9 @@ TEST(constant_folding, constant_unary_binary) {
less_eq_autob_numpy,
logical_or_autob_numpy,
logical_xor_autob_numpy,
doubles_sqrt},
doubles_sqrt,
sub_int8,
sub_uint8},
ParameterVector{});
auto func_error = make_shared<Function>(NodeVector{neg_sqrt}, ParameterVector{});
@@ -365,6 +373,8 @@ TEST(constant_folding, constant_unary_binary) {
vector<char> logical_or_autob_numpy_expected{0, 1, 1, 1};
vector<char> logical_xor_autob_numpy_expected{0, 1, 1, 0};
vector<double> doubles_sqrt_expected{2.0, 3.0};
vector<int8_t> sub_int8_expected{0, 0};
vector<uint8_t> sub_uint8_expected{0, 0};
ASSERT_EQ(get_result_constant<int>(func, 0), add_expected);
ASSERT_EQ(get_result_constant<int>(func, 1), sub_expected);
@@ -392,13 +402,15 @@ TEST(constant_folding, constant_unary_binary) {
ASSERT_EQ(get_result_constant<char>(func, 23), logical_or_autob_numpy_expected);
ASSERT_EQ(get_result_constant<char>(func, 24), logical_xor_autob_numpy_expected);
ASSERT_EQ(get_result_constant<double>(func, 25), doubles_sqrt_expected);
ASSERT_EQ(get_result_constant<int8_t>(func, 26), sub_int8_expected);
ASSERT_EQ(get_result_constant<uint8_t>(func, 27), sub_uint8_expected);
ASSERT_NO_THROW(pass_manager.run_passes(func_error));
}
template <typename T, typename U>
template <element::Type_t from, element::Type_t to, typename T, typename U>
static void test_const_convert(const vector<T>& values_in, const vector<U>& values_expected) {
auto constant = op::Constant::create(element::from<T>(), Shape{values_in.size()}, values_in);
auto convert = make_shared<op::Convert>(constant, element::from<U>());
auto constant = op::Constant::create(from, Shape{values_in.size()}, values_in);
auto convert = make_shared<op::Convert>(constant, to);
convert->set_friendly_name("test");
auto f = make_shared<Function>(convert, ParameterVector{});
@@ -412,8 +424,8 @@ static void test_const_convert(const vector<T>& values_in, const vector<U>& valu
auto new_const = ov::as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
ASSERT_TRUE(new_const);
ASSERT_EQ(new_const->get_friendly_name(), "test");
ASSERT_EQ(new_const->get_output_element_type(0), element::from<U>());
auto values_out = new_const->template get_vector<U>();
ASSERT_EQ(new_const->get_output_element_type(0), to);
auto values_out = new_const->template cast_vector<U>();
ASSERT_EQ(values_expected, values_out);
}
@@ -422,47 +434,57 @@ TEST(constant_folding, const_convert) {
{
vector<float> in{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
vector<uint64_t> expected{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
test_const_convert(in, expected);
test_const_convert<element::f32, element::u64>(in, expected);
}
{
vector<bool> in{false, true, true, false, false, false, true};
vector<float> expected{0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f};
test_const_convert(in, expected);
test_const_convert<element::boolean, element::f32>(in, expected);
}
{
vector<float> in{1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f};
vector<bool> expected{true, false, true, false, true, false, true};
test_const_convert(in, expected);
test_const_convert<element::f32, element::boolean>(in, expected);
}
{
vector<int64_t> in{1, 2, 3, 4, 5};
vector<double> expected{1.0, 2.0, 3.0, 4.0, 5.0};
test_const_convert(in, expected);
test_const_convert<element::i64, element::f64>(in, expected);
}
{
vector<double> in{1.2, 2.1, 3.3, 4.45, 5.02};
vector<int64_t> expected{1, 2, 3, 4, 5};
test_const_convert(in, expected);
test_const_convert<element::f64, element::i64>(in, expected);
}
{
vector<int8_t> in{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
vector<float> expected{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
test_const_convert<element::i4, element::f32>(in, expected);
}
{
vector<float> in{9, 0, 1, 2, 3, 4, 5, -1, -2, -10};
vector<int8_t> expected{-7, 0, 1, 2, 3, 4, 5, -1, -2, 6};
test_const_convert<element::f32, element::i4>(in, expected);
}
{
vector<int8_t> in{-128, -2, 0, 1, 3, 127};
vector<float> expected{-128, -2, 0, 1, 3, 127};
test_const_convert(in, expected);
test_const_convert<element::i8, element::f32>(in, expected);
}
{
vector<uint8_t> in{0, 1, 3, 127, 255};
vector<float> expected{0, 1, 3, 127, 255};
test_const_convert(in, expected);
test_const_convert<element::u8, element::f32>(in, expected);
}
{
vector<float> in{-300, -128, -1, 0, 33, 127, 128};
vector<int8_t> expected{-44, -128, -1, 0, 33, 127, -128};
test_const_convert(in, expected);
test_const_convert<element::f32, element::i8>(in, expected);
}
{
vector<float> in{0, 33, 127, 255, 256};
vector<uint8_t> expected{0, 33, 127, 255, 0};
test_const_convert(in, expected);
test_const_convert<element::f32, element::u8>(in, expected);
}
}

View File

@@ -88,12 +88,12 @@ std::tuple<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>> get_inp
input_low =
std::make_shared<default_opset::Multiply>(y_scale,
std::make_shared<default_opset::Subtract>(output_low, zero_point));
if (auto constant = get_constant_from_source(input_low))
if (auto constant = ov::get_constant_from_source(input_low))
input_low = constant;
input_high =
std::make_shared<default_opset::Multiply>(y_scale,
std::make_shared<default_opset::Subtract>(output_high, zero_point));
if (auto constant = get_constant_from_source(input_high))
if (auto constant = ov::get_constant_from_source(input_high))
input_high = constant;
return std::make_tuple(input_low, input_high);

View File

@@ -0,0 +1,179 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include <memory>
#include <ngraph/function.hpp>
#include <ngraph/opsets/opset8.hpp>
#include <compress_quantize_weights.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/utils/utils.hpp>
#include <ngraph/pass/manager.hpp>
#include "common_test_utils/ngraph_test_utils.hpp"
using namespace testing;
using namespace ngraph;
struct CompressQuantizeWeightsParams {
Shape shape;
std::vector<float> weights;
float in_low;
float in_high;
float out_low;
float out_high;
size_t levels;
element::Type_t expected_type;
std::vector<float> expected_weights;
float scale_val;
float zero_point_val;
};
class CompressQuantizeWeightsTests
: public testing::WithParamInterface<CompressQuantizeWeightsParams>,
public TransformationTestsF {
void SetUp() override {
TransformationTestsF::SetUp();
auto param = GetParam();
{
auto data = opset8::Constant::create(element::f32, param.shape, param.weights);
auto input_low = opset8::Constant::create(element::f32, Shape{}, {param.in_low});
auto input_high = opset8::Constant::create(element::f32, Shape{}, {param.in_high});
auto output_low = opset8::Constant::create(element::f32, Shape{}, {param.out_low});
auto output_high = opset8::Constant::create(element::f32, Shape{}, {param.out_high});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, param.levels);
function = std::make_shared<Function>(fq, ParameterVector{});
}
manager.register_pass<pass::CompressQuantizeWeights>();
{
auto data = opset8::Constant::create(param.expected_type, param.shape, param.expected_weights);
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{}, {param.scale_val});
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
function_ref = std::make_shared<Function>(mul, ParameterVector{});
}
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
enable_accuracy_check();
}
};
TEST_P(CompressQuantizeWeightsTests, FusionTest) {
}
static std::vector<CompressQuantizeWeightsParams> params = {
{Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 5, 3, element::i4, {-1, -1, 0, 0, 0, 1}, 3, -0.666667},
{Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 4, 16, element::i4, {-8, -5, -4, -2, 0, 7}, 0.333333, -5},
{Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 17, element::i8, {-8, -8, -8, -6, -4, -2, 0, 8}, 0.5, -4},
{Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 256, element::i8, {-128, -128, -128, -96, -64, -32, 0, 127}, 0.0313725, -64.25},
};
INSTANTIATE_TEST_SUITE_P(TransformationTests, CompressQuantizeWeightsTests, ::testing::ValuesIn(params));
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithDequantizationSubgraph) {
{
auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-128});
auto output_high = opset8::Constant::create(element::f32, Shape{}, {127});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
auto convert = std::make_shared<opset8::Convert>(fq, element::i8);
auto second_convert = std::make_shared<opset8::Convert>(convert, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
auto sub = std::make_shared<opset8::Subtract>(second_convert, zero_point);
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
function = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
manager.register_pass<pass::CompressQuantizeWeights>();
}
{
auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
}
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
enable_accuracy_check();
}
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
{
auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
manager.register_pass<pass::CompressQuantizeWeights>();
manager.register_pass<pass::ZeroPointOptimizer>();
}
{
auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.00299335, 0.00266047});
auto mul = std::make_shared<opset8::Multiply>(convert, scale);
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
}
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
enable_accuracy_check();
}
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimizer) {
{
auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
manager.register_pass<pass::CompressQuantizeWeights>();
manager.register_pass<pass::ZeroPointOptimizer>();
}
{
auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{}, {0.0313725});
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {-64.25});
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
}
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
enable_accuracy_check();
}
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsNonConstantInput) {
auto data = std::make_shared<opset8::Parameter>(element::f32, Shape{2, 4, 1, 1});
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{data});
manager.register_pass<pass::CompressQuantizeWeights>();
manager.register_pass<pass::ZeroPointOptimizer>();
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
enable_accuracy_check();
}

View File

@@ -24,6 +24,8 @@ def moc_emit_ir(ngraph_function: Model, argv: argparse.Namespace):
apply_user_transformations(ngraph_function, parse_transform(argv.transform))
apply_moc_transformations(ngraph_function)
from openvino.offline_transformations_pybind import compress_quantize_weights_transformation
compress_quantize_weights_transformation(ngraph_function)
if argv.framework == "onnx":
# set OldApi map in IR to be executed via OV API 1.x and for parity with legacy MO