Add CompressQuantizeWeights transformation (#7538)
* Add CompressQuantizeWeights transformation It's based on model-optimizer/extensions/back/compress_quantized_weights.py * handle dequantization subgraph after quantization * fix scale shift calculation * remove TRANSFORMATIONS_API from CompressQuantizeWeights * ZeroPointOptimizer * add CompressQuantizeWeights pass to to ApplyMOCTransformations * add comment * fix code style * cleanup * ambiguous copy_runtime_info * ambigous call * fix case when zero_point is close to zero * fix cf test case * move files * update tests * add tests for evaluate_subgraph * Address review comments * make params static
This commit is contained in:
@@ -38,3 +38,4 @@ from openvino.pyopenvino.offline_transformations_pybind import generate_mapping_
|
||||
from openvino.pyopenvino.offline_transformations_pybind import apply_make_stateful_transformation
|
||||
from openvino.pyopenvino.offline_transformations_pybind import serialize
|
||||
from openvino.pyopenvino.offline_transformations_pybind import compress_model_transformation
|
||||
from openvino.pyopenvino.offline_transformations_pybind import compress_quantize_weights_transformation
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <pybind11/stl.h>
|
||||
|
||||
#include <compress_quantize_weights.hpp>
|
||||
#include <generate_mapping_file.hpp>
|
||||
#include <openvino/pass/make_stateful.hpp>
|
||||
#include <openvino/pass/serialize.hpp>
|
||||
@@ -120,6 +121,16 @@ void regmodule_offline_transformations(py::module m) {
|
||||
},
|
||||
py::arg("function"));
|
||||
|
||||
m_offline_transformations.def(
|
||||
"compress_quantize_weights_transformation",
|
||||
[](std::shared_ptr<ov::Model> function) {
|
||||
ov::pass::Manager manager;
|
||||
manager.register_pass<ngraph::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ngraph::pass::ZeroPointOptimizer>();
|
||||
manager.run_passes(function);
|
||||
},
|
||||
py::arg("function"));
|
||||
|
||||
// todo: remove as serialize as part of passManager api will be merged
|
||||
m_offline_transformations.def(
|
||||
"serialize",
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <transformations_visibility.hpp>
|
||||
|
||||
#include <ngraph/pass/graph_rewrite.hpp>
|
||||
|
||||
namespace ngraph {
|
||||
namespace pass {
|
||||
|
||||
class CompressQuantizeWeights;
|
||||
class ZeroPointOptimizer;
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ngraph
|
||||
|
||||
|
||||
/*
|
||||
CompressQuantizeWeights transformation goal is to pre-quantize data to minimize runtime calculations with constant data.
|
||||
To achieve this goal we perform FakeQuantize decomposition to separate quantization from dequantization in it.
|
||||
|
||||
Initial graph (FakeQuantize where all inputs are Constants):
|
||||
|
||||
| | | | |
|
||||
| | | | |
|
||||
v v v v v
|
||||
+------------+
|
||||
|FakeQuantize|
|
||||
+------------+
|
||||
|
|
||||
v
|
||||
|
||||
is replaced to:
|
||||
+-----------------+
|
||||
| Constant |
|
||||
| (low precision) |
|
||||
+-----------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Convert |
|
||||
| (to high prec) |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+----------+ +------------+
|
||||
|zero point|--->| Subtract |
|
||||
+----------+ +-----+------+
|
||||
|
|
||||
v
|
||||
+---------+ +------------+
|
||||
| scale |--->| Multiply |
|
||||
+---------+ +-----+------+
|
||||
|
|
||||
v
|
||||
|
||||
Transformation prepares quantized constant data for Low Precision pipeline.
|
||||
Such constant data packing reduces IR size (.bin file size) in offline transformations.
|
||||
With that we can skip same calculations in the runtime and make loading of such sub-graphs to the plugin faster.
|
||||
*/
|
||||
class ngraph::pass::CompressQuantizeWeights: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
NGRAPH_RTTI_DECLARATION;
|
||||
CompressQuantizeWeights();
|
||||
};
|
||||
|
||||
/*
|
||||
if zero_point == 0 we can eliminate Subtract from following dequantization subgraph:
|
||||
|
||||
+-----------------+
|
||||
| Constant |
|
||||
| (low precision) |
|
||||
+-----------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Convert |
|
||||
| (to high prec) |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+----------+ +------------+
|
||||
|zero point|--->| Subtract |
|
||||
+----------+ +-----+------+
|
||||
|
|
||||
v
|
||||
*/
|
||||
class ngraph::pass::ZeroPointOptimizer: public ngraph::pass::MatcherPass {
|
||||
public:
|
||||
NGRAPH_RTTI_DECLARATION;
|
||||
ZeroPointOptimizer();
|
||||
};
|
||||
@@ -0,0 +1,235 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <ngraph/pattern/op/wrap_type.hpp>
|
||||
#include <ngraph/opsets/opset8.hpp>
|
||||
#include <ngraph/validation_util.hpp>
|
||||
#include <ngraph/rt_info.hpp>
|
||||
#include <openvino/pass/constant_folding.hpp>
|
||||
#include <compress_quantize_weights.hpp>
|
||||
|
||||
NGRAPH_RTTI_DEFINITION(ngraph::pass::CompressQuantizeWeights, "CompressQuantizeWeights", 0);
|
||||
|
||||
static bool has_dequantization_subgraph(const std::shared_ptr<ngraph::Node>& first_convert) {
|
||||
auto first_convert_users = first_convert->get_users();
|
||||
const auto second_convert = std::find_if(first_convert_users.begin(), first_convert_users.end(),
|
||||
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
|
||||
return ov::is_type<ngraph::opset8::Convert>(n);
|
||||
});
|
||||
if (second_convert == first_convert_users.end())
|
||||
return false;
|
||||
auto convert_or_subtract_users = (*second_convert)->get_users();
|
||||
const auto subtract = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
|
||||
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
|
||||
return ov::is_type<ngraph::opset8::Subtract>(n);
|
||||
});
|
||||
if (subtract != convert_or_subtract_users.end()) {
|
||||
convert_or_subtract_users = (*subtract)->get_users();
|
||||
}
|
||||
const auto multiply = std::find_if(convert_or_subtract_users.begin(), convert_or_subtract_users.end(),
|
||||
[] (const std::shared_ptr<ngraph::Node>& n) -> bool {
|
||||
return ov::is_type<ngraph::opset8::Multiply>(n);
|
||||
});
|
||||
return multiply != convert_or_subtract_users.end();
|
||||
}
|
||||
|
||||
ngraph::pass::CompressQuantizeWeights::CompressQuantizeWeights() {
|
||||
auto weights_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto input_low_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto input_high_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto output_low_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto output_high_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto fq_pattern = pattern::wrap_type<opset8::FakeQuantize>({weights_pattern, input_low_pattern, input_high_pattern,
|
||||
output_low_pattern, output_high_pattern});
|
||||
|
||||
ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
|
||||
auto fq = std::dynamic_pointer_cast<opset8::FakeQuantize>(m.get_match_root());
|
||||
if (!fq)
|
||||
return false;
|
||||
auto levels = fq->get_levels();
|
||||
if (levels <= 2 || levels > 256)
|
||||
return false;
|
||||
auto quantized_type = element::undefined;
|
||||
// Currently we support two weights quantize types: i4 and i8
|
||||
if (levels <= 16) {
|
||||
quantized_type = element::i4;
|
||||
} else if (levels <= 256) {
|
||||
quantized_type = element::i8;
|
||||
}
|
||||
|
||||
const auto& pattern_value_map = m.get_pattern_value_map();
|
||||
const auto& input_type = fq->get_element_type();
|
||||
|
||||
// skip dequantize part if there is already dequantization subgraph after FakeQuantize
|
||||
auto fq_users = fq->get_users();
|
||||
if (fq_users.size() == 1 && has_dequantization_subgraph(fq_users[0])) {
|
||||
auto& first_convert = fq_users[0];
|
||||
if (auto new_weights = ov::get_constant_from_source(first_convert)) {
|
||||
replace_node(first_convert, new_weights);
|
||||
// preserve dequantization subgraph for LP transformations
|
||||
auto weights_users = new_weights->get_users();
|
||||
if (weights_users.size() == 1 && ov::is_type<ngraph::opset8::Convert>(weights_users[0])) {
|
||||
ov::pass::disable_constant_folding(weights_users[0]);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
Quantize part
|
||||
|
||||
Prepare new FakeQuantize that performs weights quantization.
|
||||
In this case input_low/high stays the same, but we need new output_low/high:
|
||||
output_low = -levels / 2
|
||||
output_high = levels - 1 + output_low
|
||||
The FakeQuantize result is converted to low precision type and then constant folded
|
||||
*/
|
||||
std::shared_ptr<Node> new_input_low;
|
||||
auto new_output_low = op::Constant::create(input_type, Shape{}, {-static_cast<float>(levels / 2)});
|
||||
auto new_output_high = std::make_shared<opset8::Add>(new_output_low, op::Constant::create(input_type, Shape{}, {levels - 1}));
|
||||
const auto& weights = pattern_value_map.at(weights_pattern);
|
||||
const auto& input_low = pattern_value_map.at(input_low_pattern);
|
||||
const auto& input_high = pattern_value_map.at(input_high_pattern);
|
||||
auto quantize = fq->clone_with_new_inputs({weights, input_low, input_high,
|
||||
new_output_low, new_output_high});
|
||||
// Convert quantized weights to low precision type
|
||||
std::shared_ptr<Node> new_weights = std::make_shared<opset8::Convert>(quantize, quantized_type);
|
||||
// Constant fold quantized weights
|
||||
if (auto constant = ov::get_constant_from_source(new_weights)) {
|
||||
new_weights = constant;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
new_weights->set_friendly_name(weights.get_node()->get_friendly_name());
|
||||
|
||||
/*
|
||||
Dequantize part is performed by Convert(from low to high precision)->Subtract->Multiply subgraph.
|
||||
|
||||
+-------------------------+
|
||||
| Convert |
|
||||
| (from low to high prec) |
|
||||
+-------------------------+
|
||||
|
|
||||
v
|
||||
+----------+ +------------+
|
||||
|zero point|--->| Subtract |
|
||||
+----------+ +-----+------+
|
||||
|
|
||||
v
|
||||
+---------+ +------------+
|
||||
| scale |--->| Multiply |
|
||||
+---------+ +-----+------+
|
||||
|
|
||||
v
|
||||
|
||||
where:
|
||||
scale = (output_high - output_low) / (new_output_high - new_output_low)
|
||||
zero_point = new_output_low - output_low / scale
|
||||
*/
|
||||
const auto& output_low = pattern_value_map.at(output_low_pattern);
|
||||
const auto& output_high = pattern_value_map.at(output_high_pattern);
|
||||
auto output_range = std::make_shared<opset8::Subtract>(output_high, output_low);
|
||||
auto input_range = std::make_shared<opset8::Subtract>(new_output_high, new_output_low);
|
||||
std::shared_ptr<Node> scale = std::make_shared<opset8::Divide>(output_range, input_range);
|
||||
auto descaled_output_low = std::make_shared<opset8::Divide>(output_low, scale);
|
||||
std::shared_ptr<Node> shift = std::make_shared<opset8::Subtract>(new_output_low, descaled_output_low);
|
||||
if (auto constant = ov::get_constant_from_source(scale))
|
||||
scale = constant;
|
||||
auto zero = op::Constant::create(input_type, Shape{}, {0});
|
||||
auto scale_eq_zero = std::make_shared<opset8::Equal>(scale, zero);
|
||||
// shift equals to input_low - output_low / scale
|
||||
// for positions where scale == 0, we put zero as shift
|
||||
std::shared_ptr<Node> zero_point = std::make_shared<opset8::Select>(scale_eq_zero, zero, shift);
|
||||
if (auto constant = ov::get_constant_from_source(zero_point))
|
||||
zero_point = constant;
|
||||
if (auto constant = ov::get_constant_from_source(scale))
|
||||
scale = constant;
|
||||
auto convert_to_high_prec = std::make_shared<opset8::Convert>(new_weights, input_type);
|
||||
auto sub = register_new_node<opset8::Subtract>(convert_to_high_prec, zero_point);
|
||||
auto mul = register_new_node<opset8::Multiply>(sub, scale);
|
||||
mul->set_friendly_name(fq->get_friendly_name());
|
||||
copy_runtime_info(fq, {convert_to_high_prec, sub, mul});
|
||||
ov::pass::disable_constant_folding(convert_to_high_prec);
|
||||
replace_node(fq, mul);
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(fq_pattern, "CompressQuantizeWeights");
|
||||
this->register_matcher(m, callback);
|
||||
}
|
||||
|
||||
NGRAPH_RTTI_DEFINITION(ngraph::pass::ZeroPointOptimizer, "ZeroPointOptimizer", 0);
|
||||
|
||||
ngraph::pass::ZeroPointOptimizer::ZeroPointOptimizer() {
|
||||
auto weights_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto zero_point_pattern = pattern::wrap_type<opset8::Constant>();
|
||||
auto convert_pattern = pattern::wrap_type<opset8::Convert>({weights_pattern});
|
||||
auto sub_pattern = pattern::wrap_type<opset8::Subtract>({convert_pattern, zero_point_pattern});
|
||||
|
||||
ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
|
||||
const auto& pattern_value_map = m.get_pattern_value_map();
|
||||
auto convert = pattern_value_map.at(convert_pattern).get_node_shared_ptr();
|
||||
auto sub = pattern_value_map.at(sub_pattern).get_node_shared_ptr();
|
||||
auto weights = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(weights_pattern).get_node_shared_ptr());
|
||||
if (!weights || weights->get_element_type() != element::i8)
|
||||
return false;
|
||||
auto zero_point = std::dynamic_pointer_cast<opset8::Constant>(pattern_value_map.at(zero_point_pattern).get_node_shared_ptr());
|
||||
if (!zero_point)
|
||||
return false;
|
||||
|
||||
auto zp_value = zero_point->cast_vector<float>();
|
||||
if (std::all_of(zp_value.begin(), zp_value.end(), [] (float f) -> bool { return std::fabs(f) <= std::numeric_limits<float>::epsilon(); })) {
|
||||
copy_runtime_info(sub, convert);
|
||||
replace_node(sub, convert);
|
||||
}
|
||||
|
||||
auto int8_zero_point = std::make_shared<opset8::Convert>(
|
||||
std::make_shared<opset8::Round>(zero_point, opset8::Round::RoundMode::HALF_TO_EVEN),
|
||||
weights->get_element_type());
|
||||
auto adj_zero_point = std::make_shared<opset8::Subtract>(zero_point, std::make_shared<opset8::Convert>(int8_zero_point, convert->get_element_type()));
|
||||
|
||||
auto adj_zero_point_const = ov::get_constant_from_source(adj_zero_point);
|
||||
if (!adj_zero_point_const)
|
||||
return false;
|
||||
auto adj_zero_point_val = adj_zero_point_const->cast_vector<float>();
|
||||
bool is_adj_zero_point_close_to_zero = std::all_of(adj_zero_point_val.begin(), adj_zero_point_val.end(),
|
||||
[] (float f) -> bool {
|
||||
return std::fabs(f) < 1e-4;
|
||||
});
|
||||
if (!is_adj_zero_point_close_to_zero)
|
||||
return false;
|
||||
|
||||
auto transformed = std::make_shared<opset8::Subtract>(
|
||||
std::make_shared<opset8::Convert>(std::make_shared<opset8::Subtract>(weights, int8_zero_point), convert->get_element_type()),
|
||||
adj_zero_point);
|
||||
auto diff = std::make_shared<opset8::Subtract>(sub, transformed);
|
||||
auto diff_const = ov::get_constant_from_source(diff);
|
||||
if (!diff_const)
|
||||
return false;
|
||||
auto diff_val = diff_const->cast_vector<float>();
|
||||
bool is_transformed_and_original_equal = std::all_of(diff_val.begin(), diff_val.end(),
|
||||
[] (float f) -> bool {
|
||||
return std::fabs(f) < std::numeric_limits<float>::epsilon();
|
||||
});
|
||||
if (!is_transformed_and_original_equal)
|
||||
return false;
|
||||
|
||||
std::shared_ptr<Node> new_weights = std::make_shared<opset8::Subtract>(weights, int8_zero_point);
|
||||
if (auto constant = ov::get_constant_from_source(new_weights))
|
||||
new_weights = constant;
|
||||
else
|
||||
return false;
|
||||
new_weights->set_friendly_name(weights->get_friendly_name());
|
||||
replace_node(weights, new_weights);
|
||||
|
||||
copy_runtime_info(sub, convert);
|
||||
replace_node(sub, convert);
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ngraph::pattern::Matcher>(sub_pattern, "ZeroPointOptimizer");
|
||||
this->register_matcher(m, callback);
|
||||
}
|
||||
@@ -32,8 +32,8 @@ inline uint8_t get_u1(const uint8_t* buf, size_t idx) {
|
||||
inline void set_u4(uint8_t* buf, size_t idx, uint8_t val) {
|
||||
const size_t byte_idx = idx / 2;
|
||||
const uint8_t bit_shift = 4 * (++idx % 2);
|
||||
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
|
||||
buf[byte_idx] |= (val << bit_shift); // set 1's
|
||||
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
|
||||
buf[byte_idx] |= ((val & 0xF) << bit_shift); // set 1's
|
||||
}
|
||||
|
||||
inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
|
||||
@@ -45,8 +45,8 @@ inline uint8_t get_u4(const uint8_t* buf, size_t idx) {
|
||||
inline void set_i4(uint8_t* buf, size_t idx, int8_t val) {
|
||||
const size_t byte_idx = idx / 2;
|
||||
const uint8_t bit_shift = 4 * (++idx % 2);
|
||||
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
|
||||
buf[byte_idx] |= (val << bit_shift); // set 1's
|
||||
buf[byte_idx] &= ~(0xF << bit_shift); // half byte zeroed
|
||||
buf[byte_idx] |= ((val & 0xF) << bit_shift); // set 1's
|
||||
}
|
||||
|
||||
inline int8_t get_i4(const uint8_t* buf, size_t idx) {
|
||||
|
||||
@@ -131,12 +131,6 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
|
||||
NGRAPH_CHECK(node, validate_host_tensor_vector(output_values, 1));
|
||||
const auto& input = node->input_value(0);
|
||||
if (const auto& value = is_upper ? input.get_tensor().get_upper_value() : input.get_tensor().get_lower_value()) {
|
||||
// constants for dynamic values translation
|
||||
auto input_maximum_value = get_constant_max_of_type(input.get_element_type());
|
||||
auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
|
||||
if (input_maximum_value == nullptr || output_maximum_value == nullptr)
|
||||
return false;
|
||||
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
bool status = node->evaluate(output_values, {value});
|
||||
OPENVINO_SUPPRESS_DEPRECATED_END
|
||||
@@ -144,6 +138,19 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo
|
||||
if (!status)
|
||||
return status;
|
||||
|
||||
const auto& input_element_type = input.get_element_type();
|
||||
const auto& output_element_type = output_values[0]->get_element_type();
|
||||
if ((input_element_type.is_integral() && input_element_type.bitwidth() <= 16) ||
|
||||
(output_element_type.is_integral() && output_element_type.bitwidth() <= 16)) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// constants for dynamic values translation
|
||||
auto input_maximum_value = get_constant_max_of_type(input_element_type);
|
||||
auto output_maximum_value = get_constant_max_of_type(output_values[0]->get_element_type());
|
||||
if (input_maximum_value == nullptr || output_maximum_value == nullptr)
|
||||
return false;
|
||||
|
||||
// dynamic values translation
|
||||
auto input_dynamic_mask = std::make_shared<HostTensor>(element::boolean, input.get_shape());
|
||||
status =
|
||||
|
||||
@@ -35,8 +35,10 @@ bool evaluate_subtract(const HostTensorPtr& arg0,
|
||||
bool rc = true;
|
||||
out->set_broadcast(broadcast_spec, arg0, arg1);
|
||||
switch (arg0->get_element_type()) {
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, i8, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, i32, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, i64, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, u8, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, u32, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, u64, arg0, arg1, out, broadcast_spec);
|
||||
NGRAPH_TYPE_CASE(evaluate_subtract, f16, arg0, arg1, out, broadcast_spec);
|
||||
|
||||
@@ -1348,9 +1348,6 @@ shared_ptr<op::Constant> ngraph::get_constant_max_of_type(element::Type_t t) {
|
||||
NGRAPH_TYPE_TO_MAX_CONST(element::u16);
|
||||
NGRAPH_TYPE_TO_MAX_CONST(element::u32);
|
||||
NGRAPH_TYPE_TO_MAX_CONST(element::u64);
|
||||
|
||||
case element::undefined:
|
||||
case element::dynamic:
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1377,9 +1374,6 @@ shared_ptr<op::Constant> ngraph::get_constant_min_of_type(element::Type_t t) {
|
||||
NGRAPH_TYPE_TO_MIN_CONST(element::u16);
|
||||
NGRAPH_TYPE_TO_MIN_CONST(element::u32);
|
||||
NGRAPH_TYPE_TO_MIN_CONST(element::u64);
|
||||
|
||||
case element::undefined:
|
||||
case element::dynamic:
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -266,6 +266,8 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
vector<int> values_g{1, 4};
|
||||
vector<char> values_h{0, 0, 1, 1};
|
||||
vector<char> values_i{0, 1};
|
||||
vector<int8_t> values_j{-3, 5};
|
||||
vector<uint8_t> values_k{3, 5};
|
||||
auto a = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_a);
|
||||
auto b = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_b);
|
||||
auto c = make_shared<op::Constant>(element::i32, Shape{2, 2}, values_c);
|
||||
@@ -275,6 +277,8 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
auto g = make_shared<op::Constant>(element::i32, Shape{2}, values_g);
|
||||
auto h = make_shared<op::Constant>(element::boolean, Shape{2, 2}, values_h);
|
||||
auto i = make_shared<op::Constant>(element::boolean, Shape{2}, values_i);
|
||||
auto j = make_shared<op::Constant>(element::i8, Shape{2}, values_j);
|
||||
auto k = make_shared<op::Constant>(element::u8, Shape{2}, values_k);
|
||||
auto doubles = make_shared<op::Constant>(element::f64, Shape{2}, std::vector<double>{4.0, 9.0});
|
||||
|
||||
auto add = make_shared<op::v1::Add>(a, b);
|
||||
@@ -303,6 +307,8 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
auto logical_or_autob_numpy = make_shared<op::v1::LogicalOr>(h, i, op::AutoBroadcastType::NUMPY);
|
||||
auto logical_xor_autob_numpy = make_shared<op::Xor>(h, i, op::AutoBroadcastType::NUMPY);
|
||||
auto doubles_sqrt = make_shared<op::Sqrt>(doubles);
|
||||
auto sub_int8 = make_shared<op::v1::Subtract>(j, j);
|
||||
auto sub_uint8 = make_shared<op::v1::Subtract>(k, k);
|
||||
|
||||
auto neg_sqrt = make_shared<op::Sqrt>(c);
|
||||
|
||||
@@ -331,7 +337,9 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
less_eq_autob_numpy,
|
||||
logical_or_autob_numpy,
|
||||
logical_xor_autob_numpy,
|
||||
doubles_sqrt},
|
||||
doubles_sqrt,
|
||||
sub_int8,
|
||||
sub_uint8},
|
||||
ParameterVector{});
|
||||
auto func_error = make_shared<Function>(NodeVector{neg_sqrt}, ParameterVector{});
|
||||
|
||||
@@ -365,6 +373,8 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
vector<char> logical_or_autob_numpy_expected{0, 1, 1, 1};
|
||||
vector<char> logical_xor_autob_numpy_expected{0, 1, 1, 0};
|
||||
vector<double> doubles_sqrt_expected{2.0, 3.0};
|
||||
vector<int8_t> sub_int8_expected{0, 0};
|
||||
vector<uint8_t> sub_uint8_expected{0, 0};
|
||||
|
||||
ASSERT_EQ(get_result_constant<int>(func, 0), add_expected);
|
||||
ASSERT_EQ(get_result_constant<int>(func, 1), sub_expected);
|
||||
@@ -392,13 +402,15 @@ TEST(constant_folding, constant_unary_binary) {
|
||||
ASSERT_EQ(get_result_constant<char>(func, 23), logical_or_autob_numpy_expected);
|
||||
ASSERT_EQ(get_result_constant<char>(func, 24), logical_xor_autob_numpy_expected);
|
||||
ASSERT_EQ(get_result_constant<double>(func, 25), doubles_sqrt_expected);
|
||||
ASSERT_EQ(get_result_constant<int8_t>(func, 26), sub_int8_expected);
|
||||
ASSERT_EQ(get_result_constant<uint8_t>(func, 27), sub_uint8_expected);
|
||||
ASSERT_NO_THROW(pass_manager.run_passes(func_error));
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
template <element::Type_t from, element::Type_t to, typename T, typename U>
|
||||
static void test_const_convert(const vector<T>& values_in, const vector<U>& values_expected) {
|
||||
auto constant = op::Constant::create(element::from<T>(), Shape{values_in.size()}, values_in);
|
||||
auto convert = make_shared<op::Convert>(constant, element::from<U>());
|
||||
auto constant = op::Constant::create(from, Shape{values_in.size()}, values_in);
|
||||
auto convert = make_shared<op::Convert>(constant, to);
|
||||
convert->set_friendly_name("test");
|
||||
auto f = make_shared<Function>(convert, ParameterVector{});
|
||||
|
||||
@@ -412,8 +424,8 @@ static void test_const_convert(const vector<T>& values_in, const vector<U>& valu
|
||||
auto new_const = ov::as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
|
||||
ASSERT_TRUE(new_const);
|
||||
ASSERT_EQ(new_const->get_friendly_name(), "test");
|
||||
ASSERT_EQ(new_const->get_output_element_type(0), element::from<U>());
|
||||
auto values_out = new_const->template get_vector<U>();
|
||||
ASSERT_EQ(new_const->get_output_element_type(0), to);
|
||||
auto values_out = new_const->template cast_vector<U>();
|
||||
|
||||
ASSERT_EQ(values_expected, values_out);
|
||||
}
|
||||
@@ -422,47 +434,57 @@ TEST(constant_folding, const_convert) {
|
||||
{
|
||||
vector<float> in{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
|
||||
vector<uint64_t> expected{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::f32, element::u64>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<bool> in{false, true, true, false, false, false, true};
|
||||
vector<float> expected{0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::boolean, element::f32>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<float> in{1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f};
|
||||
vector<bool> expected{true, false, true, false, true, false, true};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::f32, element::boolean>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<int64_t> in{1, 2, 3, 4, 5};
|
||||
vector<double> expected{1.0, 2.0, 3.0, 4.0, 5.0};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::i64, element::f64>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<double> in{1.2, 2.1, 3.3, 4.45, 5.02};
|
||||
vector<int64_t> expected{1, 2, 3, 4, 5};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::f64, element::i64>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<int8_t> in{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
|
||||
vector<float> expected{7, 0, 1, 2, 3, 4, 5, -1, -2, -8};
|
||||
test_const_convert<element::i4, element::f32>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<float> in{9, 0, 1, 2, 3, 4, 5, -1, -2, -10};
|
||||
vector<int8_t> expected{-7, 0, 1, 2, 3, 4, 5, -1, -2, 6};
|
||||
test_const_convert<element::f32, element::i4>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<int8_t> in{-128, -2, 0, 1, 3, 127};
|
||||
vector<float> expected{-128, -2, 0, 1, 3, 127};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::i8, element::f32>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<uint8_t> in{0, 1, 3, 127, 255};
|
||||
vector<float> expected{0, 1, 3, 127, 255};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::u8, element::f32>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<float> in{-300, -128, -1, 0, 33, 127, 128};
|
||||
vector<int8_t> expected{-44, -128, -1, 0, 33, 127, -128};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::f32, element::i8>(in, expected);
|
||||
}
|
||||
{
|
||||
vector<float> in{0, 33, 127, 255, 256};
|
||||
vector<uint8_t> expected{0, 33, 127, 255, 0};
|
||||
test_const_convert(in, expected);
|
||||
test_const_convert<element::f32, element::u8>(in, expected);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -88,12 +88,12 @@ std::tuple<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>> get_inp
|
||||
input_low =
|
||||
std::make_shared<default_opset::Multiply>(y_scale,
|
||||
std::make_shared<default_opset::Subtract>(output_low, zero_point));
|
||||
if (auto constant = get_constant_from_source(input_low))
|
||||
if (auto constant = ov::get_constant_from_source(input_low))
|
||||
input_low = constant;
|
||||
input_high =
|
||||
std::make_shared<default_opset::Multiply>(y_scale,
|
||||
std::make_shared<default_opset::Subtract>(output_high, zero_point));
|
||||
if (auto constant = get_constant_from_source(input_high))
|
||||
if (auto constant = ov::get_constant_from_source(input_high))
|
||||
input_high = constant;
|
||||
|
||||
return std::make_tuple(input_low, input_high);
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include <ngraph/function.hpp>
|
||||
#include <ngraph/opsets/opset8.hpp>
|
||||
#include <compress_quantize_weights.hpp>
|
||||
#include <transformations/init_node_info.hpp>
|
||||
#include <transformations/utils/utils.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
|
||||
|
||||
using namespace testing;
|
||||
using namespace ngraph;
|
||||
|
||||
|
||||
struct CompressQuantizeWeightsParams {
|
||||
Shape shape;
|
||||
std::vector<float> weights;
|
||||
float in_low;
|
||||
float in_high;
|
||||
float out_low;
|
||||
float out_high;
|
||||
size_t levels;
|
||||
element::Type_t expected_type;
|
||||
std::vector<float> expected_weights;
|
||||
float scale_val;
|
||||
float zero_point_val;
|
||||
};
|
||||
|
||||
class CompressQuantizeWeightsTests
|
||||
: public testing::WithParamInterface<CompressQuantizeWeightsParams>,
|
||||
public TransformationTestsF {
|
||||
void SetUp() override {
|
||||
TransformationTestsF::SetUp();
|
||||
auto param = GetParam();
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, param.shape, param.weights);
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{}, {param.in_low});
|
||||
auto input_high = opset8::Constant::create(element::f32, Shape{}, {param.in_high});
|
||||
auto output_low = opset8::Constant::create(element::f32, Shape{}, {param.out_low});
|
||||
auto output_high = opset8::Constant::create(element::f32, Shape{}, {param.out_high});
|
||||
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, param.levels);
|
||||
function = std::make_shared<Function>(fq, ParameterVector{});
|
||||
}
|
||||
|
||||
manager.register_pass<pass::CompressQuantizeWeights>();
|
||||
|
||||
{
|
||||
auto data = opset8::Constant::create(param.expected_type, param.shape, param.expected_weights);
|
||||
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{}, {param.scale_val});
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
|
||||
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
|
||||
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
function_ref = std::make_shared<Function>(mul, ParameterVector{});
|
||||
}
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
enable_accuracy_check();
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CompressQuantizeWeightsTests, FusionTest) {
|
||||
}
|
||||
|
||||
static std::vector<CompressQuantizeWeightsParams> params = {
|
||||
{Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 5, 3, element::i4, {-1, -1, 0, 0, 0, 1}, 3, -0.666667},
|
||||
{Shape{2, 3, 1, 1}, {-1, 2, 3, 4, 5, 11}, 0, 10, -1, 4, 16, element::i4, {-8, -5, -4, -2, 0, 7}, 0.333333, -5},
|
||||
{Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 17, element::i8, {-8, -8, -8, -6, -4, -2, 0, 8}, 0.5, -4},
|
||||
{Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11}, 1, 9, -2, 6, 256, element::i8, {-128, -128, -128, -96, -64, -32, 0, 127}, 0.0313725, -64.25},
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(TransformationTests, CompressQuantizeWeightsTests, ::testing::ValuesIn(params));
|
||||
|
||||
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithDequantizationSubgraph) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
|
||||
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
|
||||
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-128});
|
||||
auto output_high = opset8::Constant::create(element::f32, Shape{}, {127});
|
||||
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
|
||||
auto convert = std::make_shared<opset8::Convert>(fq, element::i8);
|
||||
auto second_convert = std::make_shared<opset8::Convert>(convert, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
|
||||
auto sub = std::make_shared<opset8::Subtract>(second_convert, zero_point);
|
||||
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
|
||||
function = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
|
||||
|
||||
manager.register_pass<pass::CompressQuantizeWeights>();
|
||||
}
|
||||
{
|
||||
auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
|
||||
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{}, {10.0 / 255});
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {2 - 255.0 / 10});
|
||||
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
|
||||
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
|
||||
}
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
enable_accuracy_check();
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
|
||||
auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
|
||||
auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
|
||||
auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
|
||||
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
|
||||
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
|
||||
|
||||
manager.register_pass<pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<pass::ZeroPointOptimizer>();
|
||||
}
|
||||
|
||||
{
|
||||
auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
|
||||
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.00299335, 0.00266047});
|
||||
auto mul = std::make_shared<opset8::Multiply>(convert, scale);
|
||||
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
|
||||
}
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
enable_accuracy_check();
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
|
||||
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
|
||||
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
|
||||
auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
|
||||
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
|
||||
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{});
|
||||
|
||||
manager.register_pass<pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<pass::ZeroPointOptimizer>();
|
||||
}
|
||||
{
|
||||
auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
|
||||
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{}, {0.0313725});
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {-64.25});
|
||||
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
|
||||
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
function_ref = std::make_shared<Function>(NodeVector{mul}, ParameterVector{});
|
||||
}
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
enable_accuracy_check();
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsNonConstantInput) {
|
||||
auto data = std::make_shared<opset8::Parameter>(element::f32, Shape{2, 4, 1, 1});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
|
||||
auto input_high = opset8::Constant::create(element::f32, Shape{}, {9});
|
||||
auto output_low = opset8::Constant::create(element::f32, Shape{}, {-2});
|
||||
auto output_high = opset8::Constant::create(element::f32, Shape{}, {6});
|
||||
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
|
||||
function = std::make_shared<Function>(NodeVector{fq}, ParameterVector{data});
|
||||
|
||||
manager.register_pass<pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<pass::ZeroPointOptimizer>();
|
||||
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
enable_accuracy_check();
|
||||
}
|
||||
@@ -24,6 +24,8 @@ def moc_emit_ir(ngraph_function: Model, argv: argparse.Namespace):
|
||||
|
||||
apply_user_transformations(ngraph_function, parse_transform(argv.transform))
|
||||
apply_moc_transformations(ngraph_function)
|
||||
from openvino.offline_transformations_pybind import compress_quantize_weights_transformation
|
||||
compress_quantize_weights_transformation(ngraph_function)
|
||||
|
||||
if argv.framework == "onnx":
|
||||
# set OldApi map in IR to be executed via OV API 1.x and for parity with legacy MO
|
||||
|
||||
Reference in New Issue
Block a user