CompressQuantizeWeights - fix zero point calculation (#20541)

Current implementation tries to leverage branchless approach, but it's not correct
if scale is 0. In that case - zero point can can become inf or nan and multiplication
by 0 doesn't change its value. That causes another issue - infinite or NaN zero point
cannot be optimized out later.

Ticket: CVS-122931

Co-authored-by: Ivan Tikhonov <ivan.tikhonov@intel.com>
This commit is contained in:
Mateusz Tabaka 2023-10-24 10:21:06 +02:00 committed by GitHub
parent ea6922386e
commit 2668f68816
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 1 deletions

View File

@ -357,7 +357,7 @@ static void compute_scale_and_zero_point_internal(const std::shared_ptr<ov::op::
float output_high_value) mutable { float output_high_value) mutable {
float output_range = output_high_value - output_low_value; float output_range = output_high_value - output_low_value;
float scale = output_range / input_range; float scale = output_range / input_range;
float zero_point_value = (new_output_low - output_low_value / scale) * (scale != 0); float zero_point_value = (scale != 0) ? (new_output_low - output_low_value / scale) : 0;
zero_point_is_zero = zero_point_is_zero =
zero_point_is_zero && std::fabs(zero_point_value) < std::numeric_limits<float>::epsilon(); zero_point_is_zero && std::fabs(zero_point_value) < std::numeric_limits<float>::epsilon();
*zero_point++ = zero_point_value; *zero_point++ = zero_point_value;

View File

@ -232,6 +232,30 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminated) {
comparator.enable(FunctionsComparator::CmpValues::ACCURACY); comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
} }
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedZeroScale) {
{
auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, 0.0, -0.34054});
auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.0, 0.33788});
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
manager.register_pass<ov::pass::CompressQuantizeWeights>();
}
{
auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.0, 0.00266047});
auto mul = std::make_shared<opset8::Multiply>(convert, scale);
model_ref = std::make_shared<Model>(NodeVector{mul}, ParameterVector{});
}
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
}
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) { TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
{ {
auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2}); auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});