CompressQuantizeWeights - fix zero point calculation (#20541)

Current implementation tries to leverage branchless approach, but it's not correct if scale is 0. In that case - zero point can can become inf or nan and multiplication by 0 doesn't change its value. That causes another issue - infinite or NaN zero point cannot be optimized out later. Ticket: CVS-122931 Co-authored-by: Ivan Tikhonov <ivan.tikhonov@intel.com>
2023-10-24 10:21:06 +02:00 · 2023-10-24 10:21:06 +02:00 · 2668f68816
commit 2668f68816
parent ea6922386e
2 changed files with 25 additions and 1 deletions
--- a/src/common/offline_transformations/src/compress_quantize_weigths.cpp
+++ b/src/common/offline_transformations/src/compress_quantize_weigths.cpp
@ -357,7 +357,7 @@ static void compute_scale_and_zero_point_internal(const std::shared_ptr<ov::op::
                                                                       float output_high_value) mutable {
            float output_range = output_high_value - output_low_value;
            float scale = output_range / input_range;
-            float zero_point_value = (new_output_low - output_low_value / scale) * (scale != 0);
+            float zero_point_value = (scale != 0) ? (new_output_low - output_low_value / scale) : 0;
            zero_point_is_zero =
                zero_point_is_zero && std::fabs(zero_point_value) < std::numeric_limits<float>::epsilon();
            *zero_point++ = zero_point_value;
--- a/src/common/transformations/tests/utils/compress_quantize_weights.cpp
+++ b/src/common/transformations/tests/utils/compress_quantize_weights.cpp
@ -232,6 +232,30 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminated) {
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
 }
 TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedZeroScale) {
    {
        auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
        auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
        auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
        auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, 0.0, -0.34054});
        auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.0, 0.33788});
        auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
        model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
        manager.register_pass<ov::pass::CompressQuantizeWeights>();
    }
    {
        auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
        auto convert = std::make_shared<opset8::Convert>(data, element::f32);
        auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.0, 0.00266047});
        auto mul = std::make_shared<opset8::Multiply>(convert, scale);
        model_ref = std::make_shared<Model>(NodeVector{mul}, ParameterVector{});
    }
    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
    comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
 }
 TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
    {
        auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});