CompressQuantizeWeights - fix zero point calculation (#20541)
Current implementation tries to leverage branchless approach, but it's not correct if scale is 0. In that case - zero point can can become inf or nan and multiplication by 0 doesn't change its value. That causes another issue - infinite or NaN zero point cannot be optimized out later. Ticket: CVS-122931 Co-authored-by: Ivan Tikhonov <ivan.tikhonov@intel.com>
This commit is contained in:
parent
ea6922386e
commit
2668f68816
@ -357,7 +357,7 @@ static void compute_scale_and_zero_point_internal(const std::shared_ptr<ov::op::
|
|||||||
float output_high_value) mutable {
|
float output_high_value) mutable {
|
||||||
float output_range = output_high_value - output_low_value;
|
float output_range = output_high_value - output_low_value;
|
||||||
float scale = output_range / input_range;
|
float scale = output_range / input_range;
|
||||||
float zero_point_value = (new_output_low - output_low_value / scale) * (scale != 0);
|
float zero_point_value = (scale != 0) ? (new_output_low - output_low_value / scale) : 0;
|
||||||
zero_point_is_zero =
|
zero_point_is_zero =
|
||||||
zero_point_is_zero && std::fabs(zero_point_value) < std::numeric_limits<float>::epsilon();
|
zero_point_is_zero && std::fabs(zero_point_value) < std::numeric_limits<float>::epsilon();
|
||||||
*zero_point++ = zero_point_value;
|
*zero_point++ = zero_point_value;
|
||||||
|
@ -232,6 +232,30 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminated) {
|
|||||||
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedZeroScale) {
|
||||||
|
{
|
||||||
|
auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
|
||||||
|
auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
|
||||||
|
auto input_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.380155, 0.33788});
|
||||||
|
auto output_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, 0.0, -0.34054});
|
||||||
|
auto output_high = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.399513, 0.0, 0.33788});
|
||||||
|
auto fq = std::make_shared<opset8::FakeQuantize>(data, input_low, input_high, output_low, output_high, 256);
|
||||||
|
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
|
||||||
|
|
||||||
|
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto data = opset8::Constant::create(element::i8, Shape{3, 1, 1, 1}, {-46, 29, 42});
|
||||||
|
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||||
|
auto scale = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {0.00314577, 0.0, 0.00266047});
|
||||||
|
auto mul = std::make_shared<opset8::Multiply>(convert, scale);
|
||||||
|
model_ref = std::make_shared<Model>(NodeVector{mul}, ParameterVector{});
|
||||||
|
}
|
||||||
|
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||||
|
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
|
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
|
||||||
{
|
{
|
||||||
auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});
|
auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});
|
||||||
|
Loading…
Reference in New Issue
Block a user