[LPT] int4 inference via 16 levels int8 (#5249)

2021-11-16 18:00:48 +03:00 · 2021-11-16 18:00:48 +03:00 · 5e0daae87e
commit 5e0daae87e
parent 5d86cce4eb
11 changed files with 98 additions and 59 deletions
--- a/inference-engine/src/low_precision_transformations/include/low_precision/layer_transformation.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/layer_transformation.hpp
@ -80,28 +80,17 @@ public:
                return -8.f;
            case element::i8:
                switch (levels) {
+                    case 16:
+                        return -8.f;
                    case 255:
                        return -127.f;
-                    case 256:
+                    default:
                        return -128.f;
                }
-                break;
            case element::i16:
-                switch (levels) {
-                    case 65536:
-                        return -32768.f;
-                    case 65535:
-                        return -32767.f;
-                }
-                break;
+                return levels == 65535 ? -32767.f : -32768.f;
            case element::i32:
-                switch (levels) {
-                    case static_cast<size_t>(4294967296):
-                        return -2147483648.f;
-                    case 4294967295:
-                        return -2147483647.f;
-                }
-                break;
+                return -2147483647.f; // -2147483647.f == -2147483648.f
            case element::f16:
                return -1.0e15f;
            case element::f32:
@ -117,19 +106,29 @@ public:
            case element::u4:
                return 15.f;
            case element::u8:
-                return 255.f;
+                switch (levels) {
+                    case 16:
+                        return 15.f;
+                    default:
+                        return 255.f;
+                }
            case element::u16:
                return 65535.f;
            case element::u32:
-                return 4294967296.f;
+                return 4294967296.f; // 4294967296.f == 4294967295.f
            case element::i4:
                return 7.f;
            case element::i8:
-                return 127.f;
+                switch (levels) {
+                    case 16:
+                        return 7.f;
+                    default:
+                        return 127.f;
+                }
            case element::i16:
                return 32767.f;
            case element::i32:
-                return 2147483647.f;
+                return 2147483648.f;  // 2147483648.f == 2147483647.f
            case element::f16:
                return 1.0e15f;
            case element::f32:
@ -145,6 +144,10 @@ public:
            return 254.f;
        } else if (maxLevelsForPrecision == 256ul) {
            return 255.f;
+        } else if (maxLevelsForPrecision == 16ul) {
+            return 15.f;
+        } else if (maxLevelsForPrecision == 15ul) {
+            return 14.f;
        } else {
            THROW_TRANSFORMATION_EXCEPTION << "unexpected quantization level " << maxLevelsForPrecision;
        }
--- a/inference-engine/src/low_precision_transformations/src/fake_quantize_decomposition.cpp
+++ b/inference-engine/src/low_precision_transformations/src/fake_quantize_decomposition.cpp
@ -29,6 +29,7 @@ FakeQuantizeDecompositionTransformation::FakeQuantizeDecompositionTransformation
        if (transformation_callback(op)) {
            return false;
        }
+
        return transform(*context, m);
    };

--- a/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp
+++ b/inference-engine/src/low_precision_transformations/src/layer_transformation.cpp
@ -226,8 +226,9 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(
            hasNegative = true;

            if (outputHighValues[i] != 0.f) {
-                const float expectedRatio = (quantizationLevels == 256 || quantizationLevels == 65536 || quantizationLevels == 4294967296) ?
-                                            asymmetricIntervalSideRatio : -1.f;
+                const float expectedRatio =
+                        (quantizationLevels == 16 || quantizationLevels == 256 ||
+                         quantizationLevels == 65536 || quantizationLevels == 4294967296) ? asymmetricIntervalSideRatio : -1.f;
                const float actualRatio = outputLowValues[i] / outputHighValues[i];
                const float actual = std::fabs((actualRatio - expectedRatio) / std::min(actualRatio, expectedRatio));
                if (actual > quantizationIntervalAsymmetryThreshold) {
@ -273,6 +274,7 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(
            switch (quantizationLevels) {
                case 256:
                case 255:
+                case 16:
                    resultPrecision = element::i8;
                    break;
                case 65536:
@ -290,6 +292,7 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(
            switch (quantizationLevels) {
                case 256:
                case 255:
+                case 16:
                    resultPrecision = element::u8;
                    break;
                case 65536:
--- a/inference-engine/src/low_precision_transformations/src/quantization_details.cpp
+++ b/inference-engine/src/low_precision_transformations/src/quantization_details.cpp
@ -162,7 +162,7 @@ bool QuantizationDetails::empty() const noexcept {
 }

 bool QuantizationDetails::isSupportedLevel(const size_t level) {
-    static const std::unordered_set<size_t> supported_levels = { 255, 256, 65536, 65535, static_cast<size_t>(4294967296), 4294967295 };
+    static const std::unordered_set<size_t> supported_levels = { 16, 255, 256, 65536, 65535, static_cast<size_t>(4294967296), 4294967295 };
    return supported_levels.find(level) != supported_levels.end();
 }

--- a/inference-engine/src/low_precision_transformations/src/rt_info/intervals_alignment_attribute.cpp
+++ b/inference-engine/src/low_precision_transformations/src/rt_info/intervals_alignment_attribute.cpp
@ -166,25 +166,26 @@ void VariantWrapper<IntervalsAlignmentAttributePtr>::merge(
        const auto size = std::abs(sharedValue->minInterval.high - sharedValue->minInterval.low);
        if (resultSize > size) {
            resultSharedValue->minInterval = sharedValue->minInterval;
+            if (resultAttribute->levels != 0ul) {
+                float dequantizationMul;
+                float dequantizationSub;
+                float updatedOutputLowValue;
+                float updatedOutputHighValue;

-            float dequantizationMul;
-            float dequantizationSub;
-            float updatedOutputLowValue;
-            float updatedOutputHighValue;
+                const size_t minLevels = NetworkHelper::calculateLevels(
+                        0.f,
+                        DataPrecision::getMaxValue(resultAttribute->levels),
+                        resultSharedValue->combinedInterval.low,
+                        resultSharedValue->combinedInterval.high,
+                        resultSharedValue->minInterval.low,
+                        resultSharedValue->minInterval.high,
+                        dequantizationMul,
+                        dequantizationSub,
+                        updatedOutputLowValue,
+                        updatedOutputHighValue);

-            const size_t minLevels = NetworkHelper::calculateLevels(
-                0.f,
-                DataPrecision::getMaxValue(resultAttribute->levels),
-                resultSharedValue->combinedInterval.low,
-                resultSharedValue->combinedInterval.high,
-                resultSharedValue->minInterval.low,
-                resultSharedValue->minInterval.high,
-                dequantizationMul,
-                dequantizationSub,
-                updatedOutputLowValue,
-                updatedOutputHighValue);
-
-            resultSharedValue->minLevels = minLevels;
+                resultSharedValue->minLevels = minLevels;
+            }

 #ifdef LPT_DEBUG
            resultSharedValue->minLevelsOperation = sharedValue->minLevelsOperation;
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp
@ -967,7 +967,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
            { {element::f32}, {}, { 0.01f } },
        }
    },
-    // unexpected quantization levels, concat
+    // INT4+INT8 quantization levels, concat
    {
        LayerTransformation::createParamsU8I8(),
        false,
@ -990,16 +990,16 @@ const std::vector<ConcatTransformationTestValues> testValues = {
            ngraph::element::f32,
            {},
        },
-        false,
+        true,
        false,
    },
-    // unexpected quantization levels, concat multi channels
+    // INT4+INT8 quantization levels, concat multi channels
    {
        LayerTransformation::createParamsU8I8(),
        true,
        1,
        {
-            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {1.5f} },
            {},
            {},
            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
@ -1007,16 +1007,16 @@ const std::vector<ConcatTransformationTestValues> testValues = {
            {}
        },
        {
-            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            {16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f}, ngraph::element::u8},
            {},
            {},
-            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {256ul, {}, {0.f}, {2.55f}, {0.f}, {255.f}, ngraph::element::u8},
            {},
            {},
-            ngraph::element::f32,
-            {},
+            ngraph::element::u8,
+            { ngraph::element::f32, {}, {{ 0.1f, 0.1f, 0.1f, 0.01f, 0.01f, 0.01f }} }
        },
-        false,
+        true,
        false
    }
 };
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/fake_quantize_transformation.cpp
@ -322,6 +322,28 @@ const std::vector<FakeQuantizeTransformationTestValues> fakeQuantizeTransformati
    //        { ngraph::element::f16, { {ngraph::element::f16}, {}, { {0.01f, 0.1f, 1.f} }} }
    //    }
    //},
+    // u4 through u8
+    {
+        LayerTransformation::createParamsU8I8(),
+        { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 1.5f } },
+        { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 15.f } },
+        ngraph::element::u8,
+        {
+            { ngraph::element::f32, { {ngraph::element::f32}, {}, { 0.1f }} },
+            { ngraph::element::f16, { {ngraph::element::f16}, {}, { 0.1f }} }
+        }
+    },
+    // i4 through i8
+    {
+        LayerTransformation::createParamsI8I8(),
+        { 16ul, {}, { -0.8f }, { 0.7f }, { -0.8f }, { 0.7f } },
+        { 16ul, {}, { -0.8f }, { 0.7f }, { -8.f }, { 7.f } },
+        ngraph::element::i8,
+        {
+            { ngraph::element::f32, {{ngraph::element::f32}, { }, { 0.1f }} },
+            { ngraph::element::f16, {{ngraph::element::f16}, { }, { 0.1f }} }
+        }
+    },
 };

 INSTANTIATE_TEST_SUITE_P(
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
@ -38,10 +38,20 @@ const std::vector<ConcatTransformationTestValues> testValues = {
        { 256ul, ngraph::Shape({}), {0.f}, {2.55f}, {0.f}, {2.55f} }
    },
    // FQ with unexpected quantizationLevels
+    {
+        { 14ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} },
+        { 14ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} }
+    },
+    // FQ with INT4 quantizationLevels
    {
        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} },
        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} }
    },
+    // FQ with INT4+INT8 quantizationLevels
+    {
+        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} },
+        { 256ul, ngraph::Shape({}), {0.f}, {2.55f}, {0.f}, {2.55f} }
+    },
 };

 const std::vector<ngraph::PartialShape> shapes = {
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
@ -54,15 +54,15 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
        "U8"
    },
    {
-        { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 25.5f } },
+        { 14ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 25.5f } },
        false,
-        { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
+        { 14ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
        false,
        "Convolution",
        "FP32"
    },
    {
-        { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
+        { 14ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
        false,
        { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { -12.7f }, { 12.7f }, { -12.7f }, { 12.7f } },
        false,
@ -72,7 +72,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 25.5f } },
        false,
-        { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
+        { 14ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
        false,
        "Convolution",
        "FP32"
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
@ -46,14 +46,13 @@ const std::vector<FakeQuantizeTransformationParam> fakeQuantizeOnDataValues = {
        { 256ul, {}, { -127.5f }, { 0.f }, { -127.5f }, { 0.f } },
        "Pooling", "U8"
    },
-    // INT4 FQ's are not transformed and inferred via FP32
    {
        { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 1.5f } },
-        "Pooling", "FP32"
+        "Pooling", "U8"
    },
    {
-        { 16ul, {}, { -8.f }, { 7.f }, { -0.8f }, { 0.7f } },
-        "Pooling", "FP32"
+        { 16ul, {}, { -0.8f }, { 0.7f }, { -0.8f }, { 0.7f } },
+        "Pooling", "I8"
    },
    // INT16, INT32 FQ's are transformed, but updatePrecision = false for inference on CPU Plugin and inferred via FP32
    {
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
@ -48,11 +48,11 @@ const std::vector<FakeQuantizeTransformationParam> fakeQuantizeOnDataValues = {
    },
    {
        { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 1.5f } },
-        "Pooling", "FP32"
+        "Pooling", "U8"
    },
    {
        { 16ul, {}, { -8.f }, { 7.f }, { -0.8f }, { 0.7f } },
-        "Pooling", "FP32"
+        "Pooling", "I8"
    },
    // nGraph: I8->FP32 Convert is not supported
    // { 256ul, {}, { -1.28f} , { 1.27f }, { -1.28f} , { 1.27f } },