[LPT] INT16, INT32 leftovers (#7653)

2022-01-10 21:09:10 +03:00 · 2022-01-10 21:09:10 +03:00 · b744c11b88
commit b744c11b88
parent 0c2b53eba3
7 changed files with 101 additions and 64 deletions
--- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
+++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
@ -38,15 +38,34 @@
 namespace ngraph {
 namespace pass {
 namespace low_precision {
-
+namespace precision_set {
+    const std::vector<element::Type> int8_support  = {
+            ngraph::element::u8,  ngraph::element::i8
+    };
+    const std::vector<element::Type> int8_int16_int32_support = {
+            ngraph::element::u8,  ngraph::element::i8,
+            ngraph::element::u16, ngraph::element::i16,
+            ngraph::element::u32, ngraph::element::i32
+    };
+}
+enum levels : size_t {
+    int4 = 16,
+    int4_narrow_range = 15,
+    int8 = 256,
+    int8_narrow_range = 255,
+    int16 = 65536,
+    int16_narrow_range = 65535,
+    int32 = size_t(4294967296),  // for ARM and ia32 platforms where this number bigger than size_t but never used
+    int32_narrow_range = 4294967295
+};
 class LP_TRANSFORMATIONS_API DataPrecision {
 public:
    DataPrecision() : precision(element::undefined), min(0.f), max(0.f), hasZeroPoint(false) {}

    explicit DataPrecision(const element::Type& precision) {
        this->precision = precision;
-        min = getMinValue(precision, 256);
-        max = getMaxValue(precision, 256);
+        min = getMinValue(precision, levels::int8);
+        max = getMaxValue(precision, levels::int8);
        hasZeroPoint = false;
    }

@ -66,7 +85,7 @@ public:
                element::i16, element::u16,
                element::i32, element::u32
        };
-        return lowPrecision.count(precision) == 1;
+        return lowPrecision.find(precision) != lowPrecision.end();
    }

    static float getMinValue(const element::Type precision, const size_t levels) {
@ -80,17 +99,31 @@ public:
                return -8.f;
            case element::i8:
                switch (levels) {
-                    case 16:
+                    case low_precision::levels::int4:
                        return -8.f;
-                    case 255:
-                        return -127.f;
-                    default:
+                    case low_precision::levels::int4_narrow_range:
+                        return -7.f;
+                    case low_precision::levels::int8:
                        return -128.f;
+                    case low_precision::levels::int8_narrow_range:
+                        return -127.f;
                }
            case element::i16:
-                return levels == 65535 ? -32767.f : -32768.f;
+                switch (levels) {
+                    case low_precision::levels::int16:
+                        return -32768.f;
+                    case low_precision::levels::int16_narrow_range:
+                        return -32767.f;
+                }
+                break;
            case element::i32:
-                return -2147483647.f; // -2147483647.f == -2147483648.f
+                switch (levels) {
+                    case low_precision::levels::int32:
+                        return -2147483648.f;
+                    case low_precision::levels::int32_narrow_range:
+                        return -2147483647.f;
+                }
+                break;
            case element::f16:
                return -1.0e15f;
            case element::f32:
@ -140,14 +173,14 @@ public:

    // Return maximum value for quantization level. Quantization level is maximum value for precision.
    static float getMaxValue(const size_t maxLevelsForPrecision) {
-        if (maxLevelsForPrecision == 255ul) {
-            return 254.f;
-        } else if (maxLevelsForPrecision == 256ul) {
-            return 255.f;
-        } else if (maxLevelsForPrecision == 16ul) {
-            return 15.f;
-        } else if (maxLevelsForPrecision == 15ul) {
-            return 14.f;
+        std::set<size_t> validLevels = {
+            levels::int4,  levels::int4_narrow_range,
+            levels::int8,  levels::int8_narrow_range,
+            levels::int16, levels::int16_narrow_range,
+            levels::int32, levels::int32_narrow_range
+        };
+        if (validLevels.find(maxLevelsForPrecision) != validLevels.end()) {
+            return maxLevelsForPrecision - 1.f;
        } else {
            THROW_TRANSFORMATION_EXCEPTION << "unexpected quantization level " << maxLevelsForPrecision;
        }
--- a/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp
+++ b/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp
@ -122,12 +122,12 @@ DataPrecision getDataPrecisionByOutputPort(std::shared_ptr<opset1::FakeQuantize>
    const auto& precisions = precisionsAttribute.as<PrecisionsAttribute>().value();
    std::vector<element::Type> precisionsForLevels{};
    switch (levels) {
-        case 65536:
-        case 65535:
+        case low_precision::levels::int16:
+        case low_precision::levels::int16_narrow_range:
            precisionsForLevels = {element::u16, element::i16};
            break;
-        case static_cast<size_t>(4294967296):
-        case 4294967295:
+        case low_precision::levels::int32:
+        case low_precision::levels::int32_narrow_range:
            precisionsForLevels = {element::u32, element::i32};
            break;
        default:
--- a/src/common/low_precision_transformations/src/layer_transformation.cpp
+++ b/src/common/low_precision_transformations/src/layer_transformation.cpp
@ -24,7 +24,7 @@ namespace low_precision {
 constexpr char LayerTransformation::originalLayerPostfix[];

 // order defines default precision
-std::vector<ngraph::element::Type> LayerTransformation::defaultPrecisions = { ngraph::element::u8,  ngraph::element::i8 };
+std::vector<ngraph::element::Type> LayerTransformation::defaultPrecisions = precision_set::int8_support;
 std::mutex LayerTransformation::defaultPrecisionsMutex;

 LayerTransformation::LayerTransformation(const Params& params) :
@ -210,6 +210,9 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(

    bool hasZeroPoint = false;
    bool thereIsAtLeastOneNormalValue = false;
+
+    std::vector<size_t> fullRangeLevels = { levels::int4, levels::int8, levels::int16, levels::int32 };
+
    for (size_t i = 0; i < outputLowValues.size(); ++i) {
        if ((std::fabs(outputLowValues[i]) < zeroThreshold) && (std::fabs(outputHighValues[i]) < zeroThreshold)) {
            // both values are too small to identify preferable precision
@ -226,9 +229,8 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(
            hasNegative = true;

            if (outputHighValues[i] != 0.f) {
-                const float expectedRatio =
-                        (quantizationLevels == 16 || quantizationLevels == 256 ||
-                         quantizationLevels == 65536 || quantizationLevels == 4294967296) ? asymmetricIntervalSideRatio : -1.f;
+                auto it = std::find(fullRangeLevels.begin(), fullRangeLevels.end(), quantizationLevels);
+                const float expectedRatio = it != fullRangeLevels.end() ? asymmetricIntervalSideRatio : -1.f;
                const float actualRatio = outputLowValues[i] / outputHighValues[i];
                const float actual = std::fabs((actualRatio - expectedRatio) / std::min(actualRatio, expectedRatio));
                if (actual > quantizationIntervalAsymmetryThreshold) {
@ -272,37 +274,35 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(
    if (!hasZeroPoint) {
        if (signedPrecision && (!unsignedPrecision)) {
            switch (quantizationLevels) {
-                case 256:
-                case 255:
-                case 16:
+                case levels::int4:
+                case levels::int8:
+                case levels::int8_narrow_range:
                    resultPrecision = element::i8;
                    break;
-                case 65536:
-                case 65535:
+                case levels::int16:
+                case levels::int16_narrow_range:
                    resultPrecision = element::i16;
                    break;
-                case static_cast<size_t>(4294967296):
-                case 4294967295:
+                case levels::int32:
+                case levels::int32_narrow_range:
                    resultPrecision = element::i32;
-                    break;
            }
        }

        if ((!signedPrecision) && unsignedPrecision) {
            switch (quantizationLevels) {
-                case 256:
-                case 255:
-                case 16:
+                case levels::int4:
+                case levels::int8:
+                case levels::int8_narrow_range:
                    resultPrecision = element::u8;
                    break;
-                case 65536:
-                case 65535:
+                case levels::int16:
+                case levels::int16_narrow_range:
                    resultPrecision = element::u16;
                    break;
-                case static_cast<size_t>(4294967296):
-                case 4294967295:
+                case levels::int32:
+                case levels::int32_narrow_range:
                    resultPrecision = element::u32;
-                    break;
            }
        }
    }
@ -337,16 +337,16 @@ DataPrecision LayerTransformation::getDataPrecision(
    std::vector<element::Type> resultPrecisions = precisions;
    std::vector<element::Type> FQPrecisions;
    switch (quantizationDetails.levels) {
-        case 255:
-        case 256:
+        case levels::int8:
+        case levels::int8_narrow_range:
            FQPrecisions = {element::u8, element::i8};
            break;
-        case 65535:
-        case 65536:
+        case levels::int16:
+        case levels::int16_narrow_range:
            FQPrecisions = {element::u16, element::i16};
            break;
-        case 4294967295:
-        case static_cast<size_t>(4294967296):
+        case levels::int32:
+        case levels::int32_narrow_range:
            FQPrecisions = {element::u32, element::i32};
    }
    resultPrecisions = NetworkHelper::precisionIntersection(precisions, FQPrecisions);
--- a/src/common/low_precision_transformations/src/low_precision.cpp
+++ b/src/common/low_precision_transformations/src/low_precision.cpp
@ -292,12 +292,10 @@ bool ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
        const std::set<size_t>& levels) {
    std::vector<std::shared_ptr<ngraph::Node>> nodes = function->get_ops();
    for (auto& node : nodes) {
-        for (size_t i = 0; i < node->inputs().size(); ++i) {
-            const auto fakeQuantize = as_type_ptr<ngraph::opset1::FakeQuantize>(node);
-            if (fakeQuantize != nullptr) {
-                if (levels.count(fakeQuantize->get_levels()) == 1) {
-                    return true;
-                }
+        const auto fakeQuantize = as_type_ptr<ngraph::opset1::FakeQuantize>(node);
+        if (fakeQuantize != nullptr) {
+            if (levels.count(fakeQuantize->get_levels()) == 1) {
+                return true;
            }
        }
    }
--- a/src/common/low_precision_transformations/src/network_helper.cpp
+++ b/src/common/low_precision_transformations/src/network_helper.cpp
@ -22,6 +22,7 @@
 #include "low_precision/rt_info/precision_preserved_attribute.hpp"
 #include "low_precision/rt_info/intervals_alignment_attribute.hpp"
 #include "low_precision/rt_info/quantization_alignment_attribute.hpp"
+#include "ngraph/opsets/opset6.hpp"

 namespace ngraph {
 namespace pass {
@ -61,7 +62,9 @@ bool NetworkHelper::isConstantPath(const std::shared_ptr<Node>& op) {
            ov::is_type<opset1::Convolution>(node) ||
            ov::is_type<opset1::GroupConvolution>(node) ||
            ov::is_type<opset1::MatMul>(node) ||
-            ov::is_type<opset1::ConvolutionBackpropData>(node);
+            ov::is_type<opset1::ConvolutionBackpropData>(node) ||
+            ov::is_type<opset3::ReadValue>(node) ||
+            ov::is_type<opset6::ReadValue>(node);
    };

    if (isNotConstantPathOperation(op)) {
@ -1730,8 +1733,8 @@ bool NetworkHelper::checkZeroPoint(const std::shared_ptr<Node>& node, const Data
        const auto intNode = ov::is_type<opset1::Convert>(parent) ? parent : node;
        const auto type = intNode->get_input_element_type(0);
        if (type == element::u8 || type == element::i8) {
-            min = DataPrecision::getMinValue(type, 256) - 0.5f;
-            max = DataPrecision::getMaxValue(type, 256) + 0.5f;
+            min = DataPrecision::getMinValue(type, levels::int8) - 0.5f;
+            max = DataPrecision::getMaxValue(type, levels::int8) + 0.5f;
        } else {
            return type == element::f32 || type == element::f16;
        }
--- a/src/common/low_precision_transformations/src/quantization_details.cpp
+++ b/src/common/low_precision_transformations/src/quantization_details.cpp
@ -19,6 +19,7 @@

 #include <low_precision/common/ie_lpt_exception.hpp>
 #include <low_precision/network_helper.hpp>
+#include <low_precision/layer_transformation.hpp>

 namespace ngraph {
 namespace pass {
@ -162,7 +163,12 @@ bool QuantizationDetails::empty() const noexcept {
 }

 bool QuantizationDetails::isSupportedLevel(const size_t level) {
-    static const std::unordered_set<size_t> supported_levels = { 16, 255, 256, 65536, 65535, static_cast<size_t>(4294967296), 4294967295 };
+    static const std::unordered_set<size_t> supported_levels = {
+        levels::int4,  levels::int4_narrow_range,
+        levels::int8,  levels::int8_narrow_range,
+        levels::int16, levels::int16_narrow_range,
+        levels::int32, levels::int32_narrow_range
+    };
    return supported_levels.find(level) != supported_levels.end();
 }

--- a/src/plugins/intel_cpu/src/mkldnn_plugin.cpp
+++ b/src/plugins/intel_cpu/src/mkldnn_plugin.cpp
@ -446,14 +446,11 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
        bool updatePrecision = true;
        bool hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
                nGraphFunc,
-                {65535, 65536, 4294967295, 4294967296});
+                {levels::int16, levels::int16_narrow_range,
+                 levels::int32, levels::int32_narrow_range});
        if (hasINT16orINT32Levels) {
            updatePrecision = false;
-            LowPrecision::setDefaultPrecisions({
-                ngraph::element::u8,  ngraph::element::i8,
-                ngraph::element::u16, ngraph::element::i16,
-                ngraph::element::u32, ngraph::element::i32,
-            });
+            LowPrecision::setDefaultPrecisions(precision_set::int8_int16_int32_support);

            supportedPrecisions = std::vector<OperationPrecisionRestriction>({});
        }