[GNA] Improved accuracy on model after Accuracy Aware (#7576)

* improved accuracy on model after Accuracy Aware * refactoring+test * removed unnessary FakeQuantI8/I16 * added comments, moved fake_quantized from UpdateInputScaleFromNetwork(), removed _Np template param from QuantDescTmpl
2021-09-28 11:34:09 +03:00
parent acc14c6469
commit a3dfa980ad
10 changed files with 276 additions and 125 deletions
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -16,7 +16,6 @@ struct GNAFlags {
    float pwlMaxErrorPercent = 1.0f;
    bool gna_openmp_multithreading = false;
    bool sw_fp32 = false;
-    bool fake_quantized = false;
    bool performance_counting = false;
    bool input_low_precision = false;
 };
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@@ -39,14 +39,12 @@ struct QuantDescTmpl {
    InferenceEngine::TPrecision<Op> _Op;
    InferenceEngine::TPrecision<Wp> _Wp;
    InferenceEngine::TPrecision<Bp> _Bp;
-    InferenceEngine::TPrecision<Np> _Np;

    QuantDescTmpl() = default;
    QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
              InferenceEngine::TPrecision<Op> _Op,
              InferenceEngine::TPrecision<Wp> _Wp,
-              InferenceEngine::TPrecision<Bp> _Bp,
-              InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
+              InferenceEngine::TPrecision<Bp> _Bp) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp) {
    }

    InferenceEngine::Precision getInputPrecision() const {
@@ -58,9 +56,6 @@ struct QuantDescTmpl {
    InferenceEngine::Precision getBiasesPrecision() const {
        return _Bp;
    }
-    InferenceEngine::Precision getNetPrecision() const {
-        return _Np;
-    }
    InferenceEngine::Precision getOutputPrecision() const {
        return _Op;
    }
@@ -74,23 +69,16 @@ typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_t


 struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
-    QuantI16() {
-        _Np = InferenceEngine::Precision::MIXED;
-    }
 };
 struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna_compound_bias_t, P_TYPE(MIXED)> {
-    QuantI8() {
-        _Np = InferenceEngine::Precision::MIXED;
-    }
 };
 // Low precision path quantizer (I8 inputs, weights, biases)
 struct QuantI8_I8 : public QuantDescTmpl<PRECISION_TYPE(I8, I32, I8, I8, MIXED)> {
-    QuantI8_I8() {
-        _Np = InferenceEngine::Precision::MIXED;
-    }
 };

 // for support proper trait instantiation for quantization function callback
+struct FakeQuant : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(MIXED), P_TYPE(MIXED), P_TYPE(MIXED)> {
+};
 struct FakeQuantI16 : public QuantI16 {};
 struct FakeQuantI8 : public QuantI8 {};

@@ -654,9 +642,24 @@ class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuant
 public:
    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
    bool operator()(InferenceEngine::WeightableLayer *wl) const {
-        quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
+        (*this)(wl, typename Desc::MandatoryType());
        return true;
    }
+
+    template<typename T>
+    void operator()(InferenceEngine::WeightableLayer *wl, const T&) const {
+        quantizeWeightsBiases<T>(T(), wl, Quant<T>());
+    }
+
+    void operator()(InferenceEngine::WeightableLayer *wl, const FakeQuant&) const {
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
+        IE_ASSERT(quantData->_weights_quant.IsStatsSet());
+        if (quantData->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
+            quantizeWeightsBiases<FakeQuantI8>(FakeQuantI8(), wl, Quant<FakeQuantI8>());
+        } else {
+            quantizeWeightsBiases<FakeQuantI16>(FakeQuantI16(), wl, Quant<FakeQuantI16>());
+        }
+    }
 };

 template<class Desc>
@@ -691,13 +694,18 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
    }
 };

+/*
+ * Major of layers will be executed in I16 mode
+ *  most of auto generated primitives like one for aligning support
+ *  GNA 1.0, 2.0 doesn’t support I8 for convolution layer.
+ * Some layers will be switched into I16 mode to not lose accuracy while memory and
+ * runtime performance of layers like scaleshifts still OK since it is O(N).
+ */
 using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
 using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
 using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I8>;

-
-using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
-using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
+using FakeQuant = frontend::QuantPair<frontend::FakeQuant, frontend::FakeQuantI16>;

 enum class QuantizedDataType {
    input,
--- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
@@ -83,9 +83,7 @@ class ModelQuantizer {
            scaleIndex++;
        }

-        bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
-        propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), T::optional().getWeightsPrecision().size(),
-                             T::mandatory().getInputPrecision().size(), isFakeQuantize);
+        propagateScaleFactor(sortedNewNet);

        // sorted order gives possibility for propagate quantisation along depended layers
        for (auto &&layer : sortedNewNet) {
@@ -96,9 +94,8 @@ class ModelQuantizer {
    }

 private :
-    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int mandWeightsBytesSize,
-                              int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) const {
-        ScaleFactorCalculator sf(net, mandWeightsBytesSize, optWeightsBytesSize, inputsBytesSize, fakeQuantize);
+    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net) const {
+        ScaleFactorCalculator<T> sf(net);

        int infiniteLoopCount = 0;
        std::vector<std::string> infiniteLoopPattern;
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -22,6 +22,10 @@
 #include "round_float_define.hpp"

 namespace GNAPluginNS {
+
+template<typename QUANT_DESC>
+class ScaleFactorCalculator;
+
 namespace frontend {
 static const float MIN_SEARCH_WEIGHTS_VAL = 1.0f;
 static const float MAX_SEARCH_WEIGHTS_VAL = 1024.0f;
@@ -133,7 +137,8 @@ static float selectBestWeightsScaleFactors(float inScale, float outScale, std::v
        for (size_t j = 0; j < slopes.size(); ++j) {
            auto s = gna_slope(slopes[j], inScale * weightScale, outScale);
            auto slope = static_cast<uint32_t>(s.slope * s.slope_scale);
-            if (slope < static_cast<uint32_t>(std::numeric_limits<int16_t>::min()) && slope > static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {
+            if (slope < static_cast<uint32_t>(std::numeric_limits<int16_t>::min()) &&
+                slope > static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {
                sd += std::numeric_limits<int8_t>::max();
                continue;
            }
@@ -206,24 +211,23 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
 * @brief calculates output scale factor per layer
 * @tparam T
 */
-template<class T>
+template<typename T, typename QUANT_DESC>
 class ScaleFactorPerLayer {
 public:
    /**
     * @brief calculates weights scale factor to fit dynamic range into target bitsize,
     * also calculates output scale factor for the given layer
     * @param cnnLayer
-     * @param weightsSize
     * @param result
     * @return
     */
-    bool operator()(T cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, bool fakeQuantize, int infiniteLoopCount) {
+    bool operator()(T cnnLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        return false;
    }
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
 private :
    const float activation_scale_factor = 2048.f;
    const float low_prec_activation_scale_factor = 4.f;
@@ -450,11 +454,14 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
    }

 public :
-    bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, bool fakeQuantize,
-        int infiniteLoopCount) {
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        if ( !cnnLayer ) {
            IE_THROW() << "Incorrect Convolutional Layer pointer \n";
        }
+
+        int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
+        bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
+
        LayerInfo layerInfo(*cnnLayer);
        // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
        auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
@@ -656,8 +663,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
    }
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*, QUANT_DESC> {
 private:
    bool requantizeEltwiseInput(InferenceEngine::EltwiseLayer* eltwiseLayer, uint8_t inputIx, int16_t maxValue,
        bool fakeQuantize, ScaleFactorUpdateResult &result) {
@@ -726,11 +733,12 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
    }

 public:
-    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
-        bool fakeQuantize, int infiniteLoopCount) {
+    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        if ( !eltwiseLayer ) {
            THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
        }
+        int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
+        bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
        bool lowPrecision = (inputsSize == sizeof(int8_t));

        auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
@@ -836,15 +844,16 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
    }
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*, QUANT_DESC> {
 public:
-    bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
-        bool fakeQuantize, int infiniteLoopCount) {
+    bool operator()(InferenceEngine::ConcatLayer* concatLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        if ( !concatLayer ) {
            THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
        }

+        bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
+
        if (concatLayer->insData.size() < 2) {
            THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
        }
@@ -1061,8 +1070,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
    }
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
 private:
    std::vector<std::tuple<uint16_t const, float const, float const>> thresholds {
        // tuple values: scale factor threshold, scale factor reduction factor for I16 precision, for I8 precision
@@ -1074,14 +1083,15 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
    };

 public:
-    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
-        bool fakeQuantize, int infiniteLoopCount) {
+    bool operator()(InferenceEngine::WeightableLayer *wl, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        if ( !wl ) {
            THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
        } else if (!wl->_weights) {
            THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
        }

+        int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
+        bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
        auto prevLayer = CNNNetPrevLayer(wl);
        auto quantDataForInputLayer =
            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
@@ -1111,6 +1121,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
        }

        // TODO: pass 8 bits somehow
+        int weightsSize = ScaleFactorCalculator<QUANT_DESC>::GetMandatoryWeightsBytesSize(wl);
        if (!quant->_weights_quant.IsScaleSet()) {
            size_t scaleRange = 0;
            if (weightsSize == 2) {
@@ -1217,19 +1228,20 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
    }
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*, QUANT_DESC> :
+    public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*, QUANT_DESC> :
+    public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
 };

-template<>
-class ScaleFactorPerLayer<InferenceEngine::GemmLayer*> {
+template<typename QUANT_DESC>
+class ScaleFactorPerLayer<InferenceEngine::GemmLayer*, QUANT_DESC> {
 public:
-    bool operator() (InferenceEngine::GemmLayer* gemmLayer, int weightsSize, int inputSize, ScaleFactorUpdateResult &result,
-        bool fakeQuantize, int infiniteLoopCount) {
+    bool operator() (InferenceEngine::GemmLayer* gemmLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
        if ( !gemmLayer ) {
            THROW_GNA_EXCEPTION << "Incorrect Gemm Layer pointer \n";
        }
@@ -1278,21 +1290,16 @@ public:
 * @brief scale factor calculator will calculate only output scale factors for the layer
 * if scale factor propagation not possible, it will fall indicate a restart condition
 */
+template<typename QUANT_DESC>
 class ScaleFactorCalculator {
    using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
    Cnt  net;
    mutable Cnt::const_iterator idx;
    mutable bool needRestart = false;
-    int mandWeightsBytesSize;
-    int optWeightsBytesSize;
-    bool isFakeQuantize;
-    int inputsBytesSize;
    int infiniteLoopCount = 0;

 public:
-    ScaleFactorCalculator(Cnt &net, int mandWeightsBytesSize, int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize)
-            : net(net), mandWeightsBytesSize(mandWeightsBytesSize), optWeightsBytesSize(optWeightsBytesSize),
-              inputsBytesSize(inputsBytesSize), isFakeQuantize(fakeQuantize) {
+    ScaleFactorCalculator(Cnt &net) : net(net) {
        idx = std::begin(this->net);
    }
    bool needToRestart() const {
@@ -1311,13 +1318,7 @@ class ScaleFactorCalculator {
    bool operator()(T ptr) const {
        needRestart = false;
        frontend::ScaleFactorUpdateResult result;
-        auto weightsBytesSize = mandWeightsBytesSize;
-
-        if (LayerInfo(ptr).isConvolution() || LayerInfo(ptr).isScaleShift()) {
-            weightsBytesSize = optWeightsBytesSize;
-        }
-
-        if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputsBytesSize, result, isFakeQuantize, infiniteLoopCount)) {
+        if (!frontend::ScaleFactorPerLayer<T, QUANT_DESC>()(ptr, result, infiniteLoopCount)) {
            return false;
        }
        if (result) {
@@ -1337,6 +1338,39 @@ class ScaleFactorCalculator {
        needRestart = true;
        return true;
    }
-};
+
+    template<class T>
+    static int GetMandatoryWeightsBytesSize(T ptr) {
+        auto info = LayerInfo(ptr);
+        if (info.isConvolution() || info.isScaleShift()) {
+            return GetOptionalWeightsBytesSize();
+        }
+
+        if (IsFakeQuantize()) {
+            auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*ptr);
+            if (quantData->_weights_quant.IsStatsSet()) {
+                if (quantData->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
+                    return frontend::FakeQuantI8().getWeightsPrecision().size();
+                } else {
+                    return frontend::FakeQuantI16().getWeightsPrecision().size();
+                }
+            }
+        }
+
+        return QUANT_DESC::mandatory().getWeightsPrecision().size();
+    }
+
+    static int GetOptionalWeightsBytesSize() {
+        return QUANT_DESC::optional().getWeightsPrecision().size();
+    }
+
+    static int GetInputsBytesSize() {
+        return QUANT_DESC::mandatory().getInputPrecision().size();
+    }
+
+    static bool IsFakeQuantize() {
+        return std::is_same<QUANT_DESC, FakeQuant>();
+    }
+}; // class ScaleFactorCalculator

 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -429,43 +429,7 @@ void GNAPlugin::InitGNADevice() {
    graphCompiler.setGNAMemoryPtr(gnamem);
 }

-void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork & network) {
-    OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateGnaQuantModeFromNetwork");
-    // fp32 emulation mode dont need any modifications to configuration
-    if (config.gnaFlags.sw_fp32) return;
-
-    // search for FQ layers
-    // only supports cases of int16 or int8
-    auto it = details::CNNNetworkIterator(network), end = details::CNNNetworkIterator();
-    for (; it != end; it++) {
-        if (!LayerInfo(*it).isFakeQuantize()) {
-            continue;
-        }
-
-        GNAFakeQuantizeLayer fqLayer(*it);
-        auto inputLayer = fqLayer.getInputLayer();
-
-        // this fake quantize represents data quantization - not weights
-        if (!LayerInfo(inputLayer).isConst()) {
-            continue;
-        }
-        // also in mixed mode i8 should be stated as target precision
-        if (fqLayer.getLevels() <= std::numeric_limits<uint8_t>::max()) {
-            config.gnaPrecision = InferenceEngine::Precision::I8;
-        } else if (fqLayer.getLevels() <= std::numeric_limits<uint16_t>::max()) {
-            config.gnaPrecision = InferenceEngine::Precision::I16;
-        } else {
-            THROW_GNA_LAYER_EXCEPTION(*it)
-                << "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only up to "
-                << std::numeric_limits<uint16_t>::max() << " is supported";
-        }
-
-        gnaFlags->fake_quantized = true;
-        config.gnaFlags.fake_quantized = true;
-    }
-}
-
-void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & network) {
+void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork& network) {
    OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateInputScaleFromNetwork");
    // fp32 emulation mode dont need any modifications to configuration
    if (config.gnaFlags.sw_fp32) return;
@@ -480,6 +444,7 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
            if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
                continue;
            }
+
            // replacing scale factor from this fq layer
            GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
            auto inputRange = fqLayer.getInputRange();
@@ -714,12 +679,13 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
    }

    bool isNgraphPassesUsed = false;
-
+    bool fake_quantized = false;
    if (_network.getFunction()) {
        CNNNetwork clonedNetwork = InferenceEngine::cloneNetwork(_network);
        const auto& graph = clonedNetwork.getFunction();
        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        fake_quantized = ngraph::op::util::has_op_with_type<ngraph::opset7::FakeQuantize>(graph);
        // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
        manager.register_pass<ngraph::pass::ConvertPriorBox>();
        manager.register_pass<ngraph::pass::CommonOptimizations>();
@@ -783,9 +749,9 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        THROW_GNA_EXCEPTION << error.c_str();
    }

-    // FQ networks now replaces certain flags in the plugin - flags will'be owerritten
-    UpdateGnaQuantModeFromNetwork(network);
-    UpdateInputScaleFromNetwork(network);
+    if (fake_quantized) {
+        UpdateInputScaleFromNetwork(network);
+    }

    // Set input and output information from orginal network
    UpdateInputsAndOutputsInfoFromNetwork(network);
@@ -849,19 +815,9 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        // to run all passes need to have two calls to pass manager
        run_passes(newNet, true, gnaFlags->input_low_precision);
        run_passes(newNet, false, gnaFlags->input_low_precision);
-    } else if (gnaFlags->fake_quantized) {
-        switch (config.gnaPrecision) {
-            case Precision::I16:
-                ModelQuantizer<FakeQuantI16> q16;
-                newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
-                break;
-            case Precision::I8:
-                ModelQuantizer<FakeQuantI8> q8;
-                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
-                break;
-            default:
-                THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
-        }
+    } else if (fake_quantized) {
+        ModelQuantizer<FakeQuant> modelQuantizer;
+        newNet = modelQuantizer.quantize(network, run_passes, inputsDesc->inputScaleFactors);
    } else {
        switch (config.gnaPrecision) {
            case Precision::I16:
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -211,8 +211,7 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
                    int idx = 0);

    void UpdateFieldsFromConfig();
-    void UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork &);
-    void UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork &);
+    void UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork& network);
    void UpdateInputsAndOutputsInfoFromNetwork(InferenceEngine::CNNNetwork &);
    /**
     * @brief Tries to init an output on the base of a layer data
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_with_mixed_levels.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_with_mixed_levels.cpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+#include "subgraph_tests/fq_with_mixed_levels.hpp"
+
+namespace SubgraphTestsDefinitions {
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_FqWithMixedLevelsTest, FqWithMixedLevelsTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs)),
+                        FqWithMixedLevelsTest::getTestCaseName);
+} // namespace
+} // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/fq_with_mixed_levels.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/fq_with_mixed_levels.hpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef TEST_FQ_WITH_MIXED_LEVELS_HPP
+#define TEST_FQ_WITH_MIXED_LEVELS_HPP
+
+#include "shared_test_classes/subgraph/fq_with_mixed_levels.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(FqWithMixedLevelsTest, CompareWithRefImpl) {
+    Run();
+};
+
+}  // namespace SubgraphTestsDefinitions
+
+#endif // TEST_FQ_WITH_MIXED_LEVELS_HPP
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_with_mixed_levels.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_with_mixed_levels.hpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef FQ_WITH_MIXED_LEVELS_HPP
+#define FQ_WITH_MIXED_LEVELS_HPP
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+typedef std::tuple<
+        InferenceEngine::Precision,         // Network Precision
+        std::string,                        // Target Device
+        std::map<std::string, std::string>  // Configuration
+> FqWithMixedLevelsParams;
+
+class FqWithMixedLevelsTest : public testing::WithParamInterface<FqWithMixedLevelsParams>,
+                        public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<FqWithMixedLevelsParams>& obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
+
+#endif // FQ_WITH_MIXED_LEVELS_HPP
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_with_mixed_levels.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_with_mixed_levels.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/fq_with_mixed_levels.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+std::string FqWithMixedLevelsTest::getTestCaseName(const testing::TestParamInfo<FqWithMixedLevelsParams>& obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    std::tie(netPrecision, targetDevice, configuration) = obj.param;
+
+    std::ostringstream result;
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+void FqWithMixedLevelsTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    std::tie(netPrecision, targetDevice, tempConfig) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto unit = [=](const std::shared_ptr<ngraph::Node>& input,
+            const std::vector<std::vector<size_t>>& shapes,
+            float weights_min, float weights_max,
+            size_t level1, const std::vector<std::vector<float>>& data1,
+            size_t level2, const std::vector<std::vector<float>>& data2,
+            size_t level3, const std::vector<std::vector<float>>& data3) {
+        auto sigmoid = std::make_shared<ngraph::opset7::Sigmoid>(input);
+        auto fake1 = ngraph::builder::makeFakeQuantize(sigmoid, ngPrc, level1, { 1 }, data1[0], data1[1], data1[2], data1[3]);
+        std::vector<float> weights = CommonTestUtils::generate_float_numbers(shapes[1][0] * shapes[1][1], weights_min, weights_max);
+        auto constant = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{shapes[1][0], shapes[1][1]}, weights);
+        auto fake2 = ngraph::builder::makeFakeQuantize(constant, ngPrc, level2, { 1 }, data2[0], data2[1], data2[2], data2[3]);
+        auto matmul = ngraph::builder::makeMatMul(fake1, fake2, false, true);
+        auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{shapes[0][0], shapes[1][0]}, std::vector<float>{ 1.0 });
+        auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
+        return ngraph::builder::makeFakeQuantize(add, ngPrc, level3, { 1 }, data3[0], data3[1], data3[2], data3[3]);
+    };
+
+    auto params = ngraph::builder::makeParams(ngPrc, {{ 1, 8 }});
+    auto input = ngraph::builder::makeFakeQuantize(params[0], ngPrc, std::numeric_limits<uint32_t>::max(), { 1 },
+        { -10. }, { 10. }, { -10. }, { 10. });
+    input = unit(input,
+        {{1, 8}, {8, 8}},
+        -20., 20.,
+        std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
+        std::numeric_limits<uint8_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
+        std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
+    input = unit(input,
+        {{ 1, 8 }, { 8, 8 }},
+        -13., 13.,
+        std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
+        std::numeric_limits<uint16_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
+        std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
+    input = unit(input,
+        {{1, 8}, {8, 8}},
+        -20., 20.,
+        std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
+        std::numeric_limits<uint8_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
+        std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
+    auto result = std::make_shared<ngraph::opset7::Result>(input);
+    function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, params, "FqWithMixedLevelsTest");
+}
+
+}  // namespace SubgraphTestsDefinitions