[GNA] Fixed scale factors propagation for Eltwise with very different inputs ranges (#7305)

* [GNA] Fix scale factors propogation for Eltwise with very different inputs ranges * [GNA] Added test * [GNA] Added exception for scale factor <= 0 * [GNA] Disable tests with integer weights * [GNA] Added assert for CNNLayer in getScaleFactor() * [GNA] Added check if scale factor is inf * [GNA] Fixed legacy tests
2021-09-08 10:48:05 +03:00 · 2021-09-08 10:48:05 +03:00 · 66a14f1ac3
commit 66a14f1ac3
parent 5096fe19f1
6 changed files with 337 additions and 125 deletions
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@ -699,5 +699,53 @@ using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I
 using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
 using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;

+enum class QuantizedDataType {
+    input,
+    output,
+    weights,
+    bias
+};
+
+/**
+ * @brief Returns a scale factor for specific layer data
+ * @param layer Layer to be quantized
+ * @param data_type Type of data to be quantized
+ * @return scale factor
+ */
+inline float getScaleFactor(InferenceEngine::CNNLayerPtr layer, QuantizedDataType data_type) {
+    IE_ASSERT(layer != nullptr);
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    float scale_factor;
+    if (!quantized) {
+        scale_factor = 1.0f;
+    } else {
+        switch (data_type) {
+            case QuantizedDataType::input:
+                scale_factor = quantized->_src_quant.GetScale();
+                break;
+            case QuantizedDataType::output:
+            scale_factor = quantized->_dst_quant.GetScale();
+                break;
+            case QuantizedDataType::weights:
+                scale_factor = quantized->_weights_quant.GetScale();
+                break;
+            case QuantizedDataType::bias:
+                scale_factor = quantized->_bias_quant.GetScale();
+                break;
+            default:
+                THROW_GNA_LAYER_EXCEPTION(layer) << "Unsupported data type for quantization: " << static_cast<int>(data_type);
+        }
+    }
+
+    auto isZero = [](float p1) {
+        return std::abs(p1) <= 0.00001f;
+    };
+
+    if (scale_factor < 0.0 || isZero(scale_factor) || std::isinf(scale_factor)) {
+        THROW_GNA_LAYER_EXCEPTION(layer) << "Invalid scale factor: " << scale_factor;
+    }
+
+    return scale_factor;
+}

 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@ -490,7 +490,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {

                    if ((!fakeQuantize && quantSibling->_dst_quant.IsScaleSet()) ||
                        (fakeQuantize && quantSibling->_dst_quant.IsScaleSet() && !fp32eq(quantSibling->_dst_quant.GetScale(), 1.0) &&
-                        quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale()) || infiniteLoopCount > 0) {
+                        quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale()) ||
+                        quantSibling->_dst_quant.IsScaleSet() && infiniteLoopCount > 0) {
                        // means we already restarted propagation input memory layer
                        // need to search for requantiseable layer prior memory output layer
                        InferenceEngine::CNNLayerPtr restartedLayer;
@ -657,6 +658,73 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {

 template<>
 class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
+ private:
+    bool requantizeEltwiseInput(InferenceEngine::EltwiseLayer* eltwiseLayer, uint8_t inputIx, int16_t maxValue,
+        bool fakeQuantize, ScaleFactorUpdateResult &result) {
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
+        auto in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, inputIx);
+        bool has8BOr16BOut = LayerInfo(in).has8BOr16BOutput();
+        auto quantParams =
+                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, inputIx));
+        // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
+        auto quantParamsOpposite =
+                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !inputIx));
+
+        while (in && !LayerInfo(in).isInput() && !LayerInfo(in).isMemory() && !LayerInfo(in).isCopy()) {
+            auto info = LayerInfo(in);
+            if (info.isActivation() || info.isConst()) {
+                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                float newOutputScale;
+                if (has8BOr16BOut) {
+                    newOutputScale = quantParamsOpposite->_dst_quant.GetScale() / maxValue;
+                } else {
+                    newOutputScale = quantDataForInputLayer->_dst_quant.GetScale() *
+                                     quantParamsOpposite->_dst_quant.GetScale() * maxValue /
+                                     quantParams->_dst_quant.GetScale();
+                }
+                if (info.isActivation() && newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
+                    return false;
+                }
+                gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
+                            << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
+                            << ", was " << quantDataForInputLayer->_dst_quant.GetScale() <<"\n" << std::flush;
+                quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
+                result = ScaleFactorUpdateResult(in.get());
+                return true;
+            }
+
+            if (fakeQuantize && info.isWeightableIdentity()) {
+                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
+                    auto reducer = quantData->_weights_quant.GetScale() / maxValue;
+                    reducer = std::max(1.0f, reducer);
+                    auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
+                    newWeightsScale = std::max(1.0f, newWeightsScale);
+                    quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
+                    quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
+                        quantDataForInputLayer->_src_quant.GetScale());
+
+                    result = ScaleFactorUpdateResult(in.get());
+                    return true;
+                }
+            }
+
+            // if we are here it means that we are in the port 1
+            if (info.isFullyConnected() || info.isConvolution()) {
+                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                auto newOutputScale = quantParamsOpposite->_dst_quant.GetScale() * maxValue;
+                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
+                quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
+                quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
+                result = ScaleFactorUpdateResult(in.get());
+                return true;
+            }
+
+            in = InferenceEngine::CNNNetHasPrevLayer(in.get()) ? InferenceEngine::CNNNetPrevLayer(in) : nullptr;
+        }
+        return false;
+    }
+
 public:
    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
        bool fakeQuantize, int infiniteLoopCount) {
@ -734,7 +802,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                            }
                        }

-                        if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
+                        if (bestWeightsScale > 0.0f && !fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
                            quantParams1->_weights_quant.SetScale(bestWeightsScale);
                            quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
                            result = ScaleFactorUpdateResult(in1.get());
@ -746,79 +814,22 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());

                // eltwise will work in int16 or int8 if low precision inputs are used
-                auto maxValue = lowPrecision ? (std::numeric_limits<int8_t>::max() - 1) : (std::numeric_limits<int16_t>::max() - 1);
-                if (quantData->_weights_quant.GetScale() > maxValue + 1) {
-                    // rescaling it's activation input
-                    // iterating thru previous layers of eltwise
-                    for (uint8_t i = 0; i < 2; ++i) {
-                        InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
-                        bool has8BOr16BOut = LayerInfo(in).has8BOr16BOutput();
-                        auto quantParams =
-                                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i));
-                        // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
-                        auto quantParamsOpposite =
-                                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
-
-                        for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
-                            auto info = LayerInfo(in);
-                            if (info.isSplit() || info.isSlice() || info.isConcat() || info.isNonFunctional()) {
-                                continue;
-                            } else if (info.has8BOr16BOutput() && info.isActivation()) {
-                                auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
-                                float newOutputScale;
-                                if (has8BOr16BOut) {
-                                    newOutputScale = quantParamsOpposite->_dst_quant.GetScale() / maxValue;
-                                } else {
-                                    newOutputScale = quantDataForActivation->_dst_quant.GetScale() *
-                                                     quantParamsOpposite->_dst_quant.GetScale() * maxValue /
-                                                     quantParams->_dst_quant.GetScale();
-                                }
-                                if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
-                                    break;
-                                }
-                                gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
-                                         << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
-                                         << ", was " << quantDataForActivation->_dst_quant.GetScale() <<"\n" << std::flush;
-                                quantDataForActivation->_dst_quant.SetScale(newOutputScale);
-                                result = ScaleFactorUpdateResult(in.get());
-                                return true;
-                            } else if (info.has8BOr16BOutput()) {
-                                break;
-                            }
-
-                            if (fakeQuantize && info.isWeightableIdentity()) {
-                                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
-                                if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
-                                    auto reducer = quantData->_weights_quant.GetScale() / std::numeric_limits<int16_t>::max();
-                                    reducer = std::max(1.0f, reducer);
-                                    auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
-                                    newWeightsScale = std::max(1.0f, newWeightsScale);
-                                    quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
-                                    quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
-                                        quantDataForInputLayer->_src_quant.GetScale());
-
-                                    result = ScaleFactorUpdateResult(in.get());
-                                    return true;
-                                }
-                            }
-
-                            // if we are here it means that we are in the port 1
-                            if (info.isFullyConnected() || info.isConvolution()) {
-                                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
-                                auto newOutputScale = quantParamsOpposite->_dst_quant.GetScale() * maxValue;
-                                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
-                                quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
-                                quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
-                                result = ScaleFactorUpdateResult(in.get());
-                                return true;
-                            }
-                        }
-                    }
-                    // we unable to rescale the input - results might be bad
-                    gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
+                auto maxValue = lowPrecision ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int16_t>::max();
+                if (quantData->_weights_quant.GetScale() <= maxValue) {
+                    return true;
                }
-                break;
+
+                // rescaling it's activation input
+                // iterating thru previous layers of eltwise
+                for (uint8_t i = 0; i < 2; ++i) {
+                    if (requantizeEltwiseInput(eltwiseLayer, i, maxValue - 1, fakeQuantize, result)) {
+                        return true;
+                    }
+                }
+                // we unable to rescale the input - results might be bad
+                gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
            }
+            break;
            default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
        }
        return true;
@ -1153,7 +1164,6 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
            }
            quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
        }
-
        double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
        if (weightsSize == 1) {
            auto itt = thresholds.begin();
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@ -409,13 +409,9 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
    uint32_t num_bytes_per_weight = convolution._weights->getTensorDesc().getPrecision().size();
    uint32_t num_bytes_per_bias = biasPrecision.size();

-    float weight_scale_factor = 1.0f;
-    float output_scale_factor = 1.0f;
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(convolution);
-    if (quantized != nullptr) {
-        weight_scale_factor = quantized->_weights_quant.GetScale();
-        output_scale_factor = quantized->_dst_quant.GetScale();
-    }
+    float weight_scale_factor = getScaleFactor(layer, QuantizedDataType::weights);
+    float output_scale_factor = getScaleFactor(layer, QuantizedDataType::output);
+
    auto& currentComponent = dnnComponents.addComponent(convolution.name, "convolution");
    dnn->InitConvolutional1DComponent(currentComponent,
        num_columns_in,
@ -586,13 +582,8 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
        in_height, in_width, in_channels,
        convolution._kernel_y, convolution._kernel_x, filter_n, convolution._stride_y, convolution._stride_x, inputPrec);

-    float weight_scale_factor = 1.0f;
-    float output_scale_factor = 1.0f;
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(convolution);
-    if (quantized != nullptr) {
-        weight_scale_factor = quantized->_weights_quant.GetScale();
-        output_scale_factor = quantized->_dst_quant.GetScale();
-    }
+    float weight_scale_factor = getScaleFactor(layer, QuantizedDataType::weights);
+    float output_scale_factor = getScaleFactor(layer, QuantizedDataType::output);

    auto& currentComponent = dnnComponents.addComponent(convolution.name, "convolution");
    dnn->InitConvolutional2DComponent(currentComponent,
@ -673,9 +664,6 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP

 void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
    auto& power = dynamic_cast<PowerLayer&>(*layer.get());
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-    IE_ASSERT(gnaFlags->sw_fp32 ? (quantized == nullptr) : (quantized != nullptr));
-
    if (power.power < 0.0f || power.power > 2.8f) {
        IE_THROW() << "[GNA plugin] unsupported power factor, expected be in <0, 2.8> range but was " << power.power;
    }
@ -705,6 +693,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {

        auto& currentComponent = dnnComponents.addComponent(layer->name, "power");

+        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+        IE_ASSERT(gnaFlags->sw_fp32 ? (quantized == nullptr) : (quantized != nullptr));
        dnn->InitAffineComponent(currentComponent,
            num_rows_in + num_padding,
            num_columns_in,
@ -764,8 +754,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {

        gna_pwl_segment_t* ptr_pwl_segments_target = nullptr;

-        float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
-        float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
+        float output_pwl_scale_factor = getScaleFactor(layer, QuantizedDataType::output);
+        float input_pwl_scale_factor = getScaleFactor(layer, QuantizedDataType::input);

        if (!gnaFlags->sw_fp32) {
            if (gnaFlags->uniformPwlDesign) {
@ -823,7 +813,6 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {

 void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
    auto& pooling = dynamic_cast<PoolingLayer&>(*layer.get());
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);

    IE_ASSERT(!layer->insData.empty());
    IE_ASSERT(!layer->outData.empty());
@ -883,7 +872,7 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
        outputs->getPrecision().size(),
        { pooling._kernel[X_AXIS], pooling._kernel[Y_AXIS] },
        { pooling._stride[X_AXIS], pooling._stride[Y_AXIS] },
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs);

@ -901,8 +890,6 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
 }

 void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-
    IE_ASSERT(!layer->insData.empty());
    IE_ASSERT(!layer->outData.empty());
    auto inputs = layer->insData.begin()->lock();
@ -928,7 +915,7 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
        num_columns_out,
        inputs->getPrecision().size(),
        outputs->getPrecision().size(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::output),
        num_rows_out + num_padding_out,
        num_columns_out,
        ptr_inputs,
@ -1053,7 +1040,6 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
            << axis.size() << ".";
    }

-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
    size_t cropOffset = offset.front() * cropLayer->precision.size();
    size_t cropOutputSize = dim.front() * cropLayer->precision.size();
    const uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ?
@ -1111,6 +1097,7 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {

        auto& currentComponent = dnnComponents.addComponent(layer->name, "crop");

+        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
        dnn->InitAffineComponent(currentComponent,
            num_rows_in + num_padding,
            num_columns_in,
@ -1119,8 +1106,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
            outputs->getPrecision().size(),
            quantized == nullptr ? inputs->getPrecision().size() : (gnaFlags->input_low_precision ? 1 : 2),
            gnaFlags->input_low_precision ? 1 : 4,
-            quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-            quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+            getScaleFactor(layer, QuantizedDataType::weights),
+            getScaleFactor(layer, QuantizedDataType::output),
            ptr_inputs,
            ptr_outputs,
            ptr_weights,
@ -1254,8 +1241,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
        // TODO: only fp32 and Int16 tested
        quantized == nullptr ? inputs2Bytes->getPrecision().size() : (gnaFlags->input_low_precision ? 1 : 2),
        quantized == nullptr ? inputs4Bytes->getPrecision().size() : (gnaFlags->input_low_precision ? 1 : 4),
-        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::weights),
+        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1363,8 +1350,8 @@ void GNAGraphCompiler::GemmPrimitive(InferenceEngine::CNNLayerPtr layer) {
                             outputs->getPrecision().size(),
                             quantized == nullptr ? input_2->getPrecision().size() : 2,
                             quantized == nullptr ? input_2->getPrecision().size() : 4,
-                             quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-                             quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+                             getScaleFactor(layer, QuantizedDataType::weights),
+                             getScaleFactor(layer, QuantizedDataType::output),
                             ptr_input_1,
                             ptr_outputs,
                             ptr_input_2,
@ -1452,8 +1439,8 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
        outputs->getPrecision().size(),
        weightable._weights->getTensorDesc().getPrecision().size(),
        biasPrecisionSize,
-        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::weights),
+        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1592,8 +1579,6 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
        return;
    }

-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-
    void* ptr_inputs = nullptr;
    void* ptr_outputs = nullptr;
    void* ptr_weights = nullptr;
@ -1632,7 +1617,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
                               num_columns_in,
                               inputs->getPrecision().size(),
                               inputs->getPrecision().size(),
-                               quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+                               getScaleFactor(layer, QuantizedDataType::output),
                               num_rows_copied,
                               num_columns_in,
                               ptr_inputs,
@ -1669,8 +1654,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
        outputs->getPrecision().size(),
        filterLayer->_weights->getTensorDesc().getPrecision().size(),
        biasPrecisionSize,
-        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::weights),
+        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1726,8 +1711,6 @@ void GNAGraphCompiler::ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr l
        return;
    }

-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-
    auto prevLayer = CNNNetPrevLayer(layer.get(), 0);
    if (!LayerInfo(prevLayer).isSplit() && !LayerInfo(prevLayer).isSlice()) {
        THROW_GNA_EXCEPTION << "Case with Affine Aligning Filter for not Split/Slice layers is not implemented yet!";
@ -1774,8 +1757,8 @@ void GNAGraphCompiler::ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr l
        numberOfFilters,
        filterWidth,
        convolutionStride,
-        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+        getScaleFactor(layer, QuantizedDataType::weights),
+        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1834,9 +1817,8 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {

    auto inputs = layer->insData.begin()->lock();
    auto outputs = *layer->outData.begin();
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-    float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
-    float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
+    float output_pwl_scale_factor = getScaleFactor(layer, QuantizedDataType::output);
+    float input_pwl_scale_factor = getScaleFactor(layer, QuantizedDataType::input);

    auto orientation = kDnnInterleavedOrientation;

@ -1903,6 +1885,7 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
    }
    auto activation_type = DnnActivation::fromType(it->second);
    activation_type.fqParams.set = false;
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
    if (quantized != nullptr && quantized->_dst_quant.IsStatsSet()) {
        activation_type.fqParams.set = true;
        activation_type.fqParams.levels = quantized->_dst_quant.GetLevels();
@ -2044,7 +2027,6 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
        return;
    }
    auto layerOrder = layer->GetParamAsInts("order");
-    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
    if (layer->insData.empty()) {
        THROW_GNA_LAYER_EXCEPTION(layer) << "Input layer pointer is unexpectedly absent";
    }
@ -2088,7 +2070,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
                                         squeezedInputOrder[1],
                                         inputs->getPrecision().size(),
                                         outputs->getPrecision().size(),
-                                         (quantized == nullptr) ? 1.0f : quantized->_dst_quant.GetScale(),
+                                         getScaleFactor(layer, QuantizedDataType::output),
                                         ptr_inputs,
                                         ptr_outputs);
        }
@ -2103,7 +2085,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
                                           squeezedInputOrder[1],
                                           inputs->getPrecision().size(),
                                           outputs->getPrecision().size(),
-                                           quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
+                                           getScaleFactor(layer, QuantizedDataType::output),
                                           ptr_inputs,
                                           ptr_outputs);
        }
@ -2595,4 +2577,4 @@ GNAGraphCompiler::transposeMatrix(uint8_t* ptr_matrix, size_t element_size, uint
        }
    }
    return temp_buffer;
-}
+}
--- a/inference-engine/tests/functional/plugin/gna/scale_factors_tests/const_input_add.cpp
+++ b/inference-engine/tests/functional/plugin/gna/scale_factors_tests/const_input_add.cpp
@ -0,0 +1,117 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::pair<float, float>,            // Input min/max values
+    std::pair<float, float>             // Constant min/max values
+> constInputAddParams;
+
+namespace LayerTestsDefinitions {
+
+class ConstInputAddTest : public testing::WithParamInterface<constInputAddParams>,
+                          public LayerTestsUtils::LayerTestsCommon {
+    public:
+        static std::string getTestCaseName(testing::TestParamInfo<constInputAddParams> obj) {
+            InferenceEngine::Precision netPrecision;
+            std::string targetDevice;
+            std::map<std::string, std::string> configuration;
+            std::pair<float, float> inputRange;
+            std::pair<float, float> constRange;
+            std::tie(netPrecision, targetDevice, configuration, inputRange, constRange) = obj.param;
+
+            std::ostringstream result;
+            result << "netPRC=" << netPrecision.name() << "_";
+            result << "targetDevice=" << targetDevice << "_";
+            for (auto const& configItem : configuration) {
+                result << "_configItem=" << configItem.first << "_" << configItem.second;
+            }
+            result << "_IR=" << inputRange.first << "," << inputRange.second << "_";
+            result << "IR=" << constRange.first << "," << constRange.second;
+            return result.str();
+        }
+
+        InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override {
+            return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputMax - inputMin, inputMin, (inputMax - inputMin) / 10);
+        }
+
+    protected:
+        void SetUp() override {
+            InferenceEngine::Precision netPrecision;
+            std::pair<float, float> inputRange;
+            std::pair<float, float> constRange;
+            std::tie(netPrecision, targetDevice, configuration, inputRange, constRange) = this->GetParam();
+            auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+            std::tie(inputMin, inputMax) = inputRange;
+
+            ngraph::Shape shape = {1, 72};
+            auto params = ngraph::builder::makeParams(ngPrc, { shape });
+
+            auto constant = ngraph::builder::makeConstant<float>(ngPrc, shape, {}, true, constRange.second, constRange.first);
+            auto eltwise = ngraph::builder::makeEltwise(constant, params[0], ngraph::helpers::EltwiseTypes::ADD);
+
+            ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(eltwise) };
+            function = std::make_shared<ngraph::Function>(results, params, "InputConstAdd");
+        }
+
+    private:
+        float inputMin = 0.0;
+        float inputMax = 0.0;
+};
+
+    TEST_P(ConstInputAddTest, CompareWithRefImpl) {
+        Run();
+    };
+
+    const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+    };
+
+    const std::vector<std::map<std::string, std::string>> configs = {
+        {
+            {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+        }
+    };
+
+    const std::vector<std::pair<float, float>> inputRange = {
+        {-10, 10},
+        {-100, 100}
+    };
+
+    const std::vector<std::pair<float, float>> constRange = {
+        {-10, 10},
+        {-0.1, 0.1},
+        {-1.0e-5, 1.0e-5}
+    };
+
+    INSTANTIATE_TEST_SUITE_P(smoke_const_input_add, ConstInputAddTest,
+        ::testing::Combine(
+            ::testing::ValuesIn(netPrecisions),
+            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+            ::testing::ValuesIn(configs),
+            ::testing::ValuesIn(inputRange),
+            ::testing::ValuesIn(constRange)),
+        ConstInputAddTest::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/behavior/caching/caching_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/behavior/caching/caching_tests.cpp
@ -9,8 +9,9 @@ using namespace LayerTestsDefinitions;
 namespace {
    static const std::vector<ngraph::element::Type> precisionsGNA = {
            ngraph::element::f32,
-            ngraph::element::u8,
-            ngraph::element::i16,
+            // integer weights are not supported by GNA so far
+            // ngraph::element::u8,
+            // ngraph::element::i16,
    };

    static const std::vector<std::size_t> batchSizesGNA = {
--- a/inference-engine/tests/unit/gna/gna_get_scale_factor.cpp
+++ b/inference-engine/tests/unit/gna/gna_get_scale_factor.cpp
@ -0,0 +1,54 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <limits>
+
+#include <gtest/gtest.h>
+// to suppress deprecated definition errors
+#define IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+#include "legacy/layer_transform.hpp"
+#include "frontend/layer_quantizer.hpp"
+
+namespace {
+
+class GnaGetScaleFactorTest : public ::testing::Test {
+ protected:
+    void GetScaleFactorAndCheck(float src_scale, float dst_scale, float weights_scale, float bias_scale) const {
+        InferenceEngine::LayerParams params("fc", "FullyConnected", InferenceEngine::Precision::FP32);
+        InferenceEngine::CNNLayerPtr layer = std::make_shared<InferenceEngine::CNNLayer>(params);
+        layer = InferenceEngine::injectData<GNAPluginNS::QuantizedLayerParams>(*layer);
+        auto quant = InferenceEngine::getInjectedData<GNAPluginNS::QuantizedLayerParams>(*layer);
+        quant->_src_quant.SetScale(src_scale);
+        quant->_dst_quant.SetScale(dst_scale);
+        quant->_weights_quant.SetScale(weights_scale);
+        quant->_bias_quant.SetScale(bias_scale);
+        ASSERT_EQ(GNAPluginNS::getScaleFactor(layer, GNAPluginNS::QuantizedDataType::input), src_scale);
+        ASSERT_EQ(GNAPluginNS::getScaleFactor(layer, GNAPluginNS::QuantizedDataType::output), dst_scale);
+        ASSERT_EQ(GNAPluginNS::getScaleFactor(layer, GNAPluginNS::QuantizedDataType::weights), weights_scale);
+        ASSERT_EQ(GNAPluginNS::getScaleFactor(layer, GNAPluginNS::QuantizedDataType::bias), bias_scale);
+    }
+};
+
+TEST_F(GnaGetScaleFactorTest, validSF) {
+    EXPECT_NO_THROW(GetScaleFactorAndCheck(100, 200, 300, 400));
+}
+
+TEST_F(GnaGetScaleFactorTest, invalidSF) {
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(0, 200, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 0, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, 0, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, 300, 0));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(-100, 200, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, -200, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, -300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, 300, -400));
+    double inf = std::numeric_limits<float>::infinity();
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(inf, 200, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, inf, 300, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, inf, 400));
+    EXPECT_ANY_THROW(GetScaleFactorAndCheck(100, 200, 300, inf));
+}
+
+} // namespace