[GNA] Fake quantization layer support for int-8 mode for GNA plugin (#2937)

* [GNA] added support for per-channel FakeQuantise layer * [GNA] added quantisation types detection in FQ enabled networks, and added input scale factors detection from FQ connected to input layer * added FakeQuantize callback that will be use to cast integer values stored as float in FakeQuantized layer * fixed per-channel multiplier calculation for int8 case * precision improvements for int8 fake quantization and support for propagating scale factors to activation layers * added initial int16 support * added support for fake quantize layer with many connected output layers and support for FQ data encoded as FP16 * added support for already quantized weights * Shared single layer test * Added subgraph test * Fix comment * int8 * Enabling FQ tests on GNA Co-authored-by: Eugene Smirnov <eugene.smirnov@intel.com> Co-authored-by: Andrey Dmitriev <andrey.dmitriev@intel.com>
2020-11-20 14:40:19 +01:00 · 2020-11-20 14:40:19 +01:00 · fc1a3ce2f1
commit fc1a3ce2f1
parent 27be33ba53
25 changed files with 1430 additions and 268 deletions
--- a/inference-engine/src/gna_plugin/backend/dnn_types.h
+++ b/inference-engine/src/gna_plugin/backend/dnn_types.h
@ -45,14 +45,15 @@ struct DnnActivation {
        } pow;
        struct {
            int32_t levels;
-            float input_low;
+            // if input is per-channel quantization - input pointers contains per-channel ranges
-            float input_high;
+            int8_t  inputPerChannel;
-            float output_low;
+            float  *input_low;
-            float output_high;
+            float  *input_high;
            // if output is per-channel quantization - output pointers contains per-channel ranges
            int8_t  outputPerChannel;
            float  *output_low;
            float  *output_high;
        } fakeQuantize;
        struct {
            float reserved[5];
        };
    } args;
    operator DnnActivationType () const noexcept {
        return type;
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@ -15,6 +15,7 @@ struct GNAFlags {
    bool uniformPwlDesign = false;
    bool gna_openmp_multithreading = false;
    bool sw_fp32 = false;
    bool fake_quantized = false;
    bool performance_counting = false;
 };
 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@ -83,6 +83,10 @@ struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
    }
 };
 // for support proper trait instantiation for quantization function callback
 struct FakeQuantI16 : public QuantI16 {};
 struct FakeQuantI8 : public QuantI8 {};
 template <class A, class B>
 struct QuantPair {
    using MandatoryType = A;
@ -115,7 +119,7 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
 */
 template <class T>
 class Quant {
- public:
+public:
    template<class ...Args>
    void operator()(Args && ... args) const { }
 };
@ -125,7 +129,9 @@ class Quant<QuantI16> {
 public:
    template<class ...Args>
    void operator()(Args && ... args) const {
-        QuantizeAffine16(std::forward<Args>(args)...);
+        QuantizationCallback<int16_t, int32_t> {
            std::forward<Args>(args)...
        }.runQuantize();
    }
 };
@ -134,10 +140,35 @@ class Quant<QuantI8> {
 public:
    template<class ...Args>
    void operator()(Args && ... args) const {
-        QuantizeAffine8(std::forward<Args>(args)...);
+        QuantizationCallback<int8_t, gna_compound_bias_t> {
            std::forward<Args>(args)...
        }.runQuantize();
    }
 };
 template<>
 class Quant<FakeQuantI16> {
 public:
    template<class ...Args>
    void operator()(Args && ... args) const {
        QuantizationCallback<int16_t, int32_t> {
            std::forward<Args>(args)...
        }.runFakeQuantize();
    }
 };
 template<>
 class Quant<FakeQuantI8> {
 public:
    template<class ...Args>
    void operator()(Args && ... args) const {
        QuantizationCallback<int8_t, gna_compound_bias_t>{
            std::forward<Args>(args)...
        }.runFakeQuantize();
    }
 };
 template <typename T>
 inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
    auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
@ -242,7 +273,7 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
    if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
        auto quantDataForInputLayer =
            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
-        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
        if (std::isnan(input_scale_factor) ||
            std::isinf(input_scale_factor)) {
            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
@ -273,17 +304,26 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
    {
        auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty();
        auto weightsScale = quantData->_weights_quant.GetScale();
        auto dstScale = quantData->_dst_quant.GetScale();
        fnc(wl->_weights->buffer().as<float *>(),
            wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
            intWeights->buffer(),
            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
            input_scale_factor,
-            &quantData->_weights_quant.scale,
+            &weightsScale,
-            &quantData->_dst_quant.scale,
+            &dstScale,
            num_rows,
            num_columns,
            num_rows_padded,
-            num_columns_padded);
+            num_columns_padded,
            quantData->_weights_quant.GetLevels(),
            nullptr,
            nullptr,
            per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr,
            per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr,
            &quantData->_weights_quantized);
    }
    wl->_weights = intWeights;
    wl->_biases = intBiases;
@ -343,7 +383,7 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
    if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
        auto quantDataForInputLayer =
            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
-        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
        if (std::isnan(input_scale_factor) ||
            std::isinf(input_scale_factor)) {
            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
@ -370,13 +410,15 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
    {
        auto weightsScale = quantData->_weights_quant.GetScale();
        auto dstScale = quantData->_dst_quant.GetScale();
        fnc(conv->_weights->buffer().as<float *>(),
            conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
            intWeights->buffer(),
            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
            input_scale_factor,
-            &quantData->_weights_quant.scale,
+            &weightsScale,
-            &quantData->_dst_quant.scale,
+            &dstScale,
            num_rows,
            num_columns,
            num_rows_padded,
@ -447,7 +489,7 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
            if (cnnLayer->blobs["custom"]->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP16) {
                cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
            }
-            auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.scale;
+            auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.GetScale();
            auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
            auto const_blob = cnnLayer->blobs["custom"];
            if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
@ -563,4 +605,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
 using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
 using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
 using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
 using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
@ -80,7 +80,7 @@ class ModelQuantizer {
                THROW_GNA_EXCEPTION << "Scale factors are not set for some of the inputs";
            }
            IE_ASSERT(quantData != nullptr);
-            quantData->_src_quant.scale = scaleFactor[scaleIndex];
+            quantData->_src_quant.SetScale(scaleFactor[scaleIndex]);
            scaleIndex++;
        }
--- a/inference-engine/src/gna_plugin/frontend/quantization.cpp
+++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp
@ -5,20 +5,91 @@
 #include <cstring>
 #include <iostream>
 #include <details/ie_exception.hpp>
 #include <gna_plugin_log.hpp>
 #include <limits>
 #include "backend/gna_types.h"
 #include "quantization.h"
-void QuantizeAffine16(float *ptr_float_weights,
+#ifdef DEBUG
-                      float *ptr_float_biases,
+#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
-                      int16_t *ptr_int_weights,
+#else
-                      int32_t *ptr_int_biases,
+#define QUANTWARNING(...)
-                      float input_scale_factor,
+#endif
-                      float *ptr_weight_scale_factor,
+
-                      float *ptr_output_scale_factor,
+
-                      uint32_t num_rows,
+template<>
-                      uint32_t num_columns,
+void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
-                      uint32_t num_rows_padded,
+    uint32_t num_saturate = 0;
-                      uint32_t num_columns_padded) {
+
    for (uint32_t row = 0; row < num_rows; row++) {
        for (uint32_t col = 0; col < num_columns; col++) {
            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
            float value = ptr_float_weights[row * num_columns + col];
            if (!*ptr_quantized_weights) {
                value = value * *ptr_weight_scale_factor + rounding_value;
            } else {
                value -= MAX_VAL_2B_WEIGHT;
            }
            int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
            if (*ptr_quantized_weights &&
                (value > std::numeric_limits<int16_t>::max() ||
                    value < std::numeric_limits<int16_t>::min())) {
                THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value;
            }
            if (value > std::numeric_limits<int16_t>::max()) {
                *ptr_weight_16 = std::numeric_limits<int16_t>::max();
                num_saturate++;
            } else if (value < std::numeric_limits<int16_t>::min()) {
                *ptr_weight_16 = std::numeric_limits<int16_t>::min();
                num_saturate++;
            } else {
                *ptr_weight_16 = (int16_t)value;
            }
        }
        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
            int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
            *ptr_weight_16 = 0;
        }
    }
    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
        for (uint32_t col = 0; col < num_columns_padded; col++) {
            int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
            *ptr_weight_16 = 0;
        }
    }
    // case for element wise layer
    if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
        for (uint32_t j = 0; j < num_rows; j++) {
            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
            if (value > 2147483647.0) {
                ptr_int_biases[j] = 2147483647L;
                num_saturate++;
            } else if (value < -2147483648.0) {
                ptr_int_biases[j] = -2147483648LL;
                num_saturate++;
            } else {
                ptr_int_biases[j] = (int32_t)value;
            }
        }
        for (uint32_t j = num_rows; j < num_rows_padded; j++) {
            ptr_int_biases[j] = 0;
        }
    }
    if (num_saturate > 0) {
        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine16()\n",
            num_saturate,
            num_rows * num_columns + num_rows);
    }
 }
 template<>
 void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
    uint32_t num_saturate = 0;
    if (*ptr_weight_scale_factor == 1.0) {
@ -149,11 +220,90 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
    }
 }
-void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
+template<>
-                     int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
+void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
-                     float input_scale_factor, float *ptr_weight_scale_factor,
+    uint32_t num_saturate = 0;
-                     float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+
-                     uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
        THROW_GNA_EXCEPTION << "Fake quantized output range not set";
    }
    if (fq_levels == 0 || fq_levels == 1) {
        THROW_GNA_EXCEPTION << "Fake quantized levels not set";
    }
    for (uint32_t i = 0; i < num_rows; i++) {
        uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) *
            *ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f;
        ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier);
        if (channel_multiplier > MAX_OUT_MULTIPLIER) {
            THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
        }
        for (uint32_t j = 0; j < num_columns; j++) {
            auto offset = i * num_columns + j;
            auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f;
            float value = ptr_float_weights[offset];
            if (!*ptr_quantized_weights) {
                value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value;
            } else {
                value -= MAX_VAL_1B_WEIGHT;
            }
            auto normalizedWeight = static_cast<int32_t>(value);
            if (*ptr_quantized_weights &&
                (value > std::numeric_limits<int8_t>::max() ||
                value < std::numeric_limits<int8_t>::min())) {
                THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value;
            }
            if (value > std::numeric_limits<int8_t>::max()) {
                normalizedWeight = std::numeric_limits<int8_t>::max();
                num_saturate++;
            } else if (value < std::numeric_limits<int8_t>::min()) {
                normalizedWeight = std::numeric_limits<int8_t>::min();
                num_saturate++;
            } else {
                normalizedWeight = (int8_t)value;
            }
            // range checking
            ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
        }
        for (uint32_t j = num_columns; j < num_columns_padded; j++) {
            ptr_int_weights[i * num_columns + j] = 0;
        }
    }
    for (uint32_t i = num_rows; i < num_rows_padded; i++) {
        for (uint32_t j = 0; j < num_columns_padded; j++) {
            ptr_int_weights[i * num_columns + j] = 0;
        }
        ptr_int_biases[i].multiplier = 0;
    }
    if (ptr_float_biases != nullptr) {
        for (uint32_t j = 0; j < num_rows; j++) {
            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
            if (value > 2147483647.0) {
                ptr_int_biases[j].bias = 2147483647L;
                num_saturate++;
            } else if (value < -2147483648.0) {
                ptr_int_biases[j].bias = -2147483648LL;
                num_saturate++;
            } else {
                ptr_int_biases[j].bias = (int32_t) value;
            }
        }
    }
    if (num_saturate > 0) {
        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
    }
 }
 template<>
 void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
    if (ptr_int_biases == nullptr) {
        THROW_IE_EXCEPTION << "Int biases are empty";
    }
--- a/inference-engine/src/gna_plugin/frontend/quantization.h
+++ b/inference-engine/src/gna_plugin/frontend/quantization.h
@ -16,25 +16,34 @@
 #define MAX_VAL_2B_WEIGHT 16384
 #define MAX_VAL_2B_FEAT 16384
 #define MAX_VAL_4B_BIAS 1073741824
 #ifdef DEBUG
 #define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
 #else
 #define QUANTWARNING(...)
 #endif
-void QuantizeAffine16(float *ptr_float_weights,
+template <class WeightsType,  class BiasType>
-                      float *ptr_float_biases,
+struct QuantizationCallback {
-                      int16_t *ptr_int_weights,
+    float *ptr_float_weights;
-                      int32_t *ptr_int_biases,
+    float *ptr_float_biases;
-                      float input_scale_factor,
+    WeightsType* ptr_int_weights;
-                      float *ptr_weight_scale_factor,
+    BiasType* ptr_int_biases;
-                      float *ptr_output_scale_factor,
+    float input_scale_factor;
-                      uint32_t num_rows,
+    float *ptr_weight_scale_factor;
-                      uint32_t num_columns,
+    float *ptr_output_scale_factor;
-                      uint32_t num_rows_padded,
+    uint32_t num_rows;
-                      uint32_t num_columns_padded);
+    uint32_t num_columns;
    uint32_t num_rows_padded;
    uint32_t num_columns_padded;
    int32_t fq_levels;
    const float *fq_ptr_input_low;
    const float *fq_ptr_input_high;
    const float *fq_ptr_output_low;
    const float *fq_ptr_output_high;
    const bool* ptr_quantized_weights;
    void runQuantize() const;
    void runFakeQuantize() const;
 };
 template class QuantizationCallback<int16_t, int32_t>;
 template class QuantizationCallback<int8_t, gna_compound_bias_t>;
 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
 void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
 void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
                     float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
                     uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
--- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
+++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
@ -6,19 +6,57 @@
 namespace GNAPluginNS {
-struct Quantization {
+class Quantization {
 public:
    void SetScale(float s) {
        scale = s;
        scale_set = true;
    }
    float GetScale() const {
        return scale;
    }
    bool IsScaleSet() const {
        return scale_set;
    }
    void SetLevels(int32_t l) {
        levels = l;
    }
    int32_t GetLevels() const {
        return levels;
    }
    void SetMinValues(const std::vector<float> &min) {
        min_values.clear();
        min_values.insert(min_values.end(), min.begin(), min.end());
    }
    const std::vector<float>& GetMinValues() const {
        return min_values;
    }
    void SetMaxValues(const std::vector<float>& max) {
        max_values.clear();
        max_values.insert(max_values.end(), max.begin(), max.end());
    }
    const std::vector<float>& GetMaxValues() const {
        return max_values;
    }
 private:
    float scale = 1.0f;
-    float offset = 0.0f;
+    bool scale_set = false;
-    int shift = 0.0f;
+    int32_t levels = 0;
    std::vector<float> min_values;
    std::vector<float> max_values;
 };
 struct QuantizedLayerParams {
    Quantization _src_quant;
    Quantization _dst_quant;
    // deprecate this
    Quantization _weights_quant;
    bool _weights_quantized = false;
    Quantization _bias_quant;
    float _o_shift = 0.0f;
    float _b_shift = 0.0f;
 };
-}  // namespace GNAPluginNS
+}  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@ -64,8 +64,9 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
    }
    float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
-                             GNAPluginNS::LayerInfo const& layer,
+                             GNAPluginNS::LayerInfo const& layer) {
-                             QuantizedLayerParams const* quantizedParams) {
+        auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
        // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
        // set the initial value
        float result = activation_scale_factor;
@ -82,29 +83,29 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
            for (int slope_scale_index = 1; slope_scale_index != 5; slope_scale_index ++) {
                auto slope_scale = static_cast<double>(static_cast<uint64_t>(1) << (8 * slope_scale_index));
-                auto mink = min_range * slope_scale / quantizedParams->_src_quant.scale;
+                auto mink = min_range * slope_scale / quantizedParams->_src_quant.GetScale();
-                auto maxk = max_range * slope_scale / quantizedParams->_src_quant.scale;
+                auto maxk = max_range * slope_scale / quantizedParams->_src_quant.GetScale();
                if (mink < std::numeric_limits<int16_t>::max()) {
                    auto localMaxK = std::min(static_cast<double>(std::numeric_limits<int16_t>::max()), maxk);
                    if (localMaxK > optimalK) {
-                        result = localMaxK / slope_scale *  quantizedParams->_src_quant.scale;
+                        result = localMaxK / slope_scale *  quantizedParams->_src_quant.GetScale();
                        optimalK = localMaxK;
                    }
                }
            }
 #else
            // GNA scale factor encoding might poor represent target slop scale, we are probing 2 values
-            auto s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor);
+            auto s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor);
            auto scale_default = s.slope * s.slope_scale;
            // probing one more quite good approximation for identity
-            s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor / 2);
+            s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor / 2);
            auto scale_extra = s.slope * s.slope_scale;
            result = fabs(scale_extra) > fabs(scale_default) ?  identity_scale_factor / 2 : identity_scale_factor;
 #endif
        } else if (layer.isRelu() &&
-                static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.scale)
+                static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.GetScale())
                                                            > std::numeric_limits<int32_t>::max()-1) {
            // if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
            result = (activation_scale_factor * 0.5);
@ -118,10 +119,10 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
            auto input_max_value = static_cast<double>(std::numeric_limits<int32_t>::max());
            auto output_max_value = static_cast<double>(std::numeric_limits<int16_t>::max());
-            auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.scale : 0.0;
+            auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.GetScale() : 0.0;
            x_min = std::max(x_min, -pow_domain);
-            auto x_max = input_max_value / quantizedParams->_src_quant.scale;
+            auto x_max = input_max_value / quantizedParams->_src_quant.GetScale();
            x_max = std::min(x_max, pow_domain);
            auto val1 = pow(x_min * powerLayer->scale + powerLayer->offset, powerLayer->power);
@ -134,6 +135,14 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                result = scale_val;
            }
        }
        if (!quantizedParams->_dst_quant.GetMaxValues().empty()) {
            auto min_value = quantizedParams->_dst_quant.GetMinValues().front();
            auto max_value = quantizedParams->_dst_quant.GetMaxValues().front();
            auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value);
            result = newScaleFactor < result ? newScaleFactor : result;
        }
        return result;
    }
@ -147,12 +156,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
        auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
        if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
-             if (CNNNetHasPrevLayer(cnnLayer)) {
+            if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) {
                quant->_src_quant = quant->_dst_quant;
            }
            if (CNNNetHasPrevLayer(cnnLayer)) {
                auto prevLayer = CNNNetPrevLayer(cnnLayer);
                auto prevInfo = LayerInfo(prevLayer);
                auto inputQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
-               // locating corresponding memory layers with same ID
+                // locating corresponding memory layers with same ID
-                for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) {
+                for (auto&& input : CNNNetGetAllInputLayers(cnnLayer)) {
                    LayerInfo ll(input);
                    if (!ll.isMemory() ||
                        !InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
@ -162,35 +175,36 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                    auto quantSibling = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
                    // after restarting from memory input - quant is fine
-                    if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) {
+                    if (fp32eq(quantSibling->_dst_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
-                        quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
+                        quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
                        quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
                        return true;
                    }
-                    if (!fp32eq(quantSibling->_dst_quant.scale, 1)) {
+                    if (quantSibling->_dst_quant.IsScaleSet()) {
                        // means we already restarted propagation input memory layer
                        // need to search for requantiseable layer prior memory output layer
                        InferenceEngine::CNNLayerPtr restartedLayer;
-                        gnalog() << "Memory layer :"<< input->name << " scale factor: " << quantSibling->_dst_quant.scale
+                        gnalog() << "Memory layer :" << input->name << " scale factor: " << quantSibling->_dst_quant.GetScale()
-                            << " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.scale << "\n";
+                            << " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.GetScale() << "\n";
-                        gnalog() << "[UFS] searching for quantizeable input layer for: "<< cnnLayer->name << "\n";
+                        gnalog() << "[UFS] searching for quantizeable input layer for: " << cnnLayer->name << "\n";
-                        CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer *) {}),
+                        CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer*) {}),
-                                  [&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) {
+                            [&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) {
-                                      gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name;
+                                gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name;
-                                      // found that direct input to concat is a indirect parent of align filter - so no link required
+                                // found that direct input to concat is a indirect parent of align filter - so no link required
-                                      auto info = LayerInfo(layer);
+                                auto info = LayerInfo(layer);
-                                      if (!info.isWeightable() && !info.isActivation()) {
+                                if (!info.isWeightable() && !info.isActivation()) {
-                                          gnalog() << "... skipped\n";
+                                    gnalog() << "... skipped\n";
-                                          return;
+                                    return;
-                                      }
+                                }
-                                      restartedLayer = layer;
+                                restartedLayer = layer;
-                                      gnalog() << "... OK,  need requantize\n";
+                                gnalog() << "... OK,  need requantize\n";
-                                  }, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer *from) {
+                            }, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer* from) {
-                                    // aborting UFS once found suitable layer
+                                // aborting UFS once found suitable layer
-                                    return make_upstream_order(restartedLayer == nullptr ? from : nullptr);
+                                return make_upstream_order(restartedLayer == nullptr ? from : nullptr);
-                                });
+                            });
                        if (restartedLayer == nullptr) {
                            THROW_GNA_EXCEPTION << "cannot requantize input to " << cnnLayer->name;
@ -201,23 +215,23 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                        auto restarLayerInfo = LayerInfo(restartedLayer);
                        if (restarLayerInfo.isActivation()) {
                            // requantize activation by just changing it's output scale factor
-                            quantDataForMemoryOutput->_dst_quant.scale = quantSibling->_dst_quant.scale;
+                            quantDataForMemoryOutput->_dst_quant.SetScale(quantSibling->_dst_quant.GetScale());
                        } else {
-                            THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
+                            THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.GetScale() << ") "
-                                  << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
+                                << " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
-                                  << activation_scale_factor;
+                                << activation_scale_factor;
                        }
                        result = ScaleFactorUpdateResult(restartedLayer.get());
                        return true;
                    }
-                    gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")"
+                    gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.GetScale() << ")"
-                              << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
+                        << " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
-                              << activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl;
+                        << activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl;
                    // try updating memory input layer scale factor and restart from it
-                    quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale;
+                    quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant;
                    result = ScaleFactorUpdateResult(input.get());
                    return true;
                }
@ -226,11 +240,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
        }
        if (cnnLayer->type == "Const") {
            if (quant->_dst_quant.IsScaleSet()) {
                quant->_src_quant = quant->_dst_quant;
                return ScaleFactorUpdateResult();
            }
            auto blob = cnnLayer->blobs["custom"];
            auto blob_precision = blob->getTensorDesc().getPrecision();
            if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
-                quant->_dst_quant.scale = 1.0f;
+                quant->_dst_quant.SetScale(1.0f);
                return true;
            }
@ -255,16 +274,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
            // TODO: Investigate what should be the scale in such cases (31910)
            if (std::isinf(scale_val)) {
-                quant->_dst_quant.scale = quant->_src_quant.scale;
+                quant->_dst_quant.SetScale(quant->_src_quant.GetScale());
            } else {
-                quant->_dst_quant.scale = scale_val;
+                quant->_dst_quant.SetScale(scale_val);
            }
            return ScaleFactorUpdateResult();
        }
        if (!CNNNetHasPrevLayer(cnnLayer)) {
-            quant->_dst_quant.scale = quant->_src_quant.scale;
+            quant->_dst_quant = quant->_src_quant;
            return ScaleFactorUpdateResult();
        }
@ -273,14 +292,17 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
        if (!inputQuant) {
            THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
        }
        quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
        quant->_src_quant.scale = inputQuant->_dst_quant.scale;
        quant->_src_quant = inputQuant->_dst_quant;
        if (layerInfo.isActivation()) {
            // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
            // set the initial value
-            quant->_dst_quant.scale = getActivationScale(cnnLayer, layerInfo, quant);
+            auto scale = getActivationScale(cnnLayer, layerInfo);
            quant->_dst_quant.SetScale(scale);
            return true;
        }
        quant->_dst_quant = inputQuant->_dst_quant;
        return true;
    }
 };
@ -302,8 +324,8 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
        switch (eltwiseLayer->_operation) {
            case InferenceEngine::EltwiseLayer::Prod: {
-                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
+                quantData->_weights_quant = quantParams1->_dst_quant;
-                quantData->_dst_quant.scale     = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
+                quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale());
                break;
            }
            case InferenceEngine::EltwiseLayer::Sub:
@ -325,13 +347,13 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                }
                // this path might result in significant data loss
-                quantData->_bias_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
+                quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
-                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
+                quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
-                quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
+                quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
                // eltwise will always work in int16
                auto maxValue = std::numeric_limits<int16_t>::max() - 1;
-                if (quantData->_weights_quant.scale > maxValue + 1) {
+                if (quantData->_weights_quant.GetScale() > maxValue + 1) {
                    // rescaling it's activation input
                    // iterating thru previous layers of eltwise
                    for (uint8_t i = 0; i < 2; ++i) {
@ -347,15 +369,15 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                            if (info.isSplit() || info.isSlice()) {
                                continue;
                            } else if (info.has16BOutput() && info.isActivation()) {
-                                auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
+                                auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
                                if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
                                    break;
                                }
                                auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
                                gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
                                         << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
-                                         << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
+                                         << ", was " << quantDataForActivation->_dst_quant.GetScale() <<"\n" << std::flush;
-                                quantDataForActivation->_dst_quant.scale = newOutputScale;
+                                quantDataForActivation->_dst_quant.SetScale(newOutputScale);
                                result = ScaleFactorUpdateResult(in.get());
                                return true;
                            } else if (info.has16BOutput()) {
@ -365,10 +387,10 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                            // if we are here it means that we are in the port 1
                            if (info.isFullyConnected() || info.isConvolution()) {
                                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
-                                auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
+                                auto newOutputScale = quantParams->_dst_quant.GetScale() * maxValue;
-                                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
+                                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
-                                quantDataForInputLayer->_dst_quant.scale = newOutputScale;
+                                quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
-                                quantDataForInputLayer->_weights_quant.scale = newWeightScale;
+                                quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
                                result = ScaleFactorUpdateResult(in.get());
                                return true;
                            }
@ -410,15 +432,15 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
        // if all inputs have same quant value - trivial propagation
        auto in0 = inputLayers.front();
        auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
-        auto scaleFactor = quantParams0->_dst_quant.scale;
+        auto scaleFactor = quantParams0->_dst_quant.GetScale();
        auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
            auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
-            return fp32eq(quantParams->_dst_quant.scale, scaleFactor);
+            return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor);
        };
        if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) {
-            quantData->_dst_quant.scale = quantParams0->_dst_quant.scale;
+            quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale());
-            quantData->_src_quant.scale = quantParams0->_dst_quant.scale;
+            quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
            return true;
        }
@ -435,7 +457,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
            auto nextInputIt = firstInputIt + 1;
            while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
                auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt);
-                if (!fp32eq(quantParamsSecond->_dst_quant.scale, quantParamsFirst->_dst_quant.scale)) {
+                if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) {
                    THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name
                        << " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
                }
@ -449,7 +471,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
        auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
            auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
            LayerInfo info(inputLayer);
-            return !info.isActivation() && !fp32eq(quantParams->_dst_quant.scale, 1.0f);
+            return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
        };
        static std::map<std::string, size_t> restarted_counter;
@ -469,7 +491,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
        if (sourceLayerIt == inputLayers.end()) {
            auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
                auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
-                return !fp32eq(quantParams->_dst_quant.scale, 1.0f);
+                return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
            };
            sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
@ -478,29 +500,28 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
        std::set<size_t> concatIdxToUpdate;
        if (sourceLayerIt != inputLayers.end()) {
            auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt);
-            auto scaleFactor = quantParams->_dst_quant.scale;
+            auto scaleFactor = quantParams->_dst_quant.GetScale();
            sourceQuantParams = quantParams;
            for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
                auto quantParamsIn = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it);
-                if (fp32eq(quantParamsIn->_dst_quant.scale, scaleFactor)) {
+                if (fp32eq(quantParamsIn->_dst_quant.GetScale(), scaleFactor)) {
                    continue;
                }
                // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
-                if (!fp32eq(quantParamsIn->_dst_quant.scale, 1.0f) && !LayerInfo(*it).isActivation()) {
+                if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
                    concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
                }
-                quantParamsIn->_weights_quant = quantParams->_dst_quant;
+                quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
                quantParamsIn->_dst_quant = quantParams->_dst_quant;
            }
        }
-        auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.scale;
+        auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.GetScale();
        auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
            auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
-            return fp32eq(quantParams->_dst_quant.scale, updatedScaleFactor);
+            return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor);
        };
        auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
@ -508,8 +529,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
            THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name;
        }
-        quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
+        quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
-        quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale;
+        quantData->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
        if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) {
            return true;
@ -517,7 +538,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
        for (auto& layerIdToUpdate : concatIdxToUpdate) {
            auto destinationQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
-            destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
+            destinationQuantParams->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
            InferenceEngine::CNNLayerPtr restartedLayer;
            // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input
@ -542,18 +563,18 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
                });
            if (restartedLayer == nullptr) {
-                THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << "input to concat: " << concatLayer->name;
+                THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << " input to concat: " << concatLayer->name;
            }
            auto quantDataForConCatInput = InferenceEngine::getInjectedData<QuantizedLayerParams>(*restartedLayer);
            auto restarLayerInfo = LayerInfo(restartedLayer);
            if (restarLayerInfo.isActivation()) {
                // requantize activation by just changing it's output scale factor
-                quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
+                quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
            }
            if (restarLayerInfo.isConst()) {
                gnalog() << "... warning const layer will be requantized\n";
-                quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
+                quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
            }
            result = ScaleFactorUpdateResult(restartedLayer.get());
        }
@ -588,9 +609,9 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
        auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
-        quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
+        quant->_src_quant = quantDataForInputLayer->_dst_quant;
        // TODO: pass 8 bits somehow
-        if (quant->_weights_quant.scale == 1.0f) {
+        if (quant->_weights_quant.GetScale() == 1.0f) {
            size_t scaleRange = 0;
            if (weightsSize == 2) {
                scaleRange = MAX_VAL_2B_WEIGHT;
@ -599,67 +620,61 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
            } else {
                THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
            }
-            quant->_weights_quant.scale =
+            quant->_weights_quant.SetScale(
-                ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
+                ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size()));
-            if (quant->_weights_quant.scale == -1.0f) {
+            if (quant->_weights_quant.GetScale() == -1.0f) {
-                quant->_weights_quant.scale = 1.0f;
+                quant->_weights_quant.SetScale(1.0f);
            }
            if (wl->_biases) {
-                quant->_bias_quant.scale = ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
+                quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
                                                                      MAX_VAL_4B_BIAS,
-                                                                      wl->_biases->size());
+                                                                      wl->_biases->size()));
-                if (quant->_bias_quant.scale != -1.0f) {
+                if (quant->_bias_quant.GetScale() != -1.0f) {
-                    quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
+                    quant->_bias_quant.SetScale(
-                    quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale;
+                        std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
                    quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
                }
            }
            // TODO: findout why ???
            if (weightsSize == 1) {
-                quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
            }
            double weights_reducer = 1.0;
-            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl);
+            auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
            if (conv) {
                auto dims = conv->insData.front().lock()->getDims();
                weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
                weights_reducer = std::max(1.0, weights_reducer);
            }
-            quant->_weights_quant.scale /= weights_reducer;
+            quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
        }
-
+        double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
        double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
        if (weightsSize == 1 &&
-            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
+            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
-                                                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
+            static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
            gnawarn() << "Output scale for " << wl->name
-                                            << " too large and are being reduced. Else saturations likely will happen \n";
+                << " too large and are being reduced. Else saturations likely will happen \n";
            // reduce weight scale according experimental heuristic
-            if (quant->_dst_quant.scale * quant->_src_quant.scale /
+            if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-                    static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
+                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
-                quant->_weights_quant.scale *= _scale_reduction_50;
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50);
-                tmp_dst_quant_scale *= _scale_reduction_50;
+            } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
-                    static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45);
-                quant->_weights_quant.scale *= _scale_reduction_45;
+            } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-                tmp_dst_quant_scale *= _scale_reduction_45;
+                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
-            } else if (quant->_dst_quant.scale * quant->_src_quant.scale /
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40);
                    static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
                quant->_weights_quant.scale *= _scale_reduction_40;
                tmp_dst_quant_scale *= _scale_reduction_40;
            } else {
-                quant->_weights_quant.scale *= _scale_reduction_35;
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35);
                tmp_dst_quant_scale *= _scale_reduction_35;
            }
        }
-        quant->_dst_quant.scale = tmp_dst_quant_scale;
+        quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
        return true;
    }
 };
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@ -31,6 +31,7 @@
 #include "layers/layers_builder.hpp"
 #include "layers/gna_concat_layer.hpp"
 #include "layers/gna_crop_layer.hpp"
 #include "layers/gna_fake_quantize_layer.hpp"
 #include "round_float_define.hpp"
 #include "gna_plugin_policy.hpp"
@ -377,8 +378,8 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer)
    float output_scale_factor = 1.0f;
    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
    if (quantized != nullptr) {
-        weight_scale_factor = quantized->_weights_quant.scale;
+        weight_scale_factor = quantized->_weights_quant.GetScale();
-        output_scale_factor = quantized->_dst_quant.scale;
+        output_scale_factor = quantized->_dst_quant.GetScale();
    }
    auto& currentComponent = dnnComponents.addComponent(layer->name, "convolution");
@ -541,8 +542,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
            // TODO: only fp32 and Int16 tested
            quantized == nullptr ? input->getPrecision().size() : 2,
            quantized == nullptr ? input->getPrecision().size() : 4,
-            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+            quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+            quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
            ptr_inputs,
            ptr_outputs,
            ptr_weights,
@ -558,9 +559,9 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
            gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
        } else {
            IE_ASSERT(quantized != nullptr);
-            auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.scale * power.scale,
+            auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.GetScale() * power.scale,
                static_cast<float>(INT16_MAX)));
-            auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.scale * power.offset,
+            auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.GetScale() * power.offset,
                static_cast<float>(INT32_MAX)));
            gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
            gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
@ -580,8 +581,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
        gna_pwl_segment_t* ptr_pwl_segments_target = nullptr;
-        float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+        float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
-        float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f;
+        float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
        if (!gnaFlags->sw_fp32) {
            if (gnaFlags->uniformPwlDesign) {
@ -687,7 +688,7 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
        pooling._kernel[X_AXIS],
        num_columns_in,
        false,
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        ptr_inputs,
        ptr_outputs);
@ -727,7 +728,7 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
        num_columns_out,
        inputs->getPrecision().size(),
        outputs->getPrecision().size(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        num_rows_out + num_padding_out,
        num_columns_out,
        ptr_inputs,
@ -915,8 +916,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
            4,
            quantized == nullptr ? inputs->getPrecision().size() : 2,
            4,
-            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+            quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+            quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
            ptr_inputs,
            ptr_outputs,
            ptr_weights,
@ -1028,8 +1029,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
        // TODO: only fp32 and Int16 tested
        quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
        quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
-        quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1050,7 +1051,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
        if (quantized == nullptr) {
            gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
        } else {
-            auto scaledIdentity = -quantized->_weights_quant.scale;
+            auto scaledIdentity = -quantized->_weights_quant.GetScale();
            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
@ -1062,7 +1063,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
        if (quantized == nullptr) {
            gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
        } else {
-            auto scaledIdentity = quantized->_weights_quant.scale;
+            auto scaledIdentity = quantized->_weights_quant.GetScale();
            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
@ -1132,8 +1133,8 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
        outputs->getPrecision().size(),
        weightable._weights->getTensorDesc().getPrecision().size(),
        biasPrecision.size(),
-        quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1310,7 +1311,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
                               num_columns_in,
                               inputs->getPrecision().size(),
                               inputs->getPrecision().size(),
-                               quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                               quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
                               num_rows_copied,
                               num_columns_in,
                               ptr_inputs,
@ -1346,8 +1347,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
        outputs->getPrecision().size(),
        filterLayer->_weights->getTensorDesc().getPrecision().size(),
        biasPrecision.size(),
-        quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1436,8 +1437,8 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
        outputs->getPrecision().size(),
        filterLayer->_weights->getTensorDesc().getPrecision().size(),
        biasPrecision.size(),
-        quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+        quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
-        quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+        quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
        ptr_inputs,
        ptr_outputs,
        ptr_weights,
@ -1517,13 +1518,14 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
        }
    } while (false);
-    IE_ASSERT(!layer->insData.empty());
+    GNA_LAYER_ASSERT(layer, !layer->insData.empty());
-    IE_ASSERT(!layer->outData.empty());
+    GNA_LAYER_ASSERT(layer, !layer->outData.empty());
    auto inputs = layer->insData.begin()->lock();
    auto outputs = *layer->outData.begin();
    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-    float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+    float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
-    float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f;
+    float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
    auto orientation = kDnnInterleavedOrientation;
@ -1588,39 +1590,7 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
    }
    if (it->second == kActFakeQuantize) {
-        // get params from const input
+        activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation();
        auto GetParamFromInputAsFloat = [](CNNLayerPtr input, size_t idx) {
            if (input->insData.size() <= idx) {
                THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
            }
            auto iLayerData = input->insData[idx].lock();
            if (!iLayerData) {
                THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference data weak-pointer";
            }
            auto iLayer = getCreatorLayer(iLayerData).lock();
            if (!iLayer) {
                THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference creator layer weak-pointer";
            }
            if (!LayerInfo(iLayer).isConst()) {
                THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: expected to be of type const, but was: " << iLayer->type;
            }
            if (!iLayer->blobs.count("custom")) {
                THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
            }
            auto data = iLayer->blobs["custom"];
            if (data->getTensorDesc().getPrecision() != Precision::FP32) {
                THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot cast custom blob to type FP32, since it is of type: " << data->getTensorDesc().getPrecision();
            }
            return data->cbuffer().as<float*>()[0];
        };
        activation_type.args.fakeQuantize.levels = layer->GetParamAsInt("levels");
        activation_type.args.fakeQuantize.input_low = GetParamFromInputAsFloat(layer, 1);
        activation_type.args.fakeQuantize.input_high = GetParamFromInputAsFloat(layer, 2);
        activation_type.args.fakeQuantize.output_low = GetParamFromInputAsFloat(layer, 3);
        activation_type.args.fakeQuantize.output_high = GetParamFromInputAsFloat(layer, 4);
    }
    string actName = "unknown";
@ -1759,7 +1729,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
                                         squeezedInputOrder[1],
                                         inputs->getPrecision().size(),
                                         outputs->getPrecision().size(),
-                                         (quantized == nullptr) ? 1.0f : quantized->_dst_quant.scale,
+                                         (quantized == nullptr) ? 1.0f : quantized->_dst_quant.GetScale(),
                                         ptr_inputs,
                                         ptr_outputs);
        }
@ -1774,7 +1744,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
                                           squeezedInputOrder[1],
                                           inputs->getPrecision().size(),
                                           outputs->getPrecision().size(),
-                                           quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                                           quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
                                           ptr_inputs,
                                           ptr_outputs);
        }
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@ -37,6 +37,7 @@
 #include "memory/gna_memory_state.hpp"
 #include "gna_model_serial.hpp"
 #include "runtime/gna_float_runtime.hpp"
 #include <layers/gna_fake_quantize_layer.hpp>
 #include <generic_ie.hpp>
 #include <ngraph/pass/manager.hpp>
@ -351,6 +352,87 @@ void GNAPlugin::InitGNADevice() {
    graphCompiler.setGNAMemoryPtr(gnamem);
 }
 void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork & network) {
    // fp32 emulation mode dont need any modifications to configuration
    if (config.gnaFlags.sw_fp32) return;
    // search for FQ layers
    // only supports cases of int16 or int8
    auto it = details::CNNNetworkIterator(&network);
    auto end = details::CNNNetworkIterator();
    for (; it != end; it++) {
        if (!LayerInfo(*it).isFakeQuantize()) {
            continue;
        }
        GNAFakeQuantizeLayer fqLayer(*it);
        auto inputLayer = fqLayer.getInputLayer();
        // this fake quantize represents data quantization - not weights
        if (!LayerInfo(inputLayer).isConst()) {
            continue;
        }
        // also in mixed mode i8 should be stated as target precision
        if (fqLayer.getLevels() <= std::numeric_limits<uint8_t>::max()) {
            config.gnaPrecision = InferenceEngine::Precision::I8;
        } else if (fqLayer.getLevels() <= std::numeric_limits<uint16_t>::max()) {
            config.gnaPrecision = InferenceEngine::Precision::I16;
        } else {
            THROW_GNA_LAYER_EXCEPTION(*it)
                << "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only up to "
                << std::numeric_limits<uint16_t>::max() << " is supported";
        }
        gnaFlags->fake_quantized = true;
        config.gnaFlags.fake_quantized = true;
    }
 }
 void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork & network) {
    // fp32 emulation mode dont need any modifications to configuration
    if (config.gnaFlags.sw_fp32) return;
    // search for FQ layers
    // only supports cases of int16 or int8
    InputsDataMap  inputs;
    network.getInputsInfo(inputs);
    for (auto && input : inputs) {
        auto data = input.second->getInputData();
        size_t inputIdx = 0;
        for (auto && nextToInputLayer : getInputTo(data)) {
            if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
                inputIdx++;
                continue;
            }
            // replacing scale factor from this fq layer
            GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
            auto inputRange = fqLayer.getInputRange();
            auto outputRange = fqLayer.getOutputRange();
            if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
                outputRange.second.size() != 1 || outputRange.second.size() != 1) {
                THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
                    << "unsupported, per-channel quantization for input layer : " << input.second->name();
            }
            float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]);
            if (!config.inputScaleFactors.empty()) {
                gnalog() << "Scale factor calculated during model quantization (" << scaleInput
                    << ") will be used instead of user input (" << inputsDesc->inputScaleFactors[inputIdx] << ").\n";
                if (inputsDesc->inputScaleFactors[inputIdx] < scaleInput) {
                    gnawarn() << "WARNING: Scale factor calculated based on input values (" << inputsDesc->inputScaleFactors[inputIdx]
                        << ") is smaller than scale factor used to quantize model (" << scaleInput << "). "
                        << "Input values will be clamped.\n";
                }
            }
            config.inputScaleFactors[inputIdx] = scaleInput;
            inputsDesc->inputScaleFactors[inputIdx] = scaleInput;
            inputIdx++;
        }
    }
 }
 void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
    std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
    if (_network.getFunction()) {
@ -390,6 +472,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
        THROW_GNA_EXCEPTION << error.c_str();
    }
    // FQ networks now replaces certain flags in the plugin - flags will'be owerritten
    UpdateGnaQuantModeFromNetwork(network);
    UpdateInputScaleFromNetwork(network);
    // network optimisation phases
    int passIdx = 0;
    auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
@ -401,6 +487,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
        passes->registerPass<UnrollLSTMCellPass>();
        passes->registerPass<RemoveSingleInputConcatPass>();
        // fake quantisation aware passes
        passes->registerPass<FuseFQIntoWeightsPass>();
        passes->registerPass<MoveFakeQuantizeLayerIntoQuantParamsPass>();
        passes->registerPass<SubstitutePReluPass>();
        passes->registerPass<SubstituteSoftSignPass>();
@ -441,6 +531,19 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
        // to run all passes need to have two calls to pass manager
        run_passes(newNet, true);
        run_passes(newNet, false);
    } else if (gnaFlags->fake_quantized) {
        switch (config.gnaPrecision) {
            case Precision::I16:
                ModelQuantizer<FakeQuantI16> q16;
                newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                break;
            case Precision::I8:
                ModelQuantizer<FakeQuantI8> q8;
                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                break;
            default:
                THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
        }
    } else {
        switch (config.gnaPrecision) {
            case Precision::I16:
@ -452,8 +555,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                break;
            default:
-                THROW_GNA_EXCEPTION << "no mans land for GNA precision";
+                THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
                break;
        }
    }
@ -470,7 +572,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
            return;
        }
        printed_properties.emplace_back(
-            "scale factor", std::to_string(quantized->_dst_quant.scale));
+            "scale factor", std::to_string(quantized->_dst_quant.GetScale()));
    });
 #endif
@ -564,7 +666,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
        desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
        desc.orientation = component.orientation_out;
        desc.num_bytes_per_element = component.num_bytes_per_output;
-        desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+        desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
        // TODO: this need to be fixed
        desc.num_elements = component.num_rows_out;
@ -623,7 +725,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
                    // TODO: what is orientation for concat
                    desc.orientation = kDnnInterleavedOrientation;
                    desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
-                    desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+                    desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
                    desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
                    // binding ptr for first infer request - then others will be setup during relocation
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@ -219,6 +219,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
                    int idx = 0);
    void UpdateFieldsFromConfig();
    void UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork &);
    void UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork &);
 };
 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
@ -72,5 +72,5 @@ if (!(expr)) { \
 }
 #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
 #define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
-#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" "
+#define LAYER_NAME(layer) (layer)->type << " layer : \"" << (layer)->name << "\" "
--- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
@ -0,0 +1,164 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "gna_layer_info.hpp"
 #include "gna_plugin_log.hpp"
 #include "gna_layer_helpers.hpp"
 #include "frontend/weights_converter.hpp"
 #include <ie_algorithm.hpp>
 namespace GNAPluginNS {
 class GNAFakeQuantizeLayer {
    InferenceEngine::CNNLayerPtr fqLayer;
 public :
    GNAFakeQuantizeLayer(InferenceEngine::CNNLayerPtr fqLayer)
        : fqLayer(fqLayer) {
        if (!LayerInfo(fqLayer).isFakeQuantize()) {
            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize";
        }
    }
    /**
     * @brief convert FQ layer directly to gna-pwl activation layer
     */
    DnnActivation parseAsActivation() const {
        DnnActivation fqActivation;
        fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels");
        auto inputShape  = getShapeForRange(fqLayer, 1);
        auto outputShape = getShapeForRange(fqLayer, 3);
        // TODO: check shapes broadcasting to shape of input at 0
        auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end());
        auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end());
        fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1;
        fqActivation.args.fakeQuantize.input_low   = getParamFromInputAsFloats(fqLayer, 1);
        fqActivation.args.fakeQuantize.input_high  = getParamFromInputAsFloats(fqLayer, 2);
        fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1;
        fqActivation.args.fakeQuantize.output_low  = getParamFromInputAsFloats(fqLayer, 3);
        fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4);
        fqActivation.type = kActFakeQuantize;
        return fqActivation;
     }
     /**
      * retrieves input blob for FQ layer that connected to const layer
      */
     InferenceEngine::Blob::Ptr getConstInputData() const {
         return LayerUtils::getParamFromInputAsBlob(fqLayer, 0);
     }
     /**
      * fake quantize has 5 input layers, while 4 of them always constant layer, and 1 might be a tensor - connection
      */
    InferenceEngine::CNNLayerPtr getInputLayer() const {
        return getInputLayerAt(fqLayer, 0);
    }
    int32_t getLevels() {
        return fqLayer->GetParamAsInt("levels");
    }
    std::pair<std::vector<float>, std::vector<float>> getInputRange() {
        return getRange(fqLayer, 1);
    }
    std::pair<std::vector<float>, std::vector<float>> getOutputRange() {
        return getRange(fqLayer, 3);
    }
    operator InferenceEngine::CNNLayerPtr () const {
        return fqLayer;
    }
    InferenceEngine::CNNLayerPtr operator -> () const {
        return fqLayer;
    }
    InferenceEngine::CNNLayerPtr operator * () const {
        return fqLayer;
    }
 protected :
    static std::pair<std::vector<float>, std::vector<float>> getRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
        auto shape     = getShapeForRange(input, idx);
        auto rangeSize = InferenceEngine::details::product(shape.begin(), shape.end());
        auto dataMin = LayerUtils::getParamFromInputAsBlob(input, idx);
        auto dataMax = LayerUtils::getParamFromInputAsBlob(input, idx + 1);
        std::vector<float> minValues(rangeSize), maxValues(rangeSize);
        switch (dataMin->getTensorDesc().getPrecision()) {
        case InferenceEngine::Precision::FP32: {
            memcpy(&minValues[0], dataMin->buffer().as<float*>(), rangeSize * sizeof(float));
            memcpy(&maxValues[0], dataMax->buffer().as<float*>(), rangeSize * sizeof(float));
            break;
        }
        case InferenceEngine::Precision::FP16: {
            auto dataMinFP32 = make_fp32_blob(dataMin);
            memcpy(&minValues[0], dataMinFP32->buffer().as<float*>(), rangeSize * sizeof(float));
            auto dataMaxFP32 = make_fp32_blob(dataMax);
            memcpy(&maxValues[0], dataMaxFP32->buffer().as<float*>(), rangeSize * sizeof(float));
            break;
        }
        default:
            THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
                << dataMin->getTensorDesc().getPrecision();
            break;
        }
        return {minValues, maxValues};
    }
    static float*  getParamFromInputAsFloats(InferenceEngine::CNNLayerPtr input, size_t idx) {
        auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
        if (data->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32) {
            THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
                << data->getTensorDesc().getPrecision();
        }
        return data->buffer().as<float*>();
    }
    static InferenceEngine::SizeVector  getShapeFromInput(InferenceEngine::CNNLayerPtr input, size_t idx) {
        auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
        return data->getTensorDesc().getDims();
    }
    static InferenceEngine::CNNLayerPtr  getInputLayerAt(InferenceEngine::CNNLayerPtr input, size_t idx) {
        if (input->insData.size() <= idx) {
            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
        }
        auto iLayerData = input->insData[idx].lock();
        if (!iLayerData) {
            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
                                             << ", input: cannot dereference data weak-pointer";
        }
        auto iLayer = getCreatorLayer(iLayerData).lock();
        if (!iLayer) {
            THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
                                             << ", input: cannot dereference creator layer weak-pointer";
        }
        return iLayer;
    }
    static InferenceEngine::SizeVector getShapeForRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
        auto lowShape  = getShapeFromInput(input, idx);
        auto highShape = getShapeFromInput(input, idx + 1);
        if (lowShape.size() != highShape.size()) {
            THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
        }
        for (size_t i = 0; i != lowShape.size(); i++) {
            if (lowShape[i] != highShape[i]) {
                THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
            }
        }
        return lowShape;
     }
 };
 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/layers/gna_layer_helpers.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_layer_helpers.hpp
@ -0,0 +1,44 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include "gna_layer_info.hpp"
 #include "gna_plugin_log.hpp"
 namespace GNAPluginNS {
 namespace LayerUtils {
 /**
 * @brief retrievs blob from const layer connected to certain layer
 * @param input
 * @param idx
 */
 inline InferenceEngine::Blob::Ptr getParamFromInputAsBlob(InferenceEngine::CNNLayerPtr input, size_t idx) {
    if (input->insData.size() <= idx) {
        THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
    }
    auto iLayerData = input->insData[idx].lock();
    if (!iLayerData) {
        THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
                                         << ", input: cannot dereference data weak-pointer";
    }
    auto iLayer = getCreatorLayer(iLayerData).lock();
    if (!iLayer) {
        THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
                                         << ", input: cannot dereference creator layer weak-pointer";
    }
    if (!LayerInfo(iLayer).isConst()) {
        THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
                                         << ", input: expected to be of type const, but was: " << iLayer->type;
    }
    if (!iLayer->blobs.count("custom")) {
        THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
    }
    return iLayer->blobs["custom"];
 }
 }  // namespace LayerUtils
 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
@ -205,8 +205,8 @@ class LayerInfo {
    bool isConcat() const noexcept {
        return isOfType("concat");
    }
-    bool isFakeQnatize() const noexcept {
+    bool isFakeQuantize() const noexcept {
-        return isOfType("FakeQnatize");
+        return isOfType("FakeQuantize");
    }
    bool isNonFunctional() const noexcept {
        return isOfType("reshape") || isOfType("squeeze") || isOfType("unsqueeze") || isTrivialPermute();
--- a/inference-engine/src/gna_plugin/memory/gna_memory_state.cpp
+++ b/inference-engine/src/gna_plugin/memory/gna_memory_state.cpp
@ -71,7 +71,7 @@ namespace memory {
        case InferenceEngine::Precision::I16: {
            if (new_state_precision == InferenceEngine::Precision::FP32) {
                auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
-                auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+                auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
                GNAPluginNS::ConvertToInt16(static_cast<int16_t*>(state->gna_ptr),
                    newState->buffer().as<float*>(),
                    1,
@ -97,7 +97,7 @@ namespace memory {
        if (state->getInput() && state_precision == InferenceEngine::Precision::I16) {
            auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
-            auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+            auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
            auto result_blob = make_blob_with_precision(InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32,
                InferenceEngine::SizeVector({ 1, elements }),
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@ -25,21 +25,25 @@
 #include <legacy/net_pass.h>
 #include <layers/gna_copy_layer.hpp>
 #include "backend/dnn_types.h"
 #include "gna_plugin_log.hpp"
 #include "frontend/quantization.h"
 #include "frontend/quantized_layer_params.hpp"
 #include <layers/gna_copy_layer.hpp>
 #include <layers/gna_fake_quantize_layer.hpp>
 #include <runtime/pwl.h>
 #include "gna_graph_tools.hpp"
 #include "gna_pass_manager.hpp"
 #include "layers/gna_layer_info.hpp"
 #include "gna_upstream_iterator.hpp"
 #include "frontend/quantization.h"
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 using namespace GNAPluginNS;
-#define pass_trace() gnalog() << "[" << getName() << "]"
+#define pass_trace() gnalog() << "[" << getName() << "] "
 std::shared_ptr<IPassManager> BasePass::getPassManager() {
    auto sharedMgr = mgr.lock();
@ -1672,6 +1676,232 @@ void FuseMultipleIdentitiesPass::run() {
    }
 }
 void FuseFQIntoWeightsPass::run() {
    auto isNonFunctional = [](CNNLayerPtr ptr) {
        return LayerInfo(ptr).isNonFunctional();
    };
    auto assignWeightsAndBiases = [](CNNLayerPtr layer, Blob::Ptr weights, Blob::Ptr biases) {
        auto weigtableLayer = std::dynamic_pointer_cast<WeightableLayer>(layer);
        if (nullptr == weigtableLayer) {
            THROW_GNA_LAYER_EXCEPTION(layer) << " not a weightable layer";
        }
        weigtableLayer->_weights = weights;
        weigtableLayer->_biases  = biases;
        weigtableLayer->blobs["weights"] = weights;
        weigtableLayer->blobs["biases"] = biases;
    };
    for (auto &l : *pLayers) {
        if (!LayerInfo(l).isFakeQuantize()) {
            continue;
        }
        // determine whether this FQ is actually ends into weigtable layer
        auto fqLayer = l;
        if (!CNNNetHasNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional)) {
            continue;
        }
        auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional).first;
        if (!LayerInfo(weightableLayer).isWeightable()) {
            continue;
        }
        if (weightableLayer->insData.size() != 3) {
            continue;
        }
        // check whether this FQ represents weights - it need to be at index 1 of weightable layer
        auto prevLayerAt1 = CNNNetPrevLayerSkipCertain(weightableLayer, 1, isNonFunctional);
        if (prevLayerAt1 != fqLayer) {
            continue;
        }
        // now this FQ layer represents weights - lets apply it and fuse to given weightable layer.
        pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
            << LAYER_NAME(weightableLayer) << "\n";
        GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
        auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, 2);
        auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();
        // 1. broke existing connections - by detaching fq subgraph from rest of graph
        auto prevData = weightableLayer->insData[1].lock();
        auto prevLayer = getCreatorLayer(prevData).lock();
        auto weightDims = prevLayer->outData.front()->getDims();
        prevLayer->outData.clear();
        weightableLayer->insData.resize(1);
        // 2. running FQ function for given layer
        if (weightDims.size() != 2) {
            THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported";
        }
        auto outputSize = details::product(weightDims.begin(), weightDims.end());
        // depending on compute precision weights will be recreated
        // for integer mode - weights might be simply copied - to avoid furter quantisations overhead
        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(weightableLayer);
        if (quantized) {
            // assign already quantized Weights
            assignWeightsAndBiases(weightableLayer, quantizedWeights, biases);
            // modify scale factors for quantized component
            auto levels = gnaFakeQuantizeLayer.getLevels();
            auto inputRange = gnaFakeQuantizeLayer.getInputRange();
            auto outputRange = gnaFakeQuantizeLayer.getOutputRange();
            if (outputRange.first.size() != outputRange.second.size()) {
                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " number of min and max data must be equal, min size: "
                    << outputRange.first.size() << ", max size: " << outputRange.second.size();
            }
            if (inputRange.first.size() != outputRange.first.size() ||
                inputRange.second.size() != outputRange.second.size()) {
                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " size of input and output range differs. "
                    << "input min size: " << inputRange.first.size() << ", "
                    << "output min size: " << outputRange.first.size() << ", "
                    << "input max size: " << inputRange.second.size() << ", "
                    << "output max size: " << outputRange.second.size();
            }
            if (levels > std::numeric_limits<uint8_t>::max() && outputRange.first.size() > 1) {
                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantization for int16 weights."
                    << " Per-channel quantization ";
            }
            // check if
            // - weights were float values and need to be quantized,
            // - weights are integer values and quantization can be skipped
            for (size_t i = 0; i < outputRange.first.size(); ++i) {
                if (inputRange.first[i] > outputRange.first[i] ||
                    inputRange.second[i] > outputRange.second[i]) {
                    quantized->_weights_quantized = true;
                    break;
                }
            }
            quantized->_weights_quant.SetMinValues(outputRange.first);
            quantized->_weights_quant.SetMaxValues(outputRange.second);
            quantized->_weights_quant.SetLevels(levels);
            // lets find out minimum scale factor among channels
            if (quantized->_weights_quant.GetMinValues().empty()) {
                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed";
            }
            auto getScale = [&quantized](size_t i) {
                return (quantized->_weights_quant.GetLevels() - 1) /
                    (quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]);
            };
            float min_channel_scale = getScale(0);
            for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) {
                min_channel_scale = std::min(min_channel_scale, getScale(i));
            }
            auto multiplier = 1.0f;
            if (quantized->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
                // GNA supports additional multiplier for only 8bit weights.
                // The multipler is used to extend dynamic range.
                multiplier = MAX_OUT_MULTIPLIER;
            }
            // Common weights scale calculation
            quantized->_weights_quant.SetScale(min_channel_scale * multiplier);
            continue;
        }
        intel_dnn_component_t component;
        component.num_columns_in = weightDims[1];
        component.num_rows_in    = weightDims[0];
        intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl);
        transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
        auto quantizedWeightsData = quantizedWeights->buffer();
        component.ptr_inputs = quantizedWeightsData.as<float*>();
        auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
        dequantizedWeights->allocate();
        auto resultBuffer = dequantizedWeights->buffer();
        component.ptr_outputs = resultBuffer.as<float*>();
        PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
        // 3. assign dequantized const blob to weightable layer
        assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
    }
 }
 void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
    if (!quantized) {
        return;
    }
    auto donotSkip = [](CNNLayerPtr) {
        return false;
    };
    for (auto &&l : *pLayers) {
        if (!LayerInfo(l).isFakeQuantize()) {
            continue;
        }
        GNAFakeQuantizeLayer fqLayer(l);
        auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
        if (prevLayer->outData.size() != 1) {
            THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
        }
        auto inputRange = fqLayer.getInputRange();
        auto outputRange = fqLayer.getOutputRange();
        if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
            outputRange.second.size() != 1 || outputRange.second.size() != 1) {
            THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation";
        }
        float fqLevels = fqLayer.getLevels();
        float scaleInput = (fqLevels - 1) / (inputRange.second[0] - inputRange.first[0]);
        float scaleOutputs = (fqLevels - 1) / (outputRange.second[0] - outputRange.first[0]);
        // Before FQ layer is removed, the previous layer has to be updated with its quantization data
        auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
        quantParamsPrevLayer->_dst_quant.SetScale(scaleOutputs);
        quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels);
        quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] });
        quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] });
        auto prevData = prevLayer->outData.front();
        getInputTo(prevLayer->outData.front()).clear();
        // Find all output layers connected to FQ
        auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
        if (nextLayers.empty()) {
            THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize does not have any output layers connected";
        }
        // Connect all next layers after FQ to the layer that is before FQ
        // and propagate quantization data
        for (size_t i = 0; i < nextLayers.size(); ++i) {
            auto insDatas = CNNLayerFindInsDataIdxes(fqLayer->outData.front(), nextLayers[i]);
            if (insDatas.size() != 1) {
                THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize connection to layer: "
                    << LAYER_NAME(nextLayers[i]) << " is not correct";
            }
            nextLayers[i]->insData[insDatas.front()] = prevData;
            getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
            // After layer gets removed lets absorb its params in QuantParams structure
            // replacing scale factor from this fq layer
            auto quantParamsNextLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(nextLayers[i]);
            quantParamsNextLayer->_src_quant.SetScale(scaleOutputs);
            quantParamsNextLayer->_src_quant.SetLevels(fqLevels);
            quantParamsNextLayer->_src_quant.SetMinValues({ outputRange.first[0] });
            quantParamsNextLayer->_src_quant.SetMaxValues({ outputRange.second[0] });
        }
    }
 }
 int PassManager::run(int index) {
 #ifdef PLOT
    auto dumpNetworkAfterPass = [&index, this] (std::shared_ptr<Pass> pass) {
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
@ -199,6 +199,17 @@ DECL_PASS(FuseMultipleIdentities);
 */
 DECL_PASS(BroadcastConst);
 /**
 * @brief runs static quantisation on given floating weights and replaces fakeQuantize with constblobs
 */
 DECL_PASS(FuseFQIntoWeights);
 /**
 * @brief remove all fake quantize layers while moving it's settings into QuantParams for certain layer
 */
 DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams);
 struct PassManagerSettings {
    Policy policy;
    /// @brief whether to run passes before copy
--- a/inference-engine/src/gna_plugin/runtime/pwl.cpp
+++ b/inference-engine/src/gna_plugin/runtime/pwl.cpp
@ -1047,25 +1047,32 @@ void PwlApply32(intel_dnn_component_t *component,
            }
            break;
        case kActFakeQuantize: {
            auto input_low   = transform->func_id.args.fakeQuantize.input_low;
            auto input_high  = transform->func_id.args.fakeQuantize.input_high;
            auto output_low  = transform->func_id.args.fakeQuantize.output_low;
            auto output_high = transform->func_id.args.fakeQuantize.output_high;
            auto levels  = transform->func_id.args.fakeQuantize.levels;
            // TODO: this special modification for spedup-compute give different result with straight FQ forulae
            // but this used in referencen graph FakeQuantize implementations so we need to honor it for a while
            float scaleInput = (input_high - input_low) / (levels-1);
            float scaleOutputs = (output_high - output_low) / (levels-1);
            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
                auto inputChannel  = transform->func_id.args.fakeQuantize.inputPerChannel ? i : 0;
                auto outputChannel = transform->func_id.args.fakeQuantize.outputPerChannel ? i : 0;
                auto input_low   = transform->func_id.args.fakeQuantize.input_low[inputChannel];
                auto input_high  = transform->func_id.args.fakeQuantize.input_high[inputChannel];
                auto output_low  = transform->func_id.args.fakeQuantize.output_low[outputChannel];
                auto output_high = transform->func_id.args.fakeQuantize.output_high[outputChannel];
                // TODO: this special modification for spedup-compute give different result with straight FQ formulae
                // but this used in reference graph FakeQuantize implementations so we need to honor it for a while
                float scaleInput = (input_high - input_low) / (levels-1);
                float scaleOutputs = (output_high - output_low) / (levels-1);
                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
-                    auto x = ptr_in[i * num_columns + j];
+                    auto offset = i * num_columns + j;
                    auto x = ptr_in[offset];
                    if (x < std::min(input_low, input_high)) {
-                        ptr_out[i * num_columns + j] = output_low;
+                        ptr_out[offset] = output_low;
                    } else if (x > std::max(input_low, input_high)) {
-                        ptr_out[i * num_columns + j] = output_high;
+                        ptr_out[offset] = output_high;
                    } else {
-                        ptr_out[i * num_columns + j] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low;
+                        ptr_out[offset] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low;
                    }
                }
            }
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/fake_quantize.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/fake_quantize.cpp
@ -41,17 +41,45 @@ const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
 //    {"sw_exact_i8", configInt8},
 };
-const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
+const std::vector<std::vector<size_t>> inputShapes = {
            {3, 10, 5, 6},
            {1, 1, 1, 1},
            {1, 8, 8, 256},
            {1, 2, 2, 2},
            {1, 3, 4, 5},
            };
 const std::vector<std::vector<size_t>> constShapes = {{1}};
 const std::vector<size_t> levels = {16, 255, 256};
-const std::vector<std::vector<float>> fqArgs = {{0, 10, 2, 5}, {}};
+const std::vector<std::vector<float>> fqArgs = {{}};
 const std::vector<std::vector<float>> inputParams = {{-10, 10, 0.1}, {}};
 const std::vector<float> fqInputMin = {0, 1, 2, 3, 4, 5};
 const std::vector<float> fqInputMax = {10, 9, 8, 7, 6};
 const std::vector<float> fqOutputMin = {1, 2, 3, 4};
 const std::vector<float> fqOutputMax = {8, 7, 6, 5};
 std::vector<std::vector<float>> getInputOutputShapes(const std::vector<float> inputsMin,
        const std::vector<float> inputsMax,
        const std::vector<float> OutputsMin,
        const std::vector<float> OutputsMax,
        std::vector<std::vector<float>> fqArg) {
    for (const auto& inputMin : inputsMin) {
        for (const auto& inputMax : inputsMax) {
            for (const auto& outputMin : OutputsMin) {
                for (const auto& outputMax : OutputsMax) {
                    fqArg.push_back({inputMin, inputMax, outputMin, outputMax});
                }
            }
        }
    }
    return fqArg;
 }
 const auto fqParams = ::testing::Combine(
    ::testing::ValuesIn(levels),
    ::testing::ValuesIn(constShapes),
-    ::testing::ValuesIn(fqArgs),
+    ::testing::ValuesIn(getInputOutputShapes(fqInputMin, fqInputMax, fqOutputMin, fqOutputMax, fqArgs)),
    ::testing::ValuesIn(inputParams)
 );
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/two_fake_quantize_to_fullyconnected.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/two_fake_quantize_to_fullyconnected.cpp
@ -0,0 +1,125 @@
 // Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <vector>
 #include <gna/gna_config.hpp>
 #include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
 #include "common_test_utils/test_constants.hpp"
 using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<InferenceEngine::Precision> netPrecisions = {
        InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
 };
 using ConfigType = std::map<std::string, std::string>;
 const ConfigType configFP32 = {
        {"GNA_DEVICE_MODE", "GNA_SW_FP32"},
 };
 const ConfigType configSWExact = {
        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
        {"GNA_COMPACT_MODE", "NO"}
 };
 /**
 * @brief specific quantisation mode to be used internally
 */
 const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
        {"sw_fp32", configFP32},
 };
 const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes_I8 = {
        {"gna_sw_exact", configSWExact},
 };
 const std::vector<std::vector<size_t>> inputShapes = {
        {1, 440}
 };
 const std::vector<std::vector<std::vector<size_t>>> constShapes = {
        {{1}, {2048, 1}}
 };
 const std::vector<std::vector<std::vector<size_t>>> constShapes_int16 = {
        {{1}, {1}}
 };
 const std::vector<size_t> levels_fp = {255, 65535};
 const std::vector<std::vector<size_t>> levels_i16 = {{65535, 65535}, {32767, 32767}, {16383, 16383}};
 const std::vector<std::vector<size_t>> levels_i8 = {{255, 255}};
 const std::vector<std::vector<float>> fqArgs = {{-2.0f, 2.0f, -2.0f, 2.0f}};
 const std::vector<std::vector<float>> inputParams = {{-64, 64, 1}, {-10, 10, 0.1}};
 const std::vector<std::vector<float>> inputParams_I8 = {{-2.0f, 2.0f, 0.1f}};
 const std::vector<bool> biases = {false, true};
 const auto fqParams = ::testing::Combine(
        ::testing::Values(levels_fp),
        ::testing::ValuesIn(constShapes),
        ::testing::ValuesIn(fqArgs),
        ::testing::ValuesIn(inputParams)
 );
 const auto fqParams_I8 = ::testing::Combine(
        ::testing::ValuesIn(levels_i8),
        ::testing::ValuesIn(constShapes),
        ::testing::ValuesIn(fqArgs),
        ::testing::ValuesIn(inputParams_I8)
 );
 const auto fqParams_I16 = ::testing::Combine(
        ::testing::ValuesIn(levels_i16),
        ::testing::ValuesIn(constShapes_int16),
        ::testing::ValuesIn(fqArgs),
        ::testing::ValuesIn(inputParams_I8)
 );
 INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph, FakeQuantizeSubgraphTest,
                        ::testing::Combine(
                                fqParams,
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::ValuesIn(inputShapes),
                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
                                ::testing::ValuesIn(gnaQuantModes),
                                ::testing::ValuesIn(biases)),
                        FakeQuantizeSubgraphTest::getTestCaseName);
 INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_U8, FakeQuantizeSubgraphTest,
                        ::testing::Combine(
                                fqParams_I8,
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::ValuesIn(inputShapes),
                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
                                ::testing::ValuesIn(gnaQuantModes_I8),
                                ::testing::ValuesIn(biases)),
                        FakeQuantizeSubgraphTest::getTestCaseName);
 INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_I16, FakeQuantizeSubgraphTest,
                        ::testing::Combine(
                                fqParams_I16,
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::ValuesIn(inputShapes),
                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
                                ::testing::ValuesIn(gnaQuantModes_I8),
                                ::testing::ValuesIn(biases)),
                        FakeQuantizeSubgraphTest::getTestCaseName);
 }  // namespace
--- a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/two_fake_quantize_to_fullyconnected.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/two_fake_quantize_to_fullyconnected.hpp
@ -0,0 +1,52 @@
 // Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include <tuple>
 #include <vector>
 #include <string>
 #include <memory>
 #include "functional_test_utils/layer_test_utils.hpp"
 #include "ngraph_functions/builders.hpp"
 #include "ngraph_functions/utils/ngraph_helpers.hpp"
 typedef std::tuple<
        std::vector<size_t>,              // levels
        std::vector<std::vector<size_t>>, // const inputs shape
        std::vector<float>,               // fake quantize inputLow, inputHigh, outputLow, outputHigh or empty for random
        std::vector<float>               // input generator data: low, high, resolution
 > fqSpecificParams;
 typedef std::tuple<
        fqSpecificParams,
        InferenceEngine::Precision,        // Net precision
        InferenceEngine::Precision,        // Input precision
        InferenceEngine::Precision,        // Output precision
        InferenceEngine::Layout,           // Input layout
        InferenceEngine::Layout,           // Output layout
        InferenceEngine::SizeVector,       // Input shapes
        LayerTestsUtils::TargetDevice,     // Device name
        std::pair<std::string, std::map<std::string, std::string>>, // Additional backend configuration and alis name to it
        bool
 > fqSubgraphTestParamsSet;
 namespace LayerTestsDefinitions {
 class FakeQuantizeSubgraphTest : public testing::WithParamInterface<fqSubgraphTestParamsSet>,
                                 virtual public LayerTestsUtils::LayerTestsCommon {
 public:
    static std::string getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj);
 protected:
    void SetUp() override;
    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override;
 protected:
    float inputDataMin        = 0.0;
    float inputDataMax        = 10.0;
    float inputDataResolution = 1.0;
    int32_t  seed = 1;
 };
 }  // namespace LayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/fake_quantize.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/fake_quantize.cpp
@ -111,8 +111,6 @@ void FakeQuantizeLayerTest::SetUp() {
            {fqDirectArg[2]},
            {fqDirectArg[3]});
    }
    auto fq = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fakeQNode);
    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(fq)};
--- a/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/two_fake_quantize_to_fullyconnected.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/subgraph_tests/two_fake_quantize_to_fullyconnected.cpp
@ -0,0 +1,168 @@
 // Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <tuple>
 #include <vector>
 #include <string>
 #include <memory>
 #include <functional>
 #include <functional_test_utils/skip_tests_config.hpp>
 #include "ie_core.hpp"
 #include "common_test_utils/common_utils.hpp"
 #include "functional_test_utils/blob_utils.hpp"
 #include "functional_test_utils/plugin_cache.hpp"
 #include "functional_test_utils/layer_test_utils.hpp"
 #include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
 namespace LayerTestsDefinitions {
 std::string FakeQuantizeSubgraphTest::getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj) {
    fqSpecificParams fqParams;
    InferenceEngine::Precision netPrecision;
    InferenceEngine::Precision inPrc, outPrc;
    InferenceEngine::Layout inLayout, outLayout;
    InferenceEngine::SizeVector inputShapes;
    std::string targetDevice;
    std::pair<std::string, std::map<std::string, std::string>> config;
    bool biases = false;
    std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShapes, targetDevice, config, biases) = obj.param;
    std::vector<size_t> levels;
    std::vector<std::vector<size_t>> constShape;
    std::vector<float> fqDirectArgs;
    std::vector<float> inputArg;
    std::tie(levels, constShape, fqDirectArgs, inputArg) = fqParams;
    std::ostringstream result;
    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
    result << "CS=" << CommonTestUtils::vec2str(constShape) << "_";
    result << "LEVELS=" << CommonTestUtils::vec2str(levels) << "_";
    result << "netPRC=" << netPrecision.name() << "_";
    result << "inPRC=" << inPrc.name() << "_";
    result << "outPRC=" << outPrc.name() << "_";
    result << "inL=" << inLayout << "_";
    result << "outL=" << outLayout << "_";
    result << "biases=" << biases << "_";
    result << "trgDev=" << targetDevice;
    if (!config.first.empty()) {
        result << "_targetConfig=" << config.first;
    }
    if (!fqDirectArgs.empty()) {
        result << "_fqArgs=" << fqDirectArgs[0] << "_" << fqDirectArgs[1] << "_" << fqDirectArgs[2] << "_" << fqDirectArgs[3];
    }
    if (inputArg.size() == 3) {
        result << "_inputArg=" << inputArg[0] << "_" << inputArg[1] << "_" << inputArg[2];
    }
    return result.str();
 }
 void FakeQuantizeSubgraphTest::SetUp() {
    fqSpecificParams fqParams;
    std::vector<size_t> inputShape;
    std::pair<std::string, std::map<std::string, std::string>> config;
    auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
    bool biases = false;
    std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice, config, biases) = this->GetParam();
    InferenceEngine::SizeVector kernel, stride, dilation;
    std::vector<size_t> levels;
    std::vector<std::vector<size_t>> constShape;
    std::vector<float> fqDirectArg;
    std::vector<float> inputArg;
    std::tie(levels, constShape, fqDirectArg, inputArg) = fqParams;
    if (inputArg.size() == 3) {
        inputDataMin = inputArg[0];
        inputDataMax = inputArg[1];
        inputDataResolution = inputArg[2];
    }
    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
    auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
    const int seed = 0;
    std::mt19937 gen(static_cast<float>(seed));
    auto generateFloatNumbers = [gen](std::size_t vec_len, float min, float max) mutable {
        std::vector<float> res;
        std::uniform_real_distribution<float> dist(min, max);
        for (int i = 0; i < vec_len; i++)
            res.emplace_back(static_cast<float>(dist(gen)));
        return res;
    };
    auto weightsRowNum = constShape[1][0];
    auto weightsColNum = inputShape[1];
    auto weightsData = generateFloatNumbers(weightsRowNum * weightsColNum, inputDataMin, inputDataMax);
    auto const_param = ngraph::builder::makeConstant<float>(ngPrc, { constShape[1][0], inputShape[1] }, { 1.0f });
    auto inputMinRange = std::vector<float>{};
    auto inputMaxRange = std::vector<float>{};
    auto channelDataSize = constShape[1];
    if (channelDataSize[0] == 1) {
        // If per tensor data needs to be provided
        inputMinRange.push_back(inputDataMin);
        inputMaxRange.push_back(inputDataMax);
    } else if (channelDataSize[0] == weightsRowNum) {
        // If per channel data needs to be provided
        for (size_t i = 0; i < weightsRowNum; ++i) {
            auto minChannelVal = std::numeric_limits<float>::max();
            auto maxChannelVal = std::numeric_limits<float>::min();
            for (size_t j = 0; j < weightsColNum; ++j) {
                minChannelVal = std::min(minChannelVal, weightsData[i * weightsColNum + j]);
                maxChannelVal = std::max(maxChannelVal, weightsData[i * weightsColNum + j]);
            }
            inputMinRange.push_back(minChannelVal);
            inputMaxRange.push_back(maxChannelVal);
        }
    } else {
        FAIL() << "Invalid test configuration";
    }
    auto lowNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMinRange, false);
    auto highNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMaxRange, false);
    auto inputFQNode = ngraph::builder::makeFakeQuantize(paramOuts[0], ngraph::element::f32, levels[0], constShape[0],
        { inputDataMin }, { inputDataMax }, { inputDataMin }, { inputDataMax });
    auto weightsFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(const_param,
        lowNode, highNode, lowNode, highNode, levels[1]);
    auto inputFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(inputFQNode);
    auto weightsFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(weightsFQNode);
    auto matmul = std::make_shared<ngraph::opset1::MatMul>(inputFQ, weightsFQ, false, true);
    std::shared_ptr<ngraph::Node> biases_node;
    if (biases) {
        auto const_bias = ngraph::builder::makeConstant(ngPrc, {1, constShape[1][0]}, std::vector<float>{ -1.0f });
        biases_node = std::make_shared<ngraph::opset1::Add>(matmul, const_bias);
    } else {
        biases_node = matmul;
    }
    auto sigmoid = std::make_shared<ngraph::opset1::Sigmoid>(biases_node);
    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(sigmoid)};
    if (biases) {
        auto sigmoid_2 = std::make_shared<ngraph::opset1::Sigmoid>(inputFQ);
        results.push_back(std::make_shared<ngraph::opset1::Result>(sigmoid_2));
    }
    function = std::make_shared<ngraph::Function>(results, params, "fakeQuantizeSubgraph");
    configuration = config.second;
 }
 InferenceEngine::Blob::Ptr FakeQuantizeSubgraphTest::GenerateInput(const InferenceEngine::InputInfo &info) const {
    return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution,
                                            seed);
 }
 TEST_P(FakeQuantizeSubgraphTest, CompareWithRefs) {
    Run();
 }
 }  // namespace LayerTestsDefinitions
--- a/inference-engine/tests_deprecated/unit/engines/gna/i16_quantisation_test.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/gna/i16_quantisation_test.cpp
@ -137,8 +137,8 @@ TEST_F(I16QuantisationTest, DISABLED_outputScaleFactorForAffineIsCorrect){
    auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr);
-    ASSERT_FLOAT_EQ(quantParams->_dst_quant.scale, 100);
+    ASSERT_FLOAT_EQ(quantParams->_dst_quant.GetScale(), 100);
-    ASSERT_FLOAT_EQ(quantParams->_weights_quant.scale, 100);
+    ASSERT_FLOAT_EQ(quantParams->_weights_quant.GetScale(), 100);
 }
 TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) {