From ac2370b4207c2543a6af83a29bb94f077ba85862 Mon Sep 17 00:00:00 2001
From: Edward Shogulin <edward.shogulin@intel.com>
Date: Tue, 15 Sep 2020 09:18:58 +0300
Subject: [PATCH] [LPT] Copy constant with several outputs before blob update
 (cherry-pick to master) (#2198)

* [LPT] Copy constant implementation

* [LPT] the same Constant ops as FQ interval boundaries
---
 .../network_helper.hpp                        |  14 +-
 .../weightable_layer_transformation.hpp       |   2 +
 .../src/concat.cpp                            |  16 +-
 .../src/concat_multi_channels.cpp             |   4 +-
 .../src/convolution.cpp                       |   1 +
 .../src/fake_quantize.cpp                     |   6 +-
 .../src/fully_connected.cpp                   |   1 +
 .../src/network_helper.cpp                    | 157 ++++++++++++------
 .../src/weightable_layer_transformation.cpp   |  17 +-
 ..._constant_fake_quantize_transformation.cpp |  10 +-
 ..._constant_fake_quantize_transformation.cpp |   7 +-
 ..._constant_fake_quantize_transformation.hpp |   2 +-
 ..._constant_fake_quantize_transformation.cpp |  13 +-
 ...imized_constant_fake_quantize_function.hpp |   3 +-
 ...imized_constant_fake_quantize_function.cpp |  39 +++--
 15 files changed, 190 insertions(+), 102 deletions(-)
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp
index 1cc3af08381..aa422e72843 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/network_helper.hpp
@@ -48,16 +48,22 @@ public:
 
     static Blob::Ptr makeNewBlobPtr(const TensorDesc& desc);
 
-    static void invertFakeQuantize(const CNNLayer& fakeQuantize);
-
-    static void updateBlobs(CNNLayer& layer, const std::string& blobName, float value);
-
     static void updateBlobs(const CNNLayer& quantizeLayer, int constLayerIndex, float value);
 
     static void updateBlobs(const CNNLayer& quantizeLayer, int constLayerIndex, const std::vector<float>& values);
 
+    static void updateBlobs(TransformationContext& context, const CNNLayer& quantizeLayer, int constLayerIndex, float value);
+
+    static void updateBlobs(TransformationContext& context, const CNNLayer& quantizeLayer, int constLayerIndex, const std::vector<float>& values);
+
     static void updateBlobs(CNNLayer& layer, const std::string& blobName, const std::vector<float>& values);
 
+    static CNNLayerPtr copyConstant(
+        TransformationContext& context,
+        const CNNLayer& quantizeLayer,
+        const CNNLayerPtr& blobLayer,
+        const size_t constLayerIndex);
+
     // return true if at least one child uses layer on weights
     static bool onWeights(const CNNLayer& layer);
 
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/weightable_layer_transformation.hpp b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/weightable_layer_transformation.hpp
index 34af7c72ab5..d763b6706aa 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision_transformations/weightable_layer_transformation.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision_transformations/weightable_layer_transformation.hpp
@@ -47,6 +47,7 @@ protected:
         std::vector<float>& biasesShifts) const;
 
     void updateWeights(
+        TransformationContext& context,
         const CNNLayerPtr fakeQuantize,
         std::vector<float>& outputLowValues,
         std::vector<float>& outputHighValues) const;
@@ -68,6 +69,7 @@ protected:
         const bool onWeights) const;
 
     DataPrecision fillDequantizationsForWeightsPath(
+        TransformationContext& context,
         const CNNLayer& weightableLayer,
         const bool supportAsymmetricQuantization,
         std::vector<float>& dequantizationScales,
diff --git a/inference-engine/src/low_precision_transformations/src/concat.cpp b/inference-engine/src/low_precision_transformations/src/concat.cpp
index 378b00b0ea5..c9dae711605 100644
--- a/inference-engine/src/low_precision_transformations/src/concat.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat.cpp
@@ -148,10 +148,10 @@ void ConcatTransformation::transform(TransformationContext& context, CNNLayer& c
             switch (quantizedTensorAlignmentOnActivations) {
             case QuantizedTensorAlignment::None: {
                 const float updatedOutputLowValue = quantizationDetails.outputLowValues[0] * quantizationScale + quantizationShift;
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 3, updatePrecisions ? roundf(updatedOutputLowValue) : updatedOutputLowValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 3, updatePrecisions ? roundf(updatedOutputLowValue) : updatedOutputLowValue);
 
                 const float updatedOutputHighValue = quantizationDetails.outputHighValues[0] * quantizationScale + quantizationShift;
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 4, updatePrecisions ? roundf(updatedOutputHighValue) : updatedOutputHighValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 4, updatePrecisions ? roundf(updatedOutputHighValue) : updatedOutputHighValue);
 
                 break;
             }
@@ -165,18 +165,18 @@ void ConcatTransformation::transform(TransformationContext& context, CNNLayer& c
                                                     (outputHighValue / quantizationDetails.outputHighValues[0]))
                                                  : outputHighValue;
 
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 1, inputLowValue);
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 2, inputHighValue);
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 3, dataPrecision.min);
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 4, dataPrecision.max);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 1, inputLowValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 2, inputHighValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 3, dataPrecision.min);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 4, dataPrecision.max);
                 break;
             }
             case QuantizedTensorAlignment::UpdateLevel: {
                 const float updatedOutputLowValue = quantizationDetails.outputLowValues[0] * quantizationScale + quantizationShift;
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 3, updatePrecisions ? roundf(updatedOutputLowValue) : updatedOutputLowValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 3, updatePrecisions ? roundf(updatedOutputLowValue) : updatedOutputLowValue);
 
                 const float updatedOutputHighValue = quantizationDetails.outputHighValues[0] * quantizationScale + quantizationShift;
-                CNNNetworkHelper::updateBlobs(fakeQuantizeLayer, 4, updatePrecisions ? roundf(updatedOutputHighValue) : updatedOutputHighValue);
+                CNNNetworkHelper::updateBlobs(context, fakeQuantizeLayer, 4, updatePrecisions ? roundf(updatedOutputHighValue) : updatedOutputHighValue);
 
                 const int levels = static_cast<int>(fabs(roundf(updatedOutputHighValue) - roundf(updatedOutputLowValue)) + 1.0);
                 fakeQuantizeLayer.params["levels"] = std::to_string(levels);
diff --git a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
index 582c86467f0..929694aa5d4 100644
--- a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
@@ -106,8 +106,8 @@ void ConcatMultiChannelsTransformation::transform(TransformationContext& context
         dequantizationScalesLayers[fakeQuantizeLayer->name] = dequantizationScales;
         dequantizationShiftsLayers[fakeQuantizeLayer->name] = dequantizationShifts;
 
-        CNNNetworkHelper::updateBlobs(*fakeQuantizeLayer, 3, dataPrecision.min);
-        CNNNetworkHelper::updateBlobs(*fakeQuantizeLayer, 4, dataPrecision.max);
+        CNNNetworkHelper::updateBlobs(context, *fakeQuantizeLayer, 3, dataPrecision.min);
+        CNNNetworkHelper::updateBlobs(context, *fakeQuantizeLayer, 4, dataPrecision.max);
     }
 
     if (updatePrecisions) {
diff --git a/inference-engine/src/low_precision_transformations/src/convolution.cpp b/inference-engine/src/low_precision_transformations/src/convolution.cpp
index cb1dddbac46..43b8c0a3926 100644
--- a/inference-engine/src/low_precision_transformations/src/convolution.cpp
+++ b/inference-engine/src/low_precision_transformations/src/convolution.cpp
@@ -105,6 +105,7 @@ void ConvolutionTransformation::transform(TransformationContext& context, CNNLay
     const CNNLayerPtr parentOnData = CNNNetworkHelper::getParent(layer, 0ul);
 
     const DataPrecision dataPrecisionOnWeights = fillDequantizationsForWeightsPath(
+        context,
         layer,
         supportAsymmetricQuantization,
         originalWeightsDequantizationScales,
diff --git a/inference-engine/src/low_precision_transformations/src/fake_quantize.cpp b/inference-engine/src/low_precision_transformations/src/fake_quantize.cpp
index 6113f3bdb77..6e32de23fe1 100644
--- a/inference-engine/src/low_precision_transformations/src/fake_quantize.cpp
+++ b/inference-engine/src/low_precision_transformations/src/fake_quantize.cpp
@@ -34,8 +34,6 @@ void FakeQuantizeTransformation::transform(TransformationContext& context, CNNLa
         THROW_IE_EXCEPTION << "Layer '" << layer.insData.size() << "' has invalid inputs number. 5 is expected.";
     }
 
-    // CNNNetworkHelper::invertFakeQuantize(layer);
-
     // FakeQuantize on weights are used without dequantization ScaleShifts
     const bool onWeights = CNNNetworkHelper::onConstWeightsPath(layer) && CNNNetworkHelper::onWeights(layer);
     if (onWeights) {
@@ -77,8 +75,8 @@ void FakeQuantizeTransformation::transform(TransformationContext& context, CNNLa
     printDequantizationValues(dequantizationScales, dequantizationShifts);
 #endif
 
-    CNNNetworkHelper::updateBlobs(layer, 3, dataPrecision.min);
-    CNNNetworkHelper::updateBlobs(layer, 4, dataPrecision.max);
+    CNNNetworkHelper::updateBlobs(context, layer, 3, dataPrecision.min);
+    CNNNetworkHelper::updateBlobs(context, layer, 4, dataPrecision.max);
 
     if (updatePrecisions) {
         CNNNetworkHelper::setOutDataPrecision(layer, dataPrecision.precision);
diff --git a/inference-engine/src/low_precision_transformations/src/fully_connected.cpp b/inference-engine/src/low_precision_transformations/src/fully_connected.cpp
index dcaa789990d..e75c29665c9 100644
--- a/inference-engine/src/low_precision_transformations/src/fully_connected.cpp
+++ b/inference-engine/src/low_precision_transformations/src/fully_connected.cpp
@@ -135,6 +135,7 @@ void FullyConnectedTransformation::transform(TransformationContext& context, CNN
             }
 
             fillDequantizationsForWeightsPath(
+                context,
                 fullyConnected,
                 supportAsymmetricQuantization,
                 originalWeightsDequantizationScales,
diff --git a/inference-engine/src/low_precision_transformations/src/network_helper.cpp b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
index 8556e23402f..ecb06cdae1b 100644
--- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp
+++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
@@ -183,54 +183,6 @@ Blob::Ptr CNNNetworkHelper::makeNewBlobPtr(const TensorDesc& desc) {
     return newBlob;
 }
 
-void CNNNetworkHelper::updateBlobs(CNNLayer& layer, const std::string& blobName, float value) {
-    const auto existingBlobIt = layer.blobs.find(blobName);
-    if (existingBlobIt == layer.blobs.end()) {
-        THROW_IE_EXCEPTION << "blob '" << blobName << "' was not found in layer " << layer.name;
-    }
-    const auto& existingBlobTensorDesc = existingBlobIt->second->getTensorDesc();
-    Blob::Ptr newBlob = makeNewBlobPtr(existingBlobTensorDesc);
-
-    newBlob->allocate();
-    fillBlobByFP32(newBlob, value);
-    layer.blobs[existingBlobIt->first] = newBlob;
-}
-
-void CNNNetworkHelper::invertFakeQuantize(const CNNLayer& fakeQuantize) {
-    if (fakeQuantize.type != "FakeQuantize") {
-        THROW_IE_EXCEPTION << "invalid layer type " << fakeQuantize.type;
-    }
-    const QuantizationDetails quantizationDetails = QuantizationDetails::getDetails(fakeQuantize);
-    const size_t valuesCount =
-        std::max(quantizationDetails.inputLowValues.size(), quantizationDetails.outputLowValues.size());
-    std::vector<float> inputLowValues(valuesCount);
-    std::vector<float> inputHightValues(valuesCount);
-    std::vector<float> outputLowValues(valuesCount);
-    std::vector<float> outputHighValues(valuesCount);
-    bool wasInverted = false;
-    for (size_t i = 0ul; i < valuesCount; ++i) {
-        if ((quantizationDetails.getInputLowValue(i) > quantizationDetails.getInputHighValue(i)) &&
-            (quantizationDetails.getOutputLowValue(i) > quantizationDetails.getOutputHighValue(i))) {
-            inputLowValues[i] = quantizationDetails.getInputHighValue(i);
-            inputHightValues[i] = quantizationDetails.getInputLowValue(i);
-            outputLowValues[i] = quantizationDetails.getOutputHighValue(i);
-            outputHighValues[i] = quantizationDetails.getOutputLowValue(i);
-            wasInverted = true;
-        } else {
-            inputLowValues[i] = quantizationDetails.getInputLowValue(i);
-            inputHightValues[i] = quantizationDetails.getInputHighValue(i);
-            outputLowValues[i] = quantizationDetails.getOutputLowValue(i);
-            outputHighValues[i] = quantizationDetails.getOutputHighValue(i);
-        }
-    }
-
-    if (wasInverted) {
-        CNNNetworkHelper::updateBlobs(fakeQuantize, 1, inputLowValues);
-        CNNNetworkHelper::updateBlobs(fakeQuantize, 2, inputHightValues);
-        CNNNetworkHelper::updateBlobs(fakeQuantize, 3, outputLowValues);
-        CNNNetworkHelper::updateBlobs(fakeQuantize, 4, outputHighValues);
-    }
-}
 void CNNNetworkHelper::updateBlobs(const CNNLayer& quantizeLayer, int constLayerIndex,
                                    const std::vector<float>& values) {
     CNNLayerPtr blobLayer = CNNNetworkHelper::getParent(quantizeLayer, constLayerIndex);
@@ -288,6 +240,25 @@ void CNNNetworkHelper::updateBlobs(const CNNLayer& quantizeLayer, int constLayer
         fillBlobByFP32(newBlob, values.data());
 }
 
+void CNNNetworkHelper::updateBlobs(
+    TransformationContext& context,
+    const CNNLayer& quantizeLayer,
+    int constLayerIndex,
+    const std::vector<float>& values) {
+    CNNLayerPtr blobLayer = CNNNetworkHelper::getParent(quantizeLayer, constLayerIndex);
+    if (blobLayer == nullptr) {
+        THROW_IE_EXCEPTION << "layer is absent";
+    }
+
+    const auto existingBlobIt = blobLayer->blobs.find("custom");
+    if (existingBlobIt == blobLayer->blobs.end()) {
+        THROW_IE_EXCEPTION << "custom blob was not found ";
+    }
+
+    blobLayer = copyConstant(context, quantizeLayer, blobLayer, constLayerIndex);
+    updateBlobs(quantizeLayer, constLayerIndex, values);
+}
+
 void CNNNetworkHelper::updateBlobs(CNNLayer& layer, const std::string& blobName, const std::vector<float>& values) {
     const auto existingBlobIt = layer.blobs.find(blobName);
     if (existingBlobIt == layer.blobs.end()) {
@@ -377,6 +348,96 @@ void CNNNetworkHelper::updateBlobs(const CNNLayer& quantizeLayer, int constLayer
     blobLayer->blobs[existingBlobIt->first] = newBlob;
 }
 
+void CNNNetworkHelper::updateBlobs(TransformationContext& context, const CNNLayer& quantizeLayer, int constLayerIndex, float value) {
+    auto inData = quantizeLayer.insData[constLayerIndex].lock();
+    if (inData == nullptr) {
+        THROW_IE_EXCEPTION << "data is absent";
+    }
+
+    CNNLayerPtr blobLayer = getCreatorLayer(inData).lock();
+    if (blobLayer == nullptr) {
+        THROW_IE_EXCEPTION << "layer is absent";
+    }
+
+    if (blobLayer->blobs.size() != 1) {
+        THROW_IE_EXCEPTION << "unexpected blobs size";
+    }
+
+    blobLayer = copyConstant(context, quantizeLayer, blobLayer, constLayerIndex);
+    updateBlobs(quantizeLayer, constLayerIndex, value);
+}
+
+CNNLayerPtr CNNNetworkHelper::copyConstant(
+    TransformationContext& context,
+    const CNNLayer& quantizeLayer,
+    const CNNLayerPtr& blobLayer,
+    const size_t constLayerIndex) {
+    size_t repeatsCount = 0ul;
+    for (size_t i = 0; i < quantizeLayer.insData.size(); ++i) {
+        auto parentInData = quantizeLayer.insData[i].lock();
+        if (parentInData == nullptr) {
+            continue;
+        }
+        const auto quantizeLayerParent = getCreatorLayer(parentInData).lock();
+        if (quantizeLayerParent == nullptr) {
+            continue;
+        }
+        if (quantizeLayerParent->name == blobLayer->name) {
+            repeatsCount++;
+        }
+    }
+
+    if (repeatsCount < 2ul) {
+        return blobLayer;
+    }
+
+    details::CNNNetworkImpl* networkImpl = dynamic_cast<details::CNNNetworkImpl*>(&context.network);
+    if (networkImpl == nullptr) {
+        THROW_IE_EXCEPTION << "Unexpected network type";
+    }
+
+    const DataPtr outData = blobLayer->outData[0];
+    const std::map<std::string, CNNLayerPtr>& inputTo = getInputTo(outData);
+    const auto quantizeLayerIt = inputTo.find(quantizeLayer.name);
+    if (quantizeLayerIt == inputTo.end()) {
+        THROW_IE_EXCEPTION << "Layer was not found";
+    }
+
+    const auto blobIt = blobLayer->blobs.find("custom");
+    if (blobIt == blobLayer->blobs.end()) {
+        THROW_IE_EXCEPTION << "Blob was not found";
+    }
+
+    const Blob::Ptr blob = blobIt->second;
+    Blob::Ptr newBlob = makeNewBlobPtr(blob->getTensorDesc());
+    newBlob->allocate();
+
+    const std::shared_ptr<float> blobValues = CNNNetworkHelper::getFloatData(blob);
+    fillBlobByFP32(newBlob, blobValues.get());
+
+    auto newBlobValues = CNNNetworkHelper::getFloatData(newBlob);
+
+    const std::string layerName = blobLayer->name + "/new" + std::to_string(repeatsCount);
+    CNNLayerPtr newBlobLayer = CNNLayerPtr(new CNNLayer({ layerName, "Const", blob->getTensorDesc().getPrecision() }));
+    newBlobLayer->blobs.emplace("custom", newBlob);
+
+    const TensorDesc& tensorDesc = blobLayer->outData[0]->getTensorDesc();
+    DataPtr newEdgeAfterLayer(new Data(newBlobLayer->name, tensorDesc));
+    newEdgeAfterLayer->setName(newBlobLayer->name);
+    newEdgeAfterLayer->setPrecision(blob->getTensorDesc().getPrecision());
+    quantizeLayerIt->second->insData[constLayerIndex] = newEdgeAfterLayer;
+    getInputTo(newEdgeAfterLayer)[quantizeLayer.name] = quantizeLayerIt->second;
+
+    getCreatorLayer(newEdgeAfterLayer) = newBlobLayer;
+    newBlobLayer->outData.push_back(newEdgeAfterLayer);
+
+    CNNNetworkImpl* netImpl = dynamic_cast<CNNNetworkImpl*>(&context.network);
+    netImpl->addData(newBlobLayer->name.c_str(), newEdgeAfterLayer);
+    netImpl->addLayer(newBlobLayer);
+
+    return newBlobLayer;
+}
+
 int CNNNetworkHelper::onWeightsInDepth(const CNNLayer& layer) {
     const std::vector<CNNLayerPtr> children = getChildren(layer);
     for (const CNNLayerPtr& child : children) {
diff --git a/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp b/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp
index d62c9e48111..8398eec611b 100644
--- a/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp
+++ b/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp
@@ -250,14 +250,14 @@ void WeightableLayerTransformation::updateLayerBiasesFcSpecific(
     CNNNetworkHelper::updateBlobs(*biasesLayer, "custom", biases);
 }
 
-void WeightableLayerTransformation::updateWeights(const CNNLayerPtr parent, std::vector<float>& outputLowValues,
+void WeightableLayerTransformation::updateWeights(TransformationContext& context, const CNNLayerPtr parent, std::vector<float>& outputLowValues,
                                                   std::vector<float>& outputHighValues) const {
     const QuantizationDetails quantizationDetails = QuantizationDetails::getDetails(*parent);
     // TODO: refactor: move to standalone method
     switch (quantizedTensorAlignmentOnWeights) {
     case LayerTransformation::QuantizedTensorAlignment::None: {
-        CNNNetworkHelper::updateBlobs(*parent, 3, outputLowValues);
-        CNNNetworkHelper::updateBlobs(*parent, 4, outputHighValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 3, outputLowValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 4, outputHighValues);
         break;
     }
     case LayerTransformation::QuantizedTensorAlignment::UpdateIntervals:
@@ -300,10 +300,10 @@ void WeightableLayerTransformation::updateWeights(const CNNLayerPtr parent, std:
             outputHighValues[i] = roundf(outputHighValues[i] * maxK);
         }
 
-        CNNNetworkHelper::updateBlobs(*parent, 1, inputLowValues);
-        CNNNetworkHelper::updateBlobs(*parent, 2, inputHighValues);
-        CNNNetworkHelper::updateBlobs(*parent, 3, outputLowValues);
-        CNNNetworkHelper::updateBlobs(*parent, 4, outputHighValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 1, inputLowValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 2, inputHighValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 3, outputLowValues);
+        CNNNetworkHelper::updateBlobs(context, *parent, 4, outputHighValues);
 
         const size_t levels = static_cast<size_t>(roundf(minOutputIntervalLowValue + maxOutputIntervalHighValue + 1.0));
         parent->params["levels"] = std::to_string(levels);
@@ -411,6 +411,7 @@ void WeightableLayerTransformation::createAsymmetric(TransformationContext& cont
 }
 
 DataPrecision WeightableLayerTransformation::fillDequantizationsForWeightsPath(
+    TransformationContext& context,
     const CNNLayer& weightableLayer,
     const bool supportAsymmetricQuantization,
     std::vector<float>& dequantizationScales,
@@ -461,7 +462,7 @@ DataPrecision WeightableLayerTransformation::fillDequantizationsForWeightsPath(
         }
     }
 
-    updateWeights(parent, outputLowValues, outputHighValues);
+    updateWeights(context, parent, outputLowValues, outputHighValues);
     return dataPrecision;
 }
 
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
index 5af9d555891..f37e7c0dd35 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
@@ -11,8 +11,7 @@ using namespace LayerTestsDefinitions;
 
 namespace {
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32,
-    InferenceEngine::Precision::FP16
+    InferenceEngine::Precision::FP32
 };
 
 const std::vector<LayerTestsDefinitions::MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues> params = {
@@ -22,10 +21,15 @@ const std::vector<LayerTestsDefinitions::MatMulWithOptimizedConstantFakeQuantize
     },
 };
 
+const std::vector<std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>> inputShapes = {
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>({ InferenceEngine::SizeVector({ 1, 16 }), InferenceEngine::SizeVector({ 10, 16 }) }),
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>({ InferenceEngine::SizeVector({ 1, 16 }), InferenceEngine::SizeVector({ 16, 10 }) })
+};
+
 INSTANTIATE_TEST_CASE_P(LPT, MatMulWithOptimizedConstantFakeQuantizeTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
-        ::testing::Values(InferenceEngine::SizeVector({ 1, 16 })),
+        ::testing::ValuesIn(inputShapes),
         ::testing::Values(CommonTestUtils::DEVICE_CPU),
         ::testing::ValuesIn(params)),
     MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
index 7af10777ac5..d7d7b78c445 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
@@ -21,10 +21,15 @@ const std::vector<LayerTestsDefinitions::MatMulWithOptimizedConstantFakeQuantize
     }
 };
 
+const std::vector<std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>> inputShapes = {
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>({ InferenceEngine::SizeVector({ 1, 16 }), InferenceEngine::SizeVector({ 10, 16 }) }),
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>({ InferenceEngine::SizeVector({ 1, 16 }), InferenceEngine::SizeVector({ 16, 10 }) })
+};
+
 INSTANTIATE_TEST_CASE_P(LPT, MatMulWithOptimizedConstantFakeQuantizeTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
-        ::testing::Values(InferenceEngine::SizeVector({ 1, 16 })),
+        ::testing::ValuesIn(inputShapes),
         ::testing::Values(CommonTestUtils::DEVICE_GPU),
         ::testing::ValuesIn(params)),
     MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp
index e76243a8361..15d5e671b8c 100644
--- a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.hpp
@@ -20,7 +20,7 @@ public:
 
 typedef std::tuple<
     InferenceEngine::Precision,
-    InferenceEngine::SizeVector,
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector>,
     std::string,
     MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues
 > MatMulWithOptimizedConstantFakeQuantizeTransformationTransformationParams;
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
index 6e39ca31f50..e8c2f256b31 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
@@ -23,16 +23,16 @@ namespace LayerTestsDefinitions {
 std::string MatMulWithOptimizedConstantFakeQuantizeTransformation::getTestCaseName(
     testing::TestParamInfo<MatMulWithOptimizedConstantFakeQuantizeTransformationTransformationParams> obj) {
     InferenceEngine::Precision netPrecision;
-    InferenceEngine::SizeVector inputShape;
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector> shapes;
     std::string targetDevice;
     InferenceEngine::details::LayerTransformation::Params params;
     MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues param;
 
-    std::tie(netPrecision, inputShape, targetDevice, param) = obj.param;
+    std::tie(netPrecision, shapes, targetDevice, param) = obj.param;
 
     std::ostringstream result;
     result << netPrecision.name() << "_" <<
-        CommonTestUtils::vec2str(inputShape) << "_" <<
+        CommonTestUtils::vec2str(shapes.first) << "_" << CommonTestUtils::vec2str(shapes.second) << "_" <<
         targetDevice << "_"  <<
         param.fqOnData << "_" <<
         param.fqOnWeights;
@@ -43,15 +43,16 @@ void MatMulWithOptimizedConstantFakeQuantizeTransformation::SetUp() {
     threshold = 0.01f;
 
     InferenceEngine::Precision netPrecision;
-    InferenceEngine::SizeVector inputShape;
+    std::pair<InferenceEngine::SizeVector, InferenceEngine::SizeVector> shapes;
     InferenceEngine::details::LayerTransformation::Params params;
     MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues param;
-    std::tie(netPrecision, inputShape, targetDevice, param) = this->GetParam();
+    std::tie(netPrecision, shapes, targetDevice, param) = this->GetParam();
     auto precision = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
 
     function = ngraph::builder::subgraph::MatMulWithOptimizedConstantFakeQuantizeFunction::getOriginal(
         precision,
-        inputShape,
+        shapes.first,
+        shapes.second,
         param.fqOnData,
         param.fqOnWeights);
 }
diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp
index 83983c9be0f..8b008645546 100644
--- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp
+++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.hpp
@@ -16,7 +16,8 @@ class MatMulWithOptimizedConstantFakeQuantizeFunction {
 public:
     static std::shared_ptr<ngraph::Function> getOriginal(
         const ngraph::element::Type precision,
-        const ngraph::Shape& inputShape,
+        const ngraph::Shape& inputShape1,
+        const ngraph::Shape& inputShape2,
         const FakeQuantizeOnData& fqOnData,
         const FakeQuantizeOnData& fqOnWeights);
 };
diff --git a/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp
index ad571d24e09..6543e0d8456 100644
--- a/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp
+++ b/inference-engine/tests/ngraph_functions/src/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_function.cpp
@@ -13,34 +13,41 @@ namespace subgraph {
 
 std::shared_ptr<ngraph::Function> MatMulWithOptimizedConstantFakeQuantizeFunction::getOriginal(
     const ngraph::element::Type precision,
-    const ngraph::Shape& inputShape,
+    const ngraph::Shape& inputShape1,
+    const ngraph::Shape& inputShape2,
     const FakeQuantizeOnData& fqOnData,
     const FakeQuantizeOnData& fqOnWeights) {
-    const auto input = std::make_shared<ngraph::opset1::Parameter>(precision, ngraph::Shape(inputShape));
-    const auto fakeQuantizeOnActivations = fqOnData.empty() ?
-        nullptr :
-        ngraph::builder::makeFakeQuantize(
-            input, precision, fqOnData.quantizationLevel, fqOnData.constantShape,
-            fqOnData.inputLowValues, fqOnData.inputHighValues, fqOnData.outputLowValues, fqOnData.outputHighValues);
+    const auto input = std::make_shared<ngraph::opset1::Parameter>(precision, ngraph::Shape(inputShape1));
 
-    const ngraph::Shape weightsShape = { inputShape[1], 10 };
+    const auto lowConstantOnActivations = std::make_shared<ngraph::opset1::Constant>(precision, fqOnData.constantShape, fqOnData.inputLowValues);
+    const auto highConstantOnActivations = std::make_shared<ngraph::opset1::Constant>(precision, fqOnData.constantShape, fqOnData.inputHighValues);
+    const auto fakeQuantizeOnActivations = std::make_shared<ngraph::opset1::FakeQuantize>(
+        input,
+        lowConstantOnActivations,
+        highConstantOnActivations,
+        lowConstantOnActivations,
+        highConstantOnActivations,
+        fqOnWeights.quantizationLevel);
+
+    const ngraph::Shape weightsShape = { inputShape2[0], inputShape1[1] };
     const std::vector<float> weigths(weightsShape[0] * weightsShape[1], 10.f);
+
     const auto weightsConst = std::make_shared<ngraph::opset1::Constant>(precision, weightsShape, weigths);
-    const auto lowConstant = std::make_shared<ngraph::opset1::Constant>(precision, fqOnWeights.constantShape, fqOnWeights.inputLowValues);
-    const auto highConstant = std::make_shared<ngraph::opset1::Constant>(precision, fqOnWeights.constantShape, fqOnWeights.inputHighValues);
+    const auto lowConstantOnWeights = std::make_shared<ngraph::opset1::Constant>(precision, fqOnWeights.constantShape, fqOnWeights.inputLowValues);
+    const auto highConstantOnWeights = std::make_shared<ngraph::opset1::Constant>(precision, fqOnWeights.constantShape, fqOnWeights.inputHighValues);
     const auto fakeQuantizeOnWeights = std::make_shared<ngraph::opset1::FakeQuantize>(
         weightsConst,
-        lowConstant,
-        highConstant,
-        lowConstant,
-        highConstant,
+        lowConstantOnWeights,
+        highConstantOnWeights,
+        lowConstantOnWeights,
+        highConstantOnWeights,
         fqOnWeights.quantizationLevel);
 
     const auto matMul = std::make_shared<ngraph::opset1::MatMul>(
-        fqOnData.empty() ? input : fakeQuantizeOnActivations,
+        fakeQuantizeOnActivations,
         fakeQuantizeOnWeights,
         false,
-        false);
+        inputShape1[1] != inputShape2[0]);
 
     ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(matMul) };
     return std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{ input }, "MatMulWithOptimizedConstantFakeQuantizeFunction");