From cccec6942eb7c67c74ffca23dd8a1533f6361d32 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Tue, 30 Nov 2021 10:36:54 +0300
Subject: [PATCH] GNA Input/Output buffers reusage  (#7332)

* Init implementation

# Conflicts:
#	thirdparty/ade

* Switched to shared class

* Refactoring memory commit()

* Added unit tests

* Fixed output order

* Fixed input order

* Fixed split case

* fixed compiling issue in debug mode

* Enabled compact mode by default

* Fixed default order for inputs and outputs

* Changed unit test

* Enabled compact mode bye default

* reverted compac_mode flag order
---
 .../src/gna_plugin/descriptions/gna_flags.hpp |   2 +-
 .../src/gna_plugin/gna_graph_compiler.cpp     | 166 +++++-----
 .../src/gna_plugin/gna_plugin.cpp             |  15 +-
 .../src/gna_plugin/gna_plugin_log.hpp         |   2 +
 .../gna_plugin/memory/gna_mem_requests.hpp    |  50 ++-
 .../memory/gna_mem_requests_queue.hpp         |  73 ++++-
 .../src/gna_plugin/memory/gna_memory.hpp      | 291 ++++++++++++------
 .../unit/gna/gna_memory_compact_test.cpp      | 250 +++++++++++++++
 .../unit}/gna/gna_memory_test.cpp             | 160 +++++-----
 .../tests/unit/gna/gna_plugin_config_test.cpp |   2 +-
 src/inference/dev_api/memory_solver.hpp       |   1 +
 11 files changed, 741 insertions(+), 271 deletions(-)
 create mode 100644 inference-engine/tests/unit/gna/gna_memory_compact_test.cpp
 rename inference-engine/{tests_deprecated/unit/engines => tests/unit}/gna/gna_memory_test.cpp (67%)
diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
index 79b10be7944..d15d526320d 100644
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -10,7 +10,7 @@ namespace GNAPluginNS {
 struct GNAFlags {
     uint8_t gna_lib_async_threads_num = 1;
 
-    bool compact_mode = false;
+    bool compact_mode = true;
     bool exclusive_async_requests = false;
     bool uniformPwlDesign = false;
     float pwlMaxErrorPercent = 1.0f;
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 6f8b455e9b8..6d29a360fd2 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -208,7 +208,7 @@ void  GNAGraphCompiler::ConstPrimitive(InferenceEngine::CNNLayerPtr constLayer)
     connectOutput(constLayer, ptr_for_const_blob, const_blob->byteSize());
     // TODO: segment type for bind, bind initializer not used - need refactor to separate bind and allocation requests
     // dont see practical use case when bind storage type need to be different that allocation type
-    gnamem->readonly().bind_initializer(ptr_for_const_blob, [const_blob](void* data, size_t size) {
+    gnamem->bind_initializer(nullptr, ptr_for_const_blob, [const_blob](void* data, size_t size) {
         ie_memcpy(data, size, const_blob->buffer(), const_blob->byteSize());
         });
 }
@@ -475,7 +475,7 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
     }
 
     if (num_conv_kernel_padding == 0) {
-        gnamem->readonly().push_local_ptr(ptr_weights,
+        gnamem->readonly().push_local_ptr(layer, ptr_weights,
             transposedWeights.data(),
             convolution._weights->byteSize(),
             64);
@@ -502,19 +502,19 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
                 offset += padding_zeros.size();
             }
         };
-        gnamem->readonly().push_initializer(ptr_weights,
+        gnamem->readonly().push_initializer(layer, ptr_weights,
             paddedWeightsSize,
             initializer,
             64);
     }
 
     if (convolution._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
             convolution._biases->cbuffer().as<const void*>(),
             convolution._biases->byteSize(),
             64);
     } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, out_channels, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, out_channels, 64);
     }
 }
 
@@ -600,7 +600,6 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
         ptr_outputs,
         ptr_weights,
         ptr_biases);
-
     currentComponent.num_bytes_per_input = inputs->getPrecision().size();
     currentComponent.num_bytes_per_output = outputs->getPrecision().size();
 
@@ -647,18 +646,18 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
         transposedWeights.resize(transposedWeights.size() + kernelPad);
     }
 
-    gnamem->readonly().push_local_ptr(ptr_weights,
+    gnamem->readonly().push_local_ptr(layer, ptr_weights,
         transposedWeights.data(),
         transposedWeights.size(),
         64);
 
     if (convolution._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
             convolution._biases->cbuffer().as<const void*>(),
             convolution._biases->byteSize(),
             64);
     } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, out_channels, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, out_channels, 64);
     }
 }
 #endif
@@ -712,14 +711,13 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
             ptr_weights,
             ptr_biases,
             true);
-
         connectOutput(layer, ptr_outputs, num_data_bytes_out);
         connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
 
         if (gnaFlags->sw_fp32) {
             IE_ASSERT(quantized == nullptr);
-            gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64);
-            gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, power.scale, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, power.offset, num_rows_out, 64);
         } else {
             IE_ASSERT(quantized != nullptr);
             if (!gnaFlags->input_low_precision) {
@@ -727,15 +725,15 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
                     static_cast<float>(INT16_MAX)));
                 auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.GetScale() * power.offset,
                     static_cast<float>(INT32_MAX)));
-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
-                gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedScale, num_rows_out, 64);
+                gnamem->readonly().push_value<int32_t>(layer, ptr_biases, quantizedOffset, num_rows_out, 64);
             } else {
                 auto quantizedScale = FLOAT_TO_INT8(std::min(quantized->_weights_quant.GetScale() * power.scale,
                     static_cast<float>(INT8_MAX)));
                 auto quantizedOffset = FLOAT_TO_INT8(std::min(quantized->_dst_quant.GetScale() * power.offset,
                     static_cast<float>(INT8_MAX)));
-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedScale, num_rows_out, 64);
-                gnamem->readonly().push_value<int8_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedScale, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_biases, quantizedOffset, num_rows_out, 64);
             }
         }
     } else {
@@ -799,12 +797,11 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
             ptr_pwl_input,
             ptr_pwl_outputs,
             ptr_pwl_segments_target);
-
         connectOutput(layer, ptr_pwl_outputs, num_data_bytes_out);
         connectInput(layer, ptr_pwl_input, num_data_bytes_in, 0, 0);
 
         if (ptr_pwl_segments_target != nullptr) {
-            gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+            gnamem->readonly().push_local_ptr(layer, ptr_pwl_segments_target,
                 &ptr_pwl_segments.front(),
                 ptr_pwl_segments.size() * sizeof(gna_pwl_segment_t),
                 64);
@@ -876,7 +873,6 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
         getScaleFactor(layer, QuantizedDataType::output),
         ptr_inputs,
         ptr_outputs);
-
     size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
         * outputs->getPrecision().size();
 
@@ -921,7 +917,6 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
         num_columns_out,
         ptr_inputs,
         ptr_outputs);
-
     size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
         begin(outputs->getDims()), end(outputs->getDims())), 8)
         * outputs->getPrecision().size();
@@ -933,7 +928,6 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
 
 void GNAGraphCompiler::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
-
     if (concatLayer == nullptr) {
         return;
     }
@@ -996,13 +990,10 @@ void GNAGraphCompiler::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
         auto layerInfo = LayerInfo(concatParent);
         // auto layerInfo = LayerInfo(getCreatorLayer(concatLayerInput->insData[it].lock()).lock());
         if (layerInfo.isInput()) {
-            connectInput(layer, &concatLayerInfo.gna_ptr,
-                inputLayer.tensorSize, inputLayer.offset, idx, false);
-
+            connectInput(layer, &concatLayerInfo.gna_ptr, inputLayer.tensorSize, inputLayer.offset, idx, false);
             concatLayerInfo.input_allocated = true;
         } else if (layerInfo.isMemory()) {
             connectInput(layer, &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size, inputLayer.offset, idx, false);
-
             concatLayerInfo.input_allocated = true;
         }
         ++idx;
@@ -1114,7 +1105,6 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
             ptr_weights,
             ptr_biases,
             false);
-
         size_t num_data_bytes_out =
             InferenceEngine::details::product(
                 begin(outputs->getDims()), end(outputs->getDims())) * 4;
@@ -1128,8 +1118,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
         FillWeightOfAligningFilter(layer, ptr_weights, offset.front(), (quantized == nullptr) ? false : true);
 
         (quantized == nullptr) ?
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64) :
-            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64) :
+            gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0, num_rows_out, 64);
     }
 }
 
@@ -1249,7 +1239,6 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         ptr_weights,
         ptr_biases,
         true);
-
     size_t num_data_bytes_out =
         InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims())) * outputs->getPrecision().size();
 
@@ -1262,36 +1251,36 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
     switch (eltwise._operation) {
     case EltwiseLayer::Sub:
         if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, -1.0f, num_rows_out, 64);
         } else {
             auto scaledIdentity = -quantized->_weights_quant.GetScale();
 
             if (gnaFlags->input_low_precision == false) {
                 auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 
-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
             } else {
                 auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
 
-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
             }
         }
         connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
         break;
     case EltwiseLayer::Sum:
         if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, 1.0f, num_rows_out, 64);
         } else {
             auto scaledIdentity = quantized->_weights_quant.GetScale();
 
             if (gnaFlags->input_low_precision == false) {
                 auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 
-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
             } else {
                 auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
 
-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
             }
         }
         connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
@@ -1299,12 +1288,12 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
 
     case EltwiseLayer::Prod:
         if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
         } else {
             if (gnaFlags->input_low_precision == false) {
-                gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+                gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0, num_rows_out, 64);
             } else {
-                gnamem->readonly().push_value<int8_t>(ptr_biases, 0, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_biases, 0, num_rows_out, 64);
             }
         }
         connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
@@ -1372,9 +1361,9 @@ void GNAGraphCompiler::GemmPrimitive(InferenceEngine::CNNLayerPtr layer) {
     connectInput(layer, ptr_input_2, num_data_bytes_in_2, 0, 1);
     if (gnaFlags->sw_fp32) {
         IE_ASSERT(quantized == nullptr);
-        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
     } else {
-        gnamem->readonly().push_value<int32_t>(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0.0f, num_rows_out, 64);
     }
 }
 
@@ -1485,12 +1474,12 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
 
     if (num_padding == 0) {
         if (!transpose) {
-            gnamem->readonly().push_ptr(ptr_weights,
+            gnamem->readonly().push_ptr(layer, ptr_weights,
                 weightable._weights->cbuffer().as<const void*>(),
                 weightable._weights->byteSize(),
                 64);
         } else {
-            gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void* data, size_t size) {
+            gnamem->readonly().push_initializer(layer, ptr_weights, weightable._weights->byteSize(), [=](void* data, size_t size) {
                 for (uint32_t k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
                     auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
                     auto cbuffer = weightable._weights->cbuffer().as<const uint8_t*>() + rowOffset;
@@ -1519,7 +1508,7 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
         auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
         auto paddedWeightsSize = paddedWeights * weightable.precision.size();
 
-        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
+        gnamem->readonly().push_initializer(layer, ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
             for (uint32_t i = 0; i < (isDiag ? 1 : num_rows_out); i++) {
                 ie_memcpy(data, size,
                     weightable._weights->cbuffer().as<const uint8_t*>() + num_rows_in * i * weightable.precision.size(),
@@ -1530,16 +1519,16 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
     }
 
     if (weightable._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
             weightable._biases->cbuffer().as<const void*>(),
             weightable._biases->byteSize(),
             64);
     } else {
         // in that case input from previous layer goes into biases, so we have to initialize input pointer by zero
         if (useBiasConnection) {
-            gnamem->readonly().push_value(ptr_inputs, 0.0f, num_rows_in + num_padding, 64);
+            gnamem->readonly().push_value(layer, ptr_inputs, 0.0f, num_rows_in + num_padding, 64);
         } else {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out + num_padding_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out + num_padding_out, 64);
         }
     }
 }
@@ -1557,7 +1546,7 @@ void GNAGraphCompiler::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr l
         THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!";
     }
 
-    gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void* data, size_t size) {
+    gnamem->readonly().push_initializer(layer, ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void* data, size_t size) {
         int out = 0;
         for (int input = offset; input < num_rows_out + offset; ++input) {
             auto mem_ptr = reinterpret_cast<uint8_t*>(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size();
@@ -1624,7 +1613,6 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
                                ptr_inputs,
                                ptr_outputs);
 
-
         size_t num_data_bytes_in = num_rows_copied * num_rows_copied * num_columns_in
             * inputs->getPrecision().size();
         // need to reserve full tensor so using original size with assumption of identity activation attached to filter lateron
@@ -1681,7 +1669,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
         size_t weights_stride =  (num_rows_in + num_rows_copied) * weightsElementSize;
         size_t weights_offset = weights_stride * num_rows_copied +  num_rows_copied * weightsElementSize;
 
-        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
+        gnamem->readonly().push_initializer(layer, ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
             size_t roffset = weights_offset;
             size_t woffset = 0;
             for (int i = 0; i < num_rows_out && size >= woffset; i++) {
@@ -1696,12 +1684,12 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
     }
 
     if (filterLayer->_biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
             filterLayer->_biases->cbuffer().as<const void*>(),
             filterLayer->_biases->byteSize(),
             64);
     } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
     }
 }
 
@@ -1774,18 +1762,18 @@ void GNAGraphCompiler::ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr l
     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
     connectOutput(layer, ptr_outputs, num_data_bytes_out);
 
-    gnamem->readonly().push_ptr(ptr_weights,
+    gnamem->readonly().push_ptr(layer, ptr_weights,
         filterLayer->_weights->cbuffer().as<const void*>(),
         filterLayer->_weights->byteSize(),
         64);
 
     if (filterLayer->_biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
             filterLayer->_biases->cbuffer().as<const void*>(),
             filterLayer->_biases->byteSize(),
             64);
     } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, numberOfFilters, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, numberOfFilters, 64);
     }
 }
 
@@ -2016,7 +2004,7 @@ case name:\
     connectOutput(layer, ptr_outputs, num_data_bytes_out);
 
     if (ptr_pwl_segments_target != nullptr) {
-        gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+        gnamem->readonly().push_local_ptr(layer, ptr_pwl_segments_target,
             &ptr_pwl_segments.front(),
             ptr_pwl_segments.size() * sizeof(gna_pwl_segment_t),
             64);
@@ -2152,8 +2140,9 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) {
     }
 }
 
-void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr,
-    size_t num_data_bytes_out) {
+void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer,
+                                    void *ptr,
+                                    size_t num_data_bytes_out) {
     auto getOffsetForBinding = [](InferenceEngine::CNNLayerPtr layer) {
         int32_t output_offset = 0;
         if (layer->params.find("output_offset") != layer->params.end()) {
@@ -2162,7 +2151,6 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
         return output_offset;
     };
 
-
     gnalog() << "Connecting output " << layer->name << " ...\n";
     // in case of Memory Layer it's input allocated in meminput layer
     if (layer->outData.size() == 1) {
@@ -2179,7 +2167,6 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
             if (!nextLayer.first) {
                 gnalog() << "for layer: " << layer->name << "outData[0] has non functional connection at " << j;
             }
-
             auto nextMemoryLayerIt =
                     std::find_if(begin(memory_connection), end(memory_connection),
                                  [&](MemoryConnection::value_type &comp) {
@@ -2190,14 +2177,13 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                 // memory layer not yet initialized
                 if (nextMemoryLayer.reserved_size == 0) {
                     auto memorySize = InferenceEngine::details::product(nextMemoryLayer.getDims()) * nextMemoryLayer.elementSizeBytes();
-
-                    gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(memorySize), 64);
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer));
+                    gnamem->reserve_ptr(nullptr, &nextMemoryLayer.gna_ptr, ALIGN64(memorySize), 64);
+                    gnamem->bind_ptr(nullptr, ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer));
 
                     nextMemoryLayer.reserved_size = ALIGN64(memorySize);
                 } else {
                     // We may need to extend memory buffer if connected input size is bigger, for example for concat connection
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer), ALIGN64(num_data_bytes_out));
+                    gnamem->bind_ptr(nullptr, ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer), ALIGN64(num_data_bytes_out));
                 }
                 return;
             }
@@ -2288,7 +2274,7 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                                              return it != concatItem.second.concatInputLayers.end();
                                          });
                     if (included == concat_connection.end()) {
-                        gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size), 64);
+                        gnamem->reserve_ptr(layer, &concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size), 64);
 
                         std::function<void(GNAConcatLayer, GNAPluginNS::InputDesc&, ConcatConnection&)> allocate_input_recursively =
                             [&allocate_input_recursively](GNAConcatLayer clayer, GNAPluginNS::InputDesc& inputDesc, ConcatConnection& concat_connection) {
@@ -2321,26 +2307,24 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                 if (layer->params.find("output_offset") != layer->params.end()) {
                     output_offset = layer->GetParamAsInt("output_offset");
                 }
-                gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, output_offset);
+                gnamem->bind_ptr(layer, ptr, &concatLayerInfoItem.gna_ptr, output_offset);
             }
             return;
         }
     }
 
-    intel_dnn_component_t * unused_input = nullptr;
-    if (gnaFlags->compact_mode) {
-        unused_input = find_first_unused_input(layer);
-        if (unused_input != nullptr) {
-            gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out));
-        }
-    }
-    // cannot reuse suitable input
-    if (unused_input == nullptr) {
-        gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out), 64);
-    }
+    auto nextLayer = CNNNetCheckNextLayerSkipCertain(layer, 0, 0, true,
+        [](CNNLayerPtr l) { return LayerInfo(l).isNonFunctional(); }).first;
+    // Check that layer will be an output
+    gnamem->reserve_ptr((LayerInfo(layer).isOutput() || !nextLayer) ? nullptr : layer, ptr, ALIGN64(num_data_bytes_out), 64);
 }
 
-GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx, bool connectTo) {
+GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
+                                                              void *ptr,
+                                                              size_t num_data_bytes_in,
+                                                              int32_t offset,
+                                                              int idx,
+                                                              bool connectTo) {
     // selecting particular input layers
     // auto prevLayer = CNNNetPrevLayer(layer, idx);
     auto prevLayer = CNNNetPrevLayerSkipCertain(layer, idx, [](CNNLayerPtr l) {
@@ -2363,12 +2347,12 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
 
             // real allocation pointer will be kept in ptr not in ptr_inputs_global
             if (!connectTo) {
-                gnamem->push_value(ptr,
+                gnamem->push_value(nullptr, ptr,
                                    static_cast<uint8_t>(0),
                                    num_data_bytes_in,
                                    64);
             } else {
-                gnamem->push_value(&inputDesc->getPtrInputsGlobal(prevLayer->name).front(),
+                gnamem->push_value(nullptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(),
                                    static_cast<uint8_t>(0),
                                    num_data_bytes_in,
                                    64);
@@ -2384,9 +2368,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
         }
 
         if (connectTo) {
-            gnamem->bind_ptr(ptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), offset, ALIGN(num_data_bytes_in, 64));
+            gnamem->bind_ptr(nullptr, ptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), offset, ALIGN(num_data_bytes_in, 64));
         } else {
-            gnamem->bind_ptr(&inputDesc->getPtrInputsGlobal(prevLayer->name).front(), ptr, offset, ALIGN(num_data_bytes_in, 64));
+            gnamem->bind_ptr(nullptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), ptr, offset, ALIGN(num_data_bytes_in, 64));
         }
 
         return prevLayer;
@@ -2394,9 +2378,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
     // const input
     if (LayerInfo(prevLayer).isConst()) {
         if (connectTo) {
-            gnamem->bind_ptr(ptr, const_connections[prevLayer->name], offset);
+            gnamem->bind_ptr(layer, ptr, const_connections[prevLayer->name], offset);
         } else {
-            gnamem->bind_ptr(const_connections[prevLayer->name], ptr, offset);
+            gnamem->bind_ptr(layer, const_connections[prevLayer->name], ptr, offset);
         }
 
         return prevLayer;
@@ -2423,6 +2407,8 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
 
             if (it != splitLayerInfoItem.splitOutputLayers.end()) {
                 gnalog()  << "Connecting " << splitName << " input \n";
+                // splitting layer should take the execution order from the connected layer
+                splittingLayer->userValue = layer->userValue;
                 auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset + offset, 0);
                 gnalog()  << "Connected \n";
                 return res;
@@ -2435,7 +2421,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
         if (concatLayerInfo != concat_connection.end()) {
             auto & concatLayerInfoItem = concatLayerInfo->second;
             // dnnLayer that is input for concat layer
-            gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
+            gnamem->bind_ptr(layer, ptr, &concatLayerInfoItem.gna_ptr, offset);
             // return layer over concat
             return CNNNetPrevLayer(prevLayer);
         }
@@ -2444,7 +2430,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
                 prevLayer->name);
         if (cropLayerInfo != crop_connection.end()) {
             auto & cropLayerInfoItem = cropLayerInfo->second;
-            gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset);
+            gnamem->bind_ptr(layer, ptr, &cropLayerInfoItem.gna_ptr, offset);
             return CNNNetPrevLayer(prevLayer);
         }
     }
@@ -2452,7 +2438,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
 
     // check for generic prev layer
     if (prevDnnLayer != nullptr) {
-        gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset);
+        gnamem->bind_ptr(layer, ptr, &prevDnnLayer->ptr_outputs, offset);
         return prevLayer;
     }
 
@@ -2470,20 +2456,20 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
             // connectTo used for  indicate that memory layer should be bound to given buffer
             if (connectTo) {
                 memorySize = std::max(memorySize, num_data_bytes_in);
-                gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(memorySize), 64);
-                gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset);
+                gnamem->reserve_ptr(nullptr, &memoryLayer.gna_ptr, ALIGN64(memorySize), 64);
+                gnamem->bind_ptr(nullptr, ptr, &memoryLayer.gna_ptr, offset);
             } else {
                 if (num_data_bytes_in < memorySize + offset) {
                     THROW_GNA_LAYER_EXCEPTION(layer) <<" invalid allocation request of "
                                                      << num_data_bytes_in << " is more then state tensor size of: " << memorySize + offset;
                 }
-                gnamem->bind_ptr(&memoryLayer.gna_ptr, ptr, offset);
+                gnamem->bind_ptr(nullptr, &memoryLayer.gna_ptr, ptr, offset);
             }
 
             memoryLayer.reserved_size = ALIGN64(memorySize);
         } else {
             // We may need to extend memory buffer if connected input size is bigger, for example for concat connection
-            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset, ALIGN64(num_data_bytes_in));
+            gnamem->bind_ptr(nullptr, ptr, &memoryLayer.gna_ptr, offset, ALIGN64(num_data_bytes_in));
         }
 
         return prevLayer;
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index 974a2a2130b..7c6dd757fea 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -17,6 +17,7 @@
 #include <utility>
 #include <limits>
 
+#include <ie_common.h>
 #include <legacy/graph_tools.hpp>
 #include <legacy/net_pass.h>
 #include <debug.h>
@@ -524,7 +525,7 @@ bool GNAPlugin::TryToInitOutput(int portId, InferenceEngine::CNNLayerPtr layer)
         desc.num_elements = numElem;
 
         // binding ptr for first infer request - then others will be setup during relocation
-        gnamem->bind_ptr(&desc.ptrs.front(), outputPtr);
+        gnamem->bind_ptr(layer, &desc.ptrs.front(), outputPtr);
     };
 
     // probing gna_primitives
@@ -927,7 +928,11 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
     }
 
     // Creating Layer primitives
+    uint16_t id = 0;
     for (auto & layer : sortedNoMem) {
+        IE_SUPPRESS_DEPRECATED_START
+        layer->userValue.v_int = id++;
+        IE_SUPPRESS_DEPRECATED_END
         graphCompiler.CreateLayerPrimitive(layer);
     }
 
@@ -981,7 +986,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
 
     // TODO: how active list will work in multioutput case
     // make room for active list
-    gnamem->reserve_ptr(nullptr,
+    gnamem->reserve_ptr(nullptr, nullptr,
         ALIGN64(outputsDesc.front().num_bytes_per_element * outputsDesc.front().num_elements), 64);
 
     void *pParallelExecutionData  = nullptr;
@@ -989,10 +994,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
     // reserving more bytes for intermediate data in parallel case - TODO: this works incorrectly in compact mode at lest
     rwSegmentSize = gnamem->getRWBytes();
     if (gnaFlags->gna_lib_async_threads_num > 1) {
-        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
+        gnamem->reserve_ptr(nullptr, &pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
     }
 
-    gnamem->commit();
+    gnamem->commit(gnaFlags->compact_mode);
 
     dnn->Init(gnamem->getBasePtr(),
              gnamem->getTotalBytes(),
@@ -1569,7 +1574,7 @@ InferenceEngine::IExecutableNetworkInternal::Ptr GNAPlugin::ImportNetwork(std::i
 
     graphCompiler.setGNAMemoryPtr(gnamem);
     void *basePtr = nullptr;
-    gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
+    gnamem->reserve_ptr(nullptr, &basePtr, header.gnaMemSize);
     gnamem->commit();
 #if GNA_LIB_VER == 2
     gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>(header.layersCount)));
diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
index de3b9dec8f9..6e807b6ecbc 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
@@ -14,6 +14,8 @@
  * @brief used for creating graphviz charts, and layers dump
  */
 # define PLOT
+# define MODEL_DUMP
+# define GNA_HEAP_PROFILER
 # define gnalog() std::cout
 # define gnawarn() std::cerr
 #else
diff --git a/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp b/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp
index 88fc8a0278d..6332981e3f6 100644
--- a/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp
@@ -8,6 +8,8 @@
 #include <vector>
 #include <algorithm>
 
+#include "gna_plugin_log.hpp"
+
 namespace GNAPluginNS {
 namespace memory {
 
@@ -26,6 +28,45 @@ enum rRegion {
     REGION_AUTO,
 };
 
+#ifdef GNA_HEAP_PROFILER
+inline const char* rRegionToStr(uint8_t region) {
+   const char* strRegion = "UNKNOWN";
+   switch (region) {
+      case REGION_RO:
+        strRegion = "REGION_RO";
+        break;
+      case REGION_RW:
+        strRegion = "REGION_RW";
+        break;
+      case REGION_AUTO:
+        strRegion = "REGION_AUTO";
+        break;
+   }
+   return strRegion;
+}
+
+inline const char* rTypeToStr(uint8_t type) {
+   const char* strType = "UNKNOWN";
+   switch (type) {
+      case REQUEST_STORE:
+        strType = "REQUEST_STORE";
+        break;
+      case REQUEST_ALLOCATE:
+        strType = "REQUEST_ALLOCATE";
+        break;
+      case REQUEST_BIND:
+        strType = "REQUEST_BIND";
+        break;
+      case REQUEST_INITIALIZER | REQUEST_STORE:
+      case REQUEST_INITIALIZER | REQUEST_ALLOCATE:
+      case REQUEST_INITIALIZER | REQUEST_BIND:
+        strType = "INITIALIZER";
+        break;
+   }
+   return strType;
+}
+#endif
+
 struct MemRequest {
     rRegion  _region;
     uint8_t   _type;
@@ -40,6 +81,10 @@ struct MemRequest {
     size_t _offset = 0;
     // expansion in bytes due to large depended layers
     size_t _padding = 0;
+
+    // fields to sort regions by execution availability
+    std::pair<uint16_t, uint16_t> _life_limits{0, UINT16_MAX};
+
     MemRequest(rRegion region,
                 rType req,
                 void *ptr_out,
@@ -79,7 +124,8 @@ struct MemRequest {
         _data.resize(sizeof(T));
         std::copy(reinterpret_cast<uint8_t *>(&element), reinterpret_cast<uint8_t *>(&element) + sizeof(T), _data.begin());
     }
-/**
+
+    /**
      * Store initializer request
      * @param req
      * @param ptr_out
@@ -103,4 +149,4 @@ struct MemRequest {
     }
 };
 }  // namespace memory
-}  // namespace GNAPluginNS
\ No newline at end of file
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp b/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp
index 0faaa922e1c..b18ea9b0da8 100644
--- a/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp
@@ -8,10 +8,23 @@
 #include <vector>
 #include <algorithm>
 #include <functional>
+
+#include <ie_api.h>
+#include <legacy/ie_layers.h>
 #include "gna_mem_requests.hpp"
 
 namespace GNAPluginNS {
 namespace memory {
+
+/**
+* @brief get layer id from legacy CNNLayer
+*/
+inline uint16_t getCNNLayerId(InferenceEngine::CNNLayerPtr layer) {
+    IE_SUPPRESS_DEPRECATED_START
+    return layer->userValue.v_int;
+    IE_SUPPRESS_DEPRECATED_END
+}
+
 /**
  * Adapter for requests submission and actual request queue
  */
@@ -26,12 +39,26 @@ public:
      * @param num_bytes
      * @param alignment
      */
-    void push_initializer(void *ptr_out, size_t num_bytes, std::function<void(void * data, size_t size)> initializer, size_t alignment = 1) {
+    void push_initializer(InferenceEngine::CNNLayerPtr layer,
+                          void *ptr_out,
+                          size_t num_bytes,
+                          std::function<void(void * data, size_t size)> initializer,
+                          size_t alignment = 1) {
         futureHeap().push_back({regionType(), ptr_out, num_bytes, initializer, REQUEST_INITIALIZER, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
     }
 
-    void push_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+    void push_ptr(InferenceEngine::CNNLayerPtr layer,
+                  void *ptr_out,
+                  const void *ptr_in,
+                  size_t num_bytes,
+                  size_t alignment = 1) {
         futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, ptr_in, 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
     }
 
     /**
@@ -40,10 +67,17 @@ public:
      * @param ptr_in
      * @param num_bytes
      */
-    void push_local_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+    void push_local_ptr(InferenceEngine::CNNLayerPtr layer,
+                        void *ptr_out,
+                        const void *ptr_in,
+                        size_t num_bytes,
+                        size_t alignment = 1) {
         localStorage().emplace_back(reinterpret_cast<const uint8_t *>(ptr_in),
                                     reinterpret_cast<const uint8_t *>(ptr_in) + num_bytes);
         futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, &localStorage().back().front(), 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
     }
 
     /**
@@ -51,8 +85,14 @@ public:
      * @param ptr_out
      * @param num_bytes
      */
-    void reserve_ptr(void *ptr_out, size_t num_bytes, size_t alignment = 1)  {
+    void reserve_ptr(InferenceEngine::CNNLayerPtr layer,
+                     void *ptr_out,
+                     size_t num_bytes,
+                     size_t alignment = 1)  {
         futureHeap().push_back({regionType(), REQUEST_ALLOCATE, ptr_out, nullptr, 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {getCNNLayerId(layer), getCNNLayerId(layer)};
+        }
     }
 
     /**
@@ -63,8 +103,15 @@ public:
      * @param num_bytes - bind can request for bigger buffer that originally allocated via reserve(),
      *      if that happens - reserved request parameters will be updated before committing memory
      */
-    void bind_ptr(void *source, const void *dest, size_t offset = 0, size_t num_bytes = 0)  {
+    void bind_ptr(InferenceEngine::CNNLayerPtr layer,
+                  void *source,
+                  const void *dest,
+                  size_t offset = 0,
+                  size_t num_bytes = 0)  {
         futureHeap().push_back({regionType(), REQUEST_BIND, source, dest, 1, num_bytes, 1, offset});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {getCNNLayerId(layer), getCNNLayerId(layer)};
+        }
     }
 
     /**
@@ -72,16 +119,28 @@ public:
      * @param ptr_out - previously requested buffer
      * @param initializer - initialisation routine to be called on allocated memory
      */
-    void bind_initializer(void *ptr_out, std::function<void(void * data, size_t size)> initializer)  {
+    void bind_initializer(InferenceEngine::CNNLayerPtr layer,
+                          void *ptr_out,
+                          std::function<void(void * data, size_t size)> initializer) {
         futureHeap().push_back({regionType(), ptr_out, 0, initializer, REQUEST_BIND, 1});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
     }
 
     /**
      * @brief allocates buffer and set all its values to T value
      */
     template<class T>
-    void push_value(void *ptr_out, T value, size_t num_elements, size_t alignment = 1) {
+    void push_value(InferenceEngine::CNNLayerPtr layer,
+                    void *ptr_out,
+                    T value,
+                    size_t num_elements,
+                    size_t alignment = 1) {
         futureHeap().push_back({regionType(), ptr_out, value, num_elements, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
     }
 
     /**
diff --git a/inference-engine/src/gna_plugin/memory/gna_memory.hpp b/inference-engine/src/gna_plugin/memory/gna_memory.hpp
index cc52398b95f..8219f7918fa 100644
--- a/inference-engine/src/gna_plugin/memory/gna_memory.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_memory.hpp
@@ -13,7 +13,15 @@
 #include <list>
 #include <algorithm>
 #include <functional>
+#include <iostream>
 #include "gna_lib_ver_selector.hpp"
+#include "memory_solver.hpp"
+#include "gna_plugin_log.hpp"
+
+#ifdef GNA_HEAP_PROFILER
+#include <iomanip>
+#include <fstream>
+#endif
 
 namespace GNAPluginNS {
 namespace memory {
@@ -32,6 +40,7 @@ class GNAMemory : public GNAMemRequestsQueue {
     Allocator _allocator;
     std::shared_ptr<uint8_t> heap = nullptr;
     size_t _page_alignment = 1;
+    bool _is_compact_mode = false;
 
     class GNAMemRequestsReadOnlyQueue : public GNAMemRequestsQueue {
         std::reference_wrapper<GNAMemRequestsQueue> _that;
@@ -62,93 +71,32 @@ class GNAMemory : public GNAMemRequestsQueue {
         return readOnlyFrontEnd;
     }
 
+    /**
+     * @brief enables memory optimization (compact mode). This mode can be enable in plugin configuration (COMPACT_MODE = Yes)
+     */
+    void setCompactMode(bool isCompact) {
+        _is_compact_mode = isCompact;
+    }
+
     /**
      * @brief calculates size required for all requests, allocates memory and updates pointers
      */
-    void commit() {
+    void commit(bool isCompact = false) {
+        setCompactMode(isCompact);
+
         // 1st stage -- looking for expandable bind requests:
-        for (auto &originated : _future_heap) {
-            if (originated._type & REQUEST_BIND) continue;
-            size_t offset = 0;
-            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
-                if (&originated == &reference) {
-                    offset = 0;
-                }
-                offset += binded._offset;
-                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
-                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
-                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+        expandBindings();
 
-                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
-            });
-        }
+        // 2nd stage -- setup offsets:
+        setRegionOffsets(REGION_RO);
+        setRegionOffsets(REGION_RW);
 
-        updateSectionsSizes();
+        // 3rd stage -- allocation total memory setting to 0 internally
+        heap = allocate(getTotalBytes());
 
-        _total = _rw_section_size + _ro_section_size;
-
-        // allocation with memory setting to 0 internally
-        heap = allocate(_total);
-        auto setupOffsets = [&](std::function<bool(MemRequest & request)> filter, size_t offset) {
-            for (auto &re : _future_heap) {
-                if (re._type == REQUEST_BIND) continue;
-                if (filter(re)) continue;
-
-                auto sz = re._element_size * re._num_elements;
-
-                if (re._ptr_out != nullptr) {
-                    auto cptr = heap.get() + offset;
-                    size_t cptr_avail_size = _total - offset;
-                    if (re._type & REQUEST_BIND) {
-                        cptr = reinterpret_cast<uint8_t*>(*reinterpret_cast<void **>(re._ptr_out));
-                        cptr_avail_size = sz;
-                    } else {
-                        *reinterpret_cast<void **>(re._ptr_out) = cptr;
-                    }
-                    // std::cout << "ALLOCATED=" << cptr << ", size=" << re._element_size * re._num_elements << "\n";
-                    iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
-                        *reinterpret_cast<void **>(binded._ptr_out) =
-                            binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
-                        binded._num_elements = reference._num_elements;
-                        binded._element_size = reference._element_size;
-                    });
-
-                    // std::cout << "size=" << ALIGN(sz, re._alignment) << "\n" << std::flush;
-
-                    switch (re._type & ~REQUEST_BIND) {
-                        case REQUEST_ALLOCATE :
-                            break;
-                        case REQUEST_STORE : {
-                            if (re._ptr_in != nullptr) {
-                                ie_memcpy(cptr, cptr_avail_size, re._ptr_in, sz);
-                            } else {
-                                size_t of = 0;
-                                for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
-                                    std::copy(std::begin(re._data), std::end(re._data), cptr + of);
-                                }
-                            }
-                            break;
-                        }
-                        case REQUEST_INITIALIZER : {
-                            re._initializer(cptr, sz);
-                            break;
-                        }
-                    }
-                }
-                if (!(re._type & REQUEST_BIND)) {
-                    offset += ALIGN(sz + re._padding, re._alignment);
-                }
-            }
-        };
-
-        setupOffsets([](GNAPluginNS::memory::MemRequest & request) {
-            // TODO: consume bind requests separately from storage type
-            return !(request._type & REQUEST_BIND) && (request._region != REGION_RW);
-        }, 0);
-
-        setupOffsets([](GNAPluginNS::memory::MemRequest & request) {
-            return (request._type & REQUEST_BIND) || request._region != REGION_RO;
-        }, _rw_section_size);
+        // 4th stage -- store data and updates pointers
+        allocateRegion(REGION_RW, 0);
+        allocateRegion(REGION_RO, _rw_section_size);
     }
 
     void *getBasePtr() {
@@ -180,7 +128,7 @@ class GNAMemory : public GNAMemRequestsQueue {
     void iterate_binded(GNAPluginNS::memory::MemRequest & reference, const T & visitor) {
         for (auto &re : _future_heap) {
             if ((re._type & REQUEST_BIND) && (re._ptr_in == reference._ptr_out)) {
-                // std::cout << "  [binded=" << re._type << ", ptr=" << re._ptr_out <<"]\n";
+                // std::cout << "  [binded=" << rTypeToStr(re._type) << ", ptr=" << re._ptr_out <<"]\n";
                 visitor(reference, re);
                 // primitive loop check
                 if (re._ptr_in == re._ptr_out) continue;
@@ -190,7 +138,6 @@ class GNAMemory : public GNAMemRequestsQueue {
         }
     }
 
-
     std::shared_ptr<uint8_t> allocate(size_t bytes) {
         std::shared_ptr<uint8_t> sp(_allocator.allocate(bytes), [=](uint8_t *p) {
             _allocator.deallocate(p, bytes);
@@ -200,31 +147,191 @@ class GNAMemory : public GNAMemRequestsQueue {
     }
 
  protected:
+    /**
+     * @brief expand BIND and (BIND | ) requests. Align size(_padding), set execution order
+     */
+    void expandBindings() {
+        for (auto &originated : _future_heap) {
+            // skipping bind requests to avoid duplications
+            if (originated._type & REQUEST_BIND) continue;
+
+            size_t offset = 0;
+            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
+                // aligning sizes
+                if (&originated == &reference) offset = 0;
+
+                offset += binded._offset;
+                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
+                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
+                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+
+                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
+
+                // set execution order
+                originated._life_limits.first = std::min(originated._life_limits.first, binded._life_limits.first);
+                originated._life_limits.second = std::max(originated._life_limits.second, binded._life_limits.second);
+            });
+        }
+    }
+
+    /**
+     * @brief set offsets for specific region
+     */
+    size_t setRegionOffsets(GNAPluginNS::memory::rRegion regType) {
+        size_t region_offset = 0;
+        for (auto &re : _future_heap) {
+            if (re._region != regType || re._type & REQUEST_BIND || re._ptr_out == nullptr) continue;
+
+            re._offset = region_offset;
+            region_offset += ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
+        }
+        return region_offset;
+    }
+
+    /**
+     * @brief allocates memory and updates pointers
+     */
+    void allocateRegion(GNAPluginNS::memory::rRegion regType, size_t baseOffset) {
+        for (auto &re : _future_heap) {
+            // skipping Bind, crossregion and empty requests
+            if (re._region != regType || re._type == REQUEST_BIND || re._ptr_out == nullptr) continue;
+
+            size_t offset = baseOffset + re._offset;
+            auto cptr = heap.get() + offset;
+            size_t cptr_avail_size = _total - offset;
+
+            auto sz = re._element_size * re._num_elements;
+            if (re._type & REQUEST_BIND) {
+                cptr = reinterpret_cast<uint8_t*>(*reinterpret_cast<void **>(re._ptr_out));
+                cptr_avail_size = sz;
+            } else {
+                *reinterpret_cast<void **>(re._ptr_out) = cptr;
+            }
+            iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
+                *reinterpret_cast<void **>(binded._ptr_out) =
+                    binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
+                binded._num_elements = reference._num_elements;
+                binded._element_size = reference._element_size;
+            });
+
+            switch (re._type & ~REQUEST_BIND) {
+                case REQUEST_ALLOCATE :
+                    break;
+                case REQUEST_STORE : {
+                    if (re._ptr_in != nullptr) {
+                        ie_memcpy(cptr, cptr_avail_size, re._ptr_in, sz);
+                    } else {
+                        size_t of = 0;
+                        for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
+                            std::copy(std::begin(re._data), std::end(re._data), cptr + of);
+                        }
+                    }
+                    break;
+                }
+                case REQUEST_INITIALIZER : {
+                    re._initializer(cptr, sz);
+                    break;
+                }
+            }
+        }
+    }
+
+    /**
+     * @brief optimize memory region by reusing buffers
+     */
+    size_t getSectionSizeOptimized(GNAPluginNS::memory::rRegion regType) {
+        size_t memSize = 0;
+        switch (regType) {
+            case REGION_AUTO:
+            case REGION_RW:
+            case REGION_RO: {
+                    std::vector<MemorySolver::Box> boxes;
+                    for (size_t i = 0; i < _future_heap.size(); ++i) {
+                        // skipping BIND, cross-region and empty requests
+                        if (_future_heap[i]._type & REQUEST_BIND || _future_heap[i]._region != regType || _future_heap[i]._ptr_out == nullptr) {
+                            continue;
+                        }
+
+                        auto original_with_pad = ALIGN(_future_heap[i]._num_elements * _future_heap[i]._element_size + _future_heap[i]._padding,
+                                                       _future_heap[i]._alignment);
+                        int start = _future_heap[i]._life_limits.first;
+                        int stop = _future_heap[i]._life_limits.second;
+
+                        boxes.push_back({start, stop, static_cast<int64_t>(original_with_pad), static_cast<int64_t>(i)});
+                    }
+                    MemorySolver memSolver(boxes);
+                    memSize = memSolver.solve();
+
+                    // setting offsets
+                    for (auto const & box : boxes) {
+                        _future_heap[box.id]._offset = memSolver.getOffset(box.id);
+                    }
+                }
+                break;
+
+            default:
+                break;
+            }
+
+        return memSize;
+    }
+
+
+#ifdef GNA_HEAP_PROFILER
+    void memoryDump(std::function<bool(MemRequest & re)> filter) {
+        std::ofstream dumpFile("gna_memory_requests.txt", std::ios::out);
+
+        for (auto &re : _future_heap) {
+            if (filter(re)) continue;
+            dumpFile << ": " << " region: " << rRegionToStr(re._region) << ", "
+                    << "type: " << std::setw(17) << rTypeToStr(re._type) << " "
+                    << "ptr_in: " << std::setw(15) << re._ptr_in << " "
+                    << "ptr_out: " << std::setw(15) << re._ptr_out << " "
+                    << std::setw(8) << re._num_elements << ", "
+                    << static_cast<int>(re._element_size) << ", "
+                    << re._padding << ", "
+                    << std::setw(3) << re._alignment << ", "
+                    << std::setw(8) << re._offset << ", "
+                    << "life_time: " << re._life_limits.first << ":" << re._life_limits.second << ", "
+                    << std::endl;
+        }
+    }
+#endif
+
     void updateSectionsSizes() {
         // count total size and size of read/write regions
         _rw_section_size = 0;
         _ro_section_size = 0;
-        for (auto &re : _future_heap) {
-            auto current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
 #ifdef GNA_HEAP_PROFILER
-            std::cout << "chunk: " << " region: " << re._region << ", " <<
-                    "type: " << (re._type  == REQUEST_STORE ? "store " : re._type == REQUEST_BIND ? "bind  " : "alloc ") <<
-                    std::setw(10) << re._num_elements << ", " <<
-                    static_cast<int>(re._element_size) << ", " <<
-                    re._padding << ", " <<
-                    re._offset << ", " <<
-                    re._alignment << std::endl;
+        memoryDump([](GNAPluginNS::memory::MemRequest & request) {
+            return false;
+            });
 #endif
-            if (re._type == REQUEST_BIND) continue;
+        for (auto &re : _future_heap) {
+            if (re._type & REQUEST_BIND || re._ptr_out == nullptr) continue;
 
+            size_t current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
             if (re._region == REGION_RW) {
                 _rw_section_size += current;
             } else {
                 _ro_section_size += current;
             }
         }
+
+        if (_is_compact_mode) {
+            _rw_section_size = getSectionSizeOptimized(REGION_RW);
+        }
+
+        gnalog() << "ro_section_size: " << _ro_section_size << std::endl;
+        gnalog() << "rw_section_size: " << _rw_section_size << std::endl;
+        gnalog() << "total: " << _total << std::endl;
+
         _rw_section_size = ALIGN(_rw_section_size, _page_alignment);
         _ro_section_size = ALIGN(_ro_section_size, _page_alignment);
+        _total = _rw_section_size + _ro_section_size;
+
+        gnalog() << "Aligned ro_section_size: " << _ro_section_size << std::endl;
+        gnalog() << "Aligned rw_section_size: " << _rw_section_size << std::endl;
     }
 };
 }  // namespace memory
diff --git a/inference-engine/tests/unit/gna/gna_memory_compact_test.cpp b/inference-engine/tests/unit/gna/gna_memory_compact_test.cpp
new file mode 100644
index 00000000000..7aabe0a27ff
--- /dev/null
+++ b/inference-engine/tests/unit/gna/gna_memory_compact_test.cpp
@@ -0,0 +1,250 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include <legacy/ie_layers.h>
+#include "memory/gna_memory.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS::memory;
+
+class GNAMemoryCompactTest : public ::testing::Test {
+ protected:
+    GNAMemory<std::allocator<uint8_t>> mem;
+    bool isCompact = true;
+
+    void SetUp() override  {
+    }
+};
+
+TEST_F(GNAMemoryCompactTest, canOptimizeReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+
+    mem.reserve_ptr(layer1, pFuture1, 3 * sizeof(float));
+    mem.reserve_ptr(layer2, pFuture2, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 3 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 3 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushValue) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+
+    mem.push_value(layer1, pFuture1, 1.f, 2);
+    mem.push_value(layer2, pFuture2, 2.f, 3);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 5 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 5 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushValueAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_value(layer1, pFuture1, 3.f, 2);
+    mem.bind_ptr(layer2, pFuture2, pFuture1, 0, 2);
+    mem.reserve_ptr(layer3, pFuture3, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 2 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 2 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeTwoPushValueAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    CNNLayerPtr layer4 = std::make_shared<CNNLayer>(LayerParams("layer4", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    layer4->userValue.v_int = 4;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_value(layer1, pFuture1, 1.f, 2);
+    mem.push_value(layer2, pFuture2, 2.f, 3);
+    mem.reserve_ptr(layer3, pFuture3, 5 * sizeof(float));
+    mem.bind_ptr(layer2, pFuture2, pFuture1, 0, 2);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 5 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 5 * sizeof(float));
+}
+
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float input[]  = {1, 2, 3};
+    size_t input_size = sizeof(input);
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_ptr(layer1, pFuture1, input, input_size);
+    mem.reserve_ptr(layer2, pFuture2, input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), input_size);
+    ASSERT_EQ(mem.getTotalBytes(), input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushLocalPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    size_t input_size;
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+        input_size = input.size() * sizeof(float);
+        mem.push_local_ptr(layer1, pFuture1, &*input.begin(), input_size);
+    }
+
+    mem.reserve_ptr(layer2, pFuture2, input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), input_size);
+    ASSERT_EQ(mem.getTotalBytes(), input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushInitilizerPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    size_t input_size;
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f};
+        input_size = input.size() * sizeof(float);
+        mem.push_initializer(layer1, pFuture1, input_size, [=](void* data, size_t size){
+            ie_memcpy(data, size, &input[0], input.size());
+        });
+    }
+
+    mem.reserve_ptr(layer2, pFuture2, 2 * input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 2 * input_size);
+    ASSERT_EQ(mem.getTotalBytes(), 2 * input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeBindInitilizerPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    CNNLayerPtr layer4 = std::make_shared<CNNLayer>(LayerParams("layer4", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    layer4->userValue.v_int = 4;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+    float* pFuture4 = reinterpret_cast<float*>(&pFuture4);
+
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f};
+        mem.bind_initializer(layer2, pFuture1, [=](void* data, size_t size){
+            ie_memcpy(data, size, &input[0], input.size());
+        });
+    }
+
+    mem.reserve_ptr(layer1, pFuture1, 4 * sizeof(float));
+    mem.reserve_ptr(layer3, pFuture3, 2 * sizeof(float));
+    mem.bind_ptr(layer4, pFuture4, pFuture3, 0, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeReservePtrWithOffset) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.reserve_ptr(layer1, pFuture1, 2 * sizeof(float));
+    mem.reserve_ptr(layer2, pFuture2, 2 * sizeof(float));
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 2 * sizeof(float), 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+}
\ No newline at end of file
diff --git a/inference-engine/tests_deprecated/unit/engines/gna/gna_memory_test.cpp b/inference-engine/tests/unit/gna/gna_memory_test.cpp
similarity index 67%
rename from inference-engine/tests_deprecated/unit/engines/gna/gna_memory_test.cpp
rename to inference-engine/tests/unit/gna/gna_memory_test.cpp
index 6dfa38fc27d..d400a2f2a26 100644
--- a/inference-engine/tests_deprecated/unit/engines/gna/gna_memory_test.cpp
+++ b/inference-engine/tests/unit/gna/gna_memory_test.cpp
@@ -9,7 +9,6 @@
 using namespace GNAPluginNS::memory;
 
 class GNAMemoryTest : public ::testing::Test {
-
  protected:
     GNAMemory<std::allocator<uint8_t>> mem;
 
@@ -17,12 +16,12 @@ class GNAMemoryTest : public ::testing::Test {
     }
 };
 
-TEST_F(GNAMemoryTest, canStoreActualBlob){
-    float input [] = {1,2,3};
+TEST_F(GNAMemoryTest, canStoreActualBlob) {
+    float input[] = {1, 2, 3};
     float* pFuture = nullptr;
     size_t len = sizeof(input);
 
-    mem.push_ptr(&pFuture, input, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
     mem.commit();
 
     ASSERT_NE(pFuture, nullptr);
@@ -33,12 +32,12 @@ TEST_F(GNAMemoryTest, canStoreActualBlob){
 }
 
 TEST_F(GNAMemoryTest, canStore2Blobs) {
-    float input [] = {1,2,3,4};
+    float input[] = {1, 2, 3, 4};
     float* pFuture = nullptr;
     float* pFuture2 = nullptr;
 
-    mem.push_ptr(&pFuture, input, 3*4);
-    mem.push_ptr(&pFuture2, input+1, 3*4);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4);
+    mem.push_ptr(nullptr, &pFuture2, input+1, 3*4);
     mem.commit();
 
     ASSERT_NE(pFuture, input);
@@ -54,10 +53,10 @@ TEST_F(GNAMemoryTest, canStore2Blobs) {
 }
 
 TEST_F(GNAMemoryTest, canStoreBlobsALIGNED) {
-    float input [] = {1,2,3,4,5,6,7,8};
+    float input[] = {1, 2, 3, 4, 5, 6, 7, 8};
     float* pFuture = nullptr;
 
-    mem.push_ptr(&pFuture, input, 3*4, 8);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4, 8);
     mem.commit();
 
     ASSERT_EQ(16 , mem.getTotalBytes());
@@ -73,12 +72,12 @@ TEST_F(GNAMemoryTest, canStoreBlobsALIGNED) {
 }
 
 TEST_F(GNAMemoryTest, canStore2BlobsALIGNED) {
-    float input [] = {1,2,3,4,5,6,7,8};
+    float input[] = {1, 2, 3, 4, 5, 6, 7, 8};
     float* pFuture = nullptr;
     float* pFuture2 = nullptr;
 
-    mem.push_ptr(&pFuture, input, 3*4, 8);
-    mem.push_ptr(&pFuture2, input, 3*4, 16);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4, 8);
+    mem.push_ptr(nullptr, &pFuture2, input, 3*4, 16);
     mem.commit();
 
     ASSERT_EQ(32 , mem.getTotalBytes());
@@ -92,33 +91,30 @@ TEST_F(GNAMemoryTest, canStore2BlobsALIGNED) {
     ASSERT_EQ(pFuture[4], 1);
     ASSERT_EQ(pFuture[5], 2);
     ASSERT_EQ(pFuture[6], 3);
-
 }
 
 TEST_F(GNAMemoryTest, canReserveData) {
-
     float* pFuture = nullptr;
-    mem.reserve_ptr(&pFuture, 3*4);
+    mem.reserve_ptr(nullptr, &pFuture, 3*4);
     mem.commit();
 
     ASSERT_NE(pFuture, nullptr);
 }
 
 TEST_F(GNAMemoryTest, canReserveDataByVoid) {
-    mem.reserve_ptr(nullptr, 3*4);
+    mem.reserve_ptr(nullptr, nullptr, 3*4);
     ASSERT_NO_THROW(mem.commit());
 }
 
 
 TEST_F(GNAMemoryTest, canReserveAndPushData) {
-
     float input[] = {1, 2, 3};
     float *pFuture = nullptr;
     float* pFuture2 = nullptr;
-    size_t len = sizeof(input) ;
+    size_t len = sizeof(input);
 
-    mem.push_ptr(&pFuture, input, len);
-    mem.reserve_ptr(&pFuture2, 3*4);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.reserve_ptr(nullptr, &pFuture2, 3*4);
     mem.commit();
 
     ASSERT_NE(pFuture, nullptr);
@@ -136,16 +132,15 @@ TEST_F(GNAMemoryTest, canReserveAndPushData) {
 }
 
 TEST_F(GNAMemoryTest, canBindAndResolve) {
-
     float input[] = {1, 2, 3};
     float *pFuture = nullptr;
     float *pFuture2 = nullptr;
     float *pFuture3 = nullptr;
     size_t len = sizeof(input);
 
-    mem.bind_ptr(&pFuture3, &pFuture);
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture);
 
     mem.commit();
 
@@ -160,16 +155,15 @@ TEST_F(GNAMemoryTest, canBindAndResolve) {
 }
 
 TEST_F(GNAMemoryTest, canBindTransitevlyAndResolve) {
-
     float input[] = {1, 2, 3};
     float *pFuture = nullptr;
     float *pFuture3 = nullptr;
     float *pFuture4 = nullptr;
     size_t len = sizeof(input);
 
-    mem.bind_ptr(&pFuture4, &pFuture3);
-    mem.bind_ptr(&pFuture3, &pFuture);
-    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture4, &pFuture3);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture);
+    mem.push_ptr(nullptr, &pFuture, input, len);
 
     mem.commit();
 
@@ -185,16 +179,15 @@ TEST_F(GNAMemoryTest, canBindTransitevlyAndResolve) {
 }
 
 TEST_F(GNAMemoryTest, canBindTransitevlyWithOffsetsAndResolve) {
-
     float input[] = {1, 2, 3};
     float *pFuture = nullptr;
     float *pFuture3 = nullptr;
     float *pFuture4 = nullptr;
     size_t len = sizeof(input);
 
-    mem.bind_ptr(&pFuture4, &pFuture3, 4);
-    mem.bind_ptr(&pFuture3, &pFuture, 4);
-    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture4, &pFuture3, 4);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture, 4);
+    mem.push_ptr(nullptr, &pFuture, input, len);
 
     mem.commit();
 
@@ -210,16 +203,15 @@ TEST_F(GNAMemoryTest, canBindTransitevlyWithOffsetsAndResolve) {
 }
 
 TEST_F(GNAMemoryTest, canBindWithOffsetAndResolve) {
-
     float input[] = {1, 2, 3};
     float *pFuture = nullptr;
     float *pFuture2 = nullptr;
     float *pFuture3 = nullptr;
     size_t len = sizeof(input);
 
-    mem.bind_ptr(&pFuture3, &pFuture, 4);
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture, 4);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture);
 
     mem.commit();
 
@@ -237,12 +229,11 @@ TEST_F(GNAMemoryTest, canBindWithOffsetAndResolve) {
 
 
 TEST_F(GNAMemoryTest, canPushLocal) {
-
-    float* pFuture = (float*)&pFuture;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);
 
     {
         std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-        mem.push_local_ptr(pFuture, &*input.begin(), 4 * 4, 1);
+        mem.push_local_ptr(nullptr, pFuture, &*input.begin(), 4 * 4, 1);
     }
 
     //poison stack
@@ -255,13 +246,12 @@ TEST_F(GNAMemoryTest, canPushLocal) {
 }
 
 TEST_F(GNAMemoryTest, canPushValue) {
-
-    float* pFuture = (float*)&pFuture;
-    float* pFuture2 = (float*)&pFuture2;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
 
     {
-        mem.push_value(pFuture, 3.f,  2);
-        mem.push_value(pFuture2, 13.f, 2);
+        mem.push_value(nullptr, pFuture, 3.f,  2);
+        mem.push_value(nullptr, pFuture2, 13.f, 2);
     }
 
     mem.commit();
@@ -273,13 +263,12 @@ TEST_F(GNAMemoryTest, canPushValue) {
 }
 
 TEST_F(GNAMemoryTest, canPushReadOnlyValue) {
-
-    float* pFuture = (float*)&pFuture;
-    float* pFuture2 = (float*)&pFuture2;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
 
     {
-        mem.push_value(pFuture, 3.f,  2);
-        mem.readonly().push_value(pFuture2, 13.f, 2);
+        mem.push_value(nullptr, pFuture, 3.f,  2);
+        mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
     }
 
     mem.commit();
@@ -290,10 +279,37 @@ TEST_F(GNAMemoryTest, canPushReadOnlyValue) {
     ASSERT_FLOAT_EQ(pFuture[3], 13);
 }
 
-TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeEmptyReqs) {
+    mem.push_value(nullptr, nullptr, 3.f,  2);
+    mem.readonly().push_value(nullptr, nullptr, 13.f, 2);
+    mem.commit();
 
-    mem.push_value(nullptr, 3.f,  2);
-    mem.readonly().push_value(nullptr, 13.f, 2);
+    ASSERT_EQ(mem.getTotalBytes(), 0);
+    ASSERT_EQ(mem.getRWBytes(), 0);
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithEmptyReqs) {
+    // empty request before
+    mem.push_value(nullptr, nullptr, 3.f,  2);
+    // not empty requests
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    mem.push_value(nullptr, pFuture1, 3.f,  2);
+    mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
+    // empty request after
+    mem.readonly().push_value(nullptr, nullptr, 13.f, 2);
+
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getRWBytes(), 2 * sizeof(float));
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    mem.push_value(nullptr, pFuture1, 3.f,  2);
+    mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
     mem.commit();
 
     ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
@@ -301,11 +317,12 @@ TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
 }
 
 TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithAlignment) {
-
     GNAMemory<std::allocator<uint8_t>> memAligned(64);
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
 
-    memAligned.push_value(nullptr, 3.f,  2);
-    memAligned.readonly().push_value(nullptr, 13.f, 2);
+    memAligned.push_value(nullptr, pFuture1, 3.f,  2);
+    memAligned.readonly().push_value(nullptr, pFuture2, 13.f, 2);
     memAligned.commit();
 
     ASSERT_EQ(memAligned.getTotalBytes(), 128);
@@ -313,15 +330,13 @@ TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithAlignment) {
 }
 
 TEST_F(GNAMemoryTest, canSetUpReadWriteSectionPtr) {
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
 
-    float* pFuture2 = (float*)&pFuture2;
-    float* pFuture1 = (float*)&pFuture1;
-    float* pFuture3 = (float*)&pFuture3;
-
-
-    mem.readonly().push_value(pFuture1, 3.f,  2);
-    mem.push_value(pFuture2, 13.f, 3);
-    mem.readonly().push_value(pFuture3, 32.f,  4);
+    mem.readonly().push_value(nullptr, pFuture1, 3.f,  2);
+    mem.push_value(nullptr, pFuture2, 13.f, 3);
+    mem.readonly().push_value(nullptr, pFuture3, 32.f,  4);
     mem.commit();
 
     ASSERT_EQ(mem.getTotalBytes(), (2+3+4) * sizeof(float));
@@ -346,16 +361,15 @@ TEST_F(GNAMemoryTest, canSetUpReadWriteSectionPtr) {
 
 TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequest) {
     float input[]  = {1, 2, 3};
-
     float *pFuture = nullptr;
     float *pFuture2 = nullptr;
     float *pFuture3 = nullptr;
 
     size_t len = sizeof(input);
 
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.bind_ptr(&pFuture3, &pFuture2, 2 * len, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture2, 2 * len, len);
 
     mem.commit();
 
@@ -385,9 +399,9 @@ TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenPush) {
 
     size_t len = sizeof(input);
 
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.push_ptr(&pFutureInput2, input2, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.push_ptr(nullptr, &pFutureInput2, input2, len);
 
     mem.commit();
 
@@ -416,9 +430,9 @@ TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenAlloc) {
 
     size_t len = sizeof(input);
 
-    mem.reserve_ptr(&pFuture, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.push_ptr(&pFutureInput, input, len);
+    mem.reserve_ptr(nullptr, &pFuture, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.push_ptr(nullptr, &pFutureInput, input, len);
 
     mem.commit();
 
diff --git a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
index 7fa12a42825..d2d2112c5ad 100644
--- a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
+++ b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
@@ -19,7 +19,7 @@ const std::map<std::string, std::string>  supportedConfigKeysWithDefaults = {
     {GNA_CONFIG_KEY(EXEC_TARGET), ""},
     {GNA_CONFIG_KEY(COMPILE_TARGET), ""},
     {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT},
-    {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(NO)},
+    {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(YES)},
     {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
     {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()},
     {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)},
diff --git a/src/inference/dev_api/memory_solver.hpp b/src/inference/dev_api/memory_solver.hpp
index a1261c0497a..b9b81698c00 100644
--- a/src/inference/dev_api/memory_solver.hpp
+++ b/src/inference/dev_api/memory_solver.hpp
@@ -8,6 +8,7 @@
  */
 #pragma once
 
+#include <ie_common.h>
 #include <stdint.h>
 
 #include <algorithm>