GNA Input/Output buffers reusage (#7332)

* Init implementation # Conflicts: # thirdparty/ade * Switched to shared class * Refactoring memory commit() * Added unit tests * Fixed output order * Fixed input order * Fixed split case * fixed compiling issue in debug mode * Enabled compact mode by default * Fixed default order for inputs and outputs * Changed unit test * Enabled compact mode bye default * reverted compac_mode flag order
2021-11-30 10:36:54 +03:00 · 2021-11-30 10:36:54 +03:00 · cccec6942e
commit cccec6942e
parent caa7d853b3
11 changed files with 741 additions and 271 deletions
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@ -10,7 +10,7 @@ namespace GNAPluginNS {
 struct GNAFlags {
    uint8_t gna_lib_async_threads_num = 1;

-    bool compact_mode = false;
+    bool compact_mode = true;
    bool exclusive_async_requests = false;
    bool uniformPwlDesign = false;
    float pwlMaxErrorPercent = 1.0f;
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@ -208,7 +208,7 @@ void  GNAGraphCompiler::ConstPrimitive(InferenceEngine::CNNLayerPtr constLayer)
    connectOutput(constLayer, ptr_for_const_blob, const_blob->byteSize());
    // TODO: segment type for bind, bind initializer not used - need refactor to separate bind and allocation requests
    // dont see practical use case when bind storage type need to be different that allocation type
-    gnamem->readonly().bind_initializer(ptr_for_const_blob, [const_blob](void* data, size_t size) {
+    gnamem->bind_initializer(nullptr, ptr_for_const_blob, [const_blob](void* data, size_t size) {
        ie_memcpy(data, size, const_blob->buffer(), const_blob->byteSize());
        });
 }
@ -475,7 +475,7 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
    }

    if (num_conv_kernel_padding == 0) {
-        gnamem->readonly().push_local_ptr(ptr_weights,
+        gnamem->readonly().push_local_ptr(layer, ptr_weights,
            transposedWeights.data(),
            convolution._weights->byteSize(),
            64);
@ -502,19 +502,19 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
                offset += padding_zeros.size();
            }
        };
-        gnamem->readonly().push_initializer(ptr_weights,
+        gnamem->readonly().push_initializer(layer, ptr_weights,
            paddedWeightsSize,
            initializer,
            64);
    }

    if (convolution._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
            convolution._biases->cbuffer().as<const void*>(),
            convolution._biases->byteSize(),
            64);
    } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, out_channels, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, out_channels, 64);
    }
 }

@ -600,7 +600,6 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
        ptr_outputs,
        ptr_weights,
        ptr_biases);
-
    currentComponent.num_bytes_per_input = inputs->getPrecision().size();
    currentComponent.num_bytes_per_output = outputs->getPrecision().size();

@ -647,18 +646,18 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
        transposedWeights.resize(transposedWeights.size() + kernelPad);
    }

-    gnamem->readonly().push_local_ptr(ptr_weights,
+    gnamem->readonly().push_local_ptr(layer, ptr_weights,
        transposedWeights.data(),
        transposedWeights.size(),
        64);

    if (convolution._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
            convolution._biases->cbuffer().as<const void*>(),
            convolution._biases->byteSize(),
            64);
    } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, out_channels, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, out_channels, 64);
    }
 }
 #endif
@ -712,14 +711,13 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
            ptr_weights,
            ptr_biases,
            true);
-
        connectOutput(layer, ptr_outputs, num_data_bytes_out);
        connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);

        if (gnaFlags->sw_fp32) {
            IE_ASSERT(quantized == nullptr);
-            gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64);
-            gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, power.scale, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, power.offset, num_rows_out, 64);
        } else {
            IE_ASSERT(quantized != nullptr);
            if (!gnaFlags->input_low_precision) {
@ -727,15 +725,15 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
                    static_cast<float>(INT16_MAX)));
                auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.GetScale() * power.offset,
                    static_cast<float>(INT32_MAX)));
-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
-                gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedScale, num_rows_out, 64);
+                gnamem->readonly().push_value<int32_t>(layer, ptr_biases, quantizedOffset, num_rows_out, 64);
            } else {
                auto quantizedScale = FLOAT_TO_INT8(std::min(quantized->_weights_quant.GetScale() * power.scale,
                    static_cast<float>(INT8_MAX)));
                auto quantizedOffset = FLOAT_TO_INT8(std::min(quantized->_dst_quant.GetScale() * power.offset,
                    static_cast<float>(INT8_MAX)));
-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedScale, num_rows_out, 64);
-                gnamem->readonly().push_value<int8_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedScale, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_biases, quantizedOffset, num_rows_out, 64);
            }
        }
    } else {
@ -799,12 +797,11 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
            ptr_pwl_input,
            ptr_pwl_outputs,
            ptr_pwl_segments_target);
-
        connectOutput(layer, ptr_pwl_outputs, num_data_bytes_out);
        connectInput(layer, ptr_pwl_input, num_data_bytes_in, 0, 0);

        if (ptr_pwl_segments_target != nullptr) {
-            gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+            gnamem->readonly().push_local_ptr(layer, ptr_pwl_segments_target,
                &ptr_pwl_segments.front(),
                ptr_pwl_segments.size() * sizeof(gna_pwl_segment_t),
                64);
@ -876,7 +873,6 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
        getScaleFactor(layer, QuantizedDataType::output),
        ptr_inputs,
        ptr_outputs);
-
    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
        * outputs->getPrecision().size();

@ -921,7 +917,6 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
        num_columns_out,
        ptr_inputs,
        ptr_outputs);
-
    size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
        begin(outputs->getDims()), end(outputs->getDims())), 8)
        * outputs->getPrecision().size();
@ -933,7 +928,6 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {

 void GNAGraphCompiler::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
    auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
-
    if (concatLayer == nullptr) {
        return;
    }
@ -996,13 +990,10 @@ void GNAGraphCompiler::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
        auto layerInfo = LayerInfo(concatParent);
        // auto layerInfo = LayerInfo(getCreatorLayer(concatLayerInput->insData[it].lock()).lock());
        if (layerInfo.isInput()) {
-            connectInput(layer, &concatLayerInfo.gna_ptr,
-                inputLayer.tensorSize, inputLayer.offset, idx, false);
-
+            connectInput(layer, &concatLayerInfo.gna_ptr, inputLayer.tensorSize, inputLayer.offset, idx, false);
            concatLayerInfo.input_allocated = true;
        } else if (layerInfo.isMemory()) {
            connectInput(layer, &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size, inputLayer.offset, idx, false);
-
            concatLayerInfo.input_allocated = true;
        }
        ++idx;
@ -1114,7 +1105,6 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
            ptr_weights,
            ptr_biases,
            false);
-
        size_t num_data_bytes_out =
            InferenceEngine::details::product(
                begin(outputs->getDims()), end(outputs->getDims())) * 4;
@ -1128,8 +1118,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
        FillWeightOfAligningFilter(layer, ptr_weights, offset.front(), (quantized == nullptr) ? false : true);

        (quantized == nullptr) ?
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64) :
-            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64) :
+            gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0, num_rows_out, 64);
    }
 }

@ -1249,7 +1239,6 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
        ptr_weights,
        ptr_biases,
        true);
-
    size_t num_data_bytes_out =
        InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims())) * outputs->getPrecision().size();

@ -1262,36 +1251,36 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
    switch (eltwise._operation) {
    case EltwiseLayer::Sub:
        if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, -1.0f, num_rows_out, 64);
        } else {
            auto scaledIdentity = -quantized->_weights_quant.GetScale();

            if (gnaFlags->input_low_precision == false) {
                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));

-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
            } else {
                auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));

-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
            }
        }
        connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
        break;
    case EltwiseLayer::Sum:
        if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_weights, 1.0f, num_rows_out, 64);
        } else {
            auto scaledIdentity = quantized->_weights_quant.GetScale();

            if (gnaFlags->input_low_precision == false) {
                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));

-                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
            } else {
                auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));

-                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_weights, quantizedIdentity, num_rows_out, 64);
            }
        }
        connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
@ -1299,12 +1288,12 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {

    case EltwiseLayer::Prod:
        if (quantized == nullptr) {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
        } else {
            if (gnaFlags->input_low_precision == false) {
-                gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+                gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0, num_rows_out, 64);
            } else {
-                gnamem->readonly().push_value<int8_t>(ptr_biases, 0, num_rows_out, 64);
+                gnamem->readonly().push_value<int8_t>(layer, ptr_biases, 0, num_rows_out, 64);
            }
        }
        connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
@ -1372,9 +1361,9 @@ void GNAGraphCompiler::GemmPrimitive(InferenceEngine::CNNLayerPtr layer) {
    connectInput(layer, ptr_input_2, num_data_bytes_in_2, 0, 1);
    if (gnaFlags->sw_fp32) {
        IE_ASSERT(quantized == nullptr);
-        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
    } else {
-        gnamem->readonly().push_value<int32_t>(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value<int32_t>(layer, ptr_biases, 0.0f, num_rows_out, 64);
    }
 }

@ -1485,12 +1474,12 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool

    if (num_padding == 0) {
        if (!transpose) {
-            gnamem->readonly().push_ptr(ptr_weights,
+            gnamem->readonly().push_ptr(layer, ptr_weights,
                weightable._weights->cbuffer().as<const void*>(),
                weightable._weights->byteSize(),
                64);
        } else {
-            gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void* data, size_t size) {
+            gnamem->readonly().push_initializer(layer, ptr_weights, weightable._weights->byteSize(), [=](void* data, size_t size) {
                for (uint32_t k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
                    auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
                    auto cbuffer = weightable._weights->cbuffer().as<const uint8_t*>() + rowOffset;
@ -1519,7 +1508,7 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
        auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
        auto paddedWeightsSize = paddedWeights * weightable.precision.size();

-        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
+        gnamem->readonly().push_initializer(layer, ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
            for (uint32_t i = 0; i < (isDiag ? 1 : num_rows_out); i++) {
                ie_memcpy(data, size,
                    weightable._weights->cbuffer().as<const uint8_t*>() + num_rows_in * i * weightable.precision.size(),
@ -1530,16 +1519,16 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
    }

    if (weightable._biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
            weightable._biases->cbuffer().as<const void*>(),
            weightable._biases->byteSize(),
            64);
    } else {
        // in that case input from previous layer goes into biases, so we have to initialize input pointer by zero
        if (useBiasConnection) {
-            gnamem->readonly().push_value(ptr_inputs, 0.0f, num_rows_in + num_padding, 64);
+            gnamem->readonly().push_value(layer, ptr_inputs, 0.0f, num_rows_in + num_padding, 64);
        } else {
-            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out + num_padding_out, 64);
+            gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out + num_padding_out, 64);
        }
    }
 }
@ -1557,7 +1546,7 @@ void GNAGraphCompiler::FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr l
        THROW_GNA_EXCEPTION << "Weights memory is not allocated!!!";
    }

-    gnamem->readonly().push_initializer(ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void* data, size_t size) {
+    gnamem->readonly().push_initializer(layer, ptrWeights, num_rows_out * ALIGN(num_rows_in, 8) * layer->precision.size(), [=](void* data, size_t size) {
        int out = 0;
        for (int input = offset; input < num_rows_out + offset; ++input) {
            auto mem_ptr = reinterpret_cast<uint8_t*>(data) + input * layer->precision.size() + out * ALIGN(num_rows_in, 8) * layer->precision.size();
@ -1624,7 +1613,6 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
                               ptr_inputs,
                               ptr_outputs);

-
        size_t num_data_bytes_in = num_rows_copied * num_rows_copied * num_columns_in
            * inputs->getPrecision().size();
        // need to reserve full tensor so using original size with assumption of identity activation attached to filter lateron
@ -1681,7 +1669,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
        size_t weights_stride =  (num_rows_in + num_rows_copied) * weightsElementSize;
        size_t weights_offset = weights_stride * num_rows_copied +  num_rows_copied * weightsElementSize;

-        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
+        gnamem->readonly().push_initializer(layer, ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
            size_t roffset = weights_offset;
            size_t woffset = 0;
            for (int i = 0; i < num_rows_out && size >= woffset; i++) {
@ -1696,12 +1684,12 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
    }

    if (filterLayer->_biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
            filterLayer->_biases->cbuffer().as<const void*>(),
            filterLayer->_biases->byteSize(),
            64);
    } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, num_rows_out, 64);
    }
 }

@ -1774,18 +1762,18 @@ void GNAGraphCompiler::ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr l
    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
    connectOutput(layer, ptr_outputs, num_data_bytes_out);

-    gnamem->readonly().push_ptr(ptr_weights,
+    gnamem->readonly().push_ptr(layer, ptr_weights,
        filterLayer->_weights->cbuffer().as<const void*>(),
        filterLayer->_weights->byteSize(),
        64);

    if (filterLayer->_biases) {
-        gnamem->readonly().push_ptr(ptr_biases,
+        gnamem->readonly().push_ptr(layer, ptr_biases,
            filterLayer->_biases->cbuffer().as<const void*>(),
            filterLayer->_biases->byteSize(),
            64);
    } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, numberOfFilters, 64);
+        gnamem->readonly().push_value(layer, ptr_biases, 0.0f, numberOfFilters, 64);
    }
 }

@ -2016,7 +2004,7 @@ case name:\
    connectOutput(layer, ptr_outputs, num_data_bytes_out);

    if (ptr_pwl_segments_target != nullptr) {
-        gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+        gnamem->readonly().push_local_ptr(layer, ptr_pwl_segments_target,
            &ptr_pwl_segments.front(),
            ptr_pwl_segments.size() * sizeof(gna_pwl_segment_t),
            64);
@ -2152,8 +2140,9 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) {
    }
 }

-void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr,
-    size_t num_data_bytes_out) {
+void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer,
+                                    void *ptr,
+                                    size_t num_data_bytes_out) {
    auto getOffsetForBinding = [](InferenceEngine::CNNLayerPtr layer) {
        int32_t output_offset = 0;
        if (layer->params.find("output_offset") != layer->params.end()) {
@ -2162,7 +2151,6 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
        return output_offset;
    };

-
    gnalog() << "Connecting output " << layer->name << " ...\n";
    // in case of Memory Layer it's input allocated in meminput layer
    if (layer->outData.size() == 1) {
@ -2179,7 +2167,6 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
            if (!nextLayer.first) {
                gnalog() << "for layer: " << layer->name << "outData[0] has non functional connection at " << j;
            }
-
            auto nextMemoryLayerIt =
                    std::find_if(begin(memory_connection), end(memory_connection),
                                 [&](MemoryConnection::value_type &comp) {
@ -2190,14 +2177,13 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                // memory layer not yet initialized
                if (nextMemoryLayer.reserved_size == 0) {
                    auto memorySize = InferenceEngine::details::product(nextMemoryLayer.getDims()) * nextMemoryLayer.elementSizeBytes();
-
-                    gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(memorySize), 64);
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer));
+                    gnamem->reserve_ptr(nullptr, &nextMemoryLayer.gna_ptr, ALIGN64(memorySize), 64);
+                    gnamem->bind_ptr(nullptr, ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer));

                    nextMemoryLayer.reserved_size = ALIGN64(memorySize);
                } else {
                    // We may need to extend memory buffer if connected input size is bigger, for example for concat connection
-                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer), ALIGN64(num_data_bytes_out));
+                    gnamem->bind_ptr(nullptr, ptr, &nextMemoryLayer.gna_ptr, getOffsetForBinding(layer), ALIGN64(num_data_bytes_out));
                }
                return;
            }
@ -2288,7 +2274,7 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                                             return it != concatItem.second.concatInputLayers.end();
                                         });
                    if (included == concat_connection.end()) {
-                        gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size), 64);
+                        gnamem->reserve_ptr(layer, &concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size), 64);

                        std::function<void(GNAConcatLayer, GNAPluginNS::InputDesc&, ConcatConnection&)> allocate_input_recursively =
                            [&allocate_input_recursively](GNAConcatLayer clayer, GNAPluginNS::InputDesc& inputDesc, ConcatConnection& concat_connection) {
@ -2321,26 +2307,24 @@ void GNAGraphCompiler::connectOutput(InferenceEngine::CNNLayerPtr layer, void *p
                if (layer->params.find("output_offset") != layer->params.end()) {
                    output_offset = layer->GetParamAsInt("output_offset");
                }
-                gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, output_offset);
+                gnamem->bind_ptr(layer, ptr, &concatLayerInfoItem.gna_ptr, output_offset);
            }
            return;
        }
    }

-    intel_dnn_component_t * unused_input = nullptr;
-    if (gnaFlags->compact_mode) {
-        unused_input = find_first_unused_input(layer);
-        if (unused_input != nullptr) {
-            gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out));
-        }
-    }
-    // cannot reuse suitable input
-    if (unused_input == nullptr) {
-        gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out), 64);
-    }
+    auto nextLayer = CNNNetCheckNextLayerSkipCertain(layer, 0, 0, true,
+        [](CNNLayerPtr l) { return LayerInfo(l).isNonFunctional(); }).first;
+    // Check that layer will be an output
+    gnamem->reserve_ptr((LayerInfo(layer).isOutput() || !nextLayer) ? nullptr : layer, ptr, ALIGN64(num_data_bytes_out), 64);
 }

-GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, int32_t offset, int idx, bool connectTo) {
+GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
+                                                              void *ptr,
+                                                              size_t num_data_bytes_in,
+                                                              int32_t offset,
+                                                              int idx,
+                                                              bool connectTo) {
    // selecting particular input layers
    // auto prevLayer = CNNNetPrevLayer(layer, idx);
    auto prevLayer = CNNNetPrevLayerSkipCertain(layer, idx, [](CNNLayerPtr l) {
@ -2363,12 +2347,12 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,

            // real allocation pointer will be kept in ptr not in ptr_inputs_global
            if (!connectTo) {
-                gnamem->push_value(ptr,
+                gnamem->push_value(nullptr, ptr,
                                   static_cast<uint8_t>(0),
                                   num_data_bytes_in,
                                   64);
            } else {
-                gnamem->push_value(&inputDesc->getPtrInputsGlobal(prevLayer->name).front(),
+                gnamem->push_value(nullptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(),
                                   static_cast<uint8_t>(0),
                                   num_data_bytes_in,
                                   64);
@ -2384,9 +2368,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
        }

        if (connectTo) {
-            gnamem->bind_ptr(ptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), offset, ALIGN(num_data_bytes_in, 64));
+            gnamem->bind_ptr(nullptr, ptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), offset, ALIGN(num_data_bytes_in, 64));
        } else {
-            gnamem->bind_ptr(&inputDesc->getPtrInputsGlobal(prevLayer->name).front(), ptr, offset, ALIGN(num_data_bytes_in, 64));
+            gnamem->bind_ptr(nullptr, &inputDesc->getPtrInputsGlobal(prevLayer->name).front(), ptr, offset, ALIGN(num_data_bytes_in, 64));
        }

        return prevLayer;
@ -2394,9 +2378,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
    // const input
    if (LayerInfo(prevLayer).isConst()) {
        if (connectTo) {
-            gnamem->bind_ptr(ptr, const_connections[prevLayer->name], offset);
+            gnamem->bind_ptr(layer, ptr, const_connections[prevLayer->name], offset);
        } else {
-            gnamem->bind_ptr(const_connections[prevLayer->name], ptr, offset);
+            gnamem->bind_ptr(layer, const_connections[prevLayer->name], ptr, offset);
        }

        return prevLayer;
@ -2423,6 +2407,8 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,

            if (it != splitLayerInfoItem.splitOutputLayers.end()) {
                gnalog()  << "Connecting " << splitName << " input \n";
+                // splitting layer should take the execution order from the connected layer
+                splittingLayer->userValue = layer->userValue;
                auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset + offset, 0);
                gnalog()  << "Connected \n";
                return res;
@ -2435,7 +2421,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
        if (concatLayerInfo != concat_connection.end()) {
            auto & concatLayerInfoItem = concatLayerInfo->second;
            // dnnLayer that is input for concat layer
-            gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
+            gnamem->bind_ptr(layer, ptr, &concatLayerInfoItem.gna_ptr, offset);
            // return layer over concat
            return CNNNetPrevLayer(prevLayer);
        }
@ -2444,7 +2430,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
                prevLayer->name);
        if (cropLayerInfo != crop_connection.end()) {
            auto & cropLayerInfoItem = cropLayerInfo->second;
-            gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset);
+            gnamem->bind_ptr(layer, ptr, &cropLayerInfoItem.gna_ptr, offset);
            return CNNNetPrevLayer(prevLayer);
        }
    }
@ -2452,7 +2438,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,

    // check for generic prev layer
    if (prevDnnLayer != nullptr) {
-        gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset);
+        gnamem->bind_ptr(layer, ptr, &prevDnnLayer->ptr_outputs, offset);
        return prevLayer;
    }

@ -2470,20 +2456,20 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
            // connectTo used for  indicate that memory layer should be bound to given buffer
            if (connectTo) {
                memorySize = std::max(memorySize, num_data_bytes_in);
-                gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(memorySize), 64);
-                gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset);
+                gnamem->reserve_ptr(nullptr, &memoryLayer.gna_ptr, ALIGN64(memorySize), 64);
+                gnamem->bind_ptr(nullptr, ptr, &memoryLayer.gna_ptr, offset);
            } else {
                if (num_data_bytes_in < memorySize + offset) {
                    THROW_GNA_LAYER_EXCEPTION(layer) <<" invalid allocation request of "
                                                     << num_data_bytes_in << " is more then state tensor size of: " << memorySize + offset;
                }
-                gnamem->bind_ptr(&memoryLayer.gna_ptr, ptr, offset);
+                gnamem->bind_ptr(nullptr, &memoryLayer.gna_ptr, ptr, offset);
            }

            memoryLayer.reserved_size = ALIGN64(memorySize);
        } else {
            // We may need to extend memory buffer if connected input size is bigger, for example for concat connection
-            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset, ALIGN64(num_data_bytes_in));
+            gnamem->bind_ptr(nullptr, ptr, &memoryLayer.gna_ptr, offset, ALIGN64(num_data_bytes_in));
        }

        return prevLayer;
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@ -17,6 +17,7 @@
 #include <utility>
 #include <limits>

+#include <ie_common.h>
 #include <legacy/graph_tools.hpp>
 #include <legacy/net_pass.h>
 #include <debug.h>
@ -524,7 +525,7 @@ bool GNAPlugin::TryToInitOutput(int portId, InferenceEngine::CNNLayerPtr layer)
        desc.num_elements = numElem;

        // binding ptr for first infer request - then others will be setup during relocation
-        gnamem->bind_ptr(&desc.ptrs.front(), outputPtr);
+        gnamem->bind_ptr(layer, &desc.ptrs.front(), outputPtr);
    };

    // probing gna_primitives
@ -927,7 +928,11 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
    }

    // Creating Layer primitives
+    uint16_t id = 0;
    for (auto & layer : sortedNoMem) {
+        IE_SUPPRESS_DEPRECATED_START
+        layer->userValue.v_int = id++;
+        IE_SUPPRESS_DEPRECATED_END
        graphCompiler.CreateLayerPrimitive(layer);
    }

@ -981,7 +986,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {

    // TODO: how active list will work in multioutput case
    // make room for active list
-    gnamem->reserve_ptr(nullptr,
+    gnamem->reserve_ptr(nullptr, nullptr,
        ALIGN64(outputsDesc.front().num_bytes_per_element * outputsDesc.front().num_elements), 64);

    void *pParallelExecutionData  = nullptr;
@ -989,10 +994,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
    // reserving more bytes for intermediate data in parallel case - TODO: this works incorrectly in compact mode at lest
    rwSegmentSize = gnamem->getRWBytes();
    if (gnaFlags->gna_lib_async_threads_num > 1) {
-        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
+        gnamem->reserve_ptr(nullptr, &pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
    }

-    gnamem->commit();
+    gnamem->commit(gnaFlags->compact_mode);

    dnn->Init(gnamem->getBasePtr(),
             gnamem->getTotalBytes(),
@ -1569,7 +1574,7 @@ InferenceEngine::IExecutableNetworkInternal::Ptr GNAPlugin::ImportNetwork(std::i

    graphCompiler.setGNAMemoryPtr(gnamem);
    void *basePtr = nullptr;
-    gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
+    gnamem->reserve_ptr(nullptr, &basePtr, header.gnaMemSize);
    gnamem->commit();
 #if GNA_LIB_VER == 2
    gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>(header.layersCount)));
--- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
@ -14,6 +14,8 @@
 * @brief used for creating graphviz charts, and layers dump
 */
 # define PLOT
+# define MODEL_DUMP
+# define GNA_HEAP_PROFILER
 # define gnalog() std::cout
 # define gnawarn() std::cerr
 #else
--- a/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_mem_requests.hpp
@ -8,6 +8,8 @@
 #include <vector>
 #include <algorithm>

+#include "gna_plugin_log.hpp"
+
 namespace GNAPluginNS {
 namespace memory {

@ -26,6 +28,45 @@ enum rRegion {
    REGION_AUTO,
 };

+#ifdef GNA_HEAP_PROFILER
+inline const char* rRegionToStr(uint8_t region) {
+   const char* strRegion = "UNKNOWN";
+   switch (region) {
+      case REGION_RO:
+        strRegion = "REGION_RO";
+        break;
+      case REGION_RW:
+        strRegion = "REGION_RW";
+        break;
+      case REGION_AUTO:
+        strRegion = "REGION_AUTO";
+        break;
+   }
+   return strRegion;
+}
+
+inline const char* rTypeToStr(uint8_t type) {
+   const char* strType = "UNKNOWN";
+   switch (type) {
+      case REQUEST_STORE:
+        strType = "REQUEST_STORE";
+        break;
+      case REQUEST_ALLOCATE:
+        strType = "REQUEST_ALLOCATE";
+        break;
+      case REQUEST_BIND:
+        strType = "REQUEST_BIND";
+        break;
+      case REQUEST_INITIALIZER | REQUEST_STORE:
+      case REQUEST_INITIALIZER | REQUEST_ALLOCATE:
+      case REQUEST_INITIALIZER | REQUEST_BIND:
+        strType = "INITIALIZER";
+        break;
+   }
+   return strType;
+}
+#endif
+
 struct MemRequest {
    rRegion  _region;
    uint8_t   _type;
@ -40,6 +81,10 @@ struct MemRequest {
    size_t _offset = 0;
    // expansion in bytes due to large depended layers
    size_t _padding = 0;
+
+    // fields to sort regions by execution availability
+    std::pair<uint16_t, uint16_t> _life_limits{0, UINT16_MAX};
+
    MemRequest(rRegion region,
                rType req,
                void *ptr_out,
@ -79,7 +124,8 @@ struct MemRequest {
        _data.resize(sizeof(T));
        std::copy(reinterpret_cast<uint8_t *>(&element), reinterpret_cast<uint8_t *>(&element) + sizeof(T), _data.begin());
    }
-/**
+
+    /**
     * Store initializer request
     * @param req
     * @param ptr_out
--- a/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_mem_requests_queue.hpp
@ -8,10 +8,23 @@
 #include <vector>
 #include <algorithm>
 #include <functional>
+
+#include <ie_api.h>
+#include <legacy/ie_layers.h>
 #include "gna_mem_requests.hpp"

 namespace GNAPluginNS {
 namespace memory {
+
+/**
+* @brief get layer id from legacy CNNLayer
+*/
+inline uint16_t getCNNLayerId(InferenceEngine::CNNLayerPtr layer) {
+    IE_SUPPRESS_DEPRECATED_START
+    return layer->userValue.v_int;
+    IE_SUPPRESS_DEPRECATED_END
+}
+
 /**
 * Adapter for requests submission and actual request queue
 */
@ -26,12 +39,26 @@ public:
     * @param num_bytes
     * @param alignment
     */
-    void push_initializer(void *ptr_out, size_t num_bytes, std::function<void(void * data, size_t size)> initializer, size_t alignment = 1) {
+    void push_initializer(InferenceEngine::CNNLayerPtr layer,
+                          void *ptr_out,
+                          size_t num_bytes,
+                          std::function<void(void * data, size_t size)> initializer,
+                          size_t alignment = 1) {
        futureHeap().push_back({regionType(), ptr_out, num_bytes, initializer, REQUEST_INITIALIZER, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
    }

-    void push_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+    void push_ptr(InferenceEngine::CNNLayerPtr layer,
+                  void *ptr_out,
+                  const void *ptr_in,
+                  size_t num_bytes,
+                  size_t alignment = 1) {
        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, ptr_in, 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
    }

    /**
@ -40,10 +67,17 @@ public:
     * @param ptr_in
     * @param num_bytes
     */
-    void push_local_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+    void push_local_ptr(InferenceEngine::CNNLayerPtr layer,
+                        void *ptr_out,
+                        const void *ptr_in,
+                        size_t num_bytes,
+                        size_t alignment = 1) {
        localStorage().emplace_back(reinterpret_cast<const uint8_t *>(ptr_in),
                                    reinterpret_cast<const uint8_t *>(ptr_in) + num_bytes);
        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, &localStorage().back().front(), 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
    }

    /**
@ -51,8 +85,14 @@ public:
     * @param ptr_out
     * @param num_bytes
     */
-    void reserve_ptr(void *ptr_out, size_t num_bytes, size_t alignment = 1)  {
+    void reserve_ptr(InferenceEngine::CNNLayerPtr layer,
+                     void *ptr_out,
+                     size_t num_bytes,
+                     size_t alignment = 1)  {
        futureHeap().push_back({regionType(), REQUEST_ALLOCATE, ptr_out, nullptr, 1, num_bytes, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {getCNNLayerId(layer), getCNNLayerId(layer)};
+        }
    }

    /**
@ -63,8 +103,15 @@ public:
     * @param num_bytes - bind can request for bigger buffer that originally allocated via reserve(),
     *      if that happens - reserved request parameters will be updated before committing memory
     */
-    void bind_ptr(void *source, const void *dest, size_t offset = 0, size_t num_bytes = 0)  {
+    void bind_ptr(InferenceEngine::CNNLayerPtr layer,
+                  void *source,
+                  const void *dest,
+                  size_t offset = 0,
+                  size_t num_bytes = 0)  {
        futureHeap().push_back({regionType(), REQUEST_BIND, source, dest, 1, num_bytes, 1, offset});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {getCNNLayerId(layer), getCNNLayerId(layer)};
+        }
    }

    /**
@ -72,16 +119,28 @@ public:
     * @param ptr_out - previously requested buffer
     * @param initializer - initialisation routine to be called on allocated memory
     */
-    void bind_initializer(void *ptr_out, std::function<void(void * data, size_t size)> initializer)  {
+    void bind_initializer(InferenceEngine::CNNLayerPtr layer,
+                          void *ptr_out,
+                          std::function<void(void * data, size_t size)> initializer) {
        futureHeap().push_back({regionType(), ptr_out, 0, initializer, REQUEST_BIND, 1});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
    }

    /**
     * @brief allocates buffer and set all its values to T value
     */
    template<class T>
-    void push_value(void *ptr_out, T value, size_t num_elements, size_t alignment = 1) {
+    void push_value(InferenceEngine::CNNLayerPtr layer,
+                    void *ptr_out,
+                    T value,
+                    size_t num_elements,
+                    size_t alignment = 1) {
        futureHeap().push_back({regionType(), ptr_out, value, num_elements, alignment});
+        if (layer != nullptr) {
+            futureHeap().back()._life_limits = {0, getCNNLayerId(layer)};
+        }
    }

    /**
--- a/inference-engine/src/gna_plugin/memory/gna_memory.hpp
+++ b/inference-engine/src/gna_plugin/memory/gna_memory.hpp
@ -13,7 +13,15 @@
 #include <list>
 #include <algorithm>
 #include <functional>
+#include <iostream>
 #include "gna_lib_ver_selector.hpp"
+#include "memory_solver.hpp"
+#include "gna_plugin_log.hpp"
+
+#ifdef GNA_HEAP_PROFILER
+#include <iomanip>
+#include <fstream>
+#endif

 namespace GNAPluginNS {
 namespace memory {
@ -32,6 +40,7 @@ class GNAMemory : public GNAMemRequestsQueue {
    Allocator _allocator;
    std::shared_ptr<uint8_t> heap = nullptr;
    size_t _page_alignment = 1;
+    bool _is_compact_mode = false;

    class GNAMemRequestsReadOnlyQueue : public GNAMemRequestsQueue {
        std::reference_wrapper<GNAMemRequestsQueue> _that;
@ -62,93 +71,32 @@ class GNAMemory : public GNAMemRequestsQueue {
        return readOnlyFrontEnd;
    }

+    /**
+     * @brief enables memory optimization (compact mode). This mode can be enable in plugin configuration (COMPACT_MODE = Yes)
+     */
+    void setCompactMode(bool isCompact) {
+        _is_compact_mode = isCompact;
+    }
+
    /**
     * @brief calculates size required for all requests, allocates memory and updates pointers
     */
-    void commit() {
+    void commit(bool isCompact = false) {
+        setCompactMode(isCompact);
+
        // 1st stage -- looking for expandable bind requests:
-        for (auto &originated : _future_heap) {
-            if (originated._type & REQUEST_BIND) continue;
-            size_t offset = 0;
-            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
-                if (&originated == &reference) {
-                    offset = 0;
-                }
-                offset += binded._offset;
-                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
-                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
-                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+        expandBindings();

-                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
-            });
-        }
+        // 2nd stage -- setup offsets:
+        setRegionOffsets(REGION_RO);
+        setRegionOffsets(REGION_RW);

-        updateSectionsSizes();
+        // 3rd stage -- allocation total memory setting to 0 internally
+        heap = allocate(getTotalBytes());

-        _total = _rw_section_size + _ro_section_size;
-
-        // allocation with memory setting to 0 internally
-        heap = allocate(_total);
-        auto setupOffsets = [&](std::function<bool(MemRequest & request)> filter, size_t offset) {
-            for (auto &re : _future_heap) {
-                if (re._type == REQUEST_BIND) continue;
-                if (filter(re)) continue;
-
-                auto sz = re._element_size * re._num_elements;
-
-                if (re._ptr_out != nullptr) {
-                    auto cptr = heap.get() + offset;
-                    size_t cptr_avail_size = _total - offset;
-                    if (re._type & REQUEST_BIND) {
-                        cptr = reinterpret_cast<uint8_t*>(*reinterpret_cast<void **>(re._ptr_out));
-                        cptr_avail_size = sz;
-                    } else {
-                        *reinterpret_cast<void **>(re._ptr_out) = cptr;
-                    }
-                    // std::cout << "ALLOCATED=" << cptr << ", size=" << re._element_size * re._num_elements << "\n";
-                    iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
-                        *reinterpret_cast<void **>(binded._ptr_out) =
-                            binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
-                        binded._num_elements = reference._num_elements;
-                        binded._element_size = reference._element_size;
-                    });
-
-                    // std::cout << "size=" << ALIGN(sz, re._alignment) << "\n" << std::flush;
-
-                    switch (re._type & ~REQUEST_BIND) {
-                        case REQUEST_ALLOCATE :
-                            break;
-                        case REQUEST_STORE : {
-                            if (re._ptr_in != nullptr) {
-                                ie_memcpy(cptr, cptr_avail_size, re._ptr_in, sz);
-                            } else {
-                                size_t of = 0;
-                                for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
-                                    std::copy(std::begin(re._data), std::end(re._data), cptr + of);
-                                }
-                            }
-                            break;
-                        }
-                        case REQUEST_INITIALIZER : {
-                            re._initializer(cptr, sz);
-                            break;
-                        }
-                    }
-                }
-                if (!(re._type & REQUEST_BIND)) {
-                    offset += ALIGN(sz + re._padding, re._alignment);
-                }
-            }
-        };
-
-        setupOffsets([](GNAPluginNS::memory::MemRequest & request) {
-            // TODO: consume bind requests separately from storage type
-            return !(request._type & REQUEST_BIND) && (request._region != REGION_RW);
-        }, 0);
-
-        setupOffsets([](GNAPluginNS::memory::MemRequest & request) {
-            return (request._type & REQUEST_BIND) || request._region != REGION_RO;
-        }, _rw_section_size);
+        // 4th stage -- store data and updates pointers
+        allocateRegion(REGION_RW, 0);
+        allocateRegion(REGION_RO, _rw_section_size);
    }

    void *getBasePtr() {
@ -180,7 +128,7 @@ class GNAMemory : public GNAMemRequestsQueue {
    void iterate_binded(GNAPluginNS::memory::MemRequest & reference, const T & visitor) {
        for (auto &re : _future_heap) {
            if ((re._type & REQUEST_BIND) && (re._ptr_in == reference._ptr_out)) {
-                // std::cout << "  [binded=" << re._type << ", ptr=" << re._ptr_out <<"]\n";
+                // std::cout << "  [binded=" << rTypeToStr(re._type) << ", ptr=" << re._ptr_out <<"]\n";
                visitor(reference, re);
                // primitive loop check
                if (re._ptr_in == re._ptr_out) continue;
@ -190,7 +138,6 @@ class GNAMemory : public GNAMemRequestsQueue {
        }
    }

-
    std::shared_ptr<uint8_t> allocate(size_t bytes) {
        std::shared_ptr<uint8_t> sp(_allocator.allocate(bytes), [=](uint8_t *p) {
            _allocator.deallocate(p, bytes);
@ -200,31 +147,191 @@ class GNAMemory : public GNAMemRequestsQueue {
    }

 protected:
+    /**
+     * @brief expand BIND and (BIND | ) requests. Align size(_padding), set execution order
+     */
+    void expandBindings() {
+        for (auto &originated : _future_heap) {
+            // skipping bind requests to avoid duplications
+            if (originated._type & REQUEST_BIND) continue;
+
+            size_t offset = 0;
+            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
+                // aligning sizes
+                if (&originated == &reference) offset = 0;
+
+                offset += binded._offset;
+                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
+                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
+                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+
+                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
+
+                // set execution order
+                originated._life_limits.first = std::min(originated._life_limits.first, binded._life_limits.first);
+                originated._life_limits.second = std::max(originated._life_limits.second, binded._life_limits.second);
+            });
+        }
+    }
+
+    /**
+     * @brief set offsets for specific region
+     */
+    size_t setRegionOffsets(GNAPluginNS::memory::rRegion regType) {
+        size_t region_offset = 0;
+        for (auto &re : _future_heap) {
+            if (re._region != regType || re._type & REQUEST_BIND || re._ptr_out == nullptr) continue;
+
+            re._offset = region_offset;
+            region_offset += ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
+        }
+        return region_offset;
+    }
+
+    /**
+     * @brief allocates memory and updates pointers
+     */
+    void allocateRegion(GNAPluginNS::memory::rRegion regType, size_t baseOffset) {
+        for (auto &re : _future_heap) {
+            // skipping Bind, crossregion and empty requests
+            if (re._region != regType || re._type == REQUEST_BIND || re._ptr_out == nullptr) continue;
+
+            size_t offset = baseOffset + re._offset;
+            auto cptr = heap.get() + offset;
+            size_t cptr_avail_size = _total - offset;
+
+            auto sz = re._element_size * re._num_elements;
+            if (re._type & REQUEST_BIND) {
+                cptr = reinterpret_cast<uint8_t*>(*reinterpret_cast<void **>(re._ptr_out));
+                cptr_avail_size = sz;
+            } else {
+                *reinterpret_cast<void **>(re._ptr_out) = cptr;
+            }
+            iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
+                *reinterpret_cast<void **>(binded._ptr_out) =
+                    binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
+                binded._num_elements = reference._num_elements;
+                binded._element_size = reference._element_size;
+            });
+
+            switch (re._type & ~REQUEST_BIND) {
+                case REQUEST_ALLOCATE :
+                    break;
+                case REQUEST_STORE : {
+                    if (re._ptr_in != nullptr) {
+                        ie_memcpy(cptr, cptr_avail_size, re._ptr_in, sz);
+                    } else {
+                        size_t of = 0;
+                        for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
+                            std::copy(std::begin(re._data), std::end(re._data), cptr + of);
+                        }
+                    }
+                    break;
+                }
+                case REQUEST_INITIALIZER : {
+                    re._initializer(cptr, sz);
+                    break;
+                }
+            }
+        }
+    }
+
+    /**
+     * @brief optimize memory region by reusing buffers
+     */
+    size_t getSectionSizeOptimized(GNAPluginNS::memory::rRegion regType) {
+        size_t memSize = 0;
+        switch (regType) {
+            case REGION_AUTO:
+            case REGION_RW:
+            case REGION_RO: {
+                    std::vector<MemorySolver::Box> boxes;
+                    for (size_t i = 0; i < _future_heap.size(); ++i) {
+                        // skipping BIND, cross-region and empty requests
+                        if (_future_heap[i]._type & REQUEST_BIND || _future_heap[i]._region != regType || _future_heap[i]._ptr_out == nullptr) {
+                            continue;
+                        }
+
+                        auto original_with_pad = ALIGN(_future_heap[i]._num_elements * _future_heap[i]._element_size + _future_heap[i]._padding,
+                                                       _future_heap[i]._alignment);
+                        int start = _future_heap[i]._life_limits.first;
+                        int stop = _future_heap[i]._life_limits.second;
+
+                        boxes.push_back({start, stop, static_cast<int64_t>(original_with_pad), static_cast<int64_t>(i)});
+                    }
+                    MemorySolver memSolver(boxes);
+                    memSize = memSolver.solve();
+
+                    // setting offsets
+                    for (auto const & box : boxes) {
+                        _future_heap[box.id]._offset = memSolver.getOffset(box.id);
+                    }
+                }
+                break;
+
+            default:
+                break;
+            }
+
+        return memSize;
+    }
+
+
+#ifdef GNA_HEAP_PROFILER
+    void memoryDump(std::function<bool(MemRequest & re)> filter) {
+        std::ofstream dumpFile("gna_memory_requests.txt", std::ios::out);
+
+        for (auto &re : _future_heap) {
+            if (filter(re)) continue;
+            dumpFile << ": " << " region: " << rRegionToStr(re._region) << ", "
+                    << "type: " << std::setw(17) << rTypeToStr(re._type) << " "
+                    << "ptr_in: " << std::setw(15) << re._ptr_in << " "
+                    << "ptr_out: " << std::setw(15) << re._ptr_out << " "
+                    << std::setw(8) << re._num_elements << ", "
+                    << static_cast<int>(re._element_size) << ", "
+                    << re._padding << ", "
+                    << std::setw(3) << re._alignment << ", "
+                    << std::setw(8) << re._offset << ", "
+                    << "life_time: " << re._life_limits.first << ":" << re._life_limits.second << ", "
+                    << std::endl;
+        }
+    }
+#endif
+
    void updateSectionsSizes() {
        // count total size and size of read/write regions
        _rw_section_size = 0;
        _ro_section_size = 0;
-        for (auto &re : _future_heap) {
-            auto current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
 #ifdef GNA_HEAP_PROFILER
-            std::cout << "chunk: " << " region: " << re._region << ", " <<
-                    "type: " << (re._type  == REQUEST_STORE ? "store " : re._type == REQUEST_BIND ? "bind  " : "alloc ") <<
-                    std::setw(10) << re._num_elements << ", " <<
-                    static_cast<int>(re._element_size) << ", " <<
-                    re._padding << ", " <<
-                    re._offset << ", " <<
-                    re._alignment << std::endl;
+        memoryDump([](GNAPluginNS::memory::MemRequest & request) {
+            return false;
+            });
 #endif
-            if (re._type == REQUEST_BIND) continue;
+        for (auto &re : _future_heap) {
+            if (re._type & REQUEST_BIND || re._ptr_out == nullptr) continue;

+            size_t current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
            if (re._region == REGION_RW) {
                _rw_section_size += current;
            } else {
                _ro_section_size += current;
            }
        }
+
+        if (_is_compact_mode) {
+            _rw_section_size = getSectionSizeOptimized(REGION_RW);
+        }
+
+        gnalog() << "ro_section_size: " << _ro_section_size << std::endl;
+        gnalog() << "rw_section_size: " << _rw_section_size << std::endl;
+        gnalog() << "total: " << _total << std::endl;
+
        _rw_section_size = ALIGN(_rw_section_size, _page_alignment);
        _ro_section_size = ALIGN(_ro_section_size, _page_alignment);
+        _total = _rw_section_size + _ro_section_size;
+
+        gnalog() << "Aligned ro_section_size: " << _ro_section_size << std::endl;
+        gnalog() << "Aligned rw_section_size: " << _rw_section_size << std::endl;
    }
 };
 }  // namespace memory
--- a/inference-engine/tests/unit/gna/gna_memory_compact_test.cpp
+++ b/inference-engine/tests/unit/gna/gna_memory_compact_test.cpp
@ -0,0 +1,250 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include <legacy/ie_layers.h>
+#include "memory/gna_memory.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS::memory;
+
+class GNAMemoryCompactTest : public ::testing::Test {
+ protected:
+    GNAMemory<std::allocator<uint8_t>> mem;
+    bool isCompact = true;
+
+    void SetUp() override  {
+    }
+};
+
+TEST_F(GNAMemoryCompactTest, canOptimizeReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+
+    mem.reserve_ptr(layer1, pFuture1, 3 * sizeof(float));
+    mem.reserve_ptr(layer2, pFuture2, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 3 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 3 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushValue) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+
+    mem.push_value(layer1, pFuture1, 1.f, 2);
+    mem.push_value(layer2, pFuture2, 2.f, 3);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 5 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 5 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushValueAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_value(layer1, pFuture1, 3.f, 2);
+    mem.bind_ptr(layer2, pFuture2, pFuture1, 0, 2);
+    mem.reserve_ptr(layer3, pFuture3, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 2 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 2 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeTwoPushValueAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    CNNLayerPtr layer4 = std::make_shared<CNNLayer>(LayerParams("layer4", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    layer4->userValue.v_int = 4;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_value(layer1, pFuture1, 1.f, 2);
+    mem.push_value(layer2, pFuture2, 2.f, 3);
+    mem.reserve_ptr(layer3, pFuture3, 5 * sizeof(float));
+    mem.bind_ptr(layer2, pFuture2, pFuture1, 0, 2);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 5 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 5 * sizeof(float));
+}
+
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float input[]  = {1, 2, 3};
+    size_t input_size = sizeof(input);
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.push_ptr(layer1, pFuture1, input, input_size);
+    mem.reserve_ptr(layer2, pFuture2, input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), input_size);
+    ASSERT_EQ(mem.getTotalBytes(), input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushLocalPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    size_t input_size;
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+        input_size = input.size() * sizeof(float);
+        mem.push_local_ptr(layer1, pFuture1, &*input.begin(), input_size);
+    }
+
+    mem.reserve_ptr(layer2, pFuture2, input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), input_size);
+    ASSERT_EQ(mem.getTotalBytes(), input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizePushInitilizerPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    size_t input_size;
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f};
+        input_size = input.size() * sizeof(float);
+        mem.push_initializer(layer1, pFuture1, input_size, [=](void* data, size_t size){
+            ie_memcpy(data, size, &input[0], input.size());
+        });
+    }
+
+    mem.reserve_ptr(layer2, pFuture2, 2 * input_size);
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 0, input_size);
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 2 * input_size);
+    ASSERT_EQ(mem.getTotalBytes(), 2 * input_size);
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeBindInitilizerPtrAndReservePtr) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    CNNLayerPtr layer4 = std::make_shared<CNNLayer>(LayerParams("layer4", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    layer4->userValue.v_int = 4;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+    float* pFuture4 = reinterpret_cast<float*>(&pFuture4);
+
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f};
+        mem.bind_initializer(layer2, pFuture1, [=](void* data, size_t size){
+            ie_memcpy(data, size, &input[0], input.size());
+        });
+    }
+
+    mem.reserve_ptr(layer1, pFuture1, 4 * sizeof(float));
+    mem.reserve_ptr(layer3, pFuture3, 2 * sizeof(float));
+    mem.bind_ptr(layer4, pFuture4, pFuture3, 0, 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+}
+
+TEST_F(GNAMemoryCompactTest, canOptimizeReservePtrWithOffset) {
+    IE_SUPPRESS_DEPRECATED_START
+    CNNLayerPtr layer1 = std::make_shared<CNNLayer>(LayerParams("layer1", "test", Precision::FP32));
+    CNNLayerPtr layer2 = std::make_shared<CNNLayer>(LayerParams("layer2", "test", Precision::FP32));
+    CNNLayerPtr layer3 = std::make_shared<CNNLayer>(LayerParams("layer3", "test", Precision::FP32));
+    layer1->userValue.v_int = 1;
+    layer2->userValue.v_int = 2;
+    layer3->userValue.v_int = 3;
+    IE_SUPPRESS_DEPRECATED_END
+
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);
+
+    mem.reserve_ptr(layer1, pFuture1, 2 * sizeof(float));
+    mem.reserve_ptr(layer2, pFuture2, 2 * sizeof(float));
+    mem.bind_ptr(layer3, pFuture3, pFuture2, 2 * sizeof(float), 2 * sizeof(float));
+
+    mem.commit(isCompact);
+    ASSERT_EQ(mem.getRWBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+}
--- a/inference-engine/tests_deprecated/unit/engines/gna/gna_memory_test.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/gna/gna_memory_test.cpp
@ -9,7 +9,6 @@
 using namespace GNAPluginNS::memory;

 class GNAMemoryTest : public ::testing::Test {
-
 protected:
    GNAMemory<std::allocator<uint8_t>> mem;

@ -17,12 +16,12 @@ class GNAMemoryTest : public ::testing::Test {
    }
 };

-TEST_F(GNAMemoryTest, canStoreActualBlob){
-    float input [] = {1,2,3};
+TEST_F(GNAMemoryTest, canStoreActualBlob) {
+    float input[] = {1, 2, 3};
    float* pFuture = nullptr;
    size_t len = sizeof(input);

-    mem.push_ptr(&pFuture, input, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
    mem.commit();

    ASSERT_NE(pFuture, nullptr);
@ -33,12 +32,12 @@ TEST_F(GNAMemoryTest, canStoreActualBlob){
 }

 TEST_F(GNAMemoryTest, canStore2Blobs) {
-    float input [] = {1,2,3,4};
+    float input[] = {1, 2, 3, 4};
    float* pFuture = nullptr;
    float* pFuture2 = nullptr;

-    mem.push_ptr(&pFuture, input, 3*4);
-    mem.push_ptr(&pFuture2, input+1, 3*4);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4);
+    mem.push_ptr(nullptr, &pFuture2, input+1, 3*4);
    mem.commit();

    ASSERT_NE(pFuture, input);
@ -54,10 +53,10 @@ TEST_F(GNAMemoryTest, canStore2Blobs) {
 }

 TEST_F(GNAMemoryTest, canStoreBlobsALIGNED) {
-    float input [] = {1,2,3,4,5,6,7,8};
+    float input[] = {1, 2, 3, 4, 5, 6, 7, 8};
    float* pFuture = nullptr;

-    mem.push_ptr(&pFuture, input, 3*4, 8);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4, 8);
    mem.commit();

    ASSERT_EQ(16 , mem.getTotalBytes());
@ -73,12 +72,12 @@ TEST_F(GNAMemoryTest, canStoreBlobsALIGNED) {
 }

 TEST_F(GNAMemoryTest, canStore2BlobsALIGNED) {
-    float input [] = {1,2,3,4,5,6,7,8};
+    float input[] = {1, 2, 3, 4, 5, 6, 7, 8};
    float* pFuture = nullptr;
    float* pFuture2 = nullptr;

-    mem.push_ptr(&pFuture, input, 3*4, 8);
-    mem.push_ptr(&pFuture2, input, 3*4, 16);
+    mem.push_ptr(nullptr, &pFuture, input, 3*4, 8);
+    mem.push_ptr(nullptr, &pFuture2, input, 3*4, 16);
    mem.commit();

    ASSERT_EQ(32 , mem.getTotalBytes());
@ -92,33 +91,30 @@ TEST_F(GNAMemoryTest, canStore2BlobsALIGNED) {
    ASSERT_EQ(pFuture[4], 1);
    ASSERT_EQ(pFuture[5], 2);
    ASSERT_EQ(pFuture[6], 3);
-
 }

 TEST_F(GNAMemoryTest, canReserveData) {
-
    float* pFuture = nullptr;
-    mem.reserve_ptr(&pFuture, 3*4);
+    mem.reserve_ptr(nullptr, &pFuture, 3*4);
    mem.commit();

    ASSERT_NE(pFuture, nullptr);
 }

 TEST_F(GNAMemoryTest, canReserveDataByVoid) {
-    mem.reserve_ptr(nullptr, 3*4);
+    mem.reserve_ptr(nullptr, nullptr, 3*4);
    ASSERT_NO_THROW(mem.commit());
 }


 TEST_F(GNAMemoryTest, canReserveAndPushData) {
-
    float input[] = {1, 2, 3};
    float *pFuture = nullptr;
    float* pFuture2 = nullptr;
-    size_t len = sizeof(input) ;
+    size_t len = sizeof(input);

-    mem.push_ptr(&pFuture, input, len);
-    mem.reserve_ptr(&pFuture2, 3*4);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.reserve_ptr(nullptr, &pFuture2, 3*4);
    mem.commit();

    ASSERT_NE(pFuture, nullptr);
@ -136,16 +132,15 @@ TEST_F(GNAMemoryTest, canReserveAndPushData) {
 }

 TEST_F(GNAMemoryTest, canBindAndResolve) {
-
    float input[] = {1, 2, 3};
    float *pFuture = nullptr;
    float *pFuture2 = nullptr;
    float *pFuture3 = nullptr;
    size_t len = sizeof(input);

-    mem.bind_ptr(&pFuture3, &pFuture);
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture);

    mem.commit();

@ -160,16 +155,15 @@ TEST_F(GNAMemoryTest, canBindAndResolve) {
 }

 TEST_F(GNAMemoryTest, canBindTransitevlyAndResolve) {
-
    float input[] = {1, 2, 3};
    float *pFuture = nullptr;
    float *pFuture3 = nullptr;
    float *pFuture4 = nullptr;
    size_t len = sizeof(input);

-    mem.bind_ptr(&pFuture4, &pFuture3);
-    mem.bind_ptr(&pFuture3, &pFuture);
-    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture4, &pFuture3);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture);
+    mem.push_ptr(nullptr, &pFuture, input, len);

    mem.commit();

@ -185,16 +179,15 @@ TEST_F(GNAMemoryTest, canBindTransitevlyAndResolve) {
 }

 TEST_F(GNAMemoryTest, canBindTransitevlyWithOffsetsAndResolve) {
-
    float input[] = {1, 2, 3};
    float *pFuture = nullptr;
    float *pFuture3 = nullptr;
    float *pFuture4 = nullptr;
    size_t len = sizeof(input);

-    mem.bind_ptr(&pFuture4, &pFuture3, 4);
-    mem.bind_ptr(&pFuture3, &pFuture, 4);
-    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture4, &pFuture3, 4);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture, 4);
+    mem.push_ptr(nullptr, &pFuture, input, len);

    mem.commit();

@ -210,16 +203,15 @@ TEST_F(GNAMemoryTest, canBindTransitevlyWithOffsetsAndResolve) {
 }

 TEST_F(GNAMemoryTest, canBindWithOffsetAndResolve) {
-
    float input[] = {1, 2, 3};
    float *pFuture = nullptr;
    float *pFuture2 = nullptr;
    float *pFuture3 = nullptr;
    size_t len = sizeof(input);

-    mem.bind_ptr(&pFuture3, &pFuture, 4);
-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture, 4);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture);

    mem.commit();

@ -237,12 +229,11 @@ TEST_F(GNAMemoryTest, canBindWithOffsetAndResolve) {


 TEST_F(GNAMemoryTest, canPushLocal) {
-
-    float* pFuture = (float*)&pFuture;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);

    {
        std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-        mem.push_local_ptr(pFuture, &*input.begin(), 4 * 4, 1);
+        mem.push_local_ptr(nullptr, pFuture, &*input.begin(), 4 * 4, 1);
    }

    //poison stack
@ -255,13 +246,12 @@ TEST_F(GNAMemoryTest, canPushLocal) {
 }

 TEST_F(GNAMemoryTest, canPushValue) {
-
-    float* pFuture = (float*)&pFuture;
-    float* pFuture2 = (float*)&pFuture2;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);

    {
-        mem.push_value(pFuture, 3.f,  2);
-        mem.push_value(pFuture2, 13.f, 2);
+        mem.push_value(nullptr, pFuture, 3.f,  2);
+        mem.push_value(nullptr, pFuture2, 13.f, 2);
    }

    mem.commit();
@ -273,13 +263,12 @@ TEST_F(GNAMemoryTest, canPushValue) {
 }

 TEST_F(GNAMemoryTest, canPushReadOnlyValue) {
-
-    float* pFuture = (float*)&pFuture;
-    float* pFuture2 = (float*)&pFuture2;
+    float* pFuture = reinterpret_cast<float*>(&pFuture);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);

    {
-        mem.push_value(pFuture, 3.f,  2);
-        mem.readonly().push_value(pFuture2, 13.f, 2);
+        mem.push_value(nullptr, pFuture, 3.f,  2);
+        mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
    }

    mem.commit();
@ -290,10 +279,37 @@ TEST_F(GNAMemoryTest, canPushReadOnlyValue) {
    ASSERT_FLOAT_EQ(pFuture[3], 13);
 }

-TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeEmptyReqs) {
+    mem.push_value(nullptr, nullptr, 3.f,  2);
+    mem.readonly().push_value(nullptr, nullptr, 13.f, 2);
+    mem.commit();

-    mem.push_value(nullptr, 3.f,  2);
-    mem.readonly().push_value(nullptr, 13.f, 2);
+    ASSERT_EQ(mem.getTotalBytes(), 0);
+    ASSERT_EQ(mem.getRWBytes(), 0);
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithEmptyReqs) {
+    // empty request before
+    mem.push_value(nullptr, nullptr, 3.f,  2);
+    // not empty requests
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    mem.push_value(nullptr, pFuture1, 3.f,  2);
+    mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
+    // empty request after
+    mem.readonly().push_value(nullptr, nullptr, 13.f, 2);
+
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getRWBytes(), 2 * sizeof(float));
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    mem.push_value(nullptr, pFuture1, 3.f,  2);
+    mem.readonly().push_value(nullptr, pFuture2, 13.f, 2);
    mem.commit();

    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
@ -301,11 +317,12 @@ TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
 }

 TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithAlignment) {
-
    GNAMemory<std::allocator<uint8_t>> memAligned(64);
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);

-    memAligned.push_value(nullptr, 3.f,  2);
-    memAligned.readonly().push_value(nullptr, 13.f, 2);
+    memAligned.push_value(nullptr, pFuture1, 3.f,  2);
+    memAligned.readonly().push_value(nullptr, pFuture2, 13.f, 2);
    memAligned.commit();

    ASSERT_EQ(memAligned.getTotalBytes(), 128);
@ -313,15 +330,13 @@ TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithAlignment) {
 }

 TEST_F(GNAMemoryTest, canSetUpReadWriteSectionPtr) {
+    float* pFuture1 = reinterpret_cast<float*>(&pFuture1);
+    float* pFuture2 = reinterpret_cast<float*>(&pFuture2);
+    float* pFuture3 = reinterpret_cast<float*>(&pFuture3);

-    float* pFuture2 = (float*)&pFuture2;
-    float* pFuture1 = (float*)&pFuture1;
-    float* pFuture3 = (float*)&pFuture3;
-
-
-    mem.readonly().push_value(pFuture1, 3.f,  2);
-    mem.push_value(pFuture2, 13.f, 3);
-    mem.readonly().push_value(pFuture3, 32.f,  4);
+    mem.readonly().push_value(nullptr, pFuture1, 3.f,  2);
+    mem.push_value(nullptr, pFuture2, 13.f, 3);
+    mem.readonly().push_value(nullptr, pFuture3, 32.f,  4);
    mem.commit();

    ASSERT_EQ(mem.getTotalBytes(), (2+3+4) * sizeof(float));
@ -346,16 +361,15 @@ TEST_F(GNAMemoryTest, canSetUpReadWriteSectionPtr) {

 TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequest) {
    float input[]  = {1, 2, 3};
-
    float *pFuture = nullptr;
    float *pFuture2 = nullptr;
    float *pFuture3 = nullptr;

    size_t len = sizeof(input);

-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.bind_ptr(&pFuture3, &pFuture2, 2 * len, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.bind_ptr(nullptr, &pFuture3, &pFuture2, 2 * len, len);

    mem.commit();

@ -385,9 +399,9 @@ TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenPush) {

    size_t len = sizeof(input);

-    mem.push_ptr(&pFuture, input, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.push_ptr(&pFutureInput2, input2, len);
+    mem.push_ptr(nullptr, &pFuture, input, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.push_ptr(nullptr, &pFutureInput2, input2, len);

    mem.commit();

@ -416,9 +430,9 @@ TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenAlloc) {

    size_t len = sizeof(input);

-    mem.reserve_ptr(&pFuture, len);
-    mem.bind_ptr(&pFuture2, &pFuture, len, len);
-    mem.push_ptr(&pFutureInput, input, len);
+    mem.reserve_ptr(nullptr, &pFuture, len);
+    mem.bind_ptr(nullptr, &pFuture2, &pFuture, len, len);
+    mem.push_ptr(nullptr, &pFutureInput, input, len);

    mem.commit();

--- a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
+++ b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
@ -19,7 +19,7 @@ const std::map<std::string, std::string>  supportedConfigKeysWithDefaults = {
    {GNA_CONFIG_KEY(EXEC_TARGET), ""},
    {GNA_CONFIG_KEY(COMPILE_TARGET), ""},
    {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT},
-    {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(NO)},
+    {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(YES)},
    {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
    {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()},
    {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)},
--- a/src/inference/dev_api/memory_solver.hpp
+++ b/src/inference/dev_api/memory_solver.hpp
@ -8,6 +8,7 @@
 */
 #pragma once

+#include <ie_common.h>
 #include <stdint.h>

 #include <algorithm>