openvino/inference-engine/src/gna_plugin/gna_plugin.cpp

// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#define NOMINMAX

#include <cstdlib>
#include <iostream>
#include <vector>
#include <cstring>
#include <list>
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <memory>
#include <utility>
#include <limits>

#include <legacy/graph_tools.hpp>
#include <cpp_interfaces/exception2status.hpp>
#include <legacy/net_pass.h>
#include <debug.h>
#include <gna/gna_config.hpp>
#include "gna_plugin_config.hpp"
#include <legacy/ie_util_internal.hpp>
#include "gna_plugin.hpp"
#include "optimizer/gna_pass_manager.hpp"
#include "layers/gna_layer_type.hpp"
#include "preprocessing.hpp"
#include "frontend/weights_converter.hpp"
#include "frontend/model_quantizer.hpp"
#include "gna_fused_iterator.hpp"
#include "backend/am_intel_dnn.hpp"
#include "memory/gna_allocator.hpp"
#include "memory/gna_memory_state.hpp"
#include "gna_model_serial.hpp"
#include "runtime/gna_float_runtime.hpp"

#if GNA_LIB_VER == 2
#include <gna2-model-api.h>

uint32_t ToByteSize(const Gna2DataType type) {
    switch (type) {
    case Gna2DataTypeInt8:
    case Gna2DataTypeUint8:
        return 1;
    case Gna2DataTypeInt16:
    case Gna2DataTypeUint16:
        return 2;
    case Gna2DataTypeInt32:
    case Gna2DataTypeUint32:
        return 4;
    case Gna2DataTypeInt64:
    case Gna2DataTypeUint64:
        return 8;
    default:
        return 0;
    }
}

constexpr uint32_t GNAPluginNS::GNAPlugin::FAKE_REQUEST_CONFIG_ID;
#endif
using namespace InferenceEngine;
using namespace std;
using namespace GNAPluginNS;
using namespace InferenceEngine::details;

#ifdef __clang__
namespace InferenceEngine {
    template<>
    InferenceEngine::TBlob<intel_compound_bias_t, std::enable_if<true, void> >::~TBlob() { free(); }
}
#endif  // __clang__

template <typename T, typename U>
void GNAPlugin::copyInputData(T *dst,
                const U *src,
                uint32_t num_frames,
                uint32_t num_group,
                uint32_t num_vector_elements,
                uint32_t num_vector_stride,
                intel_dnn_orientation_t orientation,
                float scaleFactor) {
    if (!dst || !src) {
        return;
    }
    if (orientation == kDnnInterleavedOrientation) {
        for (uint32_t i = 0; i < num_frames; i++) {
            for (uint32_t j = 0; j < num_vector_elements; j++) {
                if (!std::is_same<T, U>::value) {
                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
                } else {
                    dst[j * num_group + i] = src[i * num_vector_elements + j];
                }
            }
            // pad to meet weight matrix row length requirement
            for (uint32_t j = num_vector_elements; j < num_vector_stride; j++) {
                dst[j * num_group + i] = 0;
            }
        }
        // pad partial group
        for (uint32_t i = num_frames; i < num_group; i++) {
            for (uint32_t j = 0; j < num_vector_stride; j++) {
                dst[j * num_group + i] = 0;
            }
        }
    } else {
        if (!std::is_same<T, U>::value) {
            for (uint32_t i = 0; i < num_frames; i++) {
                T *ptr_dst_vec = reinterpret_cast<T *>(dst) + i * num_vector_stride;
                const U *ptr_src_vec = reinterpret_cast<const U *>(src) + i * num_vector_elements;
                std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
                for (int j=0; j < num_vector_elements; j++) {
                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
                }
            }

        } else {
            for (uint32_t i = 0; i < num_frames; i++) {
                void *ptr_dst_vec = reinterpret_cast<uint8_t *>(dst) + i * num_vector_stride * sizeof(T);
                const void *ptr_src_vec = reinterpret_cast<const uint8_t *>(src) + i * num_vector_elements * sizeof(U);
                std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
                ie_memcpy(ptr_dst_vec, num_vector_elements * sizeof(T),
                    ptr_src_vec, num_vector_elements * sizeof(T));
            }
        }

        for (uint32_t i = num_frames; i < num_group; i++) {
            void *ptr_dst_vec = reinterpret_cast<uint8_t *>(dst) + i * num_vector_stride * sizeof(T);
            std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
        }
    }
}

template <typename T, typename U>
void GNAPlugin::copyInputDataWithSplit(T *const dst,
                const U *src,
                const GNASplitLayer& splitInfo,
                size_t precision_size,
                int idx) {
    if (!dst || !src) {
        return;
    }
    T *dst_ptr = dst;
    const U *src_ptr = src;
    precision_size = sizeof(T);
    // we found split/slice layer connected to Input
    for (auto&& outputLayer : splitInfo.splitOutputLayers) {
        uint32_t begin = outputLayer.offset / precision_size;
        uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
        if (dst_ptr - dst >= end) {
            // output layer with bind pointer as previous one. Skip
            continue;
        }
        for (uint32_t i = begin; i < end; ++i) {
            if (!std::is_same<T, U>::value) {
                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * inputsDesc->getScaleFactor(idx));
            } else {
                *(dst_ptr++) = *(src_ptr++);
            }
        }
        begin = end;
        end = (outputLayer.offset + ALIGN64(outputLayer.pure_size))/precision_size;
        std::memset(dst_ptr, 0, (end - begin )* sizeof(uint16_t));
        dst_ptr += end - begin;
    }
}

void GNAPlugin::ExportScores(void *ptr_dst,
                  const void *ptr_src,
                  intel_dnn_orientation_t orientation,
                  uint32_t num_frames,
                  uint32_t num_group,
                  uint32_t num_vector_elements,
                  uint32_t num_active_elements,
                  uint32_t num_vector_stride,
                  uint32_t num_bytes_per_element_input,
                  uint32_t num_bytes_per_element) {
    // source scores are possibly padded to multiple of 8 and possibly interleaved
    // rotate if necessary and only copy actual scores (not padding)
    if (orientation == kDnnInterleavedOrientation) {
        if (num_bytes_per_element == 2) {
            int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
            const int16_t *src = reinterpret_cast<const int16_t *>(ptr_src);
            for (uint32_t i = 0; i < num_frames; i++) {
                for (uint32_t j = 0; j < num_active_elements; j++) {
                    dst[i * num_vector_elements + j] = src[j * num_group + i];
                }
                for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
                    dst[i * num_vector_elements + j] = 0;
                }
            }
        } else if (num_bytes_per_element == 4) {  // should work for both int and float
            int32_t *dst = reinterpret_cast<int32_t *>(ptr_dst);
            const int8_t *src = reinterpret_cast<const int8_t*>(ptr_src);
            for (uint32_t i = 0; i < num_frames; i++) {
                for (uint32_t j = 0; j < num_active_elements; j++) {
                    auto input_ptr = src + (j * num_group + i) * num_bytes_per_element_input;
                    auto dst_ptr = dst + (i * num_vector_elements + j);

                    switch (num_bytes_per_element_input) {
                        case 2 : {
                            *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<const int16_t*>(input_ptr));
                            break;
                        }
                        case 4 : {
                            *dst_ptr = *reinterpret_cast<const int32_t *>(input_ptr);
                            break;
                        }
                        default:
                            THROW_GNA_EXCEPTION << "Unsupported output layer precision: " << num_bytes_per_element_input << "bytes";
                    }
                }
                for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
                    dst[i * num_vector_elements + j] = 0;
                }
            }
        } else {
            THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
        }
    } else {
        if (num_bytes_per_element == 2) {
            for (uint32_t i = 0; i < num_frames; i++) {
                auto ptr_dst_vec = reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(int16_t);
                auto ptr_src_vec = reinterpret_cast<const uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(int16_t);
                memset(ptr_dst_vec, 0, num_vector_elements * sizeof(int16_t));
                ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(int16_t),
                    ptr_src_vec, num_active_elements * sizeof(int16_t));
            }
        } else if (num_bytes_per_element == 4) {  // should work for both int and float
            for (uint32_t i = 0; i < num_frames; i++) {
                void *ptr_dst_vec = reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float);
                const void *ptr_src_vec = reinterpret_cast<const uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float);
                memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
                ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float),
                    ptr_src_vec, num_active_elements * sizeof(float));
            }
        } else {
            THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
        }
    }
}

void GNAPlugin::ImportFrames(
                  void *ptr_dst,
                  const void *ptr_src,
                  Precision input_precision,
                  float scaleFactor,
                  intel_dnn_orientation_t orientation,
                  uint32_t num_frames,
                  uint32_t num_group,
                  uint32_t num_vector_elements,
                  uint32_t num_vector_stride) {
    if (orientation == kDnnInterleavedOrientation) {
        // TODO : fix that as well
        if (input_precision == Precision::U8) {
            auto src = reinterpret_cast<const uint8_t *>(ptr_src);
            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
        } else if (input_precision.size() == 2) {
            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
            auto src = reinterpret_cast<const int16_t *>(ptr_src);
            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
        } else if (input_precision.size() == 4) {
            if (!gnadevice) {
                auto dst = reinterpret_cast<float *>(ptr_dst);
                auto src = reinterpret_cast<const float *>(ptr_src);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            } else {
                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
                auto src = reinterpret_cast<const float *>(ptr_src);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            }
        }
    } else {
        if (input_precision == Precision::U8) {
            auto src = reinterpret_cast<const uint8_t *>(ptr_src);
            if (!gnadevice) {
                auto dst = reinterpret_cast<float *>(ptr_dst);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            } else {
                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            }

        } else if (input_precision.size()== 2) {
            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
            auto src = reinterpret_cast<const int16_t *>(ptr_src);
            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
        } else if (input_precision.size() == 4) {
            if (!gnadevice) {
                auto dst = reinterpret_cast<float *>(ptr_dst);
                auto src = reinterpret_cast<const float *>(ptr_src);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            } else {
                auto dst = reinterpret_cast<uint16_t *>(ptr_dst);
                auto src = reinterpret_cast<const float *>(ptr_src);
                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
            }
        }
    }
}

GNAPlugin::GNAPlugin() {
    Init();
    UpdateFieldsFromConfig();
}

GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
    Init();
    SetConfig(configMap);
}

void GNAPlugin::Init() {
    dnn = std::make_shared<backend::AMIntelDNN>(backend::AMIntelDNN());
    inputsDesc = std::make_shared<GNAPluginNS::InputDesc>(GNAPluginNS::InputDesc());
    gnaFlags = std::make_shared<GNAPluginNS::GNAFlags>(GNAPluginNS::GNAFlags());

    graphCompiler.setDNNPtr(dnn);
    graphCompiler.setInputDescPtr(inputsDesc);
    graphCompiler.setGNAFlagsPtr(gnaFlags);
}

void GNAPlugin::InitGNADevice() {
#if GNA_LIB_VER == 1
    gnadevice = std::make_shared<GNADeviceHelper>(gnaFlags->gna_lib_async_threads_num,
                                                  gnaFlags->gna_openmp_multithreading,
                                                  gnaFlags->performance_counting);
#else
    gnadevice = std::make_shared<GNADeviceHelper>(config.pluginGna2DeviceConsistent,
                gnaFlags->gna_lib_async_threads_num,
                gnaFlags->gna_openmp_multithreading,
                gnaFlags->performance_counting);
#endif
    size_t page_size_bytes = 4096;
    gnamem = std::make_shared<gna_memory_type>(memory::make_polymorph<memory::GNAAllocator>(gnadevice), page_size_bytes);
    graphCompiler.setGNAMemoryPtr(gnamem);
}

void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
    std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
    if (_network.getFunction()) {
        convertedNetwork = std::make_shared<InferenceEngine::details::CNNNetworkImpl>(_network);
    }
    InferenceEngine::ICNNNetwork &network = convertedNetwork ? *convertedNetwork : _network;

    NetPass::ConvertPrecision(network, Precision::I64, Precision::I32);
    NetPass::ConvertPrecision(network, Precision::U64, Precision::I32);
    NetPass::ConvertPrecision(network, Precision::U32, Precision::I32);

    //  Check the input network
    std::string error;
    if (!AreLayersSupported(network, error)) {
        THROW_GNA_EXCEPTION << error.c_str();
    }

    // network optimisation phases
    int passIdx = 0;
    auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
        auto passes = make_shared<PassManager>(PassManagerSettings{policy, runBeforeCopy}, network);
        passes->registerPass<RemoveConstPass>();
        passes->registerPass<UnrollTIPass>();
        passes->registerPass<RemoveConstPass>();
        passes->registerPass<InsertIdentityToLSTMCellPass>();
        passes->registerPass<UnrollLSTMCellPass>();
        passes->registerPass<RemoveSingleInputConcatPass>();

        passes->registerPass<SubstitutePReluPass>();
        passes->registerPass<SubstituteSoftSignPass>();

        passes->registerPass<ReorderMaxPoolPass>();
        passes->registerPass<EltwiseSplitOverChannelsPass>();
        passes->registerPass<InsertSplitAligningFilterPass>();

        passes->registerPass<InsertConcatAligningFilterPass>();
        passes->registerPass<ReorderConcatInputsPass>();
        if (policy.PermutePolicy != Policy::Permute::DISABLED) {
            passes->registerPass<ReversePermutationsPass>();
        }
        if (policy.NHWCToNCHWPolicy != Policy::NHWCToNCHW::DISABLED) {
            passes->registerPass<RemovePermutationsNHWCToNCHWPass>();
        }
        passes->registerPass<InsertIdentityLayerPass>();
        passes->registerPass<InsertCopyLayerPass>();
        passes->registerPass<InsertDiagonalLayerPass>();
        passes->registerPass<HandleMultipleActivationsForTheLayerPass>();
        passes->registerPass<SubstituteScaleShiftBroadCastPass>();
        passes->registerPass<FuseMultipleIdentitiesPass>();
        passes->registerPass<BroadcastConstPass>();
        passIdx = passes->run(passIdx);
    };

    ICNNNetwork::Ptr newNet;
    if (gnaFlags->sw_fp32) {
        auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
            transformLayer(lp, WeightsConverter());
            return lp;
        };
        newNet = InferenceEngine::CNNNetCopy(network, visitor);
        // to run all passes need to have two calls to pass manager
        run_passes(newNet, true);
        run_passes(newNet, false);
    } else {
        switch (config.gnaPrecision) {
            case Precision::I16:
                ModelQuantizer<QuantI16> q16;
                newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                break;
            case Precision::I8:
                ModelQuantizer<QuantI8> q8;
                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                break;
            default:
                THROW_GNA_EXCEPTION << "no mans land for GNA precision";
                break;
        }
    }

    auto inputLayers = CNNNetGetAllInputLayers(*newNet);

#ifdef PLOT
    std::ofstream file("gna_passes.dot");
    saveGraphToDot(*newNet, file, [](const CNNLayerPtr layer,
                                           ordered_properties &printed_properties,
                                           ordered_properties &node_properties) {
        // printing quantized params
        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
        if (!quantized) {
            return;
        }
        printed_properties.emplace_back(
            "scale factor", std::to_string(quantized->_dst_quant.scale));
    });
#endif

    auto sortedNet = CNNNetSortTopologicallyEx(*newNet, make_fuzed_order);

    // passing policy to compiler
    graphCompiler.setPolicy(policy);

    if (sortedNet.empty()) {
        THROW_GNA_EXCEPTION << "Sorted network is empty";
    }

    std::vector<CNNLayerPtr> sortedNoMem;
    std::unordered_map<std::string, std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
    // find all memory layers pairs and mark which one used as outputs
    for (auto &layer : sortedNet) {
        auto generic = dynamic_cast<GenericLayer *>(layer.get());
        if (generic == nullptr) {
            sortedNoMem.push_back(layer);
            continue;
        }
        LayerInfo layerInfo(layer);
        if (layerInfo.isMemory()) {
            // collect all memory pairs
            auto id = generic->GetParamAsString("id");
            memoryPairs[id].resize(generic->GetParamAsInt("size"));
            memoryPairs[id][generic->GetParamAsInt("index")] = layer;
            continue;
        } else if (layerInfo.isConcat()) {
            graphCompiler.fillConcatConnections(layer);
        } else if (layerInfo.isSplit() || layerInfo.isSlice()) {
            graphCompiler.fillSplitConnections(layer);
        }
        sortedNoMem.push_back(layer);
    }

    // fill in extra storage with memory layers
    graphCompiler.fillMemoryConnections(memoryPairs);

    if (!graphCompiler.memory_connection.empty()) {
        gnaFlags->gna_lib_async_threads_num = 1;
    }

    if (gnaFlags->sw_fp32) {
        gnamem.reset(new gna_memory_type(memory::make_polymorph<std::allocator<uint8_t>>()));
        graphCompiler.setGNAMemoryPtr(gnamem);
    } else {
        InitGNADevice();
    }

    // keep inputs information and create input primitives
    newNet->getInputsInfo(inputsDataMap);
    if (inputsDataMap.empty()) {
        THROW_GNA_EXCEPTION << " No inputs for the topology";
    }

    // keep output dims
    newNet->getOutputsInfo(outputsDataMap);
    if (outputsDataMap.empty()) {
        THROW_GNA_EXCEPTION << "No outputs for the topology";
    }

    for (auto && input : inputsDataMap) {
        inputsDesc->getPtrInputsGlobal(input.first).resize(gnaFlags->gna_lib_async_threads_num);
    }

    // CreatingLayer primitives
    for (auto & layer : sortedNoMem) {
        graphCompiler.CreateLayerPrimitive(layer);
    }
    for (auto& inputLayer : inputLayers) {
        auto layerInfo = LayerInfo(inputLayer);
        if (layerInfo.isInput() && 0 == inputsDesc->bytes_allocated_for_input[inputLayer->name]) {
            graphCompiler.connectOutput(inputLayer, &inputsDesc->getPtrInputsGlobal(inputLayer->name).front(), 0);
        }
    }
    // TODO: graph might be static - should we support that
    if (graphCompiler.dnnComponents.components.empty()) {
        THROW_GNA_EXCEPTION << "No GNA primitives created based on topology. This might indicate trivial topology";
    }

    /// setting-up output layers information
    outputsDesc.resize(outputsDataMap.size());

    auto initOutput = [this]
            (int idx, const intel_dnn_component_t & component, CNNLayerPtr layer) {
        // auto idx = std::distance(outputsDataMap.begin(), outputPort);
        auto & desc = outputsDesc[idx];
        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);

        desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
        desc.orientation = component.orientation_out;
        desc.num_bytes_per_element = component.num_bytes_per_output;
        desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
        // TODO: this need to be fixed
        desc.num_elements = component.num_rows_out;

        // binding ptr for first infer request - then others will be setup during relocation
        gnamem->bind_ptr(&desc.ptrs.front(), &component.ptr_outputs);
    };

    int portId = 0;
    for (auto && outPort : outputsDataMap) {
        // gets output layer pointer in original topology not in cloned
        auto outLayer = getCreatorLayer(outPort.second).lock();

        // Memory layers are not dnnComponents hence we need to make switch with identity layer
        if (outLayer->type == "Memory") {
            // traverse memory connection to find corresponding output_memory
            for (auto && memConnection : graphCompiler.memory_connection) {
                if (memConnection.second.getInput()->name == outLayer->name) {
                    // if connection is found, replace memory input layer with memory output layer
                    outLayer = memConnection.second.getOutput();
                    break;
                }
            }
        }

        // searching for outData represented in GNA blob
        // using ufs - upper first search
        gnalog() << "[UFS] searching for : "<< outPort.first << " representation in GNA\n";
        bool stopSearching = false;

        CNNNetDFS(outLayer, [this, &outPort, portId, &stopSearching, &initOutput](CNNLayerPtr layer) {
            auto irLayerAvatar = std::find_if(
                graphCompiler.dnnComponents.components.begin(),
                graphCompiler.dnnComponents.components.end(),
                [&layer](const backend::DnnComponents::storage_type::value_type & value) {
                    return value.name == layer->name;
            });

            gnalog() << "[UFS] from : "<< outPort.first <<" reached: " << layer->name << "\n";

            // probing gna_primitives
            if (irLayerAvatar != graphCompiler.dnnComponents.components.end()) {
                initOutput(portId, irLayerAvatar->dnnComponent, layer);
                stopSearching = true;
            }

            // probing concatInfo
            if (!stopSearching && LayerInfo(layer).isConcat()) {
                auto concatConnection  = graphCompiler.concat_connection.find(layer->name);
                if (concatConnection != graphCompiler.concat_connection.end()) {
                    //initOutput(portId, irLayerAvatar->second, layer);

                    auto &desc = outputsDesc[portId];
                    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);

                    desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
                    // TODO: what is orientation for concat
                    desc.orientation = kDnnInterleavedOrientation;
                    desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
                    desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
                    desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;

                    // binding ptr for first infer request - then others will be setup during relocation
                    gnamem->bind_ptr(&desc.ptrs.front(), &concatConnection->second.gna_ptr);
                    stopSearching = true;
                }
            }
        }, true, [&stopSearching](InferenceEngine::CNNLayer* from) {
            return make_upstream_order(!stopSearching ? from : nullptr);
        });
        if (!stopSearching) {
            THROW_GNA_EXCEPTION << "unsupported topology: cannot locate " << outPort.first
                                << " after compiling GNA graph";
        }
        portId++;
    }

    // TODO: how active list will work in multioutput case
    // make room for active list
    gnamem->reserve_ptr(nullptr,
        ALIGN64(outputsDesc.front().num_bytes_per_element * outputsDesc.front().num_elements), 64);

    void *pParallelExecutionData  = nullptr;

    // reserving more bytes for intermediate data in parallel case - TODO: this works incorrectly in compact mode at lest
    rwSegmentSize = gnamem->getRWBytes();
    if (gnaFlags->gna_lib_async_threads_num > 1) {
        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
    }

    gnamem->commit();

    dnn->Init(gnamem->getBasePtr(),
             gnamem->getTotalBytes(),
             gnaFlags->sw_fp32 ? kDnnFloat : kDnnInt,
             1);

    // TODO: this copy is unneeded; in fact, we can directly create gna structs from list
    auto execOrder = graphCompiler.dnnComponents.getExecutionOrder();
    dnn->component.insert(dnn->component.begin(), execOrder.begin(), execOrder.end());

    // in fp32 mode last PWL cannot be computed without that
    dnn->InitActiveList(NULL);

#if GNA_LIB_VER == 2
    gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>()));
#else
    nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap());
#endif
    if (!gnaFlags->sw_fp32) {
        // number of layer gets calculated inside that InitGNAStruct function
#if GNA_LIB_VER == 2
        dnn->InitGNAStruct(&std::get<0>(gnaModels.front())->obj);
#else
        dnn->InitGNAStruct(&std::get<0>(nnets.front())->obj);
#endif
    }

    // creating same gna RW segment for parallel infer requests
    for (int i = 1; i != gnaFlags->gna_lib_async_threads_num; i++) {
#if GNA_LIB_VER == 2
        gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>()));
        // this can be improved by just copy all structures, but we are too lazy
        dnn->InitGNAStruct(&std::get<0>(gnaModels.back())->obj);
#else
        nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap());
        dnn->InitGNAStruct(&std::get<0>(nnets.back())->obj);
#endif
        // relocate rw pointers to new offset
        auto basePtr = reinterpret_cast<uint8_t*>(pParallelExecutionData) + rwSegmentSize * (i - 1);

        auto relocate = [basePtr, this](void *& ptr_out, void * ptr_in) {
            if (ptr_in == nullptr) {
                ptr_out = nullptr;
            } else {
                auto offset = reinterpret_cast<uint8_t *>(ptr_in) - reinterpret_cast<uint8_t *>(gnamem->getBasePtr());
                ptr_out = basePtr + offset;
            }
        };

        for (auto &&input : inputsDesc->ptr_inputs_global_storage) {
            relocate(input[i], input[0]);
        }

        // relocating all output pointers
        for (int j = 0; j < outputsDesc.size(); ++j) {
            relocate(outputsDesc[j].ptrs[i], outputsDesc[j].ptrs[0]);
        }

#if GNA_LIB_VER == 2
        for (int j = 0; j != std::get<0>(gnaModels.front())->obj.NumberOfOperations; j++) {
            auto & gnaOperation = std::get<0>(gnaModels[i])->obj.Operations[j];
            relocate(const_cast<Gna2Tensor*>(gnaOperation.Operands[0])->Data, gnaOperation.Operands[0]->Data);
            relocate(const_cast<Gna2Tensor*>(gnaOperation.Operands[1])->Data, gnaOperation.Operands[1]->Data);
#else
        for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
            auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
            relocate(layer.pInputs, layer.pInputs);
            relocate(layer.pOutputs, layer.pOutputs);
            relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
#endif
        }
    }

    // calculating input orientation without memory layers, since their orientation not changed during infer right now
    std::unordered_map<string, std::vector<string>> skippedLayers;

    bool withConv = false;
    for (auto &layer : sortedNet) {
        auto layerInfo = LayerInfo(layer);
        if (layerInfo.isConvolution()) {
            withConv = true;
            break;
        }
    }
    if (withConv) {
        for (auto &inputLayer : sortedNet) {
            if (!LayerInfo(inputLayer).isInput()) {
                continue;
            }
            auto doesntHaveGnaMapping = [this] (CNNLayerPtr l) {
                auto dnnLayer = graphCompiler.dnnComponents.findComponent(l);
                return dnnLayer == nullptr;
            };

            auto nextLayers = CNNNetGetAllNextLayersSkipCertain(inputLayer, -1, doesntHaveGnaMapping);

            for (auto &nextLayer : nextLayers) {
                auto dnnLayer = graphCompiler.dnnComponents.findComponent(nextLayer);
                // non functional layer - skipped by gna
                if (nullptr == dnnLayer) {
                    THROW_GNA_LAYER_EXCEPTION(inputLayer) << " gna mapped layer search connection failed";
                }
                // input orientation might be already initialized, thus verify that it matches
                if (!inputsDesc->orientation_in.count(inputLayer->name)) {
                    inputsDesc->orientation_in[inputLayer->name] = dnnLayer->orientation_in;
                } else {
                    if (inputsDesc->orientation_in[inputLayer->name] != dnnLayer->orientation_in) {
                        THROW_GNA_EXCEPTION << "orientation for input layer: " << inputLayer->name << "cannot be calculated";
                    }
                }
            }
        }
    } else {
        for (auto &inputLayer : inputLayers) {
            inputsDesc->orientation_in[inputLayer->name] = kDnnInterleavedOrientation;
        }
    }

    do_rotate_input = dnn->do_rotate_input;
    num_rotate_rows = dnn->num_rotate_rows;
    num_rotate_columns = dnn->num_rotate_columns;

    DumpXNNToFile();

#ifdef PLOT
    dnn->WriteGraphWizModel("gna-blob.dot");
#endif
#if GNA_LIB_VER == 2
    createRequestConfigsForGnaModels();
#endif
}

#if GNA_LIB_VER == 2
void GNAPlugin::createRequestConfigsForGnaModels() {
    if (!gnadevice) {
        gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(FAKE_REQUEST_CONFIG_ID, -1, InferenceEngine::BlobMap()));
        return;
    }
    for (auto& model : gnaModels) {
        const auto& gnaNnet = std::get<0>(model).get()->obj;
        const auto modelId = gnadevice->createModel(gnaNnet);
        const auto requestConfigId = gnadevice->createRequestConfig(modelId);
        gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(requestConfigId, -1, InferenceEngine::BlobMap()));
    }
}

#endif

int GNAPlugin::GetDeviceVersionFromString(const std::string deviceString) {
    constexpr uint32_t embeddedSuffix = 0xE;
    if (deviceString.empty())
        return 0x100 + embeddedSuffix;
    if (deviceString.size() == 4 && deviceString.substr(0, 3) == "GNA") {
        int version = deviceString[3] - '0';
            if (version > 0) {
            version <<= 8;
            version += embeddedSuffix;
            return version;
        }
    }
    THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << deviceString;
}

void GNAPlugin::DumpXNNToFile() const {
    // TODO: output  precision as well as pointer might be incorrect, LSTM for sure
    // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
    if (config.dumpXNNPath.empty()) {
        return;
    }

    const auto versionInt = GetDeviceVersionFromString(config.dumpXNNGeneration);

    if (!gnadevice) {
        THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
    }
    std::ofstream dumpStream(config.dumpXNNPath, std::ios::out | std::ios::binary);
#if GNA_LIB_VER == 1
    if (versionInt != 0x10E)
        THROW_GNA_EXCEPTION << "Wrong GNA version for embedded model dump: " << config.dumpXNNGeneration;
    auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
    dump.header.rw_region_size = gnamem->getRWBytes();
    dump.header.input_scaling_factor = inputsDesc->inputScaleFactors.front();
    dump.header.output_scaling_factor = outputsDesc.front().scale_factor;
    dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
    dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
#else
    auto const modelId = gnadevice->createModel(std::get<0>(gnaModels.front())->obj);
    if (versionInt == Gna2DeviceVersionEmbedded1_0) {
        auto dump = gnadevice->dumpXnn(modelId);
        dump.header.RwRegionSize = gnamem->getRWBytes();
        dump.header.InputScalingFactor = inputsDesc->inputScaleFactors.front();
        dump.header.OutputScalingFactor = outputsDesc.front().scale_factor;
        dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(Gna2ModelSueCreekHeader));
        dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.ModelSize);
    } else {
        static_assert(sizeof(versionInt) >= sizeof(Gna2DeviceVersion), "");
        gnadevice->dumpXnnForDeviceVersion(modelId, dumpStream,
            *reinterpret_cast<const Gna2DeviceVersion*>(&versionInt));
    }
    gnadevice->releaseModel(modelId);
#endif
}

void RotateFeatures(uint8_t *ptr_feat,
                    size_t element_size,
                    uint32_t num_feature_vectors,
                    uint32_t num_feature_vector_elements,
                    uint32_t num_rotate_rows,
                    uint32_t num_rotate_columns) {
    if (num_feature_vector_elements == num_rotate_rows * num_rotate_columns) {
        std::vector<uint8_t> temp(num_feature_vector_elements * element_size);
        for (uint32_t k = 0; k < num_feature_vectors; k++) {
            uint8_t *ptr_in = ptr_feat + k * num_feature_vector_elements * element_size;
            for (uint32_t i = 0; i < num_rotate_rows; i++) {
                for (uint32_t j = 0; j < num_rotate_columns; j++) {
                    ie_memcpy(&temp.front() + (j * num_rotate_rows + i)*element_size,
                              temp.size() - (i * num_rotate_columns + j)*element_size,
                              ptr_in + (i * num_rotate_columns + j)*element_size,
                              element_size);
                }
            }
            ie_memcpy(ptr_in, num_feature_vector_elements * element_size,
                &temp.front(), num_feature_vector_elements * element_size);
        }
    } else {
        THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns
                           <<") do not match buffer length of "<< num_feature_vector_elements <<" in RotateFeatures()!";
    }
}

uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
#if GNA_LIB_VER == 2
    auto& nnets = gnaRequestConfigToRequestIdMap;
#endif
    auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
        return std::get<1>(item) == -1;
    });

    if (freeNnet == nnets.end()) {
        if (!graphCompiler.memory_connection.empty()) {
            Wait(0);
            freeNnet = nnets.begin();
        } else {
            THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
                               << "GNA executable network has max of "
                               << static_cast<uint32_t >(gnaFlags->gna_lib_async_threads_num)
                               << " parallel infer requests, please sync one of already running";
        }
    }

    auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));

    int inputNum = 0;
    for (auto &input : inputs) {
        auto inputLayout = input.second->getTensorDesc().getLayout();
        if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
            THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: "
                                << input.second->getTensorDesc().getLayout();
        }
        if (inputLayout == NCHW) {
            inputLayout = NC;
        }
        auto is2D = input.second->getTensorDesc().getLayout() == Layout::NC || input.second->getTensorDesc().getLayout() == Layout::CN;

        if (!inputsDesc->ptr_inputs_global_id.count(input.first)) {
            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
            THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set";
        }

        if (inputsDesc->getPtrInputsGlobal(input.first)[idx] == nullptr) {
            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
            THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #"
                                << idx << " not set";
        }

        if (inputsDesc->getOrientation(input.first) == kDnnUnknownOrientation) {
            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
            THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set";
        }

        for (auto& outputDesc : outputsDesc) {
            if (outputDesc.orientation == kDnnUnknownOrientation) {
                // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
                THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
            }
        }

        auto dims = input.second->getTensorDesc().getDims();

        auto  importedElements = is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 2] * dims[dims.size() - 3];
        auto  importedFrames = dims[0];
        auto  targetGroups = is2D ? dims[dims.size() - 2] : dims[0]; // TODO: no proper support for groups yet

        auto  importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : 2;
        auto  importedBytes = importedElements * importedFrames * importedElementSizeBytes;

        if (inputsDesc->bytes_allocated_for_input[input.first] < importedBytes) {
            THROW_GNA_EXCEPTION << "Cannot import input frames for :" << input.first
                                  << ", allocated size: " << inputsDesc->bytes_allocated_for_input[input.first]
                                  << ", but input blob size: " << importedBytes;
        }

        ImportFrames(inputsDesc->getPtrInputsGlobal(input.first)[idx],
                     input.second->cbuffer().as<float *>(),
                     input.second->getTensorDesc().getPrecision(),
                     gnaFlags->sw_fp32 ? 1.0f : inputsDesc->getScaleFactor(inputNum),
                     inputsDesc->getOrientation(input.first),
                     importedFrames,
                     targetGroups,
                     importedElements,
                     importedElements);

        bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
        if (do_rotate_input && ((inputLayout == Layout::NC || inputLayout == Layout::NCHW)
            != (inputsDesc->getOrientation(input.first) == kDnnInterleavedOrientation))
            && !isOneChannel) {
            RotateFeatures(reinterpret_cast<uint8_t *>(inputsDesc->getPtrInputsGlobal(input.first)[idx]),
                           gnadevice ? 2 : 4,
                           // TODO: only works for cnn4a and google command so far
                           dims[0],
                           is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 3],  // num_feature_vectors looks batch should be there
                           num_rotate_rows,
                           num_rotate_columns);
        }
        ++inputNum;
    }

    if (!gnadevice) {
        auto runtime = runtime::FP(dnn);
        runtime.infer();
        if (freeNnet != nnets.end()) {
            std::get<1>(*freeNnet) = 1;
        }
    } else {
#if GNA_LIB_VER == 1
        auto nnet = std::get<0>(*freeNnet).get();
        std::get<1>(*freeNnet) = gnadevice->propagate(&nnet->obj, ptr_active_indices, num_active_indices, config.gna_proc_type);
#else
        const auto reqConfigId = std::get<0>(*freeNnet);
        if (ptr_active_indices != nullptr && num_active_indices > 0 && activeLayerIndex != 0xffffffff)
            gnadevice->setUpActiveList(reqConfigId, activeLayerIndex, ptr_active_indices, num_active_indices);
        std::get<1>(*freeNnet) = gnadevice->propagate(reqConfigId, config.pluginGna2AccMode);
#endif
    }

#ifdef PLOT
    dnn->BeginNewWrite(dnn_dump_write_index);
    if (dnn->num_components() != 0) {
        dnn->WriteDnnText("Net_.txt", kDnnFloat);
    }
    dnn_dump_write_index++;
#endif
    if (freeNnet != nnets.end()) {
        // TODO: GNA2: Substitute properly when using GNA 2.0 Library setting and CPU
        std::get<2>(*freeNnet) = result;
    }
    return idx;
}

bool GNAPlugin::Wait(uint32_t request_idx) {
    return WaitFor(request_idx, MAX_TIMEOUT);
}

bool GNAPlugin::WaitFor(uint32_t request_idx, int64_t millisTimeout) {
#if GNA_LIB_VER == 2
    auto& nnets = gnaRequestConfigToRequestIdMap;
#endif
    if (nnets.size() <= request_idx) return true;    // TODO: GNA2: check whether necessary
    // already synced TODO: might be copy required ???
    if (std::get<1>(nnets[request_idx]) == -1) return true;

    if (gnadevice) {
        if (!gnadevice->wait(std::get<1>(nnets[request_idx]), millisTimeout)) {
            std::get<1>(nnets[request_idx]) = -1;
            return false;
        }
    }

    std::get<1>(nnets[request_idx]) = -1;
    auto &request = std::get<2>(nnets[request_idx]);
#ifdef PLOT
    if (dnn->num_components() != 0) {
        dnn->WriteInputAndOutputText();
    }
#if GNA_LIB_VER == 1
    dnn->WriteInputAndOutputTextGNA(&std::get<0>(nnets[request_idx])->obj);
#else
    dnn->WriteInputAndOutputTextGNA(std::get<0>(gnaModels[request_idx])->obj);
#endif
#endif
    int output_idx = 0;
    for (auto && outputBlobIt : request) {
        auto & outputBlob = outputBlobIt.second;
        auto & outputDesc = outputsDesc[output_idx];
        if (outputBlob->getTensorDesc().getLayout() == Layout::NC || outputBlob->getTensorDesc().getLayout() == Layout::CN
            || outputBlob->getTensorDesc().getLayout() == Layout::NCHW || outputBlob->getTensorDesc().getLayout() == Layout::NHWC) {
            // TODO: rotate can be incorporated with exporting - used only in unit tests so far
            // TODO: restore:
//        if (orientation_out != kDnnInterleavedOrientation) {
//            if (inputs.size() != 1) {
//                THROW_GNA_EXCEPTION << "Invalid number of inputs for  for deinterleave " << inputs.size()
//                                    << ", only 1 supported";
//            }
//            auto dims = inputs.begin()->second->dims();
//            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
//                           gnadevice ? 2 : 4,
//                           dims[dims.size() - 1],
//                           dims[0],  // num_feature_vectors looks batch should be there
//                           dims[0],
//                           dims[dims.size() - 1]);
//        }
            auto is2D = outputBlob->getTensorDesc().getLayout() == Layout::NC || outputBlob->getTensorDesc().getLayout() == Layout::CN;
            auto& exportOutputDims = outputBlob->getTensorDesc().getDims();
            auto batchSize = exportOutputDims[0];
            auto elementsPerBatch = is2D ? exportOutputDims[exportOutputDims.size() - 1]
                : exportOutputDims[exportOutputDims.size() - 1]
                  * exportOutputDims[exportOutputDims.size() - 2]
                  * exportOutputDims[exportOutputDims.size() - 3];

            ExportScores(outputBlob->buffer(),
                         outputDesc.ptrs[request_idx],
                         outputDesc.orientation,
                         batchSize,
                         batchSize,
                         elementsPerBatch,
                         elementsPerBatch,
                         elementsPerBatch,
                         outputDesc.num_bytes_per_element,
                         sizeof(float));

            if (gnadevice) {
#ifdef PLOT
                FILE* f = nullptr;
                static int num_infers = 0;
                {
                    f = fopen("ex_scores.txt", "w");
                }
                num_infers++;
                if (f) {
                    auto dims = outputBlob->getTensorDesc().getDims();
                    for (int i = 0; i < dims[dims.size() - 2]; i++) {
                        for (int j = 0; j < dims[dims.size() - 1]; j++) {
                            fprintf(f, "%d ", outputBlob->cbuffer().as<int32_t*>()[dims[dims.size() - 1] * i + j]);
                        }
                        fprintf(f, "\n");
                    }
                    fprintf(f, "\n\n");
                }
#endif
                ConvertToFloat(outputBlob->buffer(),
                    outputBlob->buffer(),
                    elementsPerBatch,
                    batchSize,
                    outputDesc.scale_factor);
#ifdef PLOT
                if (f) {
                    auto dims = outputBlob->getTensorDesc().getDims();
                    for (int i = 0; i < dims[dims.size() - 2]; i++) {
                        for (int j = 0; j < dims[dims.size() - 1]; j++) {
                            fprintf(f, "%.2f ", outputBlob->cbuffer().as<float*>()[dims[dims.size() - 1] * i + j]);
                        }
                        fprintf(f, "\n");
                    }
                    fclose(f);
                }
#endif
            }
        } else {
            THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC, Layout::CN, Layout::NCHW or Layout::NHWC. But was "
                << outputBlob->getTensorDesc().getLayout();
        }

        output_idx++;
    }
    return true;
}

void GNAPlugin::Reset() {
    graphCompiler.Reset();
}

bool GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
    BlobMap bmInput;
    BlobMap bmOutput;
    if (inputsDataMap.size() != 1) {
        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << " inputs";
    }

    IE_ASSERT(!inputsDataMap.empty());
    bmInput[inputsDataMap.begin()->first] = std::shared_ptr<Blob>(const_cast<Blob*>(&input), [](Blob*){});
    IE_ASSERT(!outputsDataMap.empty());
    bmOutput[outputsDataMap.begin()->first] = std::shared_ptr<Blob>(&output, [](Blob*){});
    return Infer(bmInput, bmOutput);
}

bool GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
    return  Wait(QueueInference(input, result));
}

Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) {
    // need to have intermediate blob for interleave conversion
    InferenceEngine::Blob::Ptr outputBlob;
    auto outputDims = outputsDataMap[name]->getTensorDesc().getDims();
    outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, outputDims.size() == 2 ? NC : NCHW));
    outputBlob->allocate();
    return outputBlob;
}

Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Precision precision) {
    InferenceEngine::Blob::Ptr inputBlob;
    // need to have intermediate blob for interleave conversion
    // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
    auto inputDims = inputsDataMap[name]->getTensorDesc().getDims();
    inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, inputDims.size() == 2 ? NC : NCHW));
    inputBlob->allocate();
    return inputBlob;
}

std::vector<InferenceEngine::MemoryStateInternal::Ptr>  GNAPlugin::QueryState() {
    if (memoryStates.size() != graphCompiler.memory_connection.size()) {
        memoryStates.clear();
        for (auto& connection : graphCompiler.memory_connection) {
            auto state = std::make_shared<memory::GNAMemoryState>(connection.first, std::make_shared <GNAMemoryLayer>(connection.second));
            memoryStates.emplace_back(state);
        }
    }
    return memoryStates;
}

std::string GNAPlugin::GetName() const noexcept {
    return _pluginName;
}

void GNAPlugin::SetName(const std::string & pluginName) noexcept {
    _pluginName = pluginName;
}

InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(std::istream& networkModel) {
    auto header = GNAModelSerial::ReadHeader(networkModel);

    InitGNADevice();

    graphCompiler.setGNAMemoryPtr(gnamem);
    void *basePtr = nullptr;
    gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
    gnamem->commit();
#if GNA_LIB_VER == 2
    gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>(header.layersCount)));
#else
    nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(header.layersCount), -1, InferenceEngine::BlobMap());
    std::get<0>(nnets.back())->obj.nGroup = header.nGroup;
#endif
    GNAModelSerial::MemoryType  mt;
#if GNA_LIB_VER == 2
    auto serial = GNAModelSerial(&std::get<0>(gnaModels.back())->obj, mt);
#else
    auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
#endif

    serial.setHeader(header);
    serial.Import(basePtr,
            header.gnaMemSize,
            networkModel,
            inputsDesc,
            outputsDesc,
            inputsDataMap,
            outputsDataMap);

#if GNA_LIB_VER == 2
    auto getOrientation = [](Gna2Operation & gnaOperation) {
        return gnaOperation.Type == Gna2OperationTypeConvolution ?
            kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
    };
#else
    auto getOrientation = [](intel_nnet_layer_t & layer) {
        return layer.nLayerKind == INTEL_CONVOLUTIONAL ?
           kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
    };
#endif

#if GNA_LIB_VER == 1
    inputsDesc->orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
    outputsDesc[0].orientation = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers - 1]);
#endif

    do_rotate_input = header.doRotateInput;
    num_rotate_rows = header.nRotateRows;
    num_rotate_columns = header.nRotateColumns;

    for (auto && memory : mt) {
        GNAMemoryLayer memoryLayer(nullptr, nullptr, gnaFlags->sw_fp32 ? 4 : 2);
        memoryLayer.gna_ptr = memory.first;
        memoryLayer.reserved_size = memory.second;

        graphCompiler.memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer));
    }

    DumpXNNToFile();

#ifdef PLOT
    dnn->WriteGraphWizModel("gna-blob-imported.dot");
#endif
#if GNA_LIB_VER == 2
    createRequestConfigsForGnaModels();
#endif
    return nullptr;
}

void GNAPlugin::Export(const std::string &fileName) {
    if (inputsDesc->ptr_inputs_global_id.empty() || outputsDesc.empty()) {
        THROW_GNA_EXCEPTION << " network not loaded";
    }

#if GNA_LIB_VER == 1
    if (inputsDesc->ptr_inputs_global_id.size() != 1) {
        THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported";
    }
#endif

    std::fstream outStream(fileName, ios_base::out | ios_base::binary);

    // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
    IE_ASSERT(!inputsDataMap.empty());
    auto inputDims = inputsDataMap.begin()->second->getTensorDesc().getDims();
    if (inputDims.size() == 2) {
#if GNA_LIB_VER == 1
        std::get<0>(nnets.front())->obj.nGroup = inputDims[0];
#endif
    }
#if GNA_LIB_VER == 2
    Gna2Model* modelToSerial = &std::get<0>(gnaModels.front())->obj;
#else
    intel_nnet_type_t* modelToSerial = &std::get<0>(nnets.front())->obj;
#endif
    auto serial = GNAModelSerial(modelToSerial,
                                 inputsDesc,
                                 outputsDesc,
                                 inputsDataMap,
                                 outputsDataMap)
                    .SetInputRotation(dnn->num_rotate_rows, dnn->num_rotate_columns, dnn->do_rotate_input);

    for (auto && memoryConnection : graphCompiler.memory_connection) {
        serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size);
    }

    serial.Export(gnamem->getBasePtr(), gnamem->getTotalBytes(), outStream);
}

void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) {
    if (gnaFlags->performance_counting) {
        gnadevice->getGnaPerfCounters(perfMap);
    }
}

void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}

void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config_map) {
    config.UpdateFromMap(config_map);
    UpdateFieldsFromConfig();
}

void GNAPlugin::UpdateFieldsFromConfig() {
    inputsDesc->inputScaleFactors = config.inputScaleFactors;
    *gnaFlags = config.gnaFlags;
}

void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
                             const std::map<std::string, std::string>& config,
                             InferenceEngine::QueryNetworkResult& res) const {
    if (network.getFunction()) {
        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str << " ngraph::Function is not supported natively";
    }

    std::unordered_set<CNNLayer *> allLayers;
    InferenceEngine::InputsDataMap inputs;

    network.getInputsInfo(inputs);
    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);

    if (inputs.empty()) {
        THROW_GNA_EXCEPTION << "Network is empty (GNA)\n";
    }

    auto const & secondLayers = getInputTo(inputs.begin()->second->getInputData());
    if (secondLayers.empty()) {
        THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n";
    }

    InferenceEngine::details::UnorderedDFS(allLayers,
                                           secondLayers.begin()->second,
                                           [&](CNNLayerPtr const& layer) {
                                                if (LayerTypeFromStr(layer->type) != LayerType::NO_TYPE) {
                                                    res.supportedLayersMap.insert({ layer->name, GetName() });
                                                }
                                            }, false);
}