[GPU] Implement dynamic shape case support for one dimension only (batch) via legacy dynamic batch functionality (#9314)

2022-01-17 12:02:52 +03:00 · 2022-01-17 12:02:52 +03:00 · f2be2c915f
commit f2be2c915f
parent 98cbaf0f08
8 changed files with 433 additions and 92 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/graph.hpp
@ -53,6 +53,10 @@ public:
    std::shared_ptr<cldnn::engine> GetEngine() const { return getContextImpl(m_context)->GetEngine(); }
    int GetMaxDynamicBatchSize() const { return getConfig().max_dynamic_batch; }
    const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return m_program->GetInputLayouts(); }
    const InferenceEngine::InputsDataMap GetNetworkInputs() const { return m_program->GetNetworkInputs(); }
    const InferenceEngine::OutputsDataMap GetNetworkOutputs() const { return m_program->GetNetworkOutputs(); }
    std::map<std::string, std::pair<int64_t, int64_t>> GetInputDynBatchDims() { return m_program->m_input_batch_dim; }
    std::map<std::string, int64_t> GetOutputDynBatchDims() { return m_program->m_output_batch_dim; }
    size_t GetNetworksCount() const { return m_networks.size(); }
    std::shared_ptr<cldnn::network> GetNetwork(size_t idx = 0) const;
    InferenceEngine::SizeVector GetOutputSize(std::string outName) const;
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program.hpp
@ -93,6 +93,8 @@ public:
    int m_max_batch;
    int m_curBatch;
    std::map<std::string, std::pair<int64_t, int64_t>> m_input_batch_dim;
    std::map<std::string, int64_t> m_output_batch_dim;
    std::shared_ptr<cldnn::program> GetCompiledProgram(int program_id = 0);
    const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return inputLayouts; }
@ -104,6 +106,9 @@ public:
    int GetMaxBatchSizeForSingleProgram();
    bool IsOpSupported(const InferenceEngine::CNNNetwork& network, const std::shared_ptr<ngraph::Node>& op);
    bool IsDynBatchModel(const std::shared_ptr<ov::Model>& model,
                         std::map<std::string, ov::PartialShape>& shapes,
                         std::map<std::string, std::pair<int64_t, int64_t>>& batch_dim);
    // Profiling utils
    void InitProfileInfo(const std::string& layerName,
@ -170,7 +175,6 @@ private:
                                                 bool createTopologyOnly = false, bool partialBuild = false);
    void CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op);
    bool CanProcessDynBatch(std::vector<std::shared_ptr<ngraph::Node>> ops, InferenceEngine::InputsDataMap networkInputs) const;
    void ChangeInputBatch(int batch);
 };
--- a/src/plugins/intel_gpu/src/plugin/graph.cpp
+++ b/src/plugins/intel_gpu/src/plugin/graph.cpp
@ -49,6 +49,8 @@ Graph::Graph(InferenceEngine::CNNNetwork& network, gpu::ClContext::Ptr context,
    , m_stream_id(stream_id)
    , m_state(0) {
    m_program = std::make_shared<Program>(network, GetEngine(), m_config);
    if (m_program->m_max_batch > 1)
        m_config.max_dynamic_batch = m_program->m_max_batch;
    Build();
 }
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@ -171,14 +171,12 @@ bool same_host_mem(cldnn::memory::ptr memPtr, uint8_t* hostPtr) {
    }
    return bufferMem == hostPtr;
 }
 }  // namespace
 namespace ov {
 namespace runtime {
 namespace intel_gpu {
 // ----------------------------------------------------------------------------------------- //
 // ---------------------------- IE API impl ------------------------------------------------ //
 // ----------------------------------------------------------------------------------------- //
@ -188,6 +186,8 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) {
    InputInfo::Ptr foundInput;
    DataPtr foundOutput;
    bool is_input = findInputAndOutputBlobByName(name, foundInput, foundOutput);
    auto node = is_input ? findInputByNodeName(name) : findOutputByNodeName(name);
    bool isDynamic = (node && node->get_output_partial_shape(0).is_dynamic());
    if (is_input) {
        // ROI blob is returned only if it was set previously. Otherwise default blob is returned.
@ -196,12 +196,21 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) {
            data = it->second->getRoiBlob();
        } else {
            data = _inputs[name];
            if (!isDynamic)
                checkInputBlob(data, name, foundInput);
        }
    } else {
        data = _outputs[name];
        if (isDynamic) {
            if (m_graph->GetMaxDynamicBatchSize() > 1) {
                SizeVector outDims = data->getTensorDesc().getDims();
                outDims[m_graph->GetOutputDynBatchDims()[name]] = m_curBatch;
                data->getTensorDesc().setDims(outDims);
            }
        } else {
            checkOutputBlob(data, name, foundOutput);
        }
    }
    return data;
 }
@ -243,13 +252,16 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
                                           desc.getPrecision().size(),
                                           std::multiplies<size_t>());
    bool preProcResize = false;
    auto node = is_input ? findInputByNodeName(name) : findOutputByNodeName(name);
    bool isDynamic = (node && node->get_output_partial_shape(0).is_dynamic());
    if (is_input) {
        preProcResize = foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE;
        const auto inputColorFormat = foundInput->getPreProcess().getColorFormat();
        preProcResize |= (inputColorFormat != ColorFormat::RAW) && (inputColorFormat != ColorFormat::BGR);
    }
-    if (dataBinSize != netReqBinSize && !compoundBlobPassed && !preProcResize) {
+    if (!isDynamic &&
        dataBinSize != netReqBinSize && !compoundBlobPassed && !preProcResize) {
        IE_THROW() << "Incorrect binary data size for " << (is_input ? "input" : "output") <<
                      " blob with name: \'" << name <<  "\' " <<
                      "Current: " << dataBinSize << " Required: " << netReqBinSize;
@ -292,7 +304,9 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
                        auto y_ptr = nv12_ptr->y()->as<gpu::ClBlob>();
                        if (y_ptr) {
                            auto y_impl = getBlobImpl(y_ptr);
                            if (!y_impl->is_allocated()) {
                                y_impl->allocate();
                            }
                            _deviceInputs[y_name] = nv12_ptr->y();
                            is_remote = true;
                        }
@ -300,7 +314,9 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
                        auto uv_ptr = nv12_ptr->uv()->as<gpu::ClBlob>();
                        if (uv_ptr) {
                            auto uv_impl = getBlobImpl(uv_ptr);
                            if (!uv_impl->is_allocated()) {
                                uv_impl->allocate();
                            }
                            _deviceInputs[uv_name] = nv12_ptr->uv();
                            is_remote = true;
                        }
@ -326,13 +342,22 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
                if (compoundBlobPassed) {
                    IE_THROW(NotImplemented) << cannot_set_compound;
                }
                if (isDynamic) {
                    // extract new batch size from blob
                    if (m_graph->GetMaxDynamicBatchSize() > 1) {
                        const auto batch_idx = m_graph->GetInputDynBatchDims()[name].first;
                        if (batch_idx >= 0)
                            SetBatch(blobDesc.getDims()[batch_idx]);
                    }
                } else {
                    size_t blobSize = desc.getLayout() != SCALAR
                        ? details::product(desc.getDims())
                        : 1;
                    if (dataSize != blobSize) {
-                    IE_THROW() << "Input blob size is not equal network input size ("
+                        IE_THROW() << "Input blob size is not equal to network input size ("
                            << dataSize << "!=" << blobSize << ").";
                    }
                }
                if (data->buffer() == nullptr)
                    IE_THROW(NotAllocated) << str_input_not_allocated << " Input name: \'" << name << "\'";
@ -347,15 +372,17 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) {
        if (is_remote) {
            _deviceOutputs[name] = data;
        } else {
            if (!isDynamic) {
                size_t outputSize = desc.getLayout() != SCALAR
                    ? details::product(desc.getDims())
                    : 1;
                if (dataSize != outputSize) {
-                IE_THROW() << "Output blob size is not equal network output size (" << dataSize
+                    IE_THROW() << "Output blob size is not equal to network output size (" << dataSize
                        << "!=" << outputSize << ").";
                }
                if (data->buffer() == nullptr)
-                IE_THROW(NotAllocated) << str_input_not_allocated << " Input name: \'" << name << "\'";
+                    IE_THROW(NotAllocated) << str_output_not_allocated << " Output name: \'" << name << "\'";
            }
        }
        _outputs[name] = data;
    }
@ -457,6 +484,9 @@ void InferRequest::checkBlobs() {
        } else {
            IE_THROW(NotFound) << "Failed to find input with name: \'" << input.first << "\'";
        }
        auto node = findInputByNodeName(input.first);
        bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic());
        if (!is_dynamic)
            checkInputBlob(input.second, input.first, foundInput, m_graph->getConfig().nv12_two_inputs);
    }
    for (auto const &output : _outputs) {
@ -470,6 +500,9 @@ void InferRequest::checkBlobs() {
        } else {
            IE_THROW(NotFound) << "Failed to find output with name: \'" << output.first << "\'";
        }
        auto node = findOutputByNodeName(output.first);
        bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic());
        if (!is_dynamic)
            checkOutputBlob(output.second, output.first, foundOutput);
    }
 }
@ -509,9 +542,12 @@ void InferRequest::SetBatch(int new_batch) {
    batchOutputs.clear();
    // tune expected inputs
-    for (auto &input : m_graph->GetInputLayouts()) {
+    for (auto& input : m_graph->GetNetworkInputs()) {
-        cldnn::tensor dims = input.second.size;
+        auto sz = input.second->getTensorDesc().getDims();
-        const SizeVector sz = { 1, size_t(dims.feature[0]), size_t(dims.spatial[1]), size_t(dims.spatial[0]) };
+        const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first;
        if (batch_idx >= 0)
            sz[batch_idx] = 1;
        size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies<size_t>());
        std::vector<buf_info> in_buf;
@ -534,9 +570,11 @@ void InferRequest::SetBatch(int new_batch) {
    }
    // tune expected outputs
-    for (auto& no : _networkOutputs) {
+    for (auto& no : m_graph->GetNetworkOutputs()) {
-        auto sz = m_graph->GetOutputSize(no.first);
+        auto sz = no.second->getTensorDesc().getDims();
-        sz.front() = 1;
+        const auto batch_idx = m_graph->GetInputDynBatchDims()[no.first].first;
        if (batch_idx >= 0)
            sz[batch_idx] = 1;
        size_t single_batch = std::accumulate(std::begin(sz), std::end(sz), (size_t)1, std::multiplies<size_t>());
        std::vector<buf_info> out_buf;
@ -816,6 +854,21 @@ void InferRequest::setup_stream_graph() {
        streamID = streamID % numGraphs;
    }
    m_graph = streamGraphs[streamID];
    // in case of dynamic batch, check all input blobs and set new batch
    if (m_graph->GetMaxDynamicBatchSize() > 1) {
        for (auto& input : _networkInputs) {
            auto node = findInputByNodeName(input.first);
            bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic());
            if (!is_dynamic)
                continue;
            // extract new batch size from blob
            const auto batch_idx = m_graph->GetInputDynBatchDims()[input.first].first;
            if (batch_idx >= 0) {
                SetBatch(_inputs[input.first]->getTensorDesc().getDims()[batch_idx]);
                break;
            }
        }
    }
 }
 Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, std::shared_ptr<InferenceEngine::IAllocator> alloc) {
@ -968,16 +1021,9 @@ void InferRequest::allocate_inputs() {
 void InferRequest::allocate_inputs_dynamic() {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_inputs_dynamic");
    // allocate inputs
-    for (auto &input : m_graph->GetInputLayouts()) {
+    for (auto &input : m_graph->GetNetworkInputs()) {
        InputInfo::Ptr ni = _networkInputs.at(input.first);
-        TensorDesc desc = ni->getTensorDesc();
+        TensorDesc desc = input.second->getTensorDesc();
        SizeVector& dims = desc.getDims();
        if (!dims.empty()) {
            *dims.begin() = static_cast<size_t>(m_graph->GetMaxDynamicBatchSize());
        } else {
            IE_THROW() << "Empty dimensions for input blob " << input.first;
        }
        Blob::Ptr inputBlob = create_host_blob(desc);
        if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
@ -1020,17 +1066,10 @@ void InferRequest::allocate_outputs() {
 void InferRequest::allocate_outputs_dynamic() {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::allocate_outputs_dynamic");
    // allocate outputs
-    for (auto& no : _networkOutputs) {
+    for (auto& no : m_graph->GetNetworkOutputs()) {
        std::string outputID = m_graph->MapOutputName(no.first);
        DataPtr oi = no.second;
        TensorDesc desc = oi->getTensorDesc();
        SizeVector& dims = desc.getDims();
        if (!dims.empty()) {
            *dims.begin() = static_cast<size_t>(m_graph->GetMaxDynamicBatchSize());
        } else {
            IE_THROW() << "Empty dimensions for output blob " << no.first;
        }
        Blob::Ptr outputBlob = create_host_blob(desc);
        _outputs[no.first] = outputBlob;
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@ -376,16 +376,45 @@ QueryNetworkResult Plugin::QueryNetwork(const CNNNetwork& network,
    }
    auto clonedNetwork = CloneAndTransformNetwork(network, conf);
-    auto ops = clonedNetwork.getFunction()->get_ordered_ops();
+    auto func = clonedNetwork.getFunction();
    auto ops = func->get_ordered_ops();
    std::unordered_set<std::string> supported;
    std::unordered_set<std::string> unsupported;
    std::unordered_set<std::string> constantsNames;
    std::vector<std::shared_ptr<ngraph::Node>> constants;
    std::map<std::string, ngraph::PartialShape> shapes;
    std::map<std::string, std::pair<int64_t, int64_t>> batch_dim;
    bool dyn_shape_batch_found = prog.IsDynBatchModel(func, shapes, batch_dim);
    auto layerIsSupported = [&](std::shared_ptr<ngraph::Node> node) {
        if (node->is_dynamic()) {
            if (!dyn_shape_batch_found)
                return false;
            auto pshape = node->get_output_partial_shape(0);
            if (pshape.rank().is_dynamic())
                return false;
            int dynCount = 0;
            int64_t batch_idx = -1;
            for (size_t i = 0; i < pshape.size(); i++) {
                if (pshape[i].is_dynamic()) {
                    dynCount++;
                    if (batch_idx < 0) {
                        batch_idx = i;
                    }
                }
            }
            if (dynCount != 1)
                return false;  // more than one dimension is dynamic
            int64_t max_batch = pshape[batch_idx].get_max_length();
            if (max_batch <= 1)
                return false;
            return true;
        }
        if (ngraph::is_type<const ngraph::op::v0::PriorBox>(node) ||
            ngraph::is_type<const ngraph::op::v0::PriorBoxClustered>(node) ||
@ -637,6 +666,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
        auto closest_pow_of_2 = [] (float x) {
            return pow(2, floor(log(x)/log(2)));
        };
        GPU_DEBUG_GET_INSTANCE(debug_config);
        auto model_param = options.find("MODEL_PTR");
        if (model_param == options.end()) {
            GPU_DEBUG_IF(debug_config->verbose >= 1) {
--- a/src/plugins/intel_gpu/src/plugin/program.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program.cpp
@ -5,6 +5,7 @@
 #include "intel_gpu/plugin/program.hpp"
 #include "ngraph/ops.hpp"
 #include "ngraph_ops/nms_ie_internal.hpp"
 #include "openvino/core/graph_util.hpp"
 #include "intel_gpu/plugin/itt.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
@ -56,43 +57,74 @@ void Program::ValidateInputs(const std::shared_ptr<ngraph::Node>& op, std::vecto
                       << " op::v" << op->get_type_info().version << ")";
 }
-bool Program::CanProcessDynBatch(std::vector<std::shared_ptr<ngraph::Node>> ops, InferenceEngine::InputsDataMap networkInputs) const {
+auto getParamName = [](const std::shared_ptr<ov::Node>& param) -> std::string {
-    if (networkInputs.empty())
+    const auto& names = param->get_output_tensor(0).get_names();
-        return false;
+    if (!names.empty())
        return *names.begin();
    else
        return param->get_friendly_name();
 };
-    for (auto op : ops) {
+//  detect the only supported dynamic shape case -
-        // TODO: do we have any other exception cases?
+//  exactly one dimension is dynamic in input params with defined min/max interval
-        if (std::dynamic_pointer_cast<ngraph::op::v1::Reshape>(op)) {
+bool Program::IsDynBatchModel(const std::shared_ptr<ov::Model>& model,
-            if (op->get_input_shape(0)[0] == op->get_output_shape(0)[0])
+                              std::map<std::string, ov::PartialShape>& shapes,
-                continue;
+                              std::map<std::string, std::pair<int64_t, int64_t>>& batch_dim) {
-        }
+    for (const auto& param : model->get_parameters()) {
-
+        auto pname = getParamName(param);
-        // List of the operations which can lead to invalid dynamic batch processing
+        batch_dim[pname] = { -1, -1 };
-        if (std::dynamic_pointer_cast<ngraph::op::internal::NonMaxSuppressionIEInternal>(op) ||
+        if (param->get_output_partial_shape(0).rank().is_dynamic()) {
            std::dynamic_pointer_cast<ngraph::op::v5::NonMaxSuppression>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v4::NonMaxSuppression>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v3::NonMaxSuppression>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v1::NonMaxSuppression>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::PSROIPooling>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::ROIPooling>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::PriorBox>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::DetectionOutput>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v1::Reshape>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::Squeeze>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::Unsqueeze>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v1::Transpose>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v4::Proposal>(op) ||
            std::dynamic_pointer_cast<ngraph::op::v0::Proposal>(op)) {
            return false;
        }
-
+        ov::PartialShape pshape = param->get_output_partial_shape(0);
-        auto customLayer = m_config.customLayers.find(op->get_type_name());
+        int dynCount = 0;
-        if (customLayer != m_config.customLayers.end()) {
+        int64_t batch_idx = -1;
        for (size_t i = 0; i < pshape.size(); i++) {
            if (pshape[i].is_dynamic()) {
                dynCount++;
                if (batch_idx < 0) {
                    batch_idx = i;
                }
            }
        }
        switch (dynCount) {
            case 1:
                // exactly one dynamic dim
                {
                    int64_t max_b = pshape[batch_idx].get_max_length();
                    if (max_b > 1) {
                        batch_dim[pname].first = batch_idx;
                        batch_dim[pname].second = max_b;
                        pshape[batch_idx] = 1;
                    }
                }
            case 0:
                // no dynamic dims - possible legacy case
                shapes[pname] = pshape;
                break;
            default:
                break;
        }
    }
    if (batch_dim.empty())
        return false;
    bool dyn_shape_batch_found = false;
    // detect 1st dyn dim, mark it and continue
    auto bitr = batch_dim.begin();
    dyn_shape_batch_found = bitr->second.first >= 0;
    auto batch_val_1st = bitr->second.second;
    bitr++;
    for (; bitr != batch_dim.end(); bitr++) {
        if (bitr->second.first >= 0) {
            if (bitr->second.second != batch_val_1st) {
                dyn_shape_batch_found = false;
                break;
            } else {
                dyn_shape_batch_found = true;
            }
        }
-
+    }
-    return true;
+    return dyn_shape_batch_found;
 }
 Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config,
@ -112,26 +144,136 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::en
    auto ops = func->get_ordered_ops();
-    if (m_config.max_dynamic_batch > 1) {
+    bool dyn_shape_batch_found = false;
-        // check topology for applicability
+    std::map<std::string, ngraph::PartialShape> shapes;
-        if (!CanProcessDynBatch(ops, networkInputs)) {
+    std::map<std::string, std::pair<int64_t, int64_t>> batch_dim;
-            IE_THROW() << "Such topology cannot be compiled for dynamic batch!";
+    if (m_config.enableDynamicBatch) {
        // in case of legacy dynamic batch,
        // we assume 4D input with 0 batch dim
        auto param = func->get_parameters().front();
        auto pname = getParamName(param);
        shapes[pname] = param->get_output_partial_shape(0);
        batch_dim[pname].first = 0;
        batch_dim[pname].second = m_config.max_dynamic_batch;
    } else {
        dyn_shape_batch_found = IsDynBatchModel(func, shapes, batch_dim);
        if (dyn_shape_batch_found) {
            m_config.max_dynamic_batch = batch_dim.begin()->second.second;
        } else {
            if (!batch_dim.empty() && shapes.empty()) {
                // more than on dynamic dim or dynamic rank
                IE_THROW() << "Only dynamic batch is supported!";
            }
        }
    }
    int m_bv_sz = GetMaxBatchSizeForSingleProgram();
    m_max_batch = m_config.max_dynamic_batch;
-    m_max_batch = config.max_dynamic_batch;
+    if (dyn_shape_batch_found || config.max_dynamic_batch > 1) {
-
+        // compile log2 networks to serve dynamic batch requests
    if (config.max_dynamic_batch > 1) {
        for (int b = m_bv_sz - 1; b >= 0; b--) {
            inputLayouts.clear();
            outputDims.clear();
            primitiveIDs.clear();
            blobMemCache.clear();
-            ChangeInputBatch(1U << static_cast<unsigned>(b));
+            auto new_batch = 1U << static_cast<unsigned>(b);
-            m_programs.insert(m_programs.begin(), BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly, partialBuild));
+            ChangeInputBatch(new_batch);
            // clone the source model, find the batch dim
            // and reshape the model to next batch size
            auto new_func = ov::clone_model(*func);
            std::map<ov::Output<ov::Node>, ngraph::PartialShape> new_shapes;
            for (const auto& param : new_func->get_parameters()) {
                ov::PartialShape pshape = param->get_output_partial_shape(0);
                auto pname = getParamName(param);
                auto batch_idx = batch_dim[pname].first;
                if (batch_idx >= 0) {
                    auto pshape = shapes[pname];
                    pshape[batch_idx] = new_batch;
                    new_shapes[param->output(0)] = pshape;
                }
            }
            new_func->reshape(new_shapes);
            // reshape network input/output maps accordingly
            // for correct network compilation
            for (auto& new_input : new_func->inputs()) {
                auto iname = new_input.get_node()->get_friendly_name();
                auto it = networkInputs.find(iname);
                if (it != networkInputs.end()) {
                    auto shape = new_input.get_shape();
                    auto l = it->second->getTensorDesc().getLayout();
                    it->second->getInputData()->reshape(shape, l);
                }
            }
            for (auto& new_output : new_func->outputs()) {
                auto iname = new_output.get_node_shared_ptr()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name();
                auto it = networkOutputs.find(iname);
                if (it != networkOutputs.end()) {
                    auto shape = new_output.get_shape();
                    auto l = it->second->getTensorDesc().getLayout();
                    it->second->reshape(shape, l);
                }
            }
            m_programs.insert(m_programs.begin(), BuildProgram(new_func->get_ordered_ops(), networkInputs, networkOutputs,
                createTopologyOnly, partialBuild));
        }
        {
            // recompute maximal dynamic batch inputs/outputs for infer request
            // and store them into internal maps
            // same operations as above, but for maximum batch
            auto new_func = ov::clone_model(*func);
            std::map<ov::Output<ov::Node>, ngraph::PartialShape> new_shapes;
            for (const auto& param : new_func->get_parameters()) {
                ov::PartialShape pshape = param->get_output_partial_shape(0);
                auto pname = getParamName(param);
                auto batch_idx = batch_dim[pname].first;
                if (batch_idx >= 0) {
                    auto pshape = shapes[pname];
                    pshape[batch_idx] = m_max_batch;
                    new_shapes[param->output(0)] = pshape;
                }
            }
            new_func->reshape(new_shapes);
            for (auto& new_input : new_func->inputs()) {
                auto iname = new_input.get_node()->get_friendly_name();
                auto it = networkInputs.find(iname);
                if (it != networkInputs.end()) {
                    auto shape = new_input.get_shape();
                    auto l = it->second->getTensorDesc().getLayout();
                    it->second->getInputData()->reshape(shape, l);
                }
            }
            for (auto& new_output : new_func->outputs()) {
                auto iname = new_output.get_node_shared_ptr()->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name();
                auto it = networkOutputs.find(iname);
                if (it != networkOutputs.end()) {
                    auto shape = new_output.get_shape();
                    auto l = it->second->getTensorDesc().getLayout();
                    SizeVector old_shape = it->second->getTensorDesc().getDims();
                    it->second->reshape(shape, l);
                    // detect changed output batch dimension
                    SizeVector new_shape = it->second->getTensorDesc().getDims();
                    for (int64_t i = 0; i < old_shape.size(); i++) {
                        if (old_shape[i] != new_shape[i]) {
                            m_output_batch_dim[iname] = i;
                            break;
                        }
                    }
                }
            }
            m_networkInputs = networkInputs;
            m_networkOutputs = networkOutputs;
            m_input_batch_dim = batch_dim;
        }
    } else {
        m_programs.emplace_back(BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly, partialBuild));
--- a/src/tests/functional/plugin/gpu/dynamic_tests/gpu_dyn_batch_shape_tests.cpp
+++ b/src/tests/functional/plugin/gpu/dynamic_tests/gpu_dyn_batch_shape_tests.cpp
@ -0,0 +1,118 @@
 // Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "openvino/runtime/core.hpp"
 #include <common_test_utils/test_common.hpp>
 #include "common_test_utils/common_utils.hpp"
 #include "functional_test_utils/skip_tests_config.hpp"
 #include "ngraph_functions/subgraph_builders.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 using namespace ::testing;
 using namespace ov::test;
 using OVDynamicBatchParams = std::tuple<
    std::vector<InputShape>,                                           // dynamic and static case sizes
    ElementType,                                                       // Network precision
    std::string,                                                       // Device name
    std::map<std::string, std::string>                                 // Config
 >;
 class OVDynamicBatchShape_Tests : public WithParamInterface<OVDynamicBatchParams>,
    virtual public ov::test::SubgraphBaseTest {
 public:
    static std::string getTestCaseName(TestParamInfo<OVDynamicBatchParams> obj) {
        std::vector<InputShape> inputShapes;
        ElementType netPrecision;
        std::string targetDevice;
        std::map<std::string, std::string> configuration;
        std::tie(inputShapes, netPrecision, targetDevice, configuration) = obj.param;
        std::ostringstream result;
        result << "IS=";
        for (const auto& shape : inputShapes) {
            result << CommonTestUtils::partialShape2str({ shape.first }) << "_";
        }
        result << "TS=";
        for (const auto& shape : inputShapes) {
            result << "(";
            if (!shape.second.empty()) {
                for (const auto& itr : shape.second) {
                    result << CommonTestUtils::vec2str(itr);
                }
            }
            result << ")_";
        }
        result << "netPRC=" << netPrecision << "_";
        result << "targetDevice=" << targetDevice;
        if (!configuration.empty()) {
            for (auto& configItem : configuration) {
                result << "configItem=" << configItem.first << "_" << configItem.second << "_";
            }
        }
        return result.str();
    }
    void TearDown() override {
        core.reset();
    }
 protected:
    void SetUp() override {
        if (core)
            core.reset();
        std::tie(inputShape, netPrecision, targetDevice, configuration) = this->GetParam();
        init_input_shapes(inputShape);
        //TODO: think how we can switch between several input topologies in the future
        //  function = ngraph::builder::subgraph::makeSplitConvConcat(inputShape.front().first.get_min_shape(), netPrecision);
        function = ngraph::builder::subgraph::makeSplitMultiConvConcat(inputShape.front().first.get_min_shape(), netPrecision);
        //  make topology dynamic
        std::map<std::string, ov::PartialShape> dynShape;
        dynShape["input_tensor"] = inputShape.front().first;
        function->reshape(dynShape);
    }
    std::shared_ptr<ov::Model> src_func;
    // std::map<std::string, std::string> configuration;
    std::vector<InputShape> inputShape;
    ElementType netPrecision;
 };
 TEST_P(OVDynamicBatchShape_Tests, InferDynamicBatchBound) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    core = std::make_shared<ov::runtime::Core>();
    run();
 }
 namespace {
 const std::map<std::string, std::string> config = {};
 const std::map<std::string, std::string> hetero_config = {
    {"TARGET_FALLBACK", CommonTestUtils::DEVICE_GPU}
 };
 const std::vector<InputShape> inputShapes = {
    { { {1, 19}, 4, 20, 20}, { {1, 4, 20, 20}, {7, 4, 20, 20}, {17, 4, 20, 20} } }
 };
 const std::vector<ElementType> netPrecisions = {
    ElementType::f16,
    ElementType::f32
 };
 INSTANTIATE_TEST_SUITE_P(smoke_GPU_DynBatch, OVDynamicBatchShape_Tests,
    ::testing::Combine(
        ::testing::Values(inputShapes),
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(CommonTestUtils::DEVICE_GPU),
        ::testing::Values(config)),
    OVDynamicBatchShape_Tests::getTestCaseName);
 INSTANTIATE_TEST_SUITE_P(smoke_GPU_DynBatchHetero, OVDynamicBatchShape_Tests,
    ::testing::Combine(
        ::testing::Values(inputShapes),
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(CommonTestUtils::DEVICE_HETERO),
        ::testing::Values(hetero_config)),
    OVDynamicBatchShape_Tests::getTestCaseName);
 }  // namespace
--- a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/subgraph_builders.hpp
+++ b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/subgraph_builders.hpp
@ -189,9 +189,11 @@ inline std::shared_ptr<ngraph::Function> makeKSOFunction(std::vector<size_t> inp
    return fnPtr;
 }
-inline std::shared_ptr<ngraph::Function> makeSplitMultiConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20}) {
+inline std::shared_ptr<ngraph::Function> makeSplitMultiConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20},
-    auto ngPrc = ngraph::element::Type_t::f32;
+                                                                  ngraph::element::Type_t ngPrc = ngraph::element::Type_t::f32) {
    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
    params.front()->set_friendly_name("Param_1");
    params.front()->get_output_tensor(0).set_names({ "input_tensor" });
    auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
    auto conv1_0 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},