[GPU] Add GetMaxBatchSize functionality (#8226)

2021-11-24 15:30:14 +09:00
parent 8f64b8e2e8
commit abbf0384ae
22 changed files with 416 additions and 73 deletions
--- a/docs/IE_DG/supported_plugins/GPU.md
+++ b/docs/IE_DG/supported_plugins/GPU.md
@@ -127,6 +127,10 @@ When specifying key values as raw strings (that is, when using Python API), omit

@snippet snippets/GPU_Metric0.cpp part0

+* MAX_BATCH_SIZE : Returns maximum batch size for a given network which is not only executable but also does not lose performance due to the memory swap impact. Note that the returned value may not aligned to power of 2. Also, MODEL_PTR is the required option for this metric since the available max batch size depends on the model size. If the MODEL_PTR is not given, it will return 1. The example code to set the required and optional configs for this metic is available in the following snippet:
+
+@snippet snippets/GPU_Metric1.cpp part1
+
 ## GPU Context and Video Memory Sharing RemoteBlob API

 See [RemoteBlob API of GPU Plugin](GPU_RemoteBlob_API.md)
--- a/docs/snippets/GPU_Metric1.cpp
+++ b/docs/snippets/GPU_Metric1.cpp
@@ -0,0 +1,17 @@
+#include <ie_core.hpp>
+
+int main() {
+using namespace InferenceEngine;
+//! [part1]
+InferenceEngine::Core core;
+CNNNetwork cnnNetwork = core.ReadNetwork("network.xml");
+uint32_t n_streams = 2;
+int64_t available_device_mem_size = 3221225472;
+
+std::map<std::string, Parameter> options = {{"MODEL_PTR", cnnNetwork.getFunction()}}; // Required. Set the address of the target network. If this is not set, the MAX_BATCH_SIZE returns 1.
+options.insert(std::make_pair("GPU_THROUGHPUT_STREAMS", n_streams)); // Optional. Set only when you want to estimate max batch size for a specific throughtput streams. Default is 1 or throughtput streams set by SetConfig.
+options.insert(std::make_pair("AVAILABLE_DEVICE_MEM_SIZE", available_device_mem_size)); // Optional. Set only when you want to limit the available device mem size.
+
+auto max_batch_size = core.GetMetric("GPU", GPU_METRIC_KEY(MAX_BATCH_SIZE), options).as<uint32_t>();
+//! [part1]
+}
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -11,7 +11,6 @@
 #include <tuple>
 #include <cctype>
 #include <memory>
-
 #include "ie_metric_helpers.hpp"
 #include "ie_plugin_config.hpp"
 #include <ie_ngraph_utils.hpp>
@@ -29,7 +28,6 @@

 #include "cldnn/runtime/device_query.hpp"
 #include "cldnn/runtime/debug_configuration.hpp"
-
 #ifdef __linux__
 # include <dlfcn.h>
 #endif
@@ -677,11 +675,11 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
        metrics.push_back(METRIC_KEY(RANGE_FOR_STREAMS));
        metrics.push_back(METRIC_KEY(DEVICE_TYPE));
        metrics.push_back(METRIC_KEY(DEVICE_GOPS));
+        metrics.push_back(GPU_METRIC_KEY(MAX_BATCH_SIZE));
        metrics.push_back(GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE));
        metrics.push_back(GPU_METRIC_KEY(UARCH_VERSION));
        metrics.push_back(GPU_METRIC_KEY(EXECUTION_UNITS_COUNT));
        metrics.push_back(GPU_METRIC_KEY(MEMORY_STATISTICS));
-
        IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
    } else if (name == METRIC_KEY(AVAILABLE_DEVICES)) {
        std::vector<std::string> availableDevices = { };
@@ -755,11 +753,138 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
            }
        }
        IE_SET_METRIC_RETURN(GPU_MEMORY_STATISTICS, statistics);
+    } else if (name == GPU_METRIC_KEY(MAX_BATCH_SIZE)) {
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        const auto& config = _impl->m_configs.GetConfig(device_id);
+        uint32_t n_streams = static_cast<uint32_t>(config.throughput_streams);
+        uint64_t occupied_device_mem = 0;
+        auto statistic_result = GetMetric(GPU_METRIC_KEY(MEMORY_STATISTICS), options).as<std::map<std::string, uint64_t>>();
+        auto occupied_usm_dev = statistic_result.find("usm_device_current");
+        if (occupied_usm_dev != statistic_result.end()) {
+            occupied_device_mem = occupied_usm_dev->second;
+        }
+
+        int64_t available_device_mem = device_info.max_global_mem_size - occupied_device_mem;
+        GPU_DEBUG_IF(debug_config->verbose >= 2) {
+            GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
+                           << " (occupied: " << occupied_device_mem << ")" << std::endl;
+        }
+
+        int64_t max_batch_size = 0;
+
+        if (options.find("MODEL_PTR") == options.end()) {
+            GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
+            }
+            IE_SET_METRIC_RETURN(GPU_MAX_BATCH_SIZE, static_cast<int32_t>(1));
+        }
+        if (options.find("GPU_THROUGHPUT_STREAMS") != options.end()) {
+            try {
+                n_streams = options.find("GPU_THROUGHPUT_STREAMS")->second.as<uint32_t>();
+            } catch (...) {
+                IE_THROW() << "[GPU] bad casting: GPU_THROUGHPUT_STREAMS should be uint32_t type";
+            }
+        }
+        GPU_DEBUG_IF(debug_config->verbose >= 2) {
+            GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
+        }
+
+        if (options.find("AVAILABLE_DEVICE_MEM_SIZE") != options.end()) {
+            try {
+                available_device_mem = std::min(static_cast<int64_t>(available_device_mem), options.find("AVAILABLE_DEVICE_MEM_SIZE")->second.as<int64_t>());
+                GPU_DEBUG_IF(debug_config->verbose >= 2) {
+                    GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
+                }
+            } catch (...) {
+                IE_THROW() << "[GPU] bad casting: AVAILABLE_DEVICE_MEM_SIZE should be int64_t type";
+            }
+            if (available_device_mem < 0) {
+                IE_THROW() << "[GPU] AVAILABLE_DEVICE_MEM_SIZE value should be greater than 0 for max batch size calculation";
+            }
+        }
+
+        std::shared_ptr<ngraph::Function> model;
+        auto model_param = options.find("MODEL_PTR")->second;
+        try {
+            model = model_param.as<std::shared_ptr<ngraph::Function>>();
+        } catch (...) {
+            IE_THROW() << "[GPU] MODEL_PTR should be std::shared_ptr<ngraph::Function> type";
+        }
+
+        InferenceEngine::CNNNetwork network(model);
+        size_t base_batch_size = 16; // empirically decided for DG1
+        auto engine_params = clDNNEngine::GetEngineParams(config, iter->second, nullptr);
+        auto engine = cldnn::engine::create(engine_params.engine_type, engine_params.runtime_type, iter->second,
+                                cldnn::engine_configuration(false, engine_params.queue_type, std::string(),
+                                config.queuePriority, config.queueThrottle, config.memory_pool_on,
+                                engine_params.use_unified_shared_memory, std::string(), config.throughput_streams),
+                                engine_params.task_executor);
+
+        std::shared_ptr<Program> program;
+
+        GPU_DEBUG_IF(debug_config->base_batch_for_memory_estimation > 0) {
+            int32_t user_specified_base_batch_size = debug_config->base_batch_for_memory_estimation;
+            base_batch_size = (user_specified_base_batch_size != base_batch_size) ? user_specified_base_batch_size : base_batch_size;
+        }
+
+        auto cloned_network = InferenceEngine::details::cloneNetwork(network);
+        auto inputs_info = cloned_network.getInputsInfo();
+        ICNNNetwork::InputShapes new_shapes;
+        //std::map<std::string, SizeVector>;
+        bool batch_detected = false;
+        for (auto& info : inputs_info) {
+            if (!info.second)
+                continue;
+            Layout layout = info.second->getLayout();
+            auto data = info.second->getInputData();
+            if (!data)
+                continue;
+            std::string name = info.second->getInputData()->getName();
+            auto shape = data->getTensorDesc().getDims();
+            if (layout == InferenceEngine::Layout::NCHW ||
+                layout == InferenceEngine::Layout::NHWC ||
+                layout == InferenceEngine::Layout::NCDHW ||
+                layout == InferenceEngine::Layout::NDHWC ||
+                layout == InferenceEngine::Layout::NC)  {
+                shape[0] = base_batch_size;
+                batch_detected = true;
+            } else if (layout == InferenceEngine::Layout::CN) {
+                shape[1] = base_batch_size;
+                batch_detected = true;
+            }
+            new_shapes[name] = shape;
+        }
+        if (batch_detected) { // reshape only for batched layout
+            cloned_network.reshape(new_shapes);
+            GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                GPU_DEBUG_COUT << "Reshaped base batch size to " << base_batch_size << std::endl;
+            }
+        } else {
+            base_batch_size = 1;
+            GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                GPU_DEBUG_COUT << "Batch dimension is not used in inputs." << std::endl;
+            }
+        }
+
+        auto nGraphFunc = cloned_network.getFunction();
+        TransformationsPipeline transformations(config, device_info);
+        transformations.apply(nGraphFunc);
+        program = std::make_shared<Program>(cloned_network, engine, config, false, true);
+        std::pair<int64_t, int64_t> device_memory_usage =  program->GetCompiledProgram(0)->get_estimated_device_mem_usage();
+        int64_t mem_for_general = std::max(static_cast<int64_t>(1L),
+                                  static_cast<int64_t>(static_cast<int64_t>(available_device_mem) - device_memory_usage.first));
+        int64_t mem_per_batch = std::max(static_cast<int64_t>(1L), (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
+        max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
+        GPU_DEBUG_IF(debug_config->verbose >= 1) {
+            GPU_DEBUG_COUT << "Base batch size: " << base_batch_size  << std::endl;
+            GPU_DEBUG_COUT << "Const mem usage: " << device_memory_usage.first  << std::endl;
+            GPU_DEBUG_COUT << "General mem usage: " << device_memory_usage.second  << std::endl;
+        }
+        IE_SET_METRIC_RETURN(GPU_MAX_BATCH_SIZE, static_cast<int32_t>(max_batch_size));
    } else {
        IE_THROW() << "Unsupported metric key " << name;
    }
 }
-
 };  // namespace CLDNNPlugin

 static const Version version = { {2, 1}, CI_BUILD_NUMBER, "clDNNPlugin" };
--- a/inference-engine/src/cldnn_engine/cldnn_engine.h
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.h
@@ -60,6 +60,31 @@ public:

    std::shared_ptr<InferenceEngine::RemoteContext> CreateContext(const InferenceEngine::ParamMap& params) override;
    std::shared_ptr<InferenceEngine::RemoteContext> GetDefaultContext(const InferenceEngine::ParamMap& params) override;
+
+    struct clDNNEngineParams {
+        cldnn::queue_types queue_type;
+        cldnn::engine_types engine_type;
+        cldnn::runtime_types runtime_type;
+        bool use_unified_shared_memory;
+        InferenceEngine::ITaskExecutor::Ptr task_executor;
+    };
+
+    static clDNNEngineParams GetEngineParams(const Config& config, const cldnn::device::ptr& dev,
+                                                InferenceEngine::gpu_handle_param external_queue = nullptr) {
+        clDNNEngineParams params;
+        params.engine_type = cldnn::engine_types::ocl;
+        params.runtime_type = cldnn::runtime_types::ocl;
+        if (external_queue) {
+            params.queue_type = cldnn::stream::detect_queue_type(params.engine_type, external_queue);
+        } else if (dev->get_info().supports_immad) {
+            params.queue_type = cldnn::queue_types::in_order;
+        } else {
+            params.queue_type = cldnn::queue_types::out_of_order;
+        }
+        params.use_unified_shared_memory = true;
+        params.task_executor = std::make_shared<InferenceEngine::CPUStreamsExecutor>(config.task_exec_config);
+        return params;
+    }
 };

 };  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -93,7 +93,8 @@ bool Program::CanProcessDynBatch(std::vector<std::shared_ptr<ngraph::Node>> ops,
    return true;
 }

-Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config, bool createTopologyOnly)
+Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config,
+    bool createTopologyOnly, bool partialBuild)
    : m_config(config)
    , m_engine(engine)
    , m_curBatch(-1)
@@ -128,10 +129,10 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::en
            blobMemCache.clear();

            ChangeInputBatch(1U << static_cast<unsigned>(b));
-            m_programs.insert(m_programs.begin(), BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly));
+            m_programs.insert(m_programs.begin(), BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly, partialBuild));
        }
    } else {
-        m_programs.emplace_back(BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly));
+        m_programs.emplace_back(BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly, partialBuild));
    }
 }

@@ -175,7 +176,7 @@ void Program::CleanupBuild() {
 std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::shared_ptr<ngraph::Node>>& ops,
                                                      InferenceEngine::InputsDataMap networkInputs,
                                                      InferenceEngine::OutputsDataMap networkOutputs,
-                                                      bool createTopologyOnly) {
+                                                      bool createTopologyOnly, bool partialBuild) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::BuildProgram");
    cldnn::build_options options;

@@ -185,7 +186,9 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha

    options.set_option(cldnn::build_option::optimize_data(true));
    options.set_option(cldnn::build_option::tuning_config(m_config.tuningConfig));
-
+    if (partialBuild) {
+        options.set_option(cldnn::build_option::partial_build_program(true));
+    }
    PrepareBuild(networkInputs, networkOutputs);
    for (const auto& op : ops) {
        CreateSingleLayerPrimitive(*m_topology, op);
--- a/inference-engine/src/cldnn_engine/cldnn_program.h
+++ b/inference-engine/src/cldnn_engine/cldnn_program.h
@@ -66,7 +66,8 @@ public:

 class Program {
 public:
-    Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config, bool createTopologyOnly = false);
+    Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config,
+            bool createTopologyOnly = false, bool partialBuild = false);
    Program(std::shared_ptr<cldnn::engine> engine, const Config& config) : m_config(config), m_engine(engine),
            m_curBatch(-1), queryMode(false), m_max_batch(1) {}
    Program() : m_config(), m_engine(nullptr), m_curBatch(-1), queryMode(false), m_max_batch(1) {}
@@ -164,7 +165,7 @@ private:
    std::shared_ptr<cldnn::program> BuildProgram(const std::vector<std::shared_ptr<ngraph::Node>>& ops,
                                                 InferenceEngine::InputsDataMap networkInputs,
                                                 InferenceEngine::OutputsDataMap networkOutputs,
-                                                 bool createTopologyOnly = false);
+                                                 bool createTopologyOnly = false, bool partialBuild = false);

    void CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op);
    bool CanProcessDynBatch(std::vector<std::shared_ptr<ngraph::Node>> ops, InferenceEngine::InputsDataMap networkInputs) const;
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@@ -5,7 +5,7 @@
 #include <memory>
 #include "cldnn_remote_context.h"
 #include "cldnn_itt.h"
-
+#include "cldnn_engine.h"
 #include "cldnn/runtime/device_query.hpp"

 using namespace InferenceEngine;
@@ -274,34 +274,23 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
    auto iter = device_map.find(m_config.device_id);
    auto& dev = iter != device_map.end() ? iter->second : device_map.begin()->second;

-    {
-        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecutionContextImpl::Create");
-        bool enable_profiling = (m_config.useProfiling ||
-                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
-                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache));
-        cldnn::queue_types queue_type;
-        if (m_external_queue) {
-            queue_type = cldnn::stream::detect_queue_type(engine_type, m_external_queue);
-        } else if (dev->get_info().supports_immad) {
-            queue_type = cldnn::queue_types::in_order;
-        } else {
-            queue_type = cldnn::queue_types::out_of_order;
-        }
+    bool enable_profiling = (m_config.useProfiling ||
+                            (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
+                            (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache));

-
-        ITaskExecutor::Ptr task_executor = std::make_shared<CPUStreamsExecutor>(m_config.task_exec_config);
-        bool use_unified_shared_memory = true;
-        m_engine = cldnn::engine::create(engine_type, runtime_type, dev,
-                                                    cldnn::engine_configuration(enable_profiling,
-                                                                                queue_type,
-                                                                                m_config.sources_dumps_dir,
-                                                                                m_config.queuePriority,
-                                                                                m_config.queueThrottle,
-                                                                                m_config.memory_pool_on,
-                                                                                use_unified_shared_memory,
-                                                                                m_config.kernels_cache_dir,
-                                                                                m_config.throughput_streams), task_executor);
-    }
+    auto engine_params = clDNNEngine::GetEngineParams(m_config, dev, m_external_queue);
+    m_engine = cldnn::engine::create(engine_params.engine_type,
+                                     engine_params.runtime_type, dev,
+                                     cldnn::engine_configuration(enable_profiling,
+                                         engine_params.queue_type,
+                                         m_config.sources_dumps_dir,
+                                         m_config.queuePriority,
+                                         m_config.queueThrottle,
+                                         m_config.memory_pool_on,
+                                         engine_params.use_unified_shared_memory,
+                                         m_config.kernels_cache_dir,
+                                         m_config.throughput_streams),
+                                     engine_params.task_executor);
 }

 ParamMap CLDNNExecutionContextImpl::getParams() const {
--- a/inference-engine/src/inference_engine/include/ie/gpu/gpu_config.hpp
+++ b/inference-engine/src/inference_engine/include/ie/gpu/gpu_config.hpp
@@ -51,6 +51,11 @@ DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int);
 */
 DECLARE_GPU_METRIC_KEY(MEMORY_STATISTICS, std::map<std::string, uint64_t>);

+/**
+ * @brief Metric to get maximum batch size which does not cause performance degradation due to memory swap impact.
+ */
+DECLARE_GPU_METRIC_KEY(MAX_BATCH_SIZE, uint32_t);
+
 /**
 * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric
 *  - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication
--- a/inference-engine/src/inference_engine/include/ie/ie_core.hpp
+++ b/inference-engine/src/inference_engine/include/ie/ie_core.hpp
@@ -242,9 +242,10 @@ public:
     *
     * @param deviceName - A name of a device to get a metric value.
     * @param name - metric name to request.
+     * @param options - optional parameters to get a metric value
     * @return Metric value corresponding to metric key.
     */
-    Parameter GetMetric(const std::string& deviceName, const std::string& name) const;
+    Parameter GetMetric(const std::string& deviceName, const std::string& name, const ParamMap& options = {}) const;

    /**
     * @brief Returns devices available for neural networks inference
--- a/inference-engine/src/inference_engine/src/ie_core.cpp
+++ b/inference-engine/src/inference_engine/src/ie_core.cpp
@@ -668,7 +668,9 @@ public:
        return res;
    }

-    ie::Parameter GetMetric(const std::string& deviceName, const std::string& name) const override {
+    ie::Parameter GetMetric(const std::string& deviceName,
+                            const std::string& name,
+                            const ie::ParamMap& options = {}) const override {
        // HETERO case
        {
            if (deviceName.find("HETERO:") == 0) {
@@ -697,6 +699,9 @@ public:
        }

        auto parsed = parseDeviceNameIntoConfig(deviceName);
+        for (auto o : options) {
+            parsed._config.insert(o);
+        }

        // we need to return a copy of Parameter object which is created on Core side,
        // not in InferenceEngine plugin side, which can be unloaded from Core in a parallel thread
@@ -1385,8 +1390,8 @@ Parameter Core::GetConfig(const std::string& deviceName, const std::string& name
        _impl->GetCPPPluginByName(parsed._deviceName).get_config(name, parsed._config));
 }

-Parameter Core::GetMetric(const std::string& deviceName, const std::string& name) const {
-    return _impl->GetMetric(deviceName, name);
+Parameter Core::GetMetric(const std::string& deviceName, const std::string& name, const ParamMap& options) const {
+    return _impl->GetMetric(deviceName, name, options);
 }

 std::vector<std::string> Core::GetAvailableDevices() const {
--- a/inference-engine/src/plugin_api/ie_icore.hpp
+++ b/inference-engine/src/plugin_api/ie_icore.hpp
@@ -110,7 +110,9 @@ public:
     * @param name - metric name to request.
     * @return Metric value corresponding to metric key.
     */
-    virtual Parameter GetMetric(const std::string& deviceName, const std::string& name) const = 0;
+    virtual Parameter GetMetric(const std::string& deviceName,
+                                const std::string& name,
+                                const ParamMap& options = {}) const = 0;

    /**
     * @brief Gets configuration dedicated to device behaviour.
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/core_integration.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/core_integration.cpp
@@ -115,6 +115,51 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Values("GPU")
 );

+using IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_DEFAULT = BehaviorTestsUtils::IEClassBaseTestP;
+TEST_P(IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_DEFAULT, GetMetricAndPrintNoThrow) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    InferenceEngine::Core ie;
+    InferenceEngine::Parameter p;
+
+    std::map<std::string, InferenceEngine::Parameter> _options = {{"MODEL_PTR", simpleCnnNetwork.getFunction()}};
+    ASSERT_NO_THROW(p = ie.GetMetric(deviceName, GPU_METRIC_KEY(MAX_BATCH_SIZE), _options).as<uint32_t>());
+    uint32_t t = p;
+
+    std::cout << "GPU device max available batch size: " << t << std::endl;
+
+    ASSERT_METRIC_SUPPORTED_IE(GPU_METRIC_KEY(MAX_BATCH_SIZE));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+        nightly_IEClassExecutableNetworkGetMetricTest, IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_DEFAULT,
+        ::testing::Values("GPU")
+);
+
+using IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_STREAM_DEVICE_MEM = BehaviorTestsUtils::IEClassBaseTestP;
+TEST_P(IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_STREAM_DEVICE_MEM, GetMetricAndPrintNoThrow) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    InferenceEngine::Core ie;
+    InferenceEngine::Parameter p;
+    uint32_t n_streams = 2;
+    int64_t available_device_mem_size = 1073741824;
+    std::map<std::string, InferenceEngine::Parameter> _options = {{"MODEL_PTR", simpleCnnNetwork.getFunction()}};
+    _options.insert(std::make_pair("GPU_THROUGHPUT_STREAMS", n_streams));
+    _options.insert(std::make_pair("AVAILABLE_DEVICE_MEM_SIZE", available_device_mem_size));
+
+    ASSERT_NO_THROW(p = ie.GetMetric(deviceName, GPU_METRIC_KEY(MAX_BATCH_SIZE), _options).as<uint32_t>());
+
+    uint32_t t = p;
+
+    std::cout << "GPU device max available batch size: " << t << std::endl;
+
+    ASSERT_METRIC_SUPPORTED_IE(GPU_METRIC_KEY(MAX_BATCH_SIZE));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+        nightly_IEClassExecutableNetworkGetMetricTest, IEClassGetMetricTest_GPU_MAX_BATCH_SIZE_STREAM_DEVICE_MEM,
+        ::testing::Values("GPU")
+);
+
 using IEClassGetMetricTest_GPU_UARCH_VERSION = BehaviorTestsUtils::IEClassBaseTestP;
 TEST_P(IEClassGetMetricTest_GPU_UARCH_VERSION, GetMetricAndPrintNoThrow) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
--- a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_icore.hpp
+++ b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_icore.hpp
@@ -29,7 +29,7 @@ public:
    MOCK_CONST_METHOD3(QueryNetwork, InferenceEngine::QueryNetworkResult(
        const InferenceEngine::CNNNetwork&, const std::string&, const std::map<std::string, std::string>&));

-    MOCK_CONST_METHOD2(GetMetric, InferenceEngine::Parameter(const std::string&, const std::string&));
+    MOCK_CONST_METHOD3(GetMetric, InferenceEngine::Parameter(const std::string&, const std::string&, const std::map<std::string, InferenceEngine::Parameter>&));
    MOCK_CONST_METHOD2(GetConfig, InferenceEngine::Parameter(const std::string&, const std::string&));
    MOCK_CONST_METHOD0(GetAvailableDevices, std::vector<std::string>());
    MOCK_CONST_METHOD1(DeviceSupportsImportExport, bool(const std::string&)); // NOLINT not a cast to bool
--- a/inference-engine/thirdparty/clDNN/api/cldnn/graph/build_options.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/graph/build_options.hpp
@@ -59,7 +59,8 @@ enum class build_option_type {
    /// @brief Name for serialization process
    serialize_network,
    load_program,
-    force_implementations
+    force_implementations,
+    partial_build_program
 };

 /// @brief Tuning mode.
@@ -141,6 +142,7 @@ struct build_option {
    /// @brief Specifies user defined implementation details to use.
    static std::shared_ptr<const build_option> force_implementations(implementation_forcing_map forcing);

+    static std::shared_ptr<const build_option> partial_build_program(bool set = false);
    virtual ~build_option() = default;

 private:
@@ -362,6 +364,12 @@ struct build_option_traits<build_option_type::force_implementations> {
    static std::shared_ptr<const build_option> make_default() { return build_option::force_implementations({}); }
 };

+template <>
+struct build_option_traits<build_option_type::partial_build_program> {
+    typedef build_option_bool<build_option_type::partial_build_program> object_type;
+    static std::shared_ptr<const build_option> make_default() { return build_option::partial_build_program(); }
+};
+
 #endif
 }  // namespace detail

@@ -410,6 +418,11 @@ inline std::shared_ptr<const build_option> build_option::load_program(const std:
 inline std::shared_ptr<const build_option> build_option::force_implementations(implementation_forcing_map forcing) {
    return std::make_shared<build_option_force_implementations>(std::move(forcing));
 }
+
+inline std::shared_ptr<const build_option> build_option::partial_build_program(bool enable) {
+    return std::make_shared<build_option_bool<build_option_type::partial_build_program>>(enable);
+}
+
 #endif

 /// @brief Represents program build options list.
--- a/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/graph/network.hpp
@@ -181,6 +181,9 @@ public:
                                    std::set<primitive_id> dependencies,
                                    allocation_type type,
                                    bool reusable = true);
+    memory_pool& get_memory_pool() {
+        return *_memory_pool;
+    }

 private:
    using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
--- a/inference-engine/thirdparty/clDNN/api/cldnn/graph/program.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/graph/program.hpp
@@ -233,6 +233,8 @@ public:
    void load_tuning_cache();
    std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }

+    std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
+
 private:
    uint32_t prog_id = 0;
    engine& _engine;
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/debug_configuration.hpp
@@ -34,6 +34,7 @@ public:
    std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
    std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
    int dump_layers_dst_only;       // Dump only output of layers
+    int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
    static const debug_configuration *get_instance();
 };

--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@@ -109,7 +109,8 @@ debug_configuration::debug_configuration()
        , dump_layers(std::string())
        , dump_layers_dst_only(0)
        , dry_run_path(std::string())
-        , disable_onednn(0) {
+        , disable_onednn(0)
+        , base_batch_for_memory_estimation(-1) {
 #ifdef GPU_DEBUG_CONFIG
    get_common_debug_env_var("Verbose", verbose);
    get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
@@ -121,6 +122,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
    get_gpu_debug_env_var("DisableOnednn", disable_onednn);
    get_gpu_debug_env_var("DryRunPath", dry_run_path);
+    get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);

    if (dump_layers_path.length() > 0 && !disable_usm) {
        disable_usm = 1;
--- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
@@ -151,6 +151,8 @@ public:
    }

    void allocate_internal_buffers();
+    static memory::ptr allocate_output(engine& engine, memory_pool& pool,
+                                        const program_node& _node, bool is_internal);

    std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }

--- a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
+++ b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
@@ -258,10 +258,16 @@ void primitive_inst::allocate_internal_buffers(void) {
            _intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_host));
    }
 }
+memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node,
+        bool is_internal) {
+    auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
+            allocation_type type, bool reusable) {
+        if (_engine.configuration().use_memory_pool)
+                return pool.get_memory(layout, id, 0, dependencies, type, reusable);
+        return pool.get_memory(layout, type);
+    };

-memory::ptr primitive_inst::allocate_output() {
    auto layout = _node.get_output_layout();
-    auto& engine = get_network().get_engine();
    // TODO: Add a preprocessing step to do  alloc_type check before actual allocation
    const auto& node_deps = _node.get_dependencies();
    auto device_mem_acc = [&](size_t a, program_node* b) {
@@ -270,54 +276,59 @@ memory::ptr primitive_inst::allocate_output() {

    bool usm_device_allocatable = true;
    const auto& total_device_input_mem_size = std::accumulate(node_deps.begin(), node_deps.end(), (uint64_t)0, device_mem_acc);
-    if (total_device_input_mem_size > engine.get_device_info().max_global_mem_size)
+    if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
        usm_device_allocatable = false;

    // For outputs, cpu prim we want to have lockable alloc type
    // Also if the successor of a node is an cpu, then memory needs to be lockable.
    auto use_lockable_memory = is_output_buffer(_node) || _node.get_selected_impl()->is_cpu() || is_any_user_cpu(_node.get_users()) ||
-                               !engine.supports_allocation(allocation_type::usm_device);
+                               !_engine.supports_allocation(allocation_type::usm_device);

    GPU_DEBUG_GET_INSTANCE(debug_config);
-    const auto& lockable_mem_type = engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d());
+    const auto& lockable_mem_type = _engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d());
    const auto& alloc_type = use_lockable_memory ? lockable_mem_type
-                             : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;
+        : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;

-    if (!_network.is_internal() && (_node.can_be_optimized() || _node.is_type<generic_layer>())) {
+    if (is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) {
        GPU_DEBUG_IF(debug_config->verbose >= 2) {
            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
        }
-        return _network.get_memory_from_pool(layout,
-                                             _node.id(),
-                                             _node.get_memory_dependencies(),
-                                             alloc_type,
-                                             false);
-    } else if (_network.is_internal() && _node.is_output() && _node.is_type<generic_layer>() &&
-               engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
+        return get_memory_from_pool(_engine,
+                layout,
+                _node.id(),
+                _node.get_memory_dependencies(),
+                alloc_type,
+                false);
+    } else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
+            _engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
        GPU_DEBUG_IF(debug_config->verbose >= 2) {
            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
        }
-        return engine.allocate_memory(layout, allocation_type::usm_device, false);
-    } else if (_network.is_internal() && !_node.is_output() && _node.is_type<input_layout>()) {
+        return _engine.allocate_memory(layout, allocation_type::usm_device, false);
+    } else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
        // Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
        // or just reuse primitive's memory
        GPU_DEBUG_IF(debug_config->verbose >= 2) {
            GPU_DEBUG_COUT << "[" << _node.id() << ": constant]" << std::endl;
        }
-        return engine.allocate_memory(layout, alloc_type, false);
-    } else if (_network.is_internal() || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
+        return _engine.allocate_memory(layout, alloc_type, false);
+    } else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
        GPU_DEBUG_IF(debug_config->verbose >= 2) {
            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
        }
-        return engine.allocate_memory(layout, alloc_type);
+        return _engine.allocate_memory(layout, alloc_type);
    } else {
-        return _network.get_memory_from_pool(layout,
-                                             _node.id(),
-                                             _node.get_memory_dependencies(),
-                                             alloc_type,
-                                             true);
+        return get_memory_from_pool(_engine,
+                layout,
+                _node.id(),
+                _node.get_memory_dependencies(),
+                alloc_type,
+                true);
    }
 }
+memory::ptr primitive_inst::allocate_output() {
+    return allocate_output(get_network().get_engine(), _network.get_memory_pool(), _node, _network.is_internal());
+}

 std::vector<std::shared_ptr<primitive_inst>> primitive_inst::build_exec_deps(
    std::vector<std::shared_ptr<primitive_inst>> const& deps) {
@@ -353,5 +364,4 @@ std::string primitive_inst::generic_to_string(program_node const& node, const ch

    return primitive_description.str();
 }
-
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -84,6 +84,7 @@
 #include <utility>
 #include <vector>
 #include <stdexcept>
+#include <unordered_set>

 program::program(engine& engine_ref,
                 topology const& topology,
@@ -434,6 +435,11 @@ void program::build_program(bool is_internal) {
    {
 #endif
        prepare_memory_dependencies();
+
+        if (options.get<build_option_type::partial_build_program>()->enabled()) {
+            return;
+        }
+
        compile();
        init_kernels();
    }
@@ -551,7 +557,7 @@ void program::post_optimize_graph(bool is_internal) {

    apply_opt_pass<remove_redundant_reorders>(lo, false, true);  // TODO: do we need it at this place also?

-    if (!is_internal) {
+    if (!is_internal && !options.get<build_option_type::partial_build_program>()->enabled()) {
        // ToDo remove hidden dependencies from propagate_constants pass
        apply_opt_pass<propagate_constants>();
    }
@@ -1394,3 +1400,44 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
 #endif
 }
+
+std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
+    auto max_alloc_size = get_engine().get_device_info().max_alloc_mem_size;
+    memory_pool pool(get_engine());
+    int64_t const_sum = 0;
+
+    std::vector<program_node*> nodes_to_allocate{};
+    for (auto node : processing_order) {
+        nodes_to_allocate.push_back(node);
+    }
+
+    std::sort(nodes_to_allocate.begin(),
+              nodes_to_allocate.end(),
+              [](program_node* const& lhs, program_node* const& rhs) {
+                  return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
+              });
+
+    // just to prevent the memories from being freed during allocation
+    std::unordered_set<memory::ptr> allocated_mem_ptrs;
+    for (const auto& node : nodes_to_allocate) {
+        auto out_size = node->get_output_layout().bytes_count();
+        if (out_size > max_alloc_size) {
+            // to consider: if the base batch size is > 1, should we allow this single output allocation to host?
+            continue; // to be allocated to host
+        }
+        if (node->can_be_optimized())
+            continue;
+        if (node->is_type<data>() && node->get_users().size() == 1 && node->have_user_with_type<generic_layer>())  {
+            continue;
+        }
+        if (node->is_type<data>() || (node->is_type<generic_layer>() && node->get_dependency(0).is_type<data>())) {
+            const_sum += out_size;
+        } else if (node->have_user_with_type<concatenation>() && node->get_users().size() == 1 && node->get_users().front()->can_be_optimized()) {
+            continue;
+        } else {
+            allocated_mem_ptrs.insert(primitive_inst::allocate_output(get_engine(), pool, *node, false));
+        }
+    }
+
+    return std::make_pair(const_sum, get_engine().get_used_device_memory(allocation_type::usm_device));
+}
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/test_device_mem_usage_estimation.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/test_device_mem_usage_estimation.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cstddef>
+
+#include "test_utils.h"
+#include <cldnn/primitives/permute.hpp>
+#include <cldnn/primitives/eltwise.hpp>
+
+using namespace cldnn;
+using namespace tests;
+
+TEST(test_device_mem_usage_estimation, basic) {
+    std::shared_ptr<cldnn::engine> engine1 = create_test_engine(cldnn::queue_types::out_of_order);
+
+    auto input1 = engine1->allocate_memory({ data_types::f16, format::bfyx,{ 2, 2, 256, 256} });
+    auto input2 = engine1->allocate_memory({ data_types::f16, format::bfyx,{ 2, 2, 256, 256} });
+    topology topology(
+        input_layout("input1", input1->get_layout()),
+        input_layout("input2", input2->get_layout()),
+        permute("permute1", "input1", { 0, 2, 3, 1 }),
+        permute("permute2", "input2", { 0, 3, 2, 1 }),
+        eltwise("eltw", {"permute1", "permute2"}, eltwise_mode::sum, data_types::f16),
+        reorder("output", "eltw", format::bfyx, data_types::f32)
+    );
+
+    auto prog = program::build_program(*engine1, topology, build_options());
+    std::pair<int64_t, int64_t> estimated_mem_usage = prog->get_estimated_device_mem_usage();
+
+    std::shared_ptr<cldnn::engine> engine2 = create_test_engine(cldnn::queue_types::out_of_order);
+    auto input3 = engine2->allocate_memory({ data_types::f16, format::bfyx,{ 2, 2, 256, 256} });
+    auto input4 = engine2->allocate_memory({ data_types::f16, format::bfyx,{ 2, 2, 256, 256} });
+
+    network network(*engine2, topology);
+    network.set_input_data("input1", input3);
+    network.set_input_data("input2", input4);
+    ASSERT_EQ(estimated_mem_usage.first + estimated_mem_usage.second, engine2->get_used_device_memory(allocation_type::usm_device));
+}