WA for MAX_BATCH_SIZE failure, removing batch4 as a min for the auto-batching

2021-12-07 14:32:59 +03:00 · 2021-12-07 14:32:59 +03:00 · c3a98fa6a1
commit c3a98fa6a1
parent 71c7986bcc
1 changed files with 53 additions and 48 deletions
--- a/src/plugins/intel_gpu/src/plugin/cldnn_engine.cpp
+++ b/src/plugins/intel_gpu/src/plugin/cldnn_engine.cpp
@ -732,7 +732,6 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
        unsigned int closest = pow(2, floor(log(max_batch_size)/log(2)));
        std::cout << "!!!!!!!!!!!!!! (CLOSEST):" << closest << std::endl;
        batch = std::min(closest, batch);
-        batch = std::max(4u, batch); //batch 4 is a min
        batch = std::min(256u, batch); //batch 256 is a max
        std::cout << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
        IE_SET_METRIC_RETURN(OPTIMAL_BATCH, batch);
@ -796,7 +795,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
                           << " (occupied: " << occupied_device_mem << ")" << std::endl;
        }

-        int64_t max_batch_size = 0;
+        int64_t max_batch_size = 1;

        if (options.find("MODEL_PTR") == options.end()) {
            GPU_DEBUG_IF(debug_config->verbose >= 1) {
@ -856,55 +855,61 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
        auto cloned_network = InferenceEngine::details::cloneNetwork(network);
        auto inputs_info = cloned_network.getInputsInfo();
        ICNNNetwork::InputShapes new_shapes;
-        //std::map<std::string, SizeVector>;
-        bool batch_detected = false;
-        for (auto& info : inputs_info) {
-            if (!info.second)
-                continue;
-            Layout layout = info.second->getLayout();
-            auto data = info.second->getInputData();
-            if (!data)
-                continue;
-            std::string name = info.second->getInputData()->getName();
-            auto shape = data->getTensorDesc().getDims();
-            if (layout == InferenceEngine::Layout::NCHW ||
-                layout == InferenceEngine::Layout::NHWC ||
-                layout == InferenceEngine::Layout::NCDHW ||
-                layout == InferenceEngine::Layout::NDHWC ||
-                layout == InferenceEngine::Layout::NC)  {
-                shape[0] = base_batch_size;
-                batch_detected = true;
-            } else if (layout == InferenceEngine::Layout::CN) {
-                shape[1] = base_batch_size;
-                batch_detected = true;
+        try {
+            //std::map<std::string, SizeVector>;
+            bool batch_detected = false;
+            for (auto &info : inputs_info) {
+                if (!info.second)
+                    continue;
+                Layout layout = info.second->getLayout();
+                auto data = info.second->getInputData();
+                if (!data)
+                    continue;
+                std::string name = info.second->getInputData()->getName();
+                auto shape = data->getTensorDesc().getDims();
+                if (layout == InferenceEngine::Layout::NCHW ||
+                    layout == InferenceEngine::Layout::NHWC ||
+                    layout == InferenceEngine::Layout::NCDHW ||
+                    layout == InferenceEngine::Layout::NDHWC ||
+                    layout == InferenceEngine::Layout::NC) {
+                    shape[0] = base_batch_size;
+                    batch_detected = true;
+                } else if (layout == InferenceEngine::Layout::CN) {
+                    shape[1] = base_batch_size;
+                    batch_detected = true;
+                }
+                new_shapes[name] = shape;
            }
-            new_shapes[name] = shape;
-        }
-        if (batch_detected) { // reshape only for batched layout
-            cloned_network.reshape(new_shapes);
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "Reshaped base batch size to " << base_batch_size << std::endl;
+            if (batch_detected) { // reshape only for batched layout
+                cloned_network.reshape(new_shapes);
+                GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                    GPU_DEBUG_COUT << "Reshaped base batch size to " << base_batch_size << std::endl;
+                }
+            } else {
+                base_batch_size = 1;
+                GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                    GPU_DEBUG_COUT << "Batch dimension is not used in inputs." << std::endl;
+                }
            }
-        } else {
-            base_batch_size = 1;
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "Batch dimension is not used in inputs." << std::endl;
-            }
-        }

-        auto nGraphFunc = cloned_network.getFunction();
-        TransformationsPipeline transformations(config, device_info);
-        transformations.apply(nGraphFunc);
-        program = std::make_shared<Program>(cloned_network, engine, config, false, true);
-        std::pair<int64_t, int64_t> device_memory_usage =  program->GetCompiledProgram(0)->get_estimated_device_mem_usage();
-        int64_t mem_for_general = std::max(static_cast<int64_t>(1L),
-                                  static_cast<int64_t>(static_cast<int64_t>(available_device_mem) - device_memory_usage.first));
-        int64_t mem_per_batch = std::max(static_cast<int64_t>(1L), (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
-        max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << "Base batch size: " << base_batch_size  << std::endl;
-            GPU_DEBUG_COUT << "Const mem usage: " << device_memory_usage.first  << std::endl;
-            GPU_DEBUG_COUT << "General mem usage: " << device_memory_usage.second  << std::endl;
+            auto nGraphFunc = cloned_network.getFunction();
+            TransformationsPipeline transformations(config, device_info);
+            transformations.apply(nGraphFunc);
+            program = std::make_shared<Program>(cloned_network, engine, config, false, true);
+            std::pair<int64_t, int64_t> device_memory_usage = program->GetCompiledProgram(
+                    0)->get_estimated_device_mem_usage();
+            int64_t mem_for_general = std::max(static_cast<int64_t>(1L),
+                                               static_cast<int64_t>(static_cast<int64_t>(available_device_mem) -
+                                                                    device_memory_usage.first));
+            int64_t mem_per_batch = std::max(static_cast<int64_t>(1L),
+                                             (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
+            max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
+            GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                GPU_DEBUG_COUT << "Base batch size: " << base_batch_size << std::endl;
+                GPU_DEBUG_COUT << "Const mem usage: " << device_memory_usage.first << std::endl;
+                GPU_DEBUG_COUT << "General mem usage: " << device_memory_usage.second << std::endl;
+            }
+        } catch (...) {
        }
        IE_SET_METRIC_RETURN(GPU_MAX_BATCH_SIZE, static_cast<int32_t>(max_batch_size));
    } else {