From 0b0202b90c4af3e59e955b43b77ee14b00e56d82 Mon Sep 17 00:00:00 2001
From: Evgeny Talanin <evgeny.talanin@intel.com>
Date: Tue, 2 Nov 2021 17:35:37 +0300
Subject: [PATCH] Revert "[GPU] Fix some performance degradations from breaking
 GPU pipeline into explicit stages (#8084)" (#8372)

This reverts commit 03106e0cd9363bfc83b9183dcfd18fc05e4631c7.
---
 .../cldnn_async_infer_request.cpp             | 20 ++++----
 .../cldnn_engine/cldnn_executable_network.cpp |  3 +-
 .../src/cldnn_engine/cldnn_infer_request.cpp  | 50 +++++++------------
 .../src/cldnn_engine/cldnn_infer_request.h    |  6 ---
 4 files changed, 27 insertions(+), 52 deletions(-)

diff --git a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
index 358d8c993a4..9e69ddeb0c8 100644
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
@@ -19,29 +19,27 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
                         OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
                         _inferRequest->preprocess();
                         _inferRequest->enqueue();
-                        _inferRequest->wait();
         } });
-    } else {
-        _pipeline.push_back({ _waitExecutor,
-                        [this] {
-                            OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
-                            _inferRequest->wait_notify();
-                        } });
     }
+    _pipeline.push_back({_waitExecutor,
+                    [this] {
+                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
+                        _inferRequest->wait();
+                    }});
 }
 
 void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
     if (_inferRequest->use_external_queue()) {
-        _inferRequest->preprocess_notify();
-        _inferRequest->enqueue_notify();
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
     }
     Parent::Infer_ThreadUnsafe();
 }
 
 void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
     if (_inferRequest->use_external_queue()) {
-        _inferRequest->preprocess_notify();
-        _inferRequest->enqueue_notify();
+        _inferRequest->preprocess();
+        _inferRequest->enqueue();
     }
     Parent::StartAsync_ThreadUnsafe();
 }
diff --git a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
index c554ee1c42f..7e465f0f257 100644
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@@ -48,8 +48,7 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, std::sh
     }()},
     m_config(config),
     m_taskExecutor{ _taskExecutor },
-    m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor",
-        config.throughput_streams > 1 ? config.throughput_streams : 1 })) {
+    m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) {
     auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
 
     if (nullptr == casted_context) {
diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
index 92ba0ee3fdd..c71acf4e6d9 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@@ -471,32 +471,28 @@ CLDNNInferRequest::CLDNNInferRequest(const std::vector<std::shared_ptr<const ov:
 // ----------------------------------------------------------------------------------------- //
 // ---------------------------- internal pipeline stages ----------------------------------- //
 // ----------------------------------------------------------------------------------------- //
-void CLDNNInferRequest::preprocess_notify() {
-    setStreamGraph();
+
+void CLDNNInferRequest::preprocess() {
+    int streamID = 0;
+    auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
+    if (nullptr != streamExecutor) {
+        streamID = streamExecutor->GetStreamId();
+        int numGraphs = streamGraphs.size();
+        streamID = streamID % numGraphs;
+    }
+    m_graph = streamGraphs[streamID];
+
     m_graph->wait(CLDNNGraph::Stage::PREPROC);
     if (m_graph->GetMaxDynamicBatchSize() > 1) {
         preprocess_dynamic();
-    } else {
-        execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
+        return;
     }
+    execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
     m_graph->notify(CLDNNGraph::Stage::PREPROC);
 }
 
-void CLDNNInferRequest::preprocess() {
-    setStreamGraph();
-    if (m_graph->GetMaxDynamicBatchSize() > 1) {
-        preprocess_dynamic();
-    } else {
-        execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
-    }
-}
-
-void CLDNNInferRequest::enqueue_notify() {
-    m_graph->wait(CLDNNGraph::Stage::EXECUTE);
-    enqueue();
-}
-
 void CLDNNInferRequest::enqueue() {
+    m_graph->wait(CLDNNGraph::Stage::EXECUTE);
     if (m_graph->GetMaxDynamicBatchSize() > 1) {
         enqueue_dynamic();
         return;
@@ -545,11 +541,6 @@ void CLDNNInferRequest::enqueue() {
     internal_outputs = m_graph->GetNetwork()->execute(dependencies);
 }
 
-void CLDNNInferRequest::wait_notify() {
-    wait();
-    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
-}
-
 void CLDNNInferRequest::wait() {
     if (m_graph->GetMaxDynamicBatchSize() > 1) {
         wait_dynamic();
@@ -577,11 +568,13 @@ void CLDNNInferRequest::wait() {
     if (m_useProfiling) {
         m_graph->UpdatePerfStatistics();
     }
+    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
 }
 
 void CLDNNInferRequest::preprocess_dynamic() {
     // execute input pre-processing.
     execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
+    m_graph->notify(CLDNNGraph::Stage::PREPROC);
 }
 
 void CLDNNInferRequest::enqueue_dynamic() {
@@ -626,21 +619,12 @@ void CLDNNInferRequest::wait_dynamic() {
             }
         }
     }
+    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
 }
 
 // ----------------------------------------------------------------------------------------- //
 // ---------------------------- internal utils --------- ----------------------------------- //
 // ----------------------------------------------------------------------------------------- //
-void CLDNNInferRequest::setStreamGraph() {
-    int streamID = 0;
-    auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
-    if (nullptr != streamExecutor) {
-        streamID = streamExecutor->GetStreamId();
-        int numGraphs = streamGraphs.size();
-        streamID = streamID % numGraphs;
-    }
-    m_graph = streamGraphs[streamID];
-}
 
 Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob");
diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
index a4eff5b0c91..72c924b015e 100644
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
@@ -49,10 +49,6 @@ public:
     void EnableProfiling() { m_useProfiling = true; }
     void EnableStreams() { m_useStreams = true; }
 
-    void preprocess_notify();
-    void enqueue_notify();
-    void wait_notify();
-
     void preprocess();
     void enqueue();
     void wait();
@@ -96,8 +92,6 @@ private:
     void allocate_inputs_dynamic();
     void allocate_outputs_dynamic();
 
-    void setStreamGraph();
-
     std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs;
     std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic;
 };