[GPU] Fix some performance degradations from breaking GPU pipeline into explicit stages (#8084)

2021-11-01 10:57:29 +03:00 · 2021-11-01 10:57:29 +03:00 · 03106e0cd9
commit 03106e0cd9
parent 4122ef50d6
4 changed files with 53 additions and 28 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp
@ -19,27 +19,29 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
                        _inferRequest->preprocess();
                        _inferRequest->enqueue();
        } });
    }
    _pipeline.push_back({_waitExecutor,
                    [this] {
                        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
                        _inferRequest->wait();
-                    }});
+        } });
    } else {
        _pipeline.push_back({ _waitExecutor,
                        [this] {
                            OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
                            _inferRequest->wait_notify();
                        } });
    }
 }
 void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
    if (_inferRequest->use_external_queue()) {
-        _inferRequest->preprocess();
+        _inferRequest->preprocess_notify();
-        _inferRequest->enqueue();
+        _inferRequest->enqueue_notify();
    }
    Parent::Infer_ThreadUnsafe();
 }
 void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
    if (_inferRequest->use_external_queue()) {
-        _inferRequest->preprocess();
+        _inferRequest->preprocess_notify();
-        _inferRequest->enqueue();
+        _inferRequest->enqueue_notify();
    }
    Parent::StartAsync_ThreadUnsafe();
 }
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -48,7 +48,8 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, std::sh
    }()},
    m_config(config),
    m_taskExecutor{ _taskExecutor },
-    m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) {
+    m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor",
        config.throughput_streams > 1 ? config.throughput_streams : 1 })) {
    auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
    if (nullptr == casted_context) {
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@ -471,28 +471,32 @@ CLDNNInferRequest::CLDNNInferRequest(const std::vector<std::shared_ptr<const ov:
 // ----------------------------------------------------------------------------------------- //
 // ---------------------------- internal pipeline stages ----------------------------------- //
 // ----------------------------------------------------------------------------------------- //
-
+void CLDNNInferRequest::preprocess_notify() {
-void CLDNNInferRequest::preprocess() {
+    setStreamGraph();
    int streamID = 0;
    auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
    if (nullptr != streamExecutor) {
        streamID = streamExecutor->GetStreamId();
        int numGraphs = streamGraphs.size();
        streamID = streamID % numGraphs;
    }
    m_graph = streamGraphs[streamID];
    m_graph->wait(CLDNNGraph::Stage::PREPROC);
    if (m_graph->GetMaxDynamicBatchSize() > 1) {
        preprocess_dynamic();
-        return;
+    } else {
        execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
    }
    execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
    m_graph->notify(CLDNNGraph::Stage::PREPROC);
 }
-void CLDNNInferRequest::enqueue() {
+void CLDNNInferRequest::preprocess() {
    setStreamGraph();
    if (m_graph->GetMaxDynamicBatchSize() > 1) {
        preprocess_dynamic();
    } else {
        execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
    }
 }
 void CLDNNInferRequest::enqueue_notify() {
    m_graph->wait(CLDNNGraph::Stage::EXECUTE);
    enqueue();
 }
 void CLDNNInferRequest::enqueue() {
    if (m_graph->GetMaxDynamicBatchSize() > 1) {
        enqueue_dynamic();
        return;
@ -541,6 +545,11 @@ void CLDNNInferRequest::enqueue() {
    internal_outputs = m_graph->GetNetwork()->execute(dependencies);
 }
 void CLDNNInferRequest::wait_notify() {
    wait();
    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
 }
 void CLDNNInferRequest::wait() {
    if (m_graph->GetMaxDynamicBatchSize() > 1) {
        wait_dynamic();
@ -568,13 +577,11 @@ void CLDNNInferRequest::wait() {
    if (m_useProfiling) {
        m_graph->UpdatePerfStatistics();
    }
    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
 }
 void CLDNNInferRequest::preprocess_dynamic() {
    // execute input pre-processing.
    execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
    m_graph->notify(CLDNNGraph::Stage::PREPROC);
 }
 void CLDNNInferRequest::enqueue_dynamic() {
@ -619,12 +626,21 @@ void CLDNNInferRequest::wait_dynamic() {
            }
        }
    }
    m_graph->notify(CLDNNGraph::Stage::EXECUTE);
 }
 // ----------------------------------------------------------------------------------------- //
 // ---------------------------- internal utils --------- ----------------------------------- //
 // ----------------------------------------------------------------------------------------- //
 void CLDNNInferRequest::setStreamGraph() {
    int streamID = 0;
    auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
    if (nullptr != streamExecutor) {
        streamID = streamExecutor->GetStreamId();
        int numGraphs = streamGraphs.size();
        streamID = streamID % numGraphs;
    }
    m_graph = streamGraphs[streamID];
 }
 Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob");
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
@ -49,6 +49,10 @@ public:
    void EnableProfiling() { m_useProfiling = true; }
    void EnableStreams() { m_useStreams = true; }
    void preprocess_notify();
    void enqueue_notify();
    void wait_notify();
    void preprocess();
    void enqueue();
    void wait();
@ -92,6 +96,8 @@ private:
    void allocate_inputs_dynamic();
    void allocate_outputs_dynamic();
    void setStreamGraph();
    std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs;
    std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic;
 };