[GPU] Fix some performance degradations from breaking GPU pipeline into explicit stages (#8084)

This commit is contained in:
Mikhail Letavin 2021-11-01 10:57:29 +03:00 committed by GitHub
parent 4122ef50d6
commit 03106e0cd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 53 additions and 28 deletions

View File

@ -19,27 +19,29 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline"); OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
_inferRequest->preprocess(); _inferRequest->preprocess();
_inferRequest->enqueue(); _inferRequest->enqueue();
} });
}
_pipeline.push_back({_waitExecutor,
[this] {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
_inferRequest->wait(); _inferRequest->wait();
}}); } });
} else {
_pipeline.push_back({ _waitExecutor,
[this] {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
_inferRequest->wait_notify();
} });
}
} }
void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() { void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
if (_inferRequest->use_external_queue()) { if (_inferRequest->use_external_queue()) {
_inferRequest->preprocess(); _inferRequest->preprocess_notify();
_inferRequest->enqueue(); _inferRequest->enqueue_notify();
} }
Parent::Infer_ThreadUnsafe(); Parent::Infer_ThreadUnsafe();
} }
void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() { void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
if (_inferRequest->use_external_queue()) { if (_inferRequest->use_external_queue()) {
_inferRequest->preprocess(); _inferRequest->preprocess_notify();
_inferRequest->enqueue(); _inferRequest->enqueue_notify();
} }
Parent::StartAsync_ThreadUnsafe(); Parent::StartAsync_ThreadUnsafe();
} }

View File

@ -48,7 +48,8 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, std::sh
}()}, }()},
m_config(config), m_config(config),
m_taskExecutor{ _taskExecutor }, m_taskExecutor{ _taskExecutor },
m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) { m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor",
config.throughput_streams > 1 ? config.throughput_streams : 1 })) {
auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context); auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
if (nullptr == casted_context) { if (nullptr == casted_context) {

View File

@ -471,28 +471,32 @@ CLDNNInferRequest::CLDNNInferRequest(const std::vector<std::shared_ptr<const ov:
// ----------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------- //
// ---------------------------- internal pipeline stages ----------------------------------- // // ---------------------------- internal pipeline stages ----------------------------------- //
// ----------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------- //
void CLDNNInferRequest::preprocess_notify() {
void CLDNNInferRequest::preprocess() { setStreamGraph();
int streamID = 0;
auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
if (nullptr != streamExecutor) {
streamID = streamExecutor->GetStreamId();
int numGraphs = streamGraphs.size();
streamID = streamID % numGraphs;
}
m_graph = streamGraphs[streamID];
m_graph->wait(CLDNNGraph::Stage::PREPROC); m_graph->wait(CLDNNGraph::Stage::PREPROC);
if (m_graph->GetMaxDynamicBatchSize() > 1) { if (m_graph->GetMaxDynamicBatchSize() > 1) {
preprocess_dynamic(); preprocess_dynamic();
return; } else {
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
} }
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
m_graph->notify(CLDNNGraph::Stage::PREPROC); m_graph->notify(CLDNNGraph::Stage::PREPROC);
} }
void CLDNNInferRequest::enqueue() { void CLDNNInferRequest::preprocess() {
setStreamGraph();
if (m_graph->GetMaxDynamicBatchSize() > 1) {
preprocess_dynamic();
} else {
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
}
}
void CLDNNInferRequest::enqueue_notify() {
m_graph->wait(CLDNNGraph::Stage::EXECUTE); m_graph->wait(CLDNNGraph::Stage::EXECUTE);
enqueue();
}
void CLDNNInferRequest::enqueue() {
if (m_graph->GetMaxDynamicBatchSize() > 1) { if (m_graph->GetMaxDynamicBatchSize() > 1) {
enqueue_dynamic(); enqueue_dynamic();
return; return;
@ -541,6 +545,11 @@ void CLDNNInferRequest::enqueue() {
internal_outputs = m_graph->GetNetwork()->execute(dependencies); internal_outputs = m_graph->GetNetwork()->execute(dependencies);
} }
void CLDNNInferRequest::wait_notify() {
wait();
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
}
void CLDNNInferRequest::wait() { void CLDNNInferRequest::wait() {
if (m_graph->GetMaxDynamicBatchSize() > 1) { if (m_graph->GetMaxDynamicBatchSize() > 1) {
wait_dynamic(); wait_dynamic();
@ -568,13 +577,11 @@ void CLDNNInferRequest::wait() {
if (m_useProfiling) { if (m_useProfiling) {
m_graph->UpdatePerfStatistics(); m_graph->UpdatePerfStatistics();
} }
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
} }
void CLDNNInferRequest::preprocess_dynamic() { void CLDNNInferRequest::preprocess_dynamic() {
// execute input pre-processing. // execute input pre-processing.
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
m_graph->notify(CLDNNGraph::Stage::PREPROC);
} }
void CLDNNInferRequest::enqueue_dynamic() { void CLDNNInferRequest::enqueue_dynamic() {
@ -619,12 +626,21 @@ void CLDNNInferRequest::wait_dynamic() {
} }
} }
} }
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
} }
// ----------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------- //
// ---------------------------- internal utils --------- ----------------------------------- // // ---------------------------- internal utils --------- ----------------------------------- //
// ----------------------------------------------------------------------------------------- // // ----------------------------------------------------------------------------------------- //
void CLDNNInferRequest::setStreamGraph() {
int streamID = 0;
auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
if (nullptr != streamExecutor) {
streamID = streamExecutor->GetStreamId();
int numGraphs = streamGraphs.size();
streamID = streamID % numGraphs;
}
m_graph = streamGraphs[streamID];
}
Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) { Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob"); OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob");

View File

@ -49,6 +49,10 @@ public:
void EnableProfiling() { m_useProfiling = true; } void EnableProfiling() { m_useProfiling = true; }
void EnableStreams() { m_useStreams = true; } void EnableStreams() { m_useStreams = true; }
void preprocess_notify();
void enqueue_notify();
void wait_notify();
void preprocess(); void preprocess();
void enqueue(); void enqueue();
void wait(); void wait();
@ -92,6 +96,8 @@ private:
void allocate_inputs_dynamic(); void allocate_inputs_dynamic();
void allocate_outputs_dynamic(); void allocate_outputs_dynamic();
void setStreamGraph();
std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs; std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs;
std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic; std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic;
}; };