[GPU] Fix some performance degradations from breaking GPU pipeline into explicit stages (#8084)
This commit is contained in:
parent
4122ef50d6
commit
03106e0cd9
@ -19,27 +19,29 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
|
|||||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
|
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
|
||||||
_inferRequest->preprocess();
|
_inferRequest->preprocess();
|
||||||
_inferRequest->enqueue();
|
_inferRequest->enqueue();
|
||||||
} });
|
|
||||||
}
|
|
||||||
_pipeline.push_back({_waitExecutor,
|
|
||||||
[this] {
|
|
||||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
|
|
||||||
_inferRequest->wait();
|
_inferRequest->wait();
|
||||||
}});
|
} });
|
||||||
|
} else {
|
||||||
|
_pipeline.push_back({ _waitExecutor,
|
||||||
|
[this] {
|
||||||
|
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
|
||||||
|
_inferRequest->wait_notify();
|
||||||
|
} });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
|
void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
|
||||||
if (_inferRequest->use_external_queue()) {
|
if (_inferRequest->use_external_queue()) {
|
||||||
_inferRequest->preprocess();
|
_inferRequest->preprocess_notify();
|
||||||
_inferRequest->enqueue();
|
_inferRequest->enqueue_notify();
|
||||||
}
|
}
|
||||||
Parent::Infer_ThreadUnsafe();
|
Parent::Infer_ThreadUnsafe();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
|
void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
|
||||||
if (_inferRequest->use_external_queue()) {
|
if (_inferRequest->use_external_queue()) {
|
||||||
_inferRequest->preprocess();
|
_inferRequest->preprocess_notify();
|
||||||
_inferRequest->enqueue();
|
_inferRequest->enqueue_notify();
|
||||||
}
|
}
|
||||||
Parent::StartAsync_ThreadUnsafe();
|
Parent::StartAsync_ThreadUnsafe();
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,8 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, std::sh
|
|||||||
}()},
|
}()},
|
||||||
m_config(config),
|
m_config(config),
|
||||||
m_taskExecutor{ _taskExecutor },
|
m_taskExecutor{ _taskExecutor },
|
||||||
m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) {
|
m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor",
|
||||||
|
config.throughput_streams > 1 ? config.throughput_streams : 1 })) {
|
||||||
auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
|
auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
|
||||||
|
|
||||||
if (nullptr == casted_context) {
|
if (nullptr == casted_context) {
|
||||||
|
@ -471,28 +471,32 @@ CLDNNInferRequest::CLDNNInferRequest(const std::vector<std::shared_ptr<const ov:
|
|||||||
// ----------------------------------------------------------------------------------------- //
|
// ----------------------------------------------------------------------------------------- //
|
||||||
// ---------------------------- internal pipeline stages ----------------------------------- //
|
// ---------------------------- internal pipeline stages ----------------------------------- //
|
||||||
// ----------------------------------------------------------------------------------------- //
|
// ----------------------------------------------------------------------------------------- //
|
||||||
|
void CLDNNInferRequest::preprocess_notify() {
|
||||||
void CLDNNInferRequest::preprocess() {
|
setStreamGraph();
|
||||||
int streamID = 0;
|
|
||||||
auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
|
|
||||||
if (nullptr != streamExecutor) {
|
|
||||||
streamID = streamExecutor->GetStreamId();
|
|
||||||
int numGraphs = streamGraphs.size();
|
|
||||||
streamID = streamID % numGraphs;
|
|
||||||
}
|
|
||||||
m_graph = streamGraphs[streamID];
|
|
||||||
|
|
||||||
m_graph->wait(CLDNNGraph::Stage::PREPROC);
|
m_graph->wait(CLDNNGraph::Stage::PREPROC);
|
||||||
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
||||||
preprocess_dynamic();
|
preprocess_dynamic();
|
||||||
return;
|
} else {
|
||||||
|
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
|
||||||
}
|
}
|
||||||
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
|
|
||||||
m_graph->notify(CLDNNGraph::Stage::PREPROC);
|
m_graph->notify(CLDNNGraph::Stage::PREPROC);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CLDNNInferRequest::enqueue() {
|
void CLDNNInferRequest::preprocess() {
|
||||||
|
setStreamGraph();
|
||||||
|
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
||||||
|
preprocess_dynamic();
|
||||||
|
} else {
|
||||||
|
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CLDNNInferRequest::enqueue_notify() {
|
||||||
m_graph->wait(CLDNNGraph::Stage::EXECUTE);
|
m_graph->wait(CLDNNGraph::Stage::EXECUTE);
|
||||||
|
enqueue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CLDNNInferRequest::enqueue() {
|
||||||
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
||||||
enqueue_dynamic();
|
enqueue_dynamic();
|
||||||
return;
|
return;
|
||||||
@ -541,6 +545,11 @@ void CLDNNInferRequest::enqueue() {
|
|||||||
internal_outputs = m_graph->GetNetwork()->execute(dependencies);
|
internal_outputs = m_graph->GetNetwork()->execute(dependencies);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CLDNNInferRequest::wait_notify() {
|
||||||
|
wait();
|
||||||
|
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
|
||||||
|
}
|
||||||
|
|
||||||
void CLDNNInferRequest::wait() {
|
void CLDNNInferRequest::wait() {
|
||||||
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
if (m_graph->GetMaxDynamicBatchSize() > 1) {
|
||||||
wait_dynamic();
|
wait_dynamic();
|
||||||
@ -568,13 +577,11 @@ void CLDNNInferRequest::wait() {
|
|||||||
if (m_useProfiling) {
|
if (m_useProfiling) {
|
||||||
m_graph->UpdatePerfStatistics();
|
m_graph->UpdatePerfStatistics();
|
||||||
}
|
}
|
||||||
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CLDNNInferRequest::preprocess_dynamic() {
|
void CLDNNInferRequest::preprocess_dynamic() {
|
||||||
// execute input pre-processing.
|
// execute input pre-processing.
|
||||||
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
|
execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP
|
||||||
m_graph->notify(CLDNNGraph::Stage::PREPROC);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CLDNNInferRequest::enqueue_dynamic() {
|
void CLDNNInferRequest::enqueue_dynamic() {
|
||||||
@ -619,12 +626,21 @@ void CLDNNInferRequest::wait_dynamic() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m_graph->notify(CLDNNGraph::Stage::EXECUTE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------------------- //
|
// ----------------------------------------------------------------------------------------- //
|
||||||
// ---------------------------- internal utils --------- ----------------------------------- //
|
// ---------------------------- internal utils --------- ----------------------------------- //
|
||||||
// ----------------------------------------------------------------------------------------- //
|
// ----------------------------------------------------------------------------------------- //
|
||||||
|
void CLDNNInferRequest::setStreamGraph() {
|
||||||
|
int streamID = 0;
|
||||||
|
auto& streamGraphs = static_cast<CLDNNExecNetwork*>(_exeNetwork.get())->m_graphs;
|
||||||
|
if (nullptr != streamExecutor) {
|
||||||
|
streamID = streamExecutor->GetStreamId();
|
||||||
|
int numGraphs = streamGraphs.size();
|
||||||
|
streamID = streamID % numGraphs;
|
||||||
|
}
|
||||||
|
m_graph = streamGraphs[streamID];
|
||||||
|
}
|
||||||
|
|
||||||
Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
|
Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) {
|
||||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob");
|
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob");
|
||||||
|
@ -49,6 +49,10 @@ public:
|
|||||||
void EnableProfiling() { m_useProfiling = true; }
|
void EnableProfiling() { m_useProfiling = true; }
|
||||||
void EnableStreams() { m_useStreams = true; }
|
void EnableStreams() { m_useStreams = true; }
|
||||||
|
|
||||||
|
void preprocess_notify();
|
||||||
|
void enqueue_notify();
|
||||||
|
void wait_notify();
|
||||||
|
|
||||||
void preprocess();
|
void preprocess();
|
||||||
void enqueue();
|
void enqueue();
|
||||||
void wait();
|
void wait();
|
||||||
@ -92,6 +96,8 @@ private:
|
|||||||
void allocate_inputs_dynamic();
|
void allocate_inputs_dynamic();
|
||||||
void allocate_outputs_dynamic();
|
void allocate_outputs_dynamic();
|
||||||
|
|
||||||
|
void setStreamGraph();
|
||||||
|
|
||||||
std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs;
|
std::map<cldnn::primitive_id, cldnn::network_output> internal_outputs;
|
||||||
std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic;
|
std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> internal_outputs_dynamic;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user