diff --git a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp index 9e69ddeb0c8..358d8c993a4 100644 --- a/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_async_infer_request.cpp @@ -19,27 +19,29 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline"); _inferRequest->preprocess(); _inferRequest->enqueue(); - } }); - } - _pipeline.push_back({_waitExecutor, - [this] { - OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline"); _inferRequest->wait(); - }}); + } }); + } else { + _pipeline.push_back({ _waitExecutor, + [this] { + OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline"); + _inferRequest->wait_notify(); + } }); + } } void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() { if (_inferRequest->use_external_queue()) { - _inferRequest->preprocess(); - _inferRequest->enqueue(); + _inferRequest->preprocess_notify(); + _inferRequest->enqueue_notify(); } Parent::Infer_ThreadUnsafe(); } void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() { if (_inferRequest->use_external_queue()) { - _inferRequest->preprocess(); - _inferRequest->enqueue(); + _inferRequest->preprocess_notify(); + _inferRequest->enqueue_notify(); } Parent::StartAsync_ThreadUnsafe(); } diff --git a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp index 7e465f0f257..c554ee1c42f 100644 --- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp @@ -48,7 +48,8 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, std::sh }()}, m_config(config), m_taskExecutor{ _taskExecutor }, - m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) { + m_waitExecutor(InferenceEngine::ExecutorManager::getInstance()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor", + config.throughput_streams > 1 ? config.throughput_streams : 1 })) { auto casted_context = std::dynamic_pointer_cast(context); if (nullptr == casted_context) { diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp index c71acf4e6d9..92ba0ee3fdd 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp @@ -471,28 +471,32 @@ CLDNNInferRequest::CLDNNInferRequest(const std::vector(_exeNetwork.get())->m_graphs; - if (nullptr != streamExecutor) { - streamID = streamExecutor->GetStreamId(); - int numGraphs = streamGraphs.size(); - streamID = streamID % numGraphs; - } - m_graph = streamGraphs[streamID]; - +void CLDNNInferRequest::preprocess_notify() { + setStreamGraph(); m_graph->wait(CLDNNGraph::Stage::PREPROC); if (m_graph->GetMaxDynamicBatchSize() > 1) { preprocess_dynamic(); - return; + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP } - execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP m_graph->notify(CLDNNGraph::Stage::PREPROC); } -void CLDNNInferRequest::enqueue() { +void CLDNNInferRequest::preprocess() { + setStreamGraph(); + if (m_graph->GetMaxDynamicBatchSize() > 1) { + preprocess_dynamic(); + } else { + execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP + } +} + +void CLDNNInferRequest::enqueue_notify() { m_graph->wait(CLDNNGraph::Stage::EXECUTE); + enqueue(); +} + +void CLDNNInferRequest::enqueue() { if (m_graph->GetMaxDynamicBatchSize() > 1) { enqueue_dynamic(); return; @@ -541,6 +545,11 @@ void CLDNNInferRequest::enqueue() { internal_outputs = m_graph->GetNetwork()->execute(dependencies); } +void CLDNNInferRequest::wait_notify() { + wait(); + m_graph->notify(CLDNNGraph::Stage::EXECUTE); +} + void CLDNNInferRequest::wait() { if (m_graph->GetMaxDynamicBatchSize() > 1) { wait_dynamic(); @@ -568,13 +577,11 @@ void CLDNNInferRequest::wait() { if (m_useProfiling) { m_graph->UpdatePerfStatistics(); } - m_graph->notify(CLDNNGraph::Stage::EXECUTE); } void CLDNNInferRequest::preprocess_dynamic() { // execute input pre-processing. execDataPreprocessing(_inputs, true); // "true" stands for serial preprocessing in case of OpenMP - m_graph->notify(CLDNNGraph::Stage::PREPROC); } void CLDNNInferRequest::enqueue_dynamic() { @@ -619,12 +626,21 @@ void CLDNNInferRequest::wait_dynamic() { } } } - m_graph->notify(CLDNNGraph::Stage::EXECUTE); } // ----------------------------------------------------------------------------------------- // // ---------------------------- internal utils --------- ----------------------------------- // // ----------------------------------------------------------------------------------------- // +void CLDNNInferRequest::setStreamGraph() { + int streamID = 0; + auto& streamGraphs = static_cast(_exeNetwork.get())->m_graphs; + if (nullptr != streamExecutor) { + streamID = streamExecutor->GetStreamId(); + int numGraphs = streamGraphs.size(); + streamID = streamID % numGraphs; + } + m_graph = streamGraphs[streamID]; +} Blob::Ptr CLDNNInferRequest::create_host_blob(const TensorDesc& desc, uint8_t* mem_ptr) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::create_host_blob"); diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_infer_request.h index 72c924b015e..a4eff5b0c91 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h @@ -49,6 +49,10 @@ public: void EnableProfiling() { m_useProfiling = true; } void EnableStreams() { m_useStreams = true; } + void preprocess_notify(); + void enqueue_notify(); + void wait_notify(); + void preprocess(); void enqueue(); void wait(); @@ -92,6 +96,8 @@ private: void allocate_inputs_dynamic(); void allocate_outputs_dynamic(); + void setStreamGraph(); + std::map internal_outputs; std::vector> internal_outputs_dynamic; };