[GPU] Extended remote context to accept user queue (#6235)

* [GPU] extended remote context to accept user queues for each stream

* [GPU] OV2.0 API for queue sharing. Removed deviceName arg for context creation
This commit is contained in:
Vladimir Paramuzov 2021-10-21 10:45:25 +03:00 committed by GitHub
parent f8439eeed8
commit 6f754052cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 467 additions and 56 deletions

View File

@ -68,6 +68,28 @@ and returns a smart pointer to it, which is cast to `Blob::Ptr`.
To ensure that the plugin generates the correct execution graph for the NV12 dual-plane input, set
the `CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS` plugin configuration flag to `PluginConfigParams::YES`.
## Context & queue sharing
GPU plugin supports creation of shared context from `cl_command_queue` handle. In that case
opencl context handle is extracted from given queue via OpenCL™ API, and the queue itself is used inside
the plugin for further execution of inference primitives. Sharing of the queue changes behavior of `StartAsync()`
method to guarantee that submission of inference primitives into given queue is finished before
returning of control back to calling thread.
This sharing mechanism allows to do pipeline synchonization on app side and avoid blocking of host thread
on waiting for completion of inference. Pseudocode may look as follows:
@snippet snippets/GPU_RemoteBlob_API3.cpp part0
### Limitations
- Some primitives in GPU plugin may block host thread on waiting for previous primitives before adding its kernels
to the command queue. In such cases `StartAsync()` call takes much more time to return control to the calling thread
as internally it waits for partial or full network completion.
Examples of operations: Loop, TensorIterator, DetectionOutput, NonMaxSuppression
- Synchonization of pre/post processing jobs and inference pipleine inside shared queue is the user responsibility
- Throughput mode is not available when queue sharing is used, i.e. only single stream can be used for each executable network.
## Low-Level Methods and Their Parameter Description
The high-level wrappers above bring a direct dependency on native APIs to the user program.
@ -84,6 +106,7 @@ descriptor, which sets the expected structure and possible parameter values of
|----------------|---------------------------------------------------------------------|
| `CONTEXT_TYPE` | Describes the type of the shared context in a map. Can be `OCL` (for pure OpenCL context) or `VA_SHARED` (for context shared with a video decoding device). |
| `OCL_CONTEXT` | Contains the OpenCL context handle. |
| `OCL_QUEUE` | Contains the OpenCL queue handle if queue sharing is needed. |
| `VA_DEVICE` | Contains the native video decoding device handle. Can be `VADisplay` or `ID3D11Device` (a pointer). |
| `SHARED_MEM_TYPE` | Describes the type of the shared memory buffer in a map. Can be `OCL_BUFFER` (clBuffer), `OCL_IMAGE2D` (clImage2D), `VA_SURFACE()`, or `DX_BUFFER`. |
| `MEM_HANDLE` | Contains the OpenCL memory handle. |

View File

@ -11,7 +11,8 @@ file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
if(NOT CLDNN__IOCL_ICD_INCDIRS OR TRUE)
list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API0.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API1.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp")
"${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API2.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/GPU_RemoteBlob_API3.cpp")
endif()
# remove OpenCV related sources

View File

@ -0,0 +1,76 @@
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <ie_core.hpp>
#include <CL/cl2.hpp>
#include <gpu/gpu_context_api_ocl.hpp>
int main() {
using namespace InferenceEngine;
//! [part0]
// ...
// initialize the core and read the network
InferenceEngine::Core ie;
auto net = ie.ReadNetwork("network.xml");
// initialize opencl context and create queue
cl::Context ctx = get_my_OpenCL_context();
cl::CommandQueue queue = get_my_OpenCL_queue();
// share the queue with GPU plugin and compile ExecutableNetwork
auto remote_context = gpu::make_shared_context(ie, "GPU", queue.get());
auto exec_net_shared = ie.LoadNetwork(net, remote_context);
// create the OpenCL buffers within the context
cl::Buffer shared_in_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
cl::Buffer shared_out_buffer(ctx, CL_MEM_READ_WRITE, image_size * num_channels, NULL, &err);
// wrap in and out buffers into RemoteBlob and set them to infer request
auto shared_in_blob = gpu::make_shared_blob(input_info->getTensorDesc(), remote_context, shared_in_buffer);
auto shared_out_blob = gpu::make_shared_blob(out_data->getTensorDesc(), remote_context, shared_out_buffer);
auto infer_request = exec_net_shared.CreateInferRequest();
infer_request.SetBlob(input_name, shared_in_blob);
infer_request.SetBlob(output_name, shared_out_blob);
// ...
// execute user kernel
cl::Kernel kernel_preproc(program, kernel_name_preproc.c_str());
kernel_preproc.setArg(0, shared_in_buffer);
queue.enqueueNDRangeKernel(kernel_preproc,
cl::NDRange(0),
cl::NDRange(image_size),
cl::NDRange(1),
nullptr, // wait events *
&profileEvent);
// Blocking clFinish() call is not required, but this barrier is added to the queue to guarantee that user kernel is finished
// before any inference primitive is started
queue.enqueueBarrierWithWaitList(nullptr, nullptr);
// ...
// pass results to the inference
// since the remote context is created with queue sharing, StartAsync() guarantees that scheduling is finished
infer_request.StartAsync();
// execute some postprocessing kernel.
// infer_request.Wait() is not called, synchonization between inference and post-processing is done via
// enqueueBarrierWithWaitList call.
cl::Kernel kernel_postproc(program, kernel_name_postproc.c_str());
kernel_postproc.setArg(0, shared_out_buffer);
queue.enqueueBarrierWithWaitList(nullptr, nullptr);
queue.enqueueNDRangeKernel(kernel_postproc,
cl::NDRange(0),
cl::NDRange(image_size),
cl::NDRange(1),
nullptr, // wait events *
&profileEvent);
// Wait for pipeline completion
queue.finish();
//! [part0]
return 0;
}

View File

@ -13,12 +13,14 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
: AsyncInferRequestThreadSafeDefault(inferRequest, taskExecutor, callbackExecutor), _inferRequest(inferRequest), _waitExecutor(waitExecutor) {
_pipeline = {};
if (!_inferRequest->use_external_queue()) {
_pipeline.push_back({taskExecutor,
[this] {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::PreprocessingAndStartPipeline");
_inferRequest->preprocess();
_inferRequest->enqueue();
} });
}
_pipeline.push_back({_waitExecutor,
[this] {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNAsyncInferRequest::WaitPipeline");
@ -26,6 +28,22 @@ CLDNNPlugin::CLDNNAsyncInferRequest::CLDNNAsyncInferRequest(const CLDNNInferRequ
}});
}
void CLDNNPlugin::CLDNNAsyncInferRequest::Infer_ThreadUnsafe() {
if (_inferRequest->use_external_queue()) {
_inferRequest->preprocess();
_inferRequest->enqueue();
}
Parent::Infer_ThreadUnsafe();
}
void CLDNNPlugin::CLDNNAsyncInferRequest::StartAsync_ThreadUnsafe() {
if (_inferRequest->use_external_queue()) {
_inferRequest->preprocess();
_inferRequest->enqueue();
}
Parent::StartAsync_ThreadUnsafe();
}
CLDNNPlugin::CLDNNAsyncInferRequest::~CLDNNAsyncInferRequest() {
StopAndWait();
}

View File

@ -21,6 +21,9 @@ public:
~CLDNNAsyncInferRequest();
void Infer_ThreadUnsafe() override;
void StartAsync_ThreadUnsafe() override;
private:
CLDNNInferRequest::Ptr _inferRequest;
InferenceEngine::ITaskExecutor::Ptr _waitExecutor;

View File

@ -89,6 +89,10 @@ IInferRequestInternal::Ptr CLDNNExecNetwork::CreateInferRequestImpl(const std::v
}
if (m_config.useProfiling)
ptr->EnableProfiling();
if (m_graphs.front()->use_external_queue()) {
ptr->enable_external_queue();
}
ptr->SetGraph(m_graphs.front());
return ptr;

View File

@ -94,9 +94,26 @@ void CLDNNGraph::Build() {
}
}
bool CLDNNGraph::use_external_queue() const {
auto impl = getContextImpl(m_context);
return impl->GetExternalQueue() != nullptr;
}
std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::BuildNetwork");
auto network = std::make_shared<cldnn::network>(program, m_stream_id);
std::shared_ptr<cldnn::network> network = nullptr;
auto impl = getContextImpl(m_context);
auto externalQueue = impl->GetExternalQueue();
if (externalQueue) {
if (m_config.throughput_streams != 1)
IE_THROW(ParameterMismatch) << "Throughput streams can't be used with shared queue!\n";
auto &engine = m_program->GetEngine();
network = std::make_shared<cldnn::network>(program, engine.create_stream(externalQueue), m_stream_id);
} else {
network = std::make_shared<cldnn::network>(program, m_stream_id);
}
if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
static int net_id = 0;

View File

@ -71,6 +71,8 @@ public:
m_cv.notify_one();
}
bool use_external_queue() const;
protected:
uint32_t m_state;
std::condition_variable m_cv;

View File

@ -452,7 +452,8 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap
const CLDNNExecNetwork::Ptr& execNetwork)
: IInferRequestInternal(networkInputs, networkOutputs)
, m_useProfiling(false)
, m_useStreams(false) {
, m_useStreams(false)
, m_useExternalQueue(false) {
IE_ASSERT(nullptr != execNetwork);
streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
}

View File

@ -57,6 +57,9 @@ public:
void enqueue_dynamic();
void wait_dynamic();
bool use_external_queue() const { return m_useExternalQueue; }
void enable_external_queue() { m_useExternalQueue = true; }
private:
InferenceEngine::BlobMap _deviceOutputs;
std::map<std::string, cldnn::primitive_id> inputsMap;
@ -64,6 +67,7 @@ private:
bool m_useProfiling;
bool m_useStreams;
bool m_useExternalQueue;
std::shared_ptr<CLDNNGraph> m_graph;
// dynamic batch stuff

View File

@ -199,6 +199,7 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
m_plugin(plugin),
m_type(ContextType::OCL),
m_config(config),
m_external_queue(nullptr),
m_va_display(nullptr) {
lock.clear(std::memory_order_relaxed);
gpu_handle_param _context_id = nullptr;
@ -221,6 +222,9 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
if (tile_id_itr != params.end()) {
target_tile_id = tile_id_itr->second.as<int>();
}
if (params.find(GPU_PARAM_KEY(OCL_QUEUE)) != params.end())
m_external_queue = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_QUEUE));
}
// TODO: Parameterize this based on plugin config and compilation options

View File

@ -224,6 +224,7 @@ public:
std::shared_ptr<cldnn::engine> GetEngine() const { return m_engine; }
Config& GetConfig() { return m_config; }
ContextType GetType() const { return m_type; }
InferenceEngine::gpu_handle_param GetExternalQueue() const { return m_external_queue; }
const std::weak_ptr<InferenceEngine::IInferencePlugin> GetPlugin() const { return m_plugin; }
void acquire_lock() {
@ -238,6 +239,7 @@ protected:
// TODO: refactor to unique_ptr
std::shared_ptr<cldnn::engine> m_engine;
InferenceEngine::gpu_handle_param m_va_display;
InferenceEngine::gpu_handle_param m_external_queue;
Config m_config;
ContextType m_type;

View File

@ -232,6 +232,25 @@ static inline RemoteContext::Ptr make_shared_context(Core& core,
return core.CreateContext(deviceName, contextParams);
}
/**
* @brief This function is used to obtain remote context object from user-supplied OpenCL context handle
* @param core A reference to Inference Engine Core object
* @param deviceName A name of device to create a remote context for
* @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
* @note Only latency mode is supported for such context sharing case.
* @return A shared remote context instance
*/
static inline RemoteContext::Ptr make_shared_context(Core& core, std::string deviceName, cl_command_queue queue) {
cl_context ctx;
auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
if (res != CL_SUCCESS)
IE_THROW() << "Can't get context from given opencl queue";
ParamMap contextParams = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
{GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
{GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
return core.CreateContext(deviceName, contextParams);
}
/**
* @brief This function is used to create remote blob object within default GPU plugin OpenCL context
* @param desc A tensor descriptor object representing remote blob configuration

View File

@ -68,6 +68,11 @@ DECLARE_GPU_PARAM_KEY(OCL_CONTEXT, gpu_handle_param);
*/
DECLARE_GPU_PARAM_KEY(TILE_ID, int);
/**
* @brief This key identifies OpenCL queue handle in a shared context
*/
DECLARE_GPU_PARAM_KEY(OCL_QUEUE, gpu_handle_param);
/**
* @brief This key identifies video acceleration device/display handle
* in a shared context or shared memory blob parameter map

View File

@ -94,6 +94,7 @@ public:
*/
class D3DContext : public ClContext {
using RemoteContext::create_tensor;
static constexpr const char* device_name = "GPU";
public:
/**
@ -117,16 +118,15 @@ public:
/**
* @brief Constructs D3DContext remote context object from ID3D11Device
* @param core OpenVINO Runtime Core object instance
* @param deviceName A name of to create a remote context for
* @param device A pointer to ID3D11Device to be used to create a remote context
*/
D3DContext(Core& core, std::string deviceName, ID3D11Device* device) {
D3DContext(Core& core, ID3D11Device* device) {
// clang-format off
ParamMap context_params = {
{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
{GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}
};
*this = core.create_context(deviceName, context_params);
*this = core.create_context(device_name, context_params);
}
/**

View File

@ -124,6 +124,7 @@ public:
*/
class ClContext : public RemoteContext {
using RemoteContext::create_tensor;
static constexpr const char* device_name = "GPU";
public:
/**
@ -139,13 +140,29 @@ public:
/**
* @brief Constructs context object from user-supplied OpenCL context handle
* @param core A reference to OpenVINO Runtime Core object
* @param deviceName A name of device to create a remote context for
* @param ctx A OpenCL context to be used to create shared remote context
*/
ClContext(Core& core, std::string deviceName, cl_context ctx) {
ClContext(Core& core, cl_context ctx) {
ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
{GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)}};
*this = core.create_context(deviceName, context_params);
*this = core.create_context(device_name, context_params);
}
/**
* @brief Constructs context object from user-supplied OpenCL context handle
* @param core A reference to OpenVINO Runtime Core object
* @param queue An OpenCL queue to be used to create shared remote context. Queue will be reused inside the plugin.
* @note Only latency mode is supported for such context sharing case.
*/
ClContext(Core& core, cl_command_queue queue) {
cl_context ctx;
auto res = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
if (res != CL_SUCCESS)
IE_THROW() << "Can't get context from given opencl queue";
ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(OCL)},
{GPU_PARAM_KEY(OCL_CONTEXT), static_cast<gpu_handle_param>(ctx)},
{GPU_PARAM_KEY(OCL_QUEUE), static_cast<gpu_handle_param>(queue)}};
*this = core.create_context(device_name, context_params);
}
/**

View File

@ -68,6 +68,7 @@ public:
*/
class VAContext : public ClContext {
using RemoteContext::create_tensor;
static constexpr const char* device_name = "GPU";
public:
/**
@ -91,13 +92,12 @@ public:
/**
* @brief Constructs remote context object from VA display handle
* @param core OpenVINO Runtime Core object
* @param deviceName A device name to create a remote context for
* @param device A `VADisplay` to create remote context from
*/
VAContext(Core& core, std::string deviceName, VADisplay device) {
VAContext(Core& core, VADisplay device) {
ParamMap context_params = {{GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED)},
{GPU_PARAM_KEY(VA_DEVICE), static_cast<gpu_handle_param>(device)}};
*this = core.create_context(deviceName, context_params);
*this = core.create_context(device_name, context_params);
}
/**

View File

@ -123,6 +123,186 @@ TEST_F(RemoteBlob_Test, smoke_canInferOnUserContext) {
}
}
TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
#if defined _WIN32
GTEST_SKIP();
#endif
auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
CNNNetwork net(fn_ptr);
net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
inf_req_regular.Infer();
auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
// inference using remote blob
auto ocl_instance = std::make_shared<OpenCL>();
cl_int err;
auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
auto in_dims = in_desc.getDims();
auto out_dims = out_desc.getDims();
size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
// without calling thread blocks
auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
auto exec_net_shared = ie->LoadNetwork(net, remote_context);
auto inf_req_shared = exec_net_shared.CreateInferRequest();
// Allocate shared buffers for input and output data which will be set to infer request
cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
// Allocate output buffer where inference result will be put as a post-processing step
cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
// Wrap buffers above with IE blobs
Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
// Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
// TODO: Why do we need to call it explicitly? Consider doing it internally
output_blob->allocate();
// Pass shared blobs to infer request
inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
// 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer and barrier to ensure that copy is finished before
// inference primitives starts execution
{
void *buffer = fakeImageData->buffer();
ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
}
// 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
// before giving the control back
inf_req_shared.StartAsync();
// 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
// Enqueue barrier with empty wait list is needed to ensure that previous kernels are finished before copying the data. It's needed here since we
// create OOO queue.
// Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
// then the result may be incorrect without Wait().
{
ocl_instance->_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
}
// 4. Wait for infer request and post-processing completion
ocl_instance->_queue.finish();
// compare results
{
ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
}
}
TEST_F(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
#if defined _WIN32
GTEST_SKIP();
#endif
auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
CNNNetwork net(fn_ptr);
net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
inf_req_regular.Infer();
auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
// inference using remote blob
auto ocl_instance = std::make_shared<OpenCL>();
ocl_instance->_queue = cl::CommandQueue(ocl_instance->_context, ocl_instance->_device);
cl_int err;
auto in_desc = net.getInputsInfo().begin()->second->getTensorDesc();
auto out_desc = net.getOutputsInfo().begin()->second->getTensorDesc();
auto in_dims = in_desc.getDims();
auto out_dims = out_desc.getDims();
size_t in_size = in_dims[1] * in_dims[2] * in_dims[3];
size_t out_size = out_dims[1] * out_dims[2] * out_dims[3] * sizeof(float);
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
// without calling thread blocks
auto remote_context = make_shared_context(*ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_queue.get());
auto exec_net_shared = ie->LoadNetwork(net, remote_context);
auto inf_req_shared = exec_net_shared.CreateInferRequest();
// Allocate shared buffers for input and output data which will be set to infer request
cl::Buffer shared_input_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, in_size, NULL, &err);
cl::Buffer shared_output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
// Allocate output buffer where inference result will be put as a post-processing step
cl::Buffer output_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, out_size, NULL, &err);
// Wrap buffers above with IE blobs
Blob::Ptr shared_input_blob = make_shared_blob(in_desc, remote_context, shared_input_buffer);
Blob::Ptr shared_output_blob = make_shared_blob(out_desc, remote_context, shared_output_buffer);
Blob::Ptr output_blob = make_shared_blob(out_desc, remote_context, output_buffer);
// Allocate is needed to actually trigger memory handle sharing. For other buffers it's called inside SetBlob impl
// TODO: Why do we need to call it explicitly? Consider doing it internally
output_blob->allocate();
// Pass shared blobs to infer request
inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_input_blob);
inf_req_shared.SetBlob(net.getOutputsInfo().begin()->first, shared_output_blob);
// 1. Pre-processing. Enqueue non-blocking copy from host ptr to shared device input buffer
{
void *buffer = fakeImageData->buffer();
ocl_instance->_queue.enqueueWriteBuffer(shared_input_buffer, false, 0, in_size, buffer);
}
// 2. Enqueue inference primitives. With shared queue this call ensures that all kernels are scheduled to the corresponding queue
// before giving the control back
inf_req_shared.StartAsync();
// 3. Post-processing. Enqueue copy from shared blob with inference result to another output blob
// Note: inf_req_shared.Wait() can be dropped in some cases, but if plugin-side post-processing is required,
// then the result may be incorrect without Wait().
{
ocl_instance->_queue.enqueueCopyBuffer(shared_output_buffer, output_buffer, 0, 0, output_blob->byteSize());
}
// 4. Wait for infer request and post-processing completion
ocl_instance->_queue.finish();
// compare results
{
ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
ASSERT_EQ(outputBlob_regular->size(), output_blob->size());
auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
FuncTestUtils::compareBlobs(outputBlob_regular, output_blob, thr);
}
}
class BatchedBlob_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
void SetUp() override {
num_batch = this->GetParam();

View File

@ -113,7 +113,7 @@ TEST_F(OVRemoteTensor_Test, DISABLED_smoke_canInferOnUserContext) {
// inference using remote tensor
auto ocl_instance = std::make_shared<OpenCL>();
auto remote_context = ov::runtime::gpu::ClContext(ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_context.get());
auto remote_context = ov::runtime::gpu::ClContext(ie, ocl_instance->_context.get());
auto exec_net_shared = ie.compile_model(function, remote_context);
auto inf_req_shared = exec_net_shared.create_infer_request();
inf_req_shared.set_tensor(input->get_friendly_name(), fakeImageData);

View File

@ -62,8 +62,9 @@ public:
const build_options& options,
bool is_internal);
network(program::ptr program,
uint16_t stream_id = 0);
network(program::ptr program, uint16_t stream_id = 0);
network(program::ptr program, stream::ptr stream, uint16_t stream_id);
~network();

View File

@ -125,6 +125,9 @@ public:
/// Create stream object for current engine
virtual stream_ptr create_stream() const = 0;
/// Creates stream object from user handle
virtual stream_ptr create_stream(void *handle) const = 0;
/// Returns service stream which can be used during program build and optimizations
virtual stream& get_program_stream() const = 0;

View File

@ -213,6 +213,10 @@ stream::ptr ocl_engine::create_stream() const {
return std::make_shared<ocl_stream>(*this);
}
stream::ptr ocl_engine::create_stream(void* handle) const {
return std::make_shared<ocl_stream>(*this, handle);
}
stream& ocl_engine::get_program_stream() const {
return *_program_stream;
}

View File

@ -40,6 +40,7 @@ public:
bool extension_supported(std::string extension) const;
stream_ptr create_stream() const override;
stream_ptr create_stream(void *handle) const override;
stream& get_program_stream() const override;
#ifdef ENABLE_ONEDNN_FOR_GPU

View File

@ -253,9 +253,18 @@ void set_arguments_impl(ocl_kernel_type& kernel,
}
}
}
sync_methods get_expected_sync_method(const engine_configuration &config) {
return config.enable_profiling ? sync_methods::events : config.queue_type == queue_types::out_of_order ? sync_methods::barriers
: sync_methods::none;
}
} // namespace
ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration().queue_type), _engine(engine) {
ocl_stream::ocl_stream(const ocl_engine &engine)
: stream(engine.configuration().queue_type)
, _engine(engine)
, sync_method(get_expected_sync_method(engine.configuration())) {
auto context = engine.get_cl_context();
auto device = engine.get_cl_device();
auto config = engine.configuration();
@ -263,9 +272,6 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
queue_builder.set_profiling(config.enable_profiling);
queue_builder.set_out_of_order((config.queue_type == queue_types::out_of_order));
sync_method = _engine.configuration().enable_profiling ? sync_methods::events :
config.queue_type == queue_types::out_of_order ? sync_methods::barriers : sync_methods::none;
if (sync_method == sync_methods::none && config.queue_type == queue_types::out_of_order) {
throw std::runtime_error("[CLDNN] Unexpected sync method (none) is specified for out_of_order queue");
}
@ -288,6 +294,22 @@ ocl_stream::ocl_stream(const ocl_engine& engine) : stream(engine.configuration()
#endif
}
ocl_stream::ocl_stream(const ocl_engine &engine, void *handle)
: stream(engine.configuration().queue_type)
, _engine(engine)
, sync_method(get_expected_sync_method(engine.configuration())) {
auto casted_handle = static_cast<cl_command_queue>(handle);
_command_queue = ocl_queue_type(casted_handle, true);
#ifdef ENABLE_ONEDNN_FOR_GPU
auto config = engine.configuration();
if (config.queue_type == queue_types::in_order) {
auto onednn_engine = engine.get_onednn_engine();
_onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
}
#endif
}
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::stream& ocl_stream::get_onednn_stream() {
if (!_onednn_stream)

View File

@ -50,6 +50,7 @@ public:
const ocl_queue_type& get_cl_queue() const { return _command_queue; }
explicit ocl_stream(const ocl_engine& engine);
ocl_stream(const ocl_engine &engine, void *handle);
ocl_stream(ocl_stream&& other)
: stream(other._engine.configuration().queue_type)
, _engine(other._engine)

View File

@ -236,6 +236,9 @@ network::network(engine& engine,
network::network(program::ptr program, uint16_t stream_id)
: network(program, program->get_engine().create_stream(), false, stream_id == 0) {}
network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
: network(program, stream, false, stream_id == 0) {}
network::~network() {
_memory_pool->clear_pool_for_network(net_id);
}