Files
openvino/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_stream.cpp

480 lines
21 KiB
C++

// Copyright (C) 2019-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "ocl_stream.hpp"
#include "ocl_event.hpp"
#include "ocl_user_event.hpp"
#include "ocl_command_queues_builder.hpp"
#include "ocl_kernel.hpp"
#include "ocl_common.hpp"
#include <cassert>
#include <iomanip>
#include <ios>
#include <fstream>
#include <thread>
#include <string>
#include <vector>
#include <memory>
// NOTE: Due to buggy scope transition of warnings we need to disable warning in place of use/instantation
// of some types (even though we already disabled them in scope of definition of these types).
// Moreover this warning is pretty much now only for annoyance: it is generated due to lack
// of proper support for mangling of custom GCC attributes into type name (usually when used
// with templates, even from standard library).
#if defined __GNUC__ && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#ifdef ENABLE_ONEDNN_FOR_GPU
#include <oneapi/dnnl/dnnl_ocl.hpp>
#endif
namespace cldnn {
namespace ocl {
namespace {
inline cl::NDRange toNDRange(const std::vector<size_t>& v) {
switch (v.size()) {
case 1:
return cl::NDRange(v[0]);
case 2:
return cl::NDRange(v[0], v[1]);
case 3:
return cl::NDRange(v[0], v[1], v[2]);
default:
return cl::NullRange;
}
}
void set_arguments_impl(ocl_kernel_type& kernel,
const arguments_desc& args,
const kernel_arguments_data& data) {
using args_t = argument_desc::Types;
using scalar_t = scalar_desc::Types;
for (uint32_t i = 0; i < static_cast<uint32_t>(args.size()); i++) {
cl_int status = CL_INVALID_ARG_VALUE;
switch (args[i].t) {
case args_t::INPUT:
if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) {
const auto& input_mem = data.inputs[args[i].index];
if (input_mem) {
if (input_mem->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(input_mem)->get_buffer());
else if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
}
break;
case args_t::INPUT_OF_FUSED_PRIMITIVE:
if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) {
const auto& input_mem = data.fused_op_inputs[args[i].index];
if (input_mem) {
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
}
break;
case args_t::INTERNAL_BUFFER:
if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) {
const auto& input_mem = data.intermediates[args[i].index];
if (input_mem) {
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
}
break;
case args_t::OUTPUT:
if (data.output) {
if (data.output->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(data.output)->get_buffer());
else if (memory_capabilities::is_usm_type(data.output->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.output)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.output)->get_buffer());
}
break;
case args_t::WEIGHTS:
if (data.weights) {
if (data.weights->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(data.weights)->get_buffer());
else if (memory_capabilities::is_usm_type(data.weights->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights)->get_buffer());
}
break;
case args_t::BIAS:
if (data.bias) {
if (memory_capabilities::is_usm_type(data.bias->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.bias)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.bias)->get_buffer());
}
break;
case args_t::WEIGHTS_ZERO_POINTS:
if (data.weights_zero_points) {
if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights_zero_points)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights_zero_points)->get_buffer());
}
break;
case args_t::ACTIVATIONS_ZERO_POINTS:
if (data.activations_zero_points) {
if (memory_capabilities::is_usm_type(data.activations_zero_points->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.activations_zero_points)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.activations_zero_points)->get_buffer());
}
break;
case args_t::COMPENSATION:
if (data.compensation) {
if (memory_capabilities::is_usm_type(data.compensation->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.compensation)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.compensation)->get_buffer());
}
break;
case args_t::SCALE_TABLE:
if (data.scale_table) {
if (memory_capabilities::is_usm_type(data.scale_table->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.scale_table)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.scale_table)->get_buffer());
}
break;
case args_t::SLOPE:
if (data.slope) {
if (memory_capabilities::is_usm_type(data.slope->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.slope)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.slope)->get_buffer());
}
break;
case args_t::SPLIT:
status = kernel.setArg(i, data.split);
break;
case args_t::SCALAR:
if (data.scalars && args[i].index < data.scalars->size()) {
const auto& scalar = (*data.scalars)[args[i].index];
switch (scalar.t) {
case scalar_t::UINT8:
status = kernel.setArg(i, scalar.v.u8);
break;
case scalar_t::UINT16:
status = kernel.setArg(i, scalar.v.u16);
break;
case scalar_t::UINT32:
status = kernel.setArg(i, scalar.v.u32);
break;
case scalar_t::UINT64:
status = kernel.setArg(i, scalar.v.u64);
break;
case scalar_t::INT8:
status = kernel.setArg(i, scalar.v.s8);
break;
case scalar_t::INT16:
status = kernel.setArg(i, scalar.v.s16);
break;
case scalar_t::INT32:
status = kernel.setArg(i, scalar.v.s32);
break;
case scalar_t::INT64:
status = kernel.setArg(i, scalar.v.s64);
break;
case scalar_t::FLOAT32:
status = kernel.setArg(i, scalar.v.f32);
break;
case scalar_t::FLOAT64:
status = kernel.setArg(i, scalar.v.f64);
break;
default:
break;
}
}
break;
case args_t::RECURRENT: // RNN/LSTM/GRU layers
if (data.recurrent) {
if (data.recurrent->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.recurrent).get_buffer());
else if (memory_capabilities::is_usm_type(data.recurrent->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.recurrent).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.recurrent).get_buffer());
}
break;
case args_t::HIDDEN: // RNN/LSTM/GRU layers
if (data.hidden) {
if (data.hidden->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.hidden).get_buffer());
else if (memory_capabilities::is_usm_type(data.hidden->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.hidden).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.hidden).get_buffer());
}
break;
case args_t::CELL: // LSTMlayers
if (data.cell) {
if (data.cell->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.cell).get_buffer());
else if (memory_capabilities::is_usm_type(data.cell->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.cell).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.cell).get_buffer());
}
break;
default:
break;
}
if (status != CL_SUCCESS) {
throw std::runtime_error("Error set arg " + std::to_string(i) + ", error code: " + std::to_string(status) + "\n");
}
}
}
sync_methods get_expected_sync_method(const engine_configuration &config) {
return config.enable_profiling ? sync_methods::events : config.queue_type == queue_types::out_of_order ? sync_methods::barriers
: sync_methods::none;
}
} // namespace
ocl_stream::ocl_stream(const ocl_engine &engine)
: stream(engine.configuration().queue_type)
, _engine(engine)
, sync_method(get_expected_sync_method(engine.configuration())) {
auto context = engine.get_cl_context();
auto device = engine.get_cl_device();
auto config = engine.configuration();
ocl::command_queues_builder queue_builder;
queue_builder.set_profiling(config.enable_profiling);
queue_builder.set_out_of_order((config.queue_type == queue_types::out_of_order));
if (sync_method == sync_methods::none && config.queue_type == queue_types::out_of_order) {
throw std::runtime_error("[CLDNN] Unexpected sync method (none) is specified for out_of_order queue");
}
bool priorty_extensions = engine.extension_supported("cl_khr_priority_hints") && engine.extension_supported("cl_khr_create_command_queue");
queue_builder.set_priority_mode(config.priority_mode, priorty_extensions);
bool throttle_extensions = engine.extension_supported("cl_khr_throttle_hints") && engine.extension_supported("cl_khr_create_command_queue");
queue_builder.set_throttle_mode(config.throttle_mode, throttle_extensions);
bool queue_families_extension = engine.get_device_info().supports_queue_families;
queue_builder.set_supports_queue_families(queue_families_extension);
_command_queue = queue_builder.build(context, device);
#ifdef ENABLE_ONEDNN_FOR_GPU
if (config.queue_type == queue_types::in_order) {
auto onednn_engine = engine.get_onednn_engine();
_onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
}
#endif
}
ocl_stream::ocl_stream(const ocl_engine &engine, void *handle)
: stream(engine.configuration().queue_type)
, _engine(engine)
, sync_method(get_expected_sync_method(engine.configuration())) {
auto casted_handle = static_cast<cl_command_queue>(handle);
_command_queue = ocl_queue_type(casted_handle, true);
if (ocl_stream::detect_queue_type(handle) != engine.configuration().queue_type)
throw std::runtime_error("Inconsistent engine config and external user queue are passed to ocl_stream");
#ifdef ENABLE_ONEDNN_FOR_GPU
auto config = engine.configuration();
if (config.queue_type == queue_types::in_order) {
auto onednn_engine = engine.get_onednn_engine();
_onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
}
#endif
}
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::stream& ocl_stream::get_onednn_stream() {
if (!_onednn_stream)
throw std::runtime_error("[GPU] onednn stream is nullptr");
return *_onednn_stream;
}
#endif
queue_types ocl_stream::detect_queue_type(void *queue_handle) {
cl_command_queue queue = static_cast<cl_command_queue>(queue_handle);
cl_command_queue_properties properties;
auto status = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &properties, nullptr);
if (status != CL_SUCCESS) {
throw std::runtime_error("Can't get queue properties for user handle\n");
}
return (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) ? queue_types::out_of_order : queue_types::in_order;
}
void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args_desc, const kernel_arguments_data& args) {
static std::mutex m;
std::lock_guard<std::mutex> guard(m);
auto& ocl_kernel = downcast<ocl::ocl_kernel>(kernel);
auto& kern = ocl_kernel.get_handle();
try {
set_arguments_impl(kern, args_desc.arguments, args);
} catch (cl::Error const& err) {
throw ocl_error(err);
}
}
event::ptr ocl_stream::enqueue_kernel(kernel& kernel,
const kernel_arguments_desc& args_desc,
const kernel_arguments_data& /* args */,
std::vector<event::ptr> const& deps,
bool is_output) {
auto& ocl_kernel = downcast<ocl::ocl_kernel>(kernel);
auto& kern = ocl_kernel.get_handle();
auto global = toNDRange(args_desc.workGroups.global);
auto local = toNDRange(args_desc.workGroups.local);
std::vector<cl::Event> dep_events;
std::vector<cl::Event>* dep_events_ptr = nullptr;
if (sync_method == sync_methods::events) {
for (auto& dep : deps) {
if (auto ocl_base_ev = std::dynamic_pointer_cast<ocl_base_event>(dep)) {
if (ocl_base_ev->get().get() != nullptr)
dep_events.push_back(ocl_base_ev->get());
}
}
dep_events_ptr = &dep_events;
} else if (sync_method == sync_methods::barriers) {
sync_events(deps, is_output);
}
cl::Event ret_ev;
bool set_output_event = sync_method == sync_methods::events || is_output;
try {
_command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, set_output_event ? &ret_ev : nullptr);
} catch (cl::Error const& err) {
throw ocl_error(err);
}
return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);
}
void ocl_stream::enqueue_barrier() {
_command_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
}
event::ptr ocl_stream::enqueue_marker(std::vector<event::ptr> const& deps, bool is_output) {
if (deps.empty())
return std::make_shared<ocl_user_event>(_engine.get_cl_context(), true);
if (sync_method == sync_methods::events) {
cl::Event ret_ev;
std::vector<cl::Event> dep_events;
for (auto& dep : deps) {
if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get()))
if (ocl_base_ev->get().get() != nullptr)
dep_events.push_back(ocl_base_ev->get());
}
try {
if (dep_events.empty()) {
return create_user_event(true);
}
_command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
} catch (cl::Error const& err) {
throw ocl_error(err);
}
return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);
} else if (sync_method == sync_methods::barriers) {
sync_events(deps, is_output);
return std::make_shared<ocl_event>(_last_barrier_ev, _last_barrier);
} else {
return std::make_shared<ocl_user_event>(_engine.get_cl_context(), true);
}
}
event::ptr ocl_stream::group_events(std::vector<event::ptr> const& deps) {
return std::make_shared<ocl_events>(deps);
}
event::ptr ocl_stream::create_user_event(bool set) {
return std::make_shared<ocl_user_event>(_engine.get_cl_context(), set);
}
event::ptr ocl_stream::create_base_event() {
cl::Event ret_ev;
return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);
}
void ocl_stream::flush() const { get_cl_queue().flush(); }
void ocl_stream::finish() const { get_cl_queue().finish(); }
void ocl_stream::wait_for_events(const std::vector<event::ptr>& events) {
if (events.empty())
return;
std::vector<cl::Event> clevents;
for (auto& ev : events) {
if (auto ocl_base_ev = downcast<ocl_base_event>(ev.get()))
clevents.push_back(ocl_base_ev->get());
}
try {
cl::WaitForEvents(clevents);
} catch (cl::Error const& err) {
throw ocl_error(err);
}
}
void ocl_stream::sync_events(std::vector<event::ptr> const& deps, bool is_output) {
bool needs_barrier = false;
for (auto& dep : deps) {
auto* ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get());
if (ocl_base_ev->get_queue_stamp() > _last_barrier) {
needs_barrier = true;
}
}
if (needs_barrier) {
try {
if (is_output)
_command_queue.enqueueBarrierWithWaitList(nullptr, &_last_barrier_ev);
else
_command_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
} catch (cl::Error const& err) {
throw ocl_error(err);
}
_last_barrier = ++_queue_counter;
}
}
} // namespace ocl
} // namespace cldnn