[IE CLDNN] USM refactoring (#6230)

This commit is contained in:
Vladimir Paramuzov
2021-06-24 13:05:49 +03:00
committed by GitHub
parent d49405a0e8
commit 730577294f
9 changed files with 526 additions and 557 deletions

View File

@@ -60,6 +60,10 @@ add_library("${CLDNN_BUILD__PROJ}" STATIC
set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY PROJECT_LABEL "${CLDNN_BUILD__PROJ_LABEL}")
set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY OUTPUT_NAME "${CLDNN_BUILD__PROJ_OUTPUT_NAME}")
if(COMMAND add_cpplint_target)
add_cpplint_target("${CLDNN_BUILD__PROJ}_cpplint" FOR_TARGETS "${CLDNN_BUILD__PROJ}")
endif()
if(COMMAND set_ie_threading_interface_for)
set_ie_threading_interface_for("${CLDNN_BUILD__PROJ}")
endif()

View File

@@ -18,7 +18,7 @@ typedef CL_API_ENTRY cl_command_queue(CL_API_CALL* pfn_clCreateCommandQueueWithP
const cl_queue_properties* properties,
cl_int* errcodeRet);
using ocl_queue_type = cl::CommandQueueIntel;
using ocl_queue_type = cl::CommandQueue;
using ocl_kernel_type = cl::KernelIntel;
class ocl_error : public std::runtime_error {

View File

@@ -47,6 +47,7 @@ ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type, const
casted->get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions);
_program_stream.reset(new ocl_stream(*this));
_usm_helper.reset(new cl::UsmHelper(get_cl_context(), get_cl_device(), use_unified_shared_memory()));
}
const cl::Context& ocl_engine::get_cl_context() const {
@@ -63,6 +64,10 @@ const cl::Device& ocl_engine::get_cl_device() const {
return cl_device->get_device();
}
const cl::UsmHelper& ocl_engine::get_usm_helper() const {
return *_usm_helper;
}
memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
if (layout.bytes_count() > get_device_info().max_alloc_mem_size) {
throw std::runtime_error("exceeded max size of memory object allocation");

View File

@@ -35,6 +35,7 @@ public:
const cl::Context& get_cl_context() const;
const cl::Device& get_cl_device() const;
const cl::UsmHelper& get_usm_helper() const;
bool extension_supported(std::string extension) const;
@@ -42,9 +43,11 @@ public:
stream& get_program_stream() const override;
static std::shared_ptr<cldnn::engine> create(const device::ptr device, runtime_types runtime_type, const engine_configuration& configuration);
private:
std::string _extensions;
std::unique_ptr<stream> _program_stream;
std::unique_ptr<cl::UsmHelper> _usm_helper;
};
} // namespace ocl

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,7 @@
//
#include "ocl_kernel.hpp"
#include "ocl_engine.hpp"
#include "kernels_factory.hpp"
#include <memory>
@@ -14,8 +15,9 @@ namespace ocl {
std::shared_ptr<kernel> create_ocl_kernel(engine& engine, cl_context /* context */, cl_kernel kernel, std::string entry_point) {
// Retain kernel to keep it valid
cl::Kernel k(kernel, true);
return std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, engine.use_unified_shared_memory()), entry_point);
ocl_engine& cl_engine = dynamic_cast<ocl_engine&>(engine);
return std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, cl_engine.get_usm_helper()), entry_point);
}
} // namespace kernels_factory
} // namespace ocl
} // namespace cldnn

View File

@@ -241,9 +241,7 @@ shared_mem_params gpu_dx_buffer::get_internal_params() const {
}
#endif
gpu_usm::gpu_usm(ocl_engine* engine,
const layout& new_layout, const cl::UsmMemory& buffer,
allocation_type type)
gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer, allocation_type type)
: lockable_gpu_mem()
, memory(engine, new_layout, type, true)
, _buffer(buffer) {
@@ -252,17 +250,16 @@ gpu_usm::gpu_usm(ocl_engine* engine,
gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
: lockable_gpu_mem()
, memory(engine, layout, type, false)
, _buffer(engine->get_cl_context()) {
auto device = engine->get_cl_device();
, _buffer(engine->get_usm_helper()) {
switch (get_allocation_type()) {
case allocation_type::usm_host:
_buffer.allocateHost(_bytes_count);
break;
case allocation_type::usm_shared:
_buffer.allocateShared(device, _bytes_count);
_buffer.allocateShared(_bytes_count);
break;
case allocation_type::usm_device:
_buffer.allocateDevice(device, _bytes_count);
_buffer.allocateDevice(_bytes_count);
break;
default:
CLDNN_ERROR_MESSAGE("gpu_usm allocation type",
@@ -294,12 +291,12 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
auto ev = stream.create_base_event();
cl::Event ev_ocl = downcast<base_event>(ev.get())->get();
// enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all.
// cl_stream.get_cl_queue().enqueueFillUsm<unsigned char>(_buffer, pattern, _bytes_count, nullptr, &ev_ocl)
// cl_stream.get_usm_helper().enqueue_fill_mem<unsigned char>(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl)
// Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above.
std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
// TODO: Do we really need blocking call here? Non-blocking one causes accuracy issues right now, but hopefully it can be fixed in more performant way.
const bool blocking = true;
cl::usm::enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl);
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl);
return ev;
}
@@ -317,7 +314,13 @@ event::ptr gpu_usm::fill(stream& stream) {
event::ptr gpu_usm::copy_from(stream& stream, const memory& other) {
auto& cl_stream = downcast<const ocl_stream>(stream);
auto& casted = downcast<const gpu_usm>(other);
cl_stream.get_cl_queue().enqueueCopyUsm(casted.get_buffer(), get_buffer(), _bytes_count, true);
auto dst_ptr = get_buffer().get();
auto src_ptr = casted.get_buffer().get();
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(),
dst_ptr,
src_ptr,
_bytes_count,
true);
return stream.create_user_event(true);
}

View File

@@ -82,6 +82,8 @@ public:
event::ptr create_base_event() override;
void release_events_pool() override;
const cl::UsmHelper& get_usm_helper() const { return _engine.get_usm_helper(); }
private:
void sync_events(std::vector<event::ptr> const& deps, bool is_output = false);

View File

@@ -78,26 +78,24 @@ TEST_P(ctor_test, basic) {
return;
}
try {
cl::UsmMemory mem(_device->get_context());
auto cl_dev = _device->get_device();
cl::UsmMemory mem(_engine->get_usm_helper());
switch (p.type) {
case allocation_type::usm_host: {
mem.allocateHost(1);
break;
}
case allocation_type::usm_shared: {
mem.allocateShared(cl_dev, 1);
mem.allocateShared(1);
break;
}
case allocation_type::usm_device: {
mem.allocateDevice(cl_dev, 1);
mem.allocateDevice(1);
break;
}
default:
FAIL() << "Not supported allocation type!";
}
ASSERT_NE(nullptr, mem.get());
ASSERT_EQ(mem.use_count(), 1);
}
catch (...) {
FAIL() << "Test failed, ctor of usm mems failed.";
@@ -192,19 +190,20 @@ TEST_P(fill_buffer, DISABLED_basic) {
try {
ocl::ocl_stream stream(*_engine);
auto queue = stream.get_cl_queue();
auto usm_helper = stream.get_usm_helper();
size_t values_count = 100;
size_t values_bytes_count = values_count * sizeof(float);
cl::UsmMemory mem(_device->get_context());
cl::UsmMemory mem(usm_helper);
switch (p.type) {
case allocation_type::usm_host:
mem.allocateHost(values_bytes_count);
break;
case allocation_type::usm_shared:
mem.allocateShared(_device->get_device(), values_bytes_count);
mem.allocateShared(values_bytes_count);
break;
case allocation_type::usm_device:
mem.allocateDevice(_device->get_device(), values_bytes_count);
mem.allocateDevice(values_bytes_count);
break;
default:
FAIL() << "Not supported allocation type!";
@@ -212,9 +211,11 @@ TEST_P(fill_buffer, DISABLED_basic) {
// Fill buffer !! This can fail with old driver, which does not support fill usm api.
cl::Event ev;
unsigned char pattern = 0;
queue.enqueueFillUsm<unsigned char>(
mem,
pattern,
usm_helper.enqueue_fill_mem(
queue,
mem.get(),
static_cast<const void*>(&pattern),
sizeof(unsigned char),
values_bytes_count,
nullptr,
&ev
@@ -232,11 +233,12 @@ TEST_P(fill_buffer, DISABLED_basic) {
break;
}
case allocation_type::usm_device: {
cl::UsmMemory host_mem(_device->get_context());
cl::UsmMemory host_mem(usm_helper);
host_mem.allocateHost(values_bytes_count);
queue.enqueueCopyUsm(
mem,
host_mem,
usm_helper.enqueue_memcpy(
queue,
host_mem.get(),
mem.get(),
values_bytes_count,
true
);