[IE CLDNN] USM refactoring (#6230)
This commit is contained in:
committed by
GitHub
parent
d49405a0e8
commit
730577294f
@@ -60,6 +60,10 @@ add_library("${CLDNN_BUILD__PROJ}" STATIC
|
||||
set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY PROJECT_LABEL "${CLDNN_BUILD__PROJ_LABEL}")
|
||||
set_property(TARGET "${CLDNN_BUILD__PROJ}" PROPERTY OUTPUT_NAME "${CLDNN_BUILD__PROJ_OUTPUT_NAME}")
|
||||
|
||||
if(COMMAND add_cpplint_target)
|
||||
add_cpplint_target("${CLDNN_BUILD__PROJ}_cpplint" FOR_TARGETS "${CLDNN_BUILD__PROJ}")
|
||||
endif()
|
||||
|
||||
if(COMMAND set_ie_threading_interface_for)
|
||||
set_ie_threading_interface_for("${CLDNN_BUILD__PROJ}")
|
||||
endif()
|
||||
|
||||
@@ -18,7 +18,7 @@ typedef CL_API_ENTRY cl_command_queue(CL_API_CALL* pfn_clCreateCommandQueueWithP
|
||||
const cl_queue_properties* properties,
|
||||
cl_int* errcodeRet);
|
||||
|
||||
using ocl_queue_type = cl::CommandQueueIntel;
|
||||
using ocl_queue_type = cl::CommandQueue;
|
||||
using ocl_kernel_type = cl::KernelIntel;
|
||||
|
||||
class ocl_error : public std::runtime_error {
|
||||
|
||||
@@ -47,6 +47,7 @@ ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type, const
|
||||
casted->get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions);
|
||||
|
||||
_program_stream.reset(new ocl_stream(*this));
|
||||
_usm_helper.reset(new cl::UsmHelper(get_cl_context(), get_cl_device(), use_unified_shared_memory()));
|
||||
}
|
||||
|
||||
const cl::Context& ocl_engine::get_cl_context() const {
|
||||
@@ -63,6 +64,10 @@ const cl::Device& ocl_engine::get_cl_device() const {
|
||||
return cl_device->get_device();
|
||||
}
|
||||
|
||||
const cl::UsmHelper& ocl_engine::get_usm_helper() const {
|
||||
return *_usm_helper;
|
||||
}
|
||||
|
||||
memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
|
||||
if (layout.bytes_count() > get_device_info().max_alloc_mem_size) {
|
||||
throw std::runtime_error("exceeded max size of memory object allocation");
|
||||
|
||||
@@ -35,6 +35,7 @@ public:
|
||||
|
||||
const cl::Context& get_cl_context() const;
|
||||
const cl::Device& get_cl_device() const;
|
||||
const cl::UsmHelper& get_usm_helper() const;
|
||||
|
||||
bool extension_supported(std::string extension) const;
|
||||
|
||||
@@ -42,9 +43,11 @@ public:
|
||||
stream& get_program_stream() const override;
|
||||
|
||||
static std::shared_ptr<cldnn::engine> create(const device::ptr device, runtime_types runtime_type, const engine_configuration& configuration);
|
||||
|
||||
private:
|
||||
std::string _extensions;
|
||||
std::unique_ptr<stream> _program_stream;
|
||||
std::unique_ptr<cl::UsmHelper> _usm_helper;
|
||||
};
|
||||
|
||||
} // namespace ocl
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,7 @@
|
||||
//
|
||||
|
||||
#include "ocl_kernel.hpp"
|
||||
#include "ocl_engine.hpp"
|
||||
#include "kernels_factory.hpp"
|
||||
|
||||
#include <memory>
|
||||
@@ -14,8 +15,9 @@ namespace ocl {
|
||||
std::shared_ptr<kernel> create_ocl_kernel(engine& engine, cl_context /* context */, cl_kernel kernel, std::string entry_point) {
|
||||
// Retain kernel to keep it valid
|
||||
cl::Kernel k(kernel, true);
|
||||
return std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, engine.use_unified_shared_memory()), entry_point);
|
||||
ocl_engine& cl_engine = dynamic_cast<ocl_engine&>(engine);
|
||||
return std::make_shared<ocl::ocl_kernel>(ocl::ocl_kernel_type(k, cl_engine.get_usm_helper()), entry_point);
|
||||
}
|
||||
|
||||
} // namespace kernels_factory
|
||||
} // namespace ocl
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -241,9 +241,7 @@ shared_mem_params gpu_dx_buffer::get_internal_params() const {
|
||||
}
|
||||
#endif
|
||||
|
||||
gpu_usm::gpu_usm(ocl_engine* engine,
|
||||
const layout& new_layout, const cl::UsmMemory& buffer,
|
||||
allocation_type type)
|
||||
gpu_usm::gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& buffer, allocation_type type)
|
||||
: lockable_gpu_mem()
|
||||
, memory(engine, new_layout, type, true)
|
||||
, _buffer(buffer) {
|
||||
@@ -252,17 +250,16 @@ gpu_usm::gpu_usm(ocl_engine* engine,
|
||||
gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
|
||||
: lockable_gpu_mem()
|
||||
, memory(engine, layout, type, false)
|
||||
, _buffer(engine->get_cl_context()) {
|
||||
auto device = engine->get_cl_device();
|
||||
, _buffer(engine->get_usm_helper()) {
|
||||
switch (get_allocation_type()) {
|
||||
case allocation_type::usm_host:
|
||||
_buffer.allocateHost(_bytes_count);
|
||||
break;
|
||||
case allocation_type::usm_shared:
|
||||
_buffer.allocateShared(device, _bytes_count);
|
||||
_buffer.allocateShared(_bytes_count);
|
||||
break;
|
||||
case allocation_type::usm_device:
|
||||
_buffer.allocateDevice(device, _bytes_count);
|
||||
_buffer.allocateDevice(_bytes_count);
|
||||
break;
|
||||
default:
|
||||
CLDNN_ERROR_MESSAGE("gpu_usm allocation type",
|
||||
@@ -294,12 +291,12 @@ event::ptr gpu_usm::fill(stream& stream, unsigned char pattern) {
|
||||
auto ev = stream.create_base_event();
|
||||
cl::Event ev_ocl = downcast<base_event>(ev.get())->get();
|
||||
// enqueueFillUsm call will never finish. Driver bug? Uncomment when fixed. Some older drivers doesn't support enqueueFillUsm call at all.
|
||||
// cl_stream.get_cl_queue().enqueueFillUsm<unsigned char>(_buffer, pattern, _bytes_count, nullptr, &ev_ocl)
|
||||
// cl_stream.get_usm_helper().enqueue_fill_mem<unsigned char>(cl_stream.get_cl_queue(), _buffer.get(), pattern, _bytes_count, nullptr, &ev_ocl)
|
||||
// Workarounded with enqeue_memcopy. ToDo: Remove below code. Uncomment above.
|
||||
std::vector<unsigned char> temp_buffer(_bytes_count, pattern);
|
||||
// TODO: Do we really need blocking call here? Non-blocking one causes accuracy issues right now, but hopefully it can be fixed in more performant way.
|
||||
const bool blocking = true;
|
||||
cl::usm::enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl);
|
||||
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _buffer.get(), temp_buffer.data(), _bytes_count, blocking, nullptr, &ev_ocl);
|
||||
|
||||
return ev;
|
||||
}
|
||||
@@ -317,7 +314,13 @@ event::ptr gpu_usm::fill(stream& stream) {
|
||||
event::ptr gpu_usm::copy_from(stream& stream, const memory& other) {
|
||||
auto& cl_stream = downcast<const ocl_stream>(stream);
|
||||
auto& casted = downcast<const gpu_usm>(other);
|
||||
cl_stream.get_cl_queue().enqueueCopyUsm(casted.get_buffer(), get_buffer(), _bytes_count, true);
|
||||
auto dst_ptr = get_buffer().get();
|
||||
auto src_ptr = casted.get_buffer().get();
|
||||
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(),
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
_bytes_count,
|
||||
true);
|
||||
return stream.create_user_event(true);
|
||||
}
|
||||
|
||||
|
||||
@@ -82,6 +82,8 @@ public:
|
||||
event::ptr create_base_event() override;
|
||||
void release_events_pool() override;
|
||||
|
||||
const cl::UsmHelper& get_usm_helper() const { return _engine.get_usm_helper(); }
|
||||
|
||||
private:
|
||||
void sync_events(std::vector<event::ptr> const& deps, bool is_output = false);
|
||||
|
||||
|
||||
@@ -78,26 +78,24 @@ TEST_P(ctor_test, basic) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
cl::UsmMemory mem(_device->get_context());
|
||||
auto cl_dev = _device->get_device();
|
||||
cl::UsmMemory mem(_engine->get_usm_helper());
|
||||
switch (p.type) {
|
||||
case allocation_type::usm_host: {
|
||||
mem.allocateHost(1);
|
||||
break;
|
||||
}
|
||||
case allocation_type::usm_shared: {
|
||||
mem.allocateShared(cl_dev, 1);
|
||||
mem.allocateShared(1);
|
||||
break;
|
||||
}
|
||||
case allocation_type::usm_device: {
|
||||
mem.allocateDevice(cl_dev, 1);
|
||||
mem.allocateDevice(1);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
FAIL() << "Not supported allocation type!";
|
||||
}
|
||||
ASSERT_NE(nullptr, mem.get());
|
||||
ASSERT_EQ(mem.use_count(), 1);
|
||||
}
|
||||
catch (...) {
|
||||
FAIL() << "Test failed, ctor of usm mems failed.";
|
||||
@@ -192,19 +190,20 @@ TEST_P(fill_buffer, DISABLED_basic) {
|
||||
try {
|
||||
ocl::ocl_stream stream(*_engine);
|
||||
auto queue = stream.get_cl_queue();
|
||||
auto usm_helper = stream.get_usm_helper();
|
||||
|
||||
size_t values_count = 100;
|
||||
size_t values_bytes_count = values_count * sizeof(float);
|
||||
cl::UsmMemory mem(_device->get_context());
|
||||
cl::UsmMemory mem(usm_helper);
|
||||
switch (p.type) {
|
||||
case allocation_type::usm_host:
|
||||
mem.allocateHost(values_bytes_count);
|
||||
break;
|
||||
case allocation_type::usm_shared:
|
||||
mem.allocateShared(_device->get_device(), values_bytes_count);
|
||||
mem.allocateShared(values_bytes_count);
|
||||
break;
|
||||
case allocation_type::usm_device:
|
||||
mem.allocateDevice(_device->get_device(), values_bytes_count);
|
||||
mem.allocateDevice(values_bytes_count);
|
||||
break;
|
||||
default:
|
||||
FAIL() << "Not supported allocation type!";
|
||||
@@ -212,9 +211,11 @@ TEST_P(fill_buffer, DISABLED_basic) {
|
||||
// Fill buffer !! This can fail with old driver, which does not support fill usm api.
|
||||
cl::Event ev;
|
||||
unsigned char pattern = 0;
|
||||
queue.enqueueFillUsm<unsigned char>(
|
||||
mem,
|
||||
pattern,
|
||||
usm_helper.enqueue_fill_mem(
|
||||
queue,
|
||||
mem.get(),
|
||||
static_cast<const void*>(&pattern),
|
||||
sizeof(unsigned char),
|
||||
values_bytes_count,
|
||||
nullptr,
|
||||
&ev
|
||||
@@ -232,11 +233,12 @@ TEST_P(fill_buffer, DISABLED_basic) {
|
||||
break;
|
||||
}
|
||||
case allocation_type::usm_device: {
|
||||
cl::UsmMemory host_mem(_device->get_context());
|
||||
cl::UsmMemory host_mem(usm_helper);
|
||||
host_mem.allocateHost(values_bytes_count);
|
||||
queue.enqueueCopyUsm(
|
||||
mem,
|
||||
host_mem,
|
||||
usm_helper.enqueue_memcpy(
|
||||
queue,
|
||||
host_mem.get(),
|
||||
mem.get(),
|
||||
values_bytes_count,
|
||||
true
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user