[GPU] Perform memory transfer from usm_host to usm_device for dGPU only (#15263)

* [GPU] Perform memory transfer from usm_host to usm_device only for dGPU

* [GPU] Allocate new memory buffer for biases fusions to avoid original buffer modification since it may be used by other primitives
This commit is contained in:
Sergey Shlyapnikov 2023-01-26 10:44:38 +04:00 committed by GitHub
parent b44b4fcf2c
commit 6c22f06d07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 12 deletions

View File

@ -329,31 +329,39 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
new_node.recalc_output_layout();
};
auto recalculate_biases = [&](data_node& original_node, data_node& new_node) -> bool {
auto recalculate_biases = [&](data_node& original_node, data_node& second_node) -> bool {
auto original_mem = original_node.get_attached_memory_ptr();
auto new_mem = new_node.get_attached_memory_ptr();
if (original_mem->count() != new_mem->count() || original_mem->get_layout().data_type != new_mem->get_layout().data_type)
auto second_mem = second_node.get_attached_memory_ptr();
if (original_mem->count() != second_mem->count() || original_mem->get_layout().data_type != second_mem->get_layout().data_type)
return false;
switch (original_mem->get_layout().data_type) {
case data_types::f32: {
mem_lock<float, mem_lock_type::write> original_bias_mem(original_mem, p.get_stream());
mem_lock<float, mem_lock_type::read> new_bias_mem(new_mem, p.get_stream());
cldnn::memory_ptr new_mem = p.get_engine().allocate_memory(original_mem->get_layout());
mem_lock<float, mem_lock_type::write> new_bias_mem(new_mem, p.get_stream());
mem_lock<float, mem_lock_type::read> original_bias_mem(original_mem, p.get_stream());
mem_lock<float, mem_lock_type::read> second_bias_mem(second_mem, p.get_stream());
float* original_data = original_bias_mem.data();
float* new_data = new_bias_mem.data();
float* new_data = second_bias_mem.data();
for (size_t i = 0; i < original_bias_mem.size(); i++)
original_data[i] += new_data[i];
new_bias_mem[i] = original_data[i] + new_data[i];
original_node.attach_memory(new_mem);
break;
}
case data_types::f16: {
mem_lock<uint16_t, mem_lock_type::write> original_bias_mem(original_mem, p.get_stream());
mem_lock<uint16_t, mem_lock_type::read> new_bias_mem(new_mem, p.get_stream());
cldnn::memory_ptr new_mem = p.get_engine().allocate_memory(original_mem->get_layout());
mem_lock<uint16_t, mem_lock_type::write> new_bias_mem(new_mem, p.get_stream());
mem_lock<uint16_t, mem_lock_type::read> original_bias_mem(original_mem, p.get_stream());
mem_lock<uint16_t, mem_lock_type::read> second_bias_mem(second_mem, p.get_stream());
uint16_t* original_data = original_bias_mem.data();
uint16_t* new_data = new_bias_mem.data();
uint16_t* new_data = second_bias_mem.data();
for (size_t i = 0; i < original_bias_mem.size(); i++) {
float new_val = half_to_float(original_data[i]) + half_to_float(new_data[i]);
original_data[i] = float_to_half(new_val);
new_bias_mem[i] = float_to_half(new_val);
}
original_node.attach_memory(new_mem);
break;
}
default:

View File

@ -1197,6 +1197,9 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
if (!get_engine().supports_allocation(allocation_type::usm_device))
return;
if (get_engine().get_device_info().dev_type != device_type::discrete_gpu)
return;
if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
// Allocate and transfer memory
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);

View File

@ -563,7 +563,8 @@ void program::build_program(bool is_internal) {
if (!is_internal) {
prim_info = get_current_stage_info();
transfer_memory_to_device();
if (get_engine().get_device_info().dev_type == device_type::discrete_gpu)
transfer_memory_to_device();
}
cleanup();