[GPU] Perform memory transfer from usm_host to usm_device for dGPU only (#15263)
* [GPU] Perform memory transfer from usm_host to usm_device only for dGPU * [GPU] Allocate new memory buffer for biases fusions to avoid original buffer modification since it may be used by other primitives
This commit is contained in:
parent
b44b4fcf2c
commit
6c22f06d07
@ -329,31 +329,39 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
|
||||
new_node.recalc_output_layout();
|
||||
};
|
||||
|
||||
auto recalculate_biases = [&](data_node& original_node, data_node& new_node) -> bool {
|
||||
auto recalculate_biases = [&](data_node& original_node, data_node& second_node) -> bool {
|
||||
auto original_mem = original_node.get_attached_memory_ptr();
|
||||
auto new_mem = new_node.get_attached_memory_ptr();
|
||||
if (original_mem->count() != new_mem->count() || original_mem->get_layout().data_type != new_mem->get_layout().data_type)
|
||||
auto second_mem = second_node.get_attached_memory_ptr();
|
||||
if (original_mem->count() != second_mem->count() || original_mem->get_layout().data_type != second_mem->get_layout().data_type)
|
||||
return false;
|
||||
|
||||
switch (original_mem->get_layout().data_type) {
|
||||
case data_types::f32: {
|
||||
mem_lock<float, mem_lock_type::write> original_bias_mem(original_mem, p.get_stream());
|
||||
mem_lock<float, mem_lock_type::read> new_bias_mem(new_mem, p.get_stream());
|
||||
cldnn::memory_ptr new_mem = p.get_engine().allocate_memory(original_mem->get_layout());
|
||||
mem_lock<float, mem_lock_type::write> new_bias_mem(new_mem, p.get_stream());
|
||||
mem_lock<float, mem_lock_type::read> original_bias_mem(original_mem, p.get_stream());
|
||||
mem_lock<float, mem_lock_type::read> second_bias_mem(second_mem, p.get_stream());
|
||||
float* original_data = original_bias_mem.data();
|
||||
float* new_data = new_bias_mem.data();
|
||||
float* new_data = second_bias_mem.data();
|
||||
for (size_t i = 0; i < original_bias_mem.size(); i++)
|
||||
original_data[i] += new_data[i];
|
||||
new_bias_mem[i] = original_data[i] + new_data[i];
|
||||
|
||||
original_node.attach_memory(new_mem);
|
||||
break;
|
||||
}
|
||||
case data_types::f16: {
|
||||
mem_lock<uint16_t, mem_lock_type::write> original_bias_mem(original_mem, p.get_stream());
|
||||
mem_lock<uint16_t, mem_lock_type::read> new_bias_mem(new_mem, p.get_stream());
|
||||
cldnn::memory_ptr new_mem = p.get_engine().allocate_memory(original_mem->get_layout());
|
||||
mem_lock<uint16_t, mem_lock_type::write> new_bias_mem(new_mem, p.get_stream());
|
||||
mem_lock<uint16_t, mem_lock_type::read> original_bias_mem(original_mem, p.get_stream());
|
||||
mem_lock<uint16_t, mem_lock_type::read> second_bias_mem(second_mem, p.get_stream());
|
||||
uint16_t* original_data = original_bias_mem.data();
|
||||
uint16_t* new_data = new_bias_mem.data();
|
||||
uint16_t* new_data = second_bias_mem.data();
|
||||
for (size_t i = 0; i < original_bias_mem.size(); i++) {
|
||||
float new_val = half_to_float(original_data[i]) + half_to_float(new_data[i]);
|
||||
original_data[i] = float_to_half(new_val);
|
||||
new_bias_mem[i] = float_to_half(new_val);
|
||||
}
|
||||
|
||||
original_node.attach_memory(new_mem);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -1197,6 +1197,9 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
|
||||
if (!get_engine().supports_allocation(allocation_type::usm_device))
|
||||
return;
|
||||
|
||||
if (get_engine().get_device_info().dev_type != device_type::discrete_gpu)
|
||||
return;
|
||||
|
||||
if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
|
||||
// Allocate and transfer memory
|
||||
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
|
||||
|
@ -563,7 +563,8 @@ void program::build_program(bool is_internal) {
|
||||
|
||||
if (!is_internal) {
|
||||
prim_info = get_current_stage_info();
|
||||
transfer_memory_to_device();
|
||||
if (get_engine().get_device_info().dev_type == device_type::discrete_gpu)
|
||||
transfer_memory_to_device();
|
||||
}
|
||||
|
||||
cleanup();
|
||||
|
Loading…
Reference in New Issue
Block a user