[GPU] Allocate internal buffer to usm_device (#7109)
* Allocate internal buffer to usm_device when one of the input tensor is from usm_device. Allocate output tensors if there is no user which is cpu impl. * Move intermediate buffer allocation to primitive_inst * Allocate to usm_host when the internal buffer is allocated close to limitation of device memory * Remove internal_buffer_info and replace it with vector of layout. Updated conditions to use alloc_type w.r.t the availability. * Allocate internal buffer within primitive_inst construction * Fixed device_mem allocation condition aligned with driver team - Single allocation should be less than CL_DEVICE_MAX_MEM_ALLOC_SIZE - Total allocation for a kernel should be less than CL_DEVICE_GLOBAL_MEM_SIZE * Apply review comment
This commit is contained in:
parent
702633073e
commit
30ddd06159
@ -73,6 +73,12 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
|
|||||||
throw std::runtime_error("exceeded max size of memory object allocation");
|
throw std::runtime_error("exceeded max size of memory object allocation");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (type != allocation_type::cl_mem && !supports_allocation(type)) {
|
||||||
|
std::ostringstream type_str;
|
||||||
|
type_str << type;
|
||||||
|
throw std::runtime_error("Unsupported allocation type " + type_str.str());
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
memory::ptr res = nullptr;
|
memory::ptr res = nullptr;
|
||||||
if (layout.format.is_image_2d()) {
|
if (layout.format.is_image_2d()) {
|
||||||
|
@ -23,6 +23,7 @@ public:
|
|||||||
|
|
||||||
void init_kernels() override {}
|
void init_kernels() override {}
|
||||||
void set_arguments(primitive_inst& /*instance*/) override {}
|
void set_arguments(primitive_inst& /*instance*/) override {}
|
||||||
|
std::vector<layout> get_internal_buffer_layouts() const override { return {}; }
|
||||||
|
|
||||||
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
|
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
|
||||||
auto& stream = instance.get_network().get_stream();
|
auto& stream = instance.get_network().get_stream();
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
// Copyright (C) 2018-2021 Intel Corporation
|
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
|
||||||
//
|
|
||||||
|
|
||||||
#include "primitive_base.hpp"
|
|
||||||
#include <list>
|
|
||||||
|
|
||||||
namespace cldnn {
|
|
||||||
namespace ocl {
|
|
||||||
|
|
||||||
bool is_user_cpu(const program_node* user) {
|
|
||||||
if (user->can_be_optimized()) {
|
|
||||||
auto users = user->get_users();
|
|
||||||
for (const auto& u : users) {
|
|
||||||
if (is_user_cpu(u)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return user->get_selected_impl()->is_cpu();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_any_user_cpu(const std::list<const program_node*>& users) {
|
|
||||||
for (const auto& user : users) {
|
|
||||||
if (is_user_cpu(user))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} // namespace ocl
|
|
||||||
} // namespace cldnn
|
|
@ -20,9 +20,6 @@
|
|||||||
namespace cldnn {
|
namespace cldnn {
|
||||||
namespace ocl {
|
namespace ocl {
|
||||||
|
|
||||||
// checks if any user in a list is a cpu primitive
|
|
||||||
bool is_any_user_cpu(const std::list<const program_node*>& users);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Base class for all GPU implementation of specified primitive type.
|
Base class for all GPU implementation of specified primitive type.
|
||||||
For example, all gpu convolution implementations should derive from typed_primitive_impl_ocl<convolution>.
|
For example, all gpu convolution implementations should derive from typed_primitive_impl_ocl<convolution>.
|
||||||
@ -33,28 +30,17 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
|||||||
kernel_selector::kernel_data _kernel_data;
|
kernel_selector::kernel_data _kernel_data;
|
||||||
std::vector<kernel_id> _kernel_ids;
|
std::vector<kernel_id> _kernel_ids;
|
||||||
std::vector<kernel::ptr> _kernels;
|
std::vector<kernel::ptr> _kernels;
|
||||||
std::vector<memory::cptr> _intermediates_memory;
|
|
||||||
|
|
||||||
typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
|
typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
|
||||||
: typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name)
|
: typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name)
|
||||||
, _outer(other._outer)
|
, _outer(other._outer)
|
||||||
, _kernel_data(other._kernel_data)
|
, _kernel_data(other._kernel_data)
|
||||||
, _kernel_ids(other._kernel_ids)
|
, _kernel_ids(other._kernel_ids)
|
||||||
, _kernels({})
|
, _kernels({}) {
|
||||||
, _intermediates_memory({}) {
|
|
||||||
_kernels.reserve(other._kernels.size());
|
_kernels.reserve(other._kernels.size());
|
||||||
for (size_t k = 0; k < other._kernels.size(); ++k) {
|
for (size_t k = 0; k < other._kernels.size(); ++k) {
|
||||||
_kernels.emplace_back(other._kernels[k]->clone());
|
_kernels.emplace_back(other._kernels[k]->clone());
|
||||||
}
|
}
|
||||||
for (auto& mem : other._intermediates_memory) {
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
|
||||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
|
||||||
GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl;
|
|
||||||
}
|
|
||||||
auto& engine = _outer.get_program().get_engine();
|
|
||||||
auto new_mem = engine.allocate_memory(mem->get_layout(), mem->get_allocation_type());
|
|
||||||
_intermediates_memory.push_back(new_mem);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
typed_primitive_impl_ocl(const typed_program_node<PType>& arg, const kernel_selector::kernel_data& kd)
|
typed_primitive_impl_ocl(const typed_program_node<PType>& arg, const kernel_selector::kernel_data& kd)
|
||||||
@ -71,22 +57,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
|||||||
for (size_t i = 0; i < kd.kernels.size(); ++i) {
|
for (size_t i = 0; i < kd.kernels.size(); ++i) {
|
||||||
_kernel_ids.emplace_back(_outer.get_program().add_kernel(kd.kernels[i].code.kernelString));
|
_kernel_ids.emplace_back(_outer.get_program().add_kernel(kd.kernels[i].code.kernelString));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto size : kd.internalBufferSizes) {
|
|
||||||
auto dtype = from_data_type(kd.internalBufferDataType);
|
|
||||||
const auto bpp = data_type_traits::size_of(dtype);
|
|
||||||
layout expected_layout = {dtype,
|
|
||||||
format::bfyx, // simple linear format (flatten to x channel)
|
|
||||||
{1, 1, 1, (tensor::value_type)(size / bpp)}};
|
|
||||||
|
|
||||||
auto& eimpl = arg.get_program().get_engine();
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
|
||||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
|
||||||
GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl;
|
|
||||||
}
|
|
||||||
_intermediates_memory.push_back(eimpl.allocate_memory(expected_layout));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_cpu() const override { return false; }
|
bool is_cpu() const override { return false; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -137,6 +109,21 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<layout> get_internal_buffer_layouts_impl() const override {
|
||||||
|
if (_kernel_data.internalBufferSizes.empty())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
std::vector<layout> layouts;
|
||||||
|
auto dtype = from_data_type(_kernel_data.internalBufferDataType);
|
||||||
|
const auto bpp = data_type_traits::size_of(dtype);
|
||||||
|
for (auto size : _kernel_data.internalBufferSizes) {
|
||||||
|
layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel)
|
||||||
|
{1, 1, 1, (tensor::value_type)(size / bpp)}};
|
||||||
|
layouts.push_back(inbuf_layout);
|
||||||
|
}
|
||||||
|
return layouts;
|
||||||
|
}
|
||||||
|
|
||||||
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
|
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
|
||||||
if (optimized_out(instance) || is_cpu()) {
|
if (optimized_out(instance) || is_cpu()) {
|
||||||
return;
|
return;
|
||||||
@ -153,7 +140,7 @@ protected:
|
|||||||
args.scalars = &_kernel_data.kernels[k].params.scalars;
|
args.scalars = &_kernel_data.kernels[k].params.scalars;
|
||||||
args.split = i;
|
args.split = i;
|
||||||
|
|
||||||
for (const auto& m : _intermediates_memory) {
|
for (const auto& m : instance.get_intermediates_memories()) {
|
||||||
args.intermediates.push_back(m);
|
args.intermediates.push_back(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,7 +175,7 @@ protected:
|
|||||||
args.scalars = &_kernel_data.kernels[k].params.scalars;
|
args.scalars = &_kernel_data.kernels[k].params.scalars;
|
||||||
args.split = i;
|
args.split = i;
|
||||||
|
|
||||||
for (const auto& m : _intermediates_memory) {
|
for (const auto& m : instance.get_intermediates_memories()) {
|
||||||
args.intermediates.push_back(m);
|
args.intermediates.push_back(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,6 +21,9 @@
|
|||||||
|
|
||||||
namespace cldnn {
|
namespace cldnn {
|
||||||
|
|
||||||
|
// checks if any user in a list is a cpu primitive
|
||||||
|
bool is_any_user_cpu(const std::list<const program_node*>& users);
|
||||||
|
|
||||||
class primitive_inst;
|
class primitive_inst;
|
||||||
|
|
||||||
template <class PType>
|
template <class PType>
|
||||||
@ -43,6 +46,7 @@ struct primitive_impl {
|
|||||||
: _weights_reorder_params(params), _kernel_name(kernel_name) {}
|
: _weights_reorder_params(params), _kernel_name(kernel_name) {}
|
||||||
virtual ~primitive_impl() = default;
|
virtual ~primitive_impl() = default;
|
||||||
|
|
||||||
|
virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
|
||||||
virtual void set_arguments(primitive_inst& instance) = 0;
|
virtual void set_arguments(primitive_inst& instance) = 0;
|
||||||
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
|
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
|
||||||
virtual bool validate(const primitive_inst& instance) const = 0;
|
virtual bool validate(const primitive_inst& instance) const = 0;
|
||||||
@ -111,6 +115,7 @@ public:
|
|||||||
event::ptr execute(const std::vector<event::ptr>& events);
|
event::ptr execute(const std::vector<event::ptr>& events);
|
||||||
void init_kernels();
|
void init_kernels();
|
||||||
void set_arguments();
|
void set_arguments();
|
||||||
|
|
||||||
bool validate() const {
|
bool validate() const {
|
||||||
if (_impl == nullptr)
|
if (_impl == nullptr)
|
||||||
throw std::invalid_argument("[Internal cldnn error]. Validation method for nullptr impl is not allowed.");
|
throw std::invalid_argument("[Internal cldnn error]. Validation method for nullptr impl is not allowed.");
|
||||||
@ -141,6 +146,14 @@ public:
|
|||||||
return _node.is_output();
|
return _node.is_output();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool mem_allocated() const {
|
||||||
|
return _mem_allocated;
|
||||||
|
}
|
||||||
|
|
||||||
|
void allocate_internal_buffers();
|
||||||
|
|
||||||
|
std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
primitive_inst(network& network, program_node const& node, bool allocate_memory);
|
primitive_inst(network& network, program_node const& node, bool allocate_memory);
|
||||||
|
|
||||||
@ -167,10 +180,13 @@ protected:
|
|||||||
// depending on reshape_node.is_in_place())
|
// depending on reshape_node.is_in_place())
|
||||||
memory::ptr _output;
|
memory::ptr _output;
|
||||||
|
|
||||||
|
std::vector<memory::cptr> _intermediates_memory;
|
||||||
|
|
||||||
bool _output_changed; // todo: implement output reuse if neither of inputs has changed
|
bool _output_changed; // todo: implement output reuse if neither of inputs has changed
|
||||||
bool _has_valid_input =
|
bool _has_valid_input =
|
||||||
true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
|
true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
|
||||||
bool _has_mutable_input = false;
|
bool _has_mutable_input = false;
|
||||||
|
bool _mem_allocated = false;
|
||||||
|
|
||||||
memory::ptr allocate_output();
|
memory::ptr allocate_output();
|
||||||
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
|
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
|
||||||
@ -207,6 +223,14 @@ private:
|
|||||||
return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<layout> get_internal_buffer_layouts() const override {
|
||||||
|
return get_internal_buffer_layouts_impl();
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual std::vector<layout> get_internal_buffer_layouts_impl() const {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
void set_arguments(primitive_inst& instance) override {
|
void set_arguments(primitive_inst& instance) override {
|
||||||
if (instance.type() != PType::type_id())
|
if (instance.type() != PType::type_id())
|
||||||
throw std::invalid_argument("Implementation type does not match primitive type");
|
throw std::invalid_argument("Implementation type does not match primitive type");
|
||||||
@ -217,7 +241,6 @@ private:
|
|||||||
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
|
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
|
||||||
virtual event::ptr execute_impl(const std::vector<event::ptr>& event,
|
virtual event::ptr execute_impl(const std::vector<event::ptr>& event,
|
||||||
typed_primitive_inst<PType>& instance) = 0;
|
typed_primitive_inst<PType>& instance) = 0;
|
||||||
|
@ -470,12 +470,6 @@ void network::allocate_primitives() {
|
|||||||
for (auto node : _program->get_processing_order()) {
|
for (auto node : _program->get_processing_order()) {
|
||||||
nodes_to_allocate.push_back(_program->get_node_ptr(node->id()));
|
nodes_to_allocate.push_back(_program->get_node_ptr(node->id()));
|
||||||
}
|
}
|
||||||
std::sort(nodes_to_allocate.begin(),
|
|
||||||
nodes_to_allocate.end(),
|
|
||||||
[](std::shared_ptr<program_node> const& lhs, std::shared_ptr<program_node> const& rhs) {
|
|
||||||
return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
|
|
||||||
});
|
|
||||||
|
|
||||||
for (auto const& node : nodes_to_allocate) {
|
for (auto const& node : nodes_to_allocate) {
|
||||||
allocate_primitive_instance(*node);
|
allocate_primitive_instance(*node);
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,27 @@
|
|||||||
|
|
||||||
namespace cldnn {
|
namespace cldnn {
|
||||||
|
|
||||||
|
bool is_user_cpu(const program_node* user) {
|
||||||
|
if (user->can_be_optimized()) {
|
||||||
|
auto users = user->get_users();
|
||||||
|
for (const auto& u : users) {
|
||||||
|
if (is_user_cpu(u)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return user->get_selected_impl()->is_cpu();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_any_user_cpu(const std::list<const program_node*>& users) {
|
||||||
|
for (const auto& user : users) {
|
||||||
|
if (is_user_cpu(user))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t primitive_inst::get_network_id() const { return _network.get_id(); }
|
uint32_t primitive_inst::get_network_id() const { return _network.get_id(); }
|
||||||
|
|
||||||
void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout) const {
|
void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout) const {
|
||||||
@ -128,7 +149,8 @@ void primitive_inst::build_deps() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
|
primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
|
||||||
: _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr), _output(), _output_changed(false) {
|
: _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr),
|
||||||
|
_output(), _output_changed(false), _mem_allocated(allocate_memory) {
|
||||||
if (allocate_memory) {
|
if (allocate_memory) {
|
||||||
// In case when output is mutable_data primitive, and other users dependencies are only used for
|
// In case when output is mutable_data primitive, and other users dependencies are only used for
|
||||||
// suychronization, The output memory of such primitive will be fused with mutable_data
|
// suychronization, The output memory of such primitive will be fused with mutable_data
|
||||||
@ -159,23 +181,92 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
|||||||
} else {
|
} else {
|
||||||
_output = allocate_output();
|
_output = allocate_output();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allocate internal buffer
|
||||||
|
allocate_internal_buffers();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void primitive_inst::allocate_internal_buffers(void) {
|
||||||
|
if (_impl == nullptr) return;
|
||||||
|
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||||
|
if (ibuf_layouts.empty()) return;
|
||||||
|
|
||||||
|
auto device_mem_acc = [&](size_t a, std::shared_ptr<primitive_inst> b) {
|
||||||
|
if (!b->mem_allocated()) return a;
|
||||||
|
if (b->output_memory().get_allocation_type() == allocation_type::usm_device ||
|
||||||
|
b->output_memory().get_allocation_type() == allocation_type::cl_mem)
|
||||||
|
return a + b->output_memory().size();
|
||||||
|
else
|
||||||
|
return a;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto& engine = get_network().get_engine();
|
||||||
|
bool input_device_mem = false;
|
||||||
|
|
||||||
|
// NOTE: Currently the ocl driver aborts at runtime when there are layers using device memory close to max size within multiple streams.
|
||||||
|
// Decided the limitation as 85 % empirically, but still it needs further investigation.
|
||||||
|
const auto& inst_deps = _network.get_primitives(_node.get_dependencies());
|
||||||
|
|
||||||
|
auto total_device_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc);
|
||||||
|
if (_output->get_allocation_type() == allocation_type::usm_device) {
|
||||||
|
total_device_mem_size += _output->size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t available_device_mem_size = engine.get_device_info().max_global_mem_size - total_device_mem_size;
|
||||||
|
// check if there is any device mem input
|
||||||
|
if (engine.supports_allocation(allocation_type::usm_device)) {
|
||||||
|
for (const auto& dep : inst_deps) {
|
||||||
|
if (dep->output_memory().get_allocation_type() == allocation_type::usm_device) {
|
||||||
|
input_device_mem = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto layout : ibuf_layouts) {
|
||||||
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
|
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||||
|
GPU_DEBUG_COUT << "[" << _node.id() << ": internal buf]" << std::endl;
|
||||||
|
}
|
||||||
|
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0))
|
||||||
|
_intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_device));
|
||||||
|
else
|
||||||
|
_intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_host));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memory::ptr primitive_inst::allocate_output() {
|
memory::ptr primitive_inst::allocate_output() {
|
||||||
auto layout = _node.get_output_layout();
|
auto layout = _node.get_output_layout();
|
||||||
auto& engine = get_network().get_engine();
|
auto& engine = get_network().get_engine();
|
||||||
|
const auto& inst_deps = _network.get_primitives(_node.get_dependencies());
|
||||||
|
auto device_mem_acc = [&](size_t a, std::shared_ptr<primitive_inst> b) {
|
||||||
|
if (!b->mem_allocated()) return a;
|
||||||
|
if (b->output_memory().get_allocation_type() == allocation_type::usm_device
|
||||||
|
|| b->output_memory().get_allocation_type() == allocation_type::cl_mem)
|
||||||
|
return a + b->output_memory().size();
|
||||||
|
else
|
||||||
|
return a;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool usm_device_allocatable = true;
|
||||||
|
const auto& total_device_input_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc);
|
||||||
|
if (total_device_input_mem_size > engine.get_device_info().max_global_mem_size)
|
||||||
|
usm_device_allocatable = false;
|
||||||
|
|
||||||
// For outputs, cpu prim we want to have lockable alloc type
|
// For outputs, cpu prim we want to have lockable alloc type
|
||||||
// Also if the successor of a node is an cpu, then memory needs to be lockable.
|
// Also if the successor of a node is an cpu, then memory needs to be lockable.
|
||||||
auto use_lockable_memory = _node.is_output() || _node.get_selected_impl()->is_cpu()
|
auto use_lockable_memory = _node.is_output() || _node.get_selected_impl()->is_cpu()
|
||||||
|| std::any_of(_node.get_users().begin(), _node.get_users().end(),
|
|| std::any_of(_node.get_users().begin(), _node.get_users().end(),
|
||||||
[](const program_node* n) {return n->get_selected_impl()->is_cpu() || n->can_be_optimized(); })
|
[](const program_node* n) {
|
||||||
|| !engine.supports_allocation(allocation_type::usm_device);
|
return n->get_selected_impl()->is_cpu() || is_any_user_cpu(n->get_users());
|
||||||
allocation_type alloc_type = use_lockable_memory ?
|
}) || !engine.supports_allocation(allocation_type::usm_device);
|
||||||
engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d())
|
|
||||||
: allocation_type::usm_device;
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
|
const auto& lockable_mem_type = engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d());
|
||||||
|
const auto& alloc_type = use_lockable_memory ? lockable_mem_type
|
||||||
|
: usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;
|
||||||
|
|
||||||
if (!_network.is_internal() && (_node.can_be_optimized() || _node.is_type<generic_layer>())) {
|
if (!_network.is_internal() && (_node.can_be_optimized() || _node.is_type<generic_layer>())) {
|
||||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||||
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
||||||
@ -186,7 +277,7 @@ memory::ptr primitive_inst::allocate_output() {
|
|||||||
alloc_type,
|
alloc_type,
|
||||||
false);
|
false);
|
||||||
} else if (_network.is_internal() && _node.is_output() && _node.is_type<generic_layer>() &&
|
} else if (_network.is_internal() && _node.is_output() && _node.is_type<generic_layer>() &&
|
||||||
engine.supports_allocation(allocation_type::usm_device)) {
|
engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
|
||||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||||
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
||||||
}
|
}
|
||||||
|
@ -100,7 +100,7 @@ TEST_P(ctor_test, basic) {
|
|||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(cldnn_usm, ctor_test, ::testing::ValuesIn(std::vector<usm_test_params>{
|
INSTANTIATE_TEST_SUITE_P(cldnn_usm, ctor_test, ::testing::ValuesIn(std::vector<usm_test_params>{
|
||||||
usm_test_params{ allocation_type::usm_host},
|
usm_test_params{ allocation_type::usm_host},
|
||||||
usm_test_params{ allocation_type::usm_shared},
|
// usm_test_params{ allocation_type::usm_shared}, // Unsupported
|
||||||
usm_test_params{ allocation_type::usm_device},
|
usm_test_params{ allocation_type::usm_device},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@ -173,7 +173,7 @@ TEST_P(copy_and_read_buffer, basic) {
|
|||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(cldnn_usm, copy_and_read_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
|
INSTANTIATE_TEST_SUITE_P(cldnn_usm, copy_and_read_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
|
||||||
usm_test_params{ allocation_type::usm_host },
|
usm_test_params{ allocation_type::usm_host },
|
||||||
usm_test_params{ allocation_type::usm_shared },
|
// usm_test_params{ allocation_type::usm_shared }, // Unsupported
|
||||||
usm_test_params{ allocation_type::usm_device },
|
usm_test_params{ allocation_type::usm_device },
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@ -256,6 +256,6 @@ TEST_P(fill_buffer, DISABLED_basic) {
|
|||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(cldnn_usm, fill_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
|
INSTANTIATE_TEST_SUITE_P(cldnn_usm, fill_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
|
||||||
usm_test_params{ allocation_type::usm_host },
|
usm_test_params{ allocation_type::usm_host },
|
||||||
usm_test_params{ allocation_type::usm_shared },
|
// usm_test_params{ allocation_type::usm_shared }, // Unsupported
|
||||||
usm_test_params{ allocation_type::usm_device },
|
usm_test_params{ allocation_type::usm_device },
|
||||||
}));
|
}));
|
||||||
|
Loading…
Reference in New Issue
Block a user