From 30ddd061598cb546346582f08230c37052010035 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Sat, 18 Sep 2021 13:50:36 +0900 Subject: [PATCH] [GPU] Allocate internal buffer to usm_device (#7109) * Allocate internal buffer to usm_device when one of the input tensor is from usm_device. Allocate output tensors if there is no user which is cpu impl. * Move intermediate buffer allocation to primitive_inst * Allocate to usm_host when the internal buffer is allocated close to limitation of device memory * Remove internal_buffer_info and replace it with vector of layout. Updated conditions to use alloc_type w.r.t the availability. * Allocate internal buffer within primitive_inst construction * Fixed device_mem allocation condition aligned with driver team - Single allocation should be less than CL_DEVICE_MAX_MEM_ALLOC_SIZE - Total allocation for a kernel should be less than CL_DEVICE_GLOBAL_MEM_SIZE * Apply review comment --- .../clDNN/runtime/ocl/ocl_engine.cpp | 6 + .../src/impls/common/wait_for_events.cpp | 1 + .../clDNN/src/impls/ocl/primitive_base.cpp | 32 ------ .../clDNN/src/impls/ocl/primitive_base.hpp | 51 ++++----- .../clDNN/src/include/primitive_inst.h | 25 ++++- .../thirdparty/clDNN/src/network.cpp | 6 - .../thirdparty/clDNN/src/primitive_inst.cpp | 105 ++++++++++++++++-- .../tests/module_tests/usm_memory_test.cpp | 6 +- 8 files changed, 151 insertions(+), 81 deletions(-) delete mode 100644 inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp index a980a90f40b..e6422a3ec96 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp @@ -73,6 +73,12 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty throw std::runtime_error("exceeded max size of memory object allocation"); } + if (type != allocation_type::cl_mem && !supports_allocation(type)) { + std::ostringstream type_str; + type_str << type; + throw std::runtime_error("Unsupported allocation type " + type_str.str()); + } + try { memory::ptr res = nullptr; if (layout.format.is_image_2d()) { diff --git a/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp b/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp index 5f3d5e59651..0f23d1769f5 100644 --- a/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp +++ b/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp @@ -23,6 +23,7 @@ public: void init_kernels() override {} void set_arguments(primitive_inst& /*instance*/) override {} + std::vector get_internal_buffer_layouts() const override { return {}; } event::ptr execute(const std::vector& events, primitive_inst& instance) override { auto& stream = instance.get_network().get_stream(); diff --git a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp deleted file mode 100644 index 63e1e8bd0bb..00000000000 --- a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "primitive_base.hpp" -#include - -namespace cldnn { -namespace ocl { - -bool is_user_cpu(const program_node* user) { - if (user->can_be_optimized()) { - auto users = user->get_users(); - for (const auto& u : users) { - if (is_user_cpu(u)) { - return true; - } - } - return false; - } - return user->get_selected_impl()->is_cpu(); -} - -bool is_any_user_cpu(const std::list& users) { - for (const auto& user : users) { - if (is_user_cpu(user)) - return true; - } - return false; -} -} // namespace ocl -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp index 8d44482a89b..1fcbccfabf3 100644 --- a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp +++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp @@ -20,9 +20,6 @@ namespace cldnn { namespace ocl { -// checks if any user in a list is a cpu primitive -bool is_any_user_cpu(const std::list& users); - /* Base class for all GPU implementation of specified primitive type. For example, all gpu convolution implementations should derive from typed_primitive_impl_ocl. @@ -33,28 +30,17 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { kernel_selector::kernel_data _kernel_data; std::vector _kernel_ids; std::vector _kernels; - std::vector _intermediates_memory; typed_primitive_impl_ocl(const typed_primitive_impl_ocl& other) : typed_primitive_impl(other._weights_reorder_params, other._kernel_name) , _outer(other._outer) , _kernel_data(other._kernel_data) , _kernel_ids(other._kernel_ids) - , _kernels({}) - , _intermediates_memory({}) { + , _kernels({}) { _kernels.reserve(other._kernels.size()); for (size_t k = 0; k < other._kernels.size(); ++k) { _kernels.emplace_back(other._kernels[k]->clone()); } - for (auto& mem : other._intermediates_memory) { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl; - } - auto& engine = _outer.get_program().get_engine(); - auto new_mem = engine.allocate_memory(mem->get_layout(), mem->get_allocation_type()); - _intermediates_memory.push_back(new_mem); - } } typed_primitive_impl_ocl(const typed_program_node& arg, const kernel_selector::kernel_data& kd) @@ -71,22 +57,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { for (size_t i = 0; i < kd.kernels.size(); ++i) { _kernel_ids.emplace_back(_outer.get_program().add_kernel(kd.kernels[i].code.kernelString)); } - - for (auto size : kd.internalBufferSizes) { - auto dtype = from_data_type(kd.internalBufferDataType); - const auto bpp = data_type_traits::size_of(dtype); - layout expected_layout = {dtype, - format::bfyx, // simple linear format (flatten to x channel) - {1, 1, 1, (tensor::value_type)(size / bpp)}}; - - auto& eimpl = arg.get_program().get_engine(); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl; - } - _intermediates_memory.push_back(eimpl.allocate_memory(expected_layout)); - } } + bool is_cpu() const override { return false; } protected: @@ -137,6 +109,21 @@ protected: } } + std::vector get_internal_buffer_layouts_impl() const override { + if (_kernel_data.internalBufferSizes.empty()) + return {}; + + std::vector layouts; + auto dtype = from_data_type(_kernel_data.internalBufferDataType); + const auto bpp = data_type_traits::size_of(dtype); + for (auto size : _kernel_data.internalBufferSizes) { + layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel) + {1, 1, 1, (tensor::value_type)(size / bpp)}}; + layouts.push_back(inbuf_layout); + } + return layouts; + } + void set_arguments_impl(typed_primitive_inst& instance) override { if (optimized_out(instance) || is_cpu()) { return; @@ -153,7 +140,7 @@ protected: args.scalars = &_kernel_data.kernels[k].params.scalars; args.split = i; - for (const auto& m : _intermediates_memory) { + for (const auto& m : instance.get_intermediates_memories()) { args.intermediates.push_back(m); } @@ -188,7 +175,7 @@ protected: args.scalars = &_kernel_data.kernels[k].params.scalars; args.split = i; - for (const auto& m : _intermediates_memory) { + for (const auto& m : instance.get_intermediates_memories()) { args.intermediates.push_back(m); } diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h index 6b69286d8c3..d65f92f50fc 100644 --- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h @@ -21,6 +21,9 @@ namespace cldnn { +// checks if any user in a list is a cpu primitive +bool is_any_user_cpu(const std::list& users); + class primitive_inst; template @@ -43,6 +46,7 @@ struct primitive_impl { : _weights_reorder_params(params), _kernel_name(kernel_name) {} virtual ~primitive_impl() = default; + virtual std::vector get_internal_buffer_layouts() const = 0; virtual void set_arguments(primitive_inst& instance) = 0; virtual event::ptr execute(const std::vector& events, primitive_inst& instance) = 0; virtual bool validate(const primitive_inst& instance) const = 0; @@ -111,6 +115,7 @@ public: event::ptr execute(const std::vector& events); void init_kernels(); void set_arguments(); + bool validate() const { if (_impl == nullptr) throw std::invalid_argument("[Internal cldnn error]. Validation method for nullptr impl is not allowed."); @@ -141,6 +146,14 @@ public: return _node.is_output(); } + bool mem_allocated() const { + return _mem_allocated; + } + + void allocate_internal_buffers(); + + std::vector get_intermediates_memories() const { return _intermediates_memory; } + protected: primitive_inst(network& network, program_node const& node, bool allocate_memory); @@ -167,10 +180,13 @@ protected: // depending on reshape_node.is_in_place()) memory::ptr _output; + std::vector _intermediates_memory; + bool _output_changed; // todo: implement output reuse if neither of inputs has changed bool _has_valid_input = true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst) bool _has_mutable_input = false; + bool _mem_allocated = false; memory::ptr allocate_output(); static std::vector> build_exec_deps( @@ -207,6 +223,14 @@ private: return execute_impl(event, reinterpret_cast&>(instance)); } + std::vector get_internal_buffer_layouts() const override { + return get_internal_buffer_layouts_impl(); + } + + virtual std::vector get_internal_buffer_layouts_impl() const { + return {}; + } + void set_arguments(primitive_inst& instance) override { if (instance.type() != PType::type_id()) throw std::invalid_argument("Implementation type does not match primitive type"); @@ -217,7 +241,6 @@ private: return set_arguments_impl(reinterpret_cast&>(instance)); } - virtual void set_arguments_impl(typed_primitive_inst& /*instance*/) {} virtual event::ptr execute_impl(const std::vector& event, typed_primitive_inst& instance) = 0; diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp index 6b6f20f697a..5c60a442992 100644 --- a/inference-engine/thirdparty/clDNN/src/network.cpp +++ b/inference-engine/thirdparty/clDNN/src/network.cpp @@ -470,12 +470,6 @@ void network::allocate_primitives() { for (auto node : _program->get_processing_order()) { nodes_to_allocate.push_back(_program->get_node_ptr(node->id())); } - std::sort(nodes_to_allocate.begin(), - nodes_to_allocate.end(), - [](std::shared_ptr const& lhs, std::shared_ptr const& rhs) { - return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count()); - }); - for (auto const& node : nodes_to_allocate) { allocate_primitive_instance(*node); } diff --git a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp index dcaea49163a..082dc3e8f2a 100644 --- a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp +++ b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp @@ -25,6 +25,27 @@ namespace cldnn { +bool is_user_cpu(const program_node* user) { + if (user->can_be_optimized()) { + auto users = user->get_users(); + for (const auto& u : users) { + if (is_user_cpu(u)) { + return true; + } + } + return false; + } + return user->get_selected_impl()->is_cpu(); +} + +bool is_any_user_cpu(const std::list& users) { + for (const auto& user : users) { + if (is_user_cpu(user)) + return true; + } + return false; +} + uint32_t primitive_inst::get_network_id() const { return _network.get_id(); } void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout) const { @@ -128,7 +149,8 @@ void primitive_inst::build_deps() { } primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory) - : _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr), _output(), _output_changed(false) { + : _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr), + _output(), _output_changed(false), _mem_allocated(allocate_memory) { if (allocate_memory) { // In case when output is mutable_data primitive, and other users dependencies are only used for // suychronization, The output memory of such primitive will be fused with mutable_data @@ -159,23 +181,92 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool } else { _output = allocate_output(); } + + // Allocate internal buffer + allocate_internal_buffers(); + } +} + +void primitive_inst::allocate_internal_buffers(void) { + if (_impl == nullptr) return; + const auto& ibuf_layouts = _impl->get_internal_buffer_layouts(); + if (ibuf_layouts.empty()) return; + + auto device_mem_acc = [&](size_t a, std::shared_ptr b) { + if (!b->mem_allocated()) return a; + if (b->output_memory().get_allocation_type() == allocation_type::usm_device || + b->output_memory().get_allocation_type() == allocation_type::cl_mem) + return a + b->output_memory().size(); + else + return a; + }; + + auto& engine = get_network().get_engine(); + bool input_device_mem = false; + + // NOTE: Currently the ocl driver aborts at runtime when there are layers using device memory close to max size within multiple streams. + // Decided the limitation as 85 % empirically, but still it needs further investigation. + const auto& inst_deps = _network.get_primitives(_node.get_dependencies()); + + auto total_device_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc); + if (_output->get_allocation_type() == allocation_type::usm_device) { + total_device_mem_size += _output->size(); + } + + int64_t available_device_mem_size = engine.get_device_info().max_global_mem_size - total_device_mem_size; + // check if there is any device mem input + if (engine.supports_allocation(allocation_type::usm_device)) { + for (const auto& dep : inst_deps) { + if (dep->output_memory().get_allocation_type() == allocation_type::usm_device) { + input_device_mem = true; + break; + } + } + } + + for (auto layout : ibuf_layouts) { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->verbose >= 2) { + GPU_DEBUG_COUT << "[" << _node.id() << ": internal buf]" << std::endl; + } + if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) + _intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_device)); + else + _intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_host)); } } memory::ptr primitive_inst::allocate_output() { auto layout = _node.get_output_layout(); auto& engine = get_network().get_engine(); + const auto& inst_deps = _network.get_primitives(_node.get_dependencies()); + auto device_mem_acc = [&](size_t a, std::shared_ptr b) { + if (!b->mem_allocated()) return a; + if (b->output_memory().get_allocation_type() == allocation_type::usm_device + || b->output_memory().get_allocation_type() == allocation_type::cl_mem) + return a + b->output_memory().size(); + else + return a; + }; + + bool usm_device_allocatable = true; + const auto& total_device_input_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc); + if (total_device_input_mem_size > engine.get_device_info().max_global_mem_size) + usm_device_allocatable = false; // For outputs, cpu prim we want to have lockable alloc type // Also if the successor of a node is an cpu, then memory needs to be lockable. auto use_lockable_memory = _node.is_output() || _node.get_selected_impl()->is_cpu() || std::any_of(_node.get_users().begin(), _node.get_users().end(), - [](const program_node* n) {return n->get_selected_impl()->is_cpu() || n->can_be_optimized(); }) - || !engine.supports_allocation(allocation_type::usm_device); - allocation_type alloc_type = use_lockable_memory ? - engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d()) - : allocation_type::usm_device; + [](const program_node* n) { + return n->get_selected_impl()->is_cpu() || is_any_user_cpu(n->get_users()); + }) || !engine.supports_allocation(allocation_type::usm_device); + GPU_DEBUG_GET_INSTANCE(debug_config); + const auto& lockable_mem_type = engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d()); + const auto& alloc_type = use_lockable_memory ? lockable_mem_type + : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type; + if (!_network.is_internal() && (_node.can_be_optimized() || _node.is_type())) { GPU_DEBUG_IF(debug_config->verbose >= 2) { GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl; @@ -186,7 +277,7 @@ memory::ptr primitive_inst::allocate_output() { alloc_type, false); } else if (_network.is_internal() && _node.is_output() && _node.is_type() && - engine.supports_allocation(allocation_type::usm_device)) { + engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) { GPU_DEBUG_IF(debug_config->verbose >= 2) { GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl; } diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp index dd414e9a0c8..6e9fd52da2e 100644 --- a/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp @@ -100,7 +100,7 @@ TEST_P(ctor_test, basic) { INSTANTIATE_TEST_SUITE_P(cldnn_usm, ctor_test, ::testing::ValuesIn(std::vector{ usm_test_params{ allocation_type::usm_host}, - usm_test_params{ allocation_type::usm_shared}, +// usm_test_params{ allocation_type::usm_shared}, // Unsupported usm_test_params{ allocation_type::usm_device}, })); @@ -173,7 +173,7 @@ TEST_P(copy_and_read_buffer, basic) { INSTANTIATE_TEST_SUITE_P(cldnn_usm, copy_and_read_buffer, ::testing::ValuesIn(std::vector{ usm_test_params{ allocation_type::usm_host }, - usm_test_params{ allocation_type::usm_shared }, +// usm_test_params{ allocation_type::usm_shared }, // Unsupported usm_test_params{ allocation_type::usm_device }, })); @@ -256,6 +256,6 @@ TEST_P(fill_buffer, DISABLED_basic) { INSTANTIATE_TEST_SUITE_P(cldnn_usm, fill_buffer, ::testing::ValuesIn(std::vector{ usm_test_params{ allocation_type::usm_host }, - usm_test_params{ allocation_type::usm_shared }, +// usm_test_params{ allocation_type::usm_shared }, // Unsupported usm_test_params{ allocation_type::usm_device }, }));