From 30ddd061598cb546346582f08230c37052010035 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Sat, 18 Sep 2021 13:50:36 +0900
Subject: [PATCH] [GPU] Allocate internal buffer to usm_device  (#7109)

* Allocate internal buffer to usm_device when one of the input tensor is from usm_device.
Allocate output tensors if there is no user which is cpu impl.

* Move intermediate buffer allocation to primitive_inst

* Allocate to usm_host when the internal buffer is allocated close to limitation of device memory

* Remove internal_buffer_info and replace it with vector of layout.
Updated conditions to use alloc_type w.r.t the availability.

* Allocate internal buffer within primitive_inst construction

* Fixed device_mem allocation condition aligned with driver team
- Single allocation should be less than CL_DEVICE_MAX_MEM_ALLOC_SIZE
- Total allocation for a kernel should be less than CL_DEVICE_GLOBAL_MEM_SIZE

* Apply review comment
---
 .../clDNN/runtime/ocl/ocl_engine.cpp          |   6 +
 .../src/impls/common/wait_for_events.cpp      |   1 +
 .../clDNN/src/impls/ocl/primitive_base.cpp    |  32 ------
 .../clDNN/src/impls/ocl/primitive_base.hpp    |  51 ++++-----
 .../clDNN/src/include/primitive_inst.h        |  25 ++++-
 .../thirdparty/clDNN/src/network.cpp          |   6 -
 .../thirdparty/clDNN/src/primitive_inst.cpp   | 105 ++++++++++++++++--
 .../tests/module_tests/usm_memory_test.cpp    |   6 +-
 8 files changed, 151 insertions(+), 81 deletions(-)
 delete mode 100644 inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp
diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
index a980a90f40b..e6422a3ec96 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_engine.cpp
@@ -73,6 +73,12 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
         throw std::runtime_error("exceeded max size of memory object allocation");
     }
 
+    if (type != allocation_type::cl_mem && !supports_allocation(type)) {
+        std::ostringstream type_str;
+        type_str << type;
+        throw std::runtime_error("Unsupported allocation type " + type_str.str());
+    }
+
     try {
         memory::ptr res = nullptr;
         if (layout.format.is_image_2d()) {
diff --git a/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp b/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp
index 5f3d5e59651..0f23d1769f5 100644
--- a/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/common/wait_for_events.cpp
@@ -23,6 +23,7 @@ public:
 
     void init_kernels() override {}
     void set_arguments(primitive_inst& /*instance*/) override {}
+    std::vector<layout> get_internal_buffer_layouts() const override { return {}; }
 
     event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
         auto& stream = instance.get_network().get_stream();
diff --git a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp
deleted file mode 100644
index 63e1e8bd0bb..00000000000
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "primitive_base.hpp"
-#include <list>
-
-namespace cldnn {
-namespace ocl {
-
-bool is_user_cpu(const program_node* user) {
-    if (user->can_be_optimized()) {
-        auto users = user->get_users();
-        for (const auto& u : users) {
-            if (is_user_cpu(u)) {
-                return true;
-            }
-        }
-        return false;
-    }
-    return user->get_selected_impl()->is_cpu();
-}
-
-bool is_any_user_cpu(const std::list<const program_node*>& users) {
-    for (const auto& user : users) {
-        if (is_user_cpu(user))
-            return true;
-    }
-    return false;
-}
-}  // namespace ocl
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp
index 8d44482a89b..1fcbccfabf3 100644
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/primitive_base.hpp
@@ -20,9 +20,6 @@
 namespace cldnn {
 namespace ocl {
 
-// checks if any user in a list is a cpu primitive
-bool is_any_user_cpu(const std::list<const program_node*>& users);
-
 /*
 Base class for all GPU implementation of specified primitive type.
 For example, all gpu convolution implementations should derive from typed_primitive_impl_ocl<convolution>.
@@ -33,28 +30,17 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
     kernel_selector::kernel_data _kernel_data;
     std::vector<kernel_id> _kernel_ids;
     std::vector<kernel::ptr> _kernels;
-    std::vector<memory::cptr> _intermediates_memory;
 
     typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
     : typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name)
     , _outer(other._outer)
     , _kernel_data(other._kernel_data)
     , _kernel_ids(other._kernel_ids)
-    , _kernels({})
-    , _intermediates_memory({}) {
+    , _kernels({}) {
         _kernels.reserve(other._kernels.size());
         for (size_t k = 0; k < other._kernels.size(); ++k) {
             _kernels.emplace_back(other._kernels[k]->clone());
         }
-        for (auto& mem : other._intermediates_memory) {
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl;
-            }
-            auto& engine = _outer.get_program().get_engine();
-            auto new_mem = engine.allocate_memory(mem->get_layout(), mem->get_allocation_type());
-            _intermediates_memory.push_back(new_mem);
-        }
     }
 
     typed_primitive_impl_ocl(const typed_program_node<PType>& arg, const kernel_selector::kernel_data& kd)
@@ -71,22 +57,8 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
         for (size_t i = 0; i < kd.kernels.size(); ++i) {
             _kernel_ids.emplace_back(_outer.get_program().add_kernel(kd.kernels[i].code.kernelString));
         }
-
-        for (auto size : kd.internalBufferSizes) {
-            auto dtype = from_data_type(kd.internalBufferDataType);
-            const auto bpp = data_type_traits::size_of(dtype);
-            layout expected_layout = {dtype,
-                                      format::bfyx,  // simple linear format (flatten to x channel)
-                                      {1, 1, 1, (tensor::value_type)(size / bpp)}};
-
-            auto& eimpl = arg.get_program().get_engine();
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "[" << _kernel_data.params->layerID << ": internal buf]" << std::endl;
-            }
-            _intermediates_memory.push_back(eimpl.allocate_memory(expected_layout));
-        }
     }
+
     bool is_cpu() const override { return false; }
 
 protected:
@@ -137,6 +109,21 @@ protected:
         }
     }
 
+    std::vector<layout> get_internal_buffer_layouts_impl() const override {
+        if (_kernel_data.internalBufferSizes.empty())
+            return {};
+
+        std::vector<layout> layouts;
+        auto dtype = from_data_type(_kernel_data.internalBufferDataType);
+        const auto bpp = data_type_traits::size_of(dtype);
+        for (auto size : _kernel_data.internalBufferSizes) {
+            layout inbuf_layout = {dtype, format::bfyx, // simple linear format (flattern to x channel)
+                                    {1, 1, 1, (tensor::value_type)(size / bpp)}};
+            layouts.push_back(inbuf_layout);
+        }
+        return layouts;
+    }
+
     void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
         if (optimized_out(instance) || is_cpu()) {
             return;
@@ -153,7 +140,7 @@ protected:
                 args.scalars = &_kernel_data.kernels[k].params.scalars;
                 args.split = i;
 
-                for (const auto& m : _intermediates_memory) {
+                for (const auto& m : instance.get_intermediates_memories()) {
                     args.intermediates.push_back(m);
                 }
 
@@ -188,7 +175,7 @@ protected:
                 args.scalars = &_kernel_data.kernels[k].params.scalars;
                 args.split = i;
 
-                for (const auto& m : _intermediates_memory) {
+                for (const auto& m : instance.get_intermediates_memories()) {
                     args.intermediates.push_back(m);
                 }
 
diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
index 6b69286d8c3..d65f92f50fc 100644
--- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
@@ -21,6 +21,9 @@
 
 namespace cldnn {
 
+// checks if any user in a list is a cpu primitive
+bool is_any_user_cpu(const std::list<const program_node*>& users);
+
 class primitive_inst;
 
 template <class PType>
@@ -43,6 +46,7 @@ struct primitive_impl {
         : _weights_reorder_params(params), _kernel_name(kernel_name) {}
     virtual ~primitive_impl() = default;
 
+    virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
     virtual void set_arguments(primitive_inst& instance) = 0;
     virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
     virtual bool validate(const primitive_inst& instance) const = 0;
@@ -111,6 +115,7 @@ public:
     event::ptr execute(const std::vector<event::ptr>& events);
     void init_kernels();
     void set_arguments();
+
     bool validate() const {
         if (_impl == nullptr)
             throw std::invalid_argument("[Internal cldnn error].  Validation method for nullptr impl is not allowed.");
@@ -141,6 +146,14 @@ public:
         return _node.is_output();
     }
 
+    bool mem_allocated() const {
+        return _mem_allocated;
+    }
+
+    void allocate_internal_buffers();
+
+    std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
+
 protected:
     primitive_inst(network& network, program_node const& node, bool allocate_memory);
 
@@ -167,10 +180,13 @@ protected:
     // depending on reshape_node.is_in_place())
     memory::ptr _output;
 
+    std::vector<memory::cptr> _intermediates_memory;
+
     bool _output_changed;  // todo: implement output reuse if neither of inputs has changed
     bool _has_valid_input =
         true;  // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
     bool _has_mutable_input = false;
+    bool _mem_allocated = false;
 
     memory::ptr allocate_output();
     static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
@@ -207,6 +223,14 @@ private:
         return execute_impl(event, reinterpret_cast<typed_primitive_inst<PType>&>(instance));
     }
 
+    std::vector<layout> get_internal_buffer_layouts() const override {
+        return get_internal_buffer_layouts_impl();
+    }
+
+    virtual std::vector<layout> get_internal_buffer_layouts_impl() const {
+        return {};
+    }
+
     void set_arguments(primitive_inst& instance) override {
         if (instance.type() != PType::type_id())
             throw std::invalid_argument("Implementation type does not match primitive type");
@@ -217,7 +241,6 @@ private:
         return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
     }
 
-
     virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
     virtual event::ptr execute_impl(const std::vector<event::ptr>& event,
                                          typed_primitive_inst<PType>& instance) = 0;
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index 6b6f20f697a..5c60a442992 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -470,12 +470,6 @@ void network::allocate_primitives() {
     for (auto node : _program->get_processing_order()) {
         nodes_to_allocate.push_back(_program->get_node_ptr(node->id()));
     }
-    std::sort(nodes_to_allocate.begin(),
-              nodes_to_allocate.end(),
-              [](std::shared_ptr<program_node> const& lhs, std::shared_ptr<program_node> const& rhs) {
-                  return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
-              });
-
     for (auto const& node : nodes_to_allocate) {
         allocate_primitive_instance(*node);
     }
diff --git a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
index dcaea49163a..082dc3e8f2a 100644
--- a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
+++ b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
@@ -25,6 +25,27 @@
 
 namespace cldnn {
 
+bool is_user_cpu(const program_node* user) {
+    if (user->can_be_optimized()) {
+        auto users = user->get_users();
+        for (const auto& u : users) {
+            if (is_user_cpu(u)) {
+                return true;
+            }
+        }
+        return false;
+    }
+    return user->get_selected_impl()->is_cpu();
+}
+
+bool is_any_user_cpu(const std::list<const program_node*>& users) {
+    for (const auto& user : users) {
+        if (is_user_cpu(user))
+            return true;
+    }
+    return false;
+}
+
 uint32_t primitive_inst::get_network_id() const { return _network.get_id(); }
 
 void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout) const {
@@ -128,7 +149,8 @@ void primitive_inst::build_deps() {
 }
 
 primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
-    : _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr), _output(), _output_changed(false) {
+    : _network(network), _node(node), _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr),
+      _output(), _output_changed(false), _mem_allocated(allocate_memory) {
     if (allocate_memory) {
         // In case when output is mutable_data primitive, and other users dependencies are only used for
         // suychronization, The output memory of such primitive will be fused with mutable_data
@@ -159,23 +181,92 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
         } else {
             _output = allocate_output();
         }
+
+        // Allocate internal buffer
+        allocate_internal_buffers();
+    }
+}
+
+void primitive_inst::allocate_internal_buffers(void) {
+    if (_impl == nullptr) return;
+    const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
+    if (ibuf_layouts.empty()) return;
+
+    auto device_mem_acc = [&](size_t a, std::shared_ptr<primitive_inst> b) {
+        if (!b->mem_allocated()) return a;
+        if (b->output_memory().get_allocation_type() == allocation_type::usm_device ||
+            b->output_memory().get_allocation_type() == allocation_type::cl_mem)
+            return a + b->output_memory().size();
+        else
+            return a;
+    };
+
+    auto& engine = get_network().get_engine();
+    bool input_device_mem = false;
+
+    // NOTE: Currently the ocl driver aborts at runtime when there are layers using device memory close to max size within multiple streams.
+    // Decided the limitation as 85 % empirically, but still it needs further investigation.
+    const auto& inst_deps = _network.get_primitives(_node.get_dependencies());
+
+    auto total_device_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc);
+    if (_output->get_allocation_type() ==  allocation_type::usm_device) {
+        total_device_mem_size += _output->size();
+    }
+
+    int64_t available_device_mem_size = engine.get_device_info().max_global_mem_size - total_device_mem_size;
+    // check if there is any device mem input
+    if (engine.supports_allocation(allocation_type::usm_device)) {
+        for (const auto& dep : inst_deps) {
+            if (dep->output_memory().get_allocation_type() == allocation_type::usm_device) {
+                input_device_mem = true;
+                break;
+            }
+        }
+    }
+
+    for (auto layout : ibuf_layouts) {
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->verbose >= 2) {
+            GPU_DEBUG_COUT << "[" << _node.id() << ": internal buf]" << std::endl;
+        }
+        if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0))
+            _intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_device));
+        else
+            _intermediates_memory.push_back(engine.allocate_memory(layout, allocation_type::usm_host));
     }
 }
 
 memory::ptr primitive_inst::allocate_output() {
     auto layout = _node.get_output_layout();
     auto& engine = get_network().get_engine();
+    const auto& inst_deps = _network.get_primitives(_node.get_dependencies());
+    auto device_mem_acc = [&](size_t a, std::shared_ptr<primitive_inst> b) {
+        if (!b->mem_allocated()) return a;
+        if (b->output_memory().get_allocation_type() == allocation_type::usm_device
+            || b->output_memory().get_allocation_type() == allocation_type::cl_mem)
+            return a + b->output_memory().size();
+        else
+            return a;
+    };
+
+    bool usm_device_allocatable = true;
+    const auto& total_device_input_mem_size = std::accumulate(inst_deps.begin(), inst_deps.end(), 0, device_mem_acc);
+    if (total_device_input_mem_size > engine.get_device_info().max_global_mem_size)
+        usm_device_allocatable = false;
 
     // For outputs, cpu prim we want to have lockable alloc type
     // Also if the successor of a node is an cpu, then memory needs to be lockable.
     auto use_lockable_memory = _node.is_output() || _node.get_selected_impl()->is_cpu()
                                || std::any_of(_node.get_users().begin(), _node.get_users().end(),
-                                              [](const program_node* n) {return n->get_selected_impl()->is_cpu() || n->can_be_optimized(); })
-                               || !engine.supports_allocation(allocation_type::usm_device);
-    allocation_type alloc_type = use_lockable_memory ?
-                                 engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d())
-                                                     : allocation_type::usm_device;
+                                              [](const program_node* n) {
+                                     return n->get_selected_impl()->is_cpu() || is_any_user_cpu(n->get_users());
+                                  }) || !engine.supports_allocation(allocation_type::usm_device);
+
     GPU_DEBUG_GET_INSTANCE(debug_config);
+    const auto& lockable_mem_type = engine.get_lockable_preffered_memory_allocation_type(layout.format.is_image_2d());
+    const auto& alloc_type = use_lockable_memory ? lockable_mem_type
+                             : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;
+
     if (!_network.is_internal() && (_node.can_be_optimized() || _node.is_type<generic_layer>())) {
         GPU_DEBUG_IF(debug_config->verbose >= 2) {
             GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
@@ -186,7 +277,7 @@ memory::ptr primitive_inst::allocate_output() {
                                              alloc_type,
                                              false);
     } else if (_network.is_internal() && _node.is_output() && _node.is_type<generic_layer>() &&
-               engine.supports_allocation(allocation_type::usm_device)) {
+               engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
         GPU_DEBUG_IF(debug_config->verbose >= 2) {
             GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
         }
diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp
index dd414e9a0c8..6e9fd52da2e 100644
--- a/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/module_tests/usm_memory_test.cpp
@@ -100,7 +100,7 @@ TEST_P(ctor_test, basic) {
 
 INSTANTIATE_TEST_SUITE_P(cldnn_usm, ctor_test, ::testing::ValuesIn(std::vector<usm_test_params>{
     usm_test_params{ allocation_type::usm_host},
-    usm_test_params{ allocation_type::usm_shared},
+//    usm_test_params{ allocation_type::usm_shared}, // Unsupported
     usm_test_params{ allocation_type::usm_device},
 }));
 
@@ -173,7 +173,7 @@ TEST_P(copy_and_read_buffer, basic) {
 
 INSTANTIATE_TEST_SUITE_P(cldnn_usm, copy_and_read_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
         usm_test_params{ allocation_type::usm_host },
-        usm_test_params{ allocation_type::usm_shared },
+//        usm_test_params{ allocation_type::usm_shared }, // Unsupported
         usm_test_params{ allocation_type::usm_device },
 }));
 
@@ -256,6 +256,6 @@ TEST_P(fill_buffer, DISABLED_basic) {
 
 INSTANTIATE_TEST_SUITE_P(cldnn_usm, fill_buffer, ::testing::ValuesIn(std::vector<usm_test_params>{
     usm_test_params{ allocation_type::usm_host },
-        usm_test_params{ allocation_type::usm_shared },
+//        usm_test_params{ allocation_type::usm_shared }, // Unsupported
         usm_test_params{ allocation_type::usm_device },
 }));