[GPU] UsmHostTensor implementation (#20518)

Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
2023-10-18 16:12:15 +04:00
parent 90ad4c618d
commit 6e97b91a77
12 changed files with 518 additions and 152 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_allocators.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_allocators.hpp
@@ -1,43 +0,0 @@
-// Copyright (C) 2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "openvino/runtime/so_ptr.hpp"
-
-#include <memory>
-
-namespace ov {
-namespace intel_gpu {
-
-class RemoteTensorImpl;
-class RemoteContextImpl;
-
-class USMHostAllocator final {
-private:
-    ov::SoPtr<RemoteTensorImpl> _usm_host_tensor = { nullptr, nullptr };
-    std::shared_ptr<RemoteContextImpl> _context = nullptr;
-
-public:
-    using Ptr = std::shared_ptr<USMHostAllocator>;
-
-    explicit USMHostAllocator(std::shared_ptr<RemoteContextImpl> context) : _context(context) { }
-
-    /**
-    * @brief Allocates memory
-    * @param size The size in bytes to allocate
-    * @return Handle to the allocated resource
-    */
-    void* allocate(const size_t bytes, const size_t alignment = alignof(max_align_t)) noexcept;
-    /**
-    * @brief Releases handle and all associated memory resources which invalidates the handle.
-    * @return false if handle cannot be released, otherwise - true.
-    */
-    bool deallocate(void* handle, const size_t bytes, size_t alignment = alignof(max_align_t)) noexcept;
-
-    bool is_equal(const USMHostAllocator& other) const;
-};
-
-}  // namespace intel_gpu
-}  // namespace ov
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/remote_tensor.hpp
@@ -15,6 +15,7 @@
 #endif
 #include "openvino/runtime/iremote_tensor.hpp"

+#include "intel_gpu/runtime/memory_caps.hpp"
 #include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/plugin/common_utils.hpp"
@@ -56,6 +57,8 @@ public:
    cldnn::memory::ptr get_memory() const;
    cldnn::memory::ptr get_original_memory() const;

+    void set_memory(cldnn::memory::ptr memory, size_t actual_size);
+
    std::shared_ptr<RemoteContextImpl> get_context() const;

 private:
@@ -76,8 +79,11 @@ private:
    size_t m_hash = 0;

    bool supports_caching() const;
+    void update_hash();
    void update_strides();
-    void init_properties();
+    void update_properties();
+
+    static TensorType allocation_type_to_tensor_type(cldnn::allocation_type t);
 };

 }  // namespace intel_gpu
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
@@ -90,9 +90,7 @@ private:
                                                bool need_lockable_mem) const;
    std::shared_ptr<ov::ITensor> reinterpret_device_tensor(std::shared_ptr<RemoteTensorImpl> tensor, const ov::Shape new_shape) const;
    std::shared_ptr<ov::ITensor> create_host_tensor(const ov::PartialShape& port_shape, const ov::element::Type& port_element_type) const;
-    std::shared_ptr<ov::ITensor> create_device_tensor(const ov::Shape& pshape, ov::element::Type element_type,
-                                                      bool need_lockable_memory = false, void* mem_ptr = nullptr) const;
-    std::shared_ptr<ov::ITensor> create_shared_device_tensor(const ov::Shape& pshape, ov::element::Type element_type, void* usm_host_mem) const;
+    std::shared_ptr<ov::ITensor> create_device_tensor(const ov::PartialShape& pshape, ov::element::Type element_type, bool need_lockable_memory = false) const;

    void allocate_inputs();
    void allocate_outputs();
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/usm_host_tensor.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/usm_host_tensor.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/runtime/itensor.hpp"
+
+#include <memory>
+
+namespace ov {
+namespace intel_gpu {
+
+class RemoteContextImpl;
+class RemoteTensorImpl;
+
+class USMHostTensor : public ov::ITensor {
+public:
+    USMHostTensor(std::shared_ptr<RemoteContextImpl> context, const element::Type element_type, const Shape& shape);
+    explicit USMHostTensor(std::shared_ptr<RemoteTensorImpl> tensor);
+
+    ~USMHostTensor() override = default;
+
+    void* data(const element::Type& element_type) const override;
+    const element::Type& get_element_type() const override;
+
+    const Shape& get_shape() const override;
+
+    const Strides& get_strides() const override;
+
+    void set_shape(ov::Shape new_shape) override;
+
+    void set_memory(std::shared_ptr<RemoteTensorImpl> tensor);
+
+    std::shared_ptr<RemoteTensorImpl> get_impl() const;
+
+private:
+    std::shared_ptr<RemoteTensorImpl> m_impl;
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
--- a/src/plugins/intel_gpu/src/plugin/remote_allocators.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_allocators.cpp
@@ -1,37 +0,0 @@
-// Copyright (C) 2018-2023 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "intel_gpu/plugin/remote_allocators.hpp"
-#include "intel_gpu/plugin/remote_tensor.hpp"
-#include "intel_gpu/plugin/remote_context.hpp"
-#include <memory>
-
-namespace ov {
-namespace intel_gpu {
-
-void* USMHostAllocator::allocate(const size_t bytes, const size_t /* alignment */) noexcept {
-    try {
-        ov::AnyMap params = { ov::intel_gpu::shared_mem_type(ov::intel_gpu::SharedMemType::USM_HOST_BUFFER) };
-        _usm_host_tensor = _context->create_tensor(ov::element::u8, {bytes}, params);
-        if (auto casted = std::dynamic_pointer_cast<RemoteTensorImpl>(_usm_host_tensor._ptr)) {
-            return casted->get_original_memory()->get_internal_params().mem;
-        }
-        return nullptr;
-    } catch (std::exception&) {
-        return nullptr;
-    }
-}
-
-bool USMHostAllocator::deallocate(void* /* handle */, const size_t /* bytes */, size_t /* alignment */) noexcept {
-    try {
-        _usm_host_tensor = {nullptr, nullptr};
-    } catch (std::exception&) { }
-    return true;
-}
-
-bool USMHostAllocator::is_equal(const USMHostAllocator& other) const {
-    return other._usm_host_tensor != nullptr && _usm_host_tensor != nullptr && other._usm_host_tensor._ptr == _usm_host_tensor._ptr;
-}
-}  // namespace intel_gpu
-}  // namespace ov
--- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp
@@ -6,7 +6,7 @@
 #include "openvino/runtime/make_tensor.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "intel_gpu/plugin/remote_tensor.hpp"
-#include "intel_gpu/plugin/remote_allocators.hpp"
+#include "intel_gpu/plugin/usm_host_tensor.hpp"
 #include "intel_gpu/runtime/itt.hpp"
 #include "intel_gpu/runtime/device_query.hpp"
 #include <memory>
@@ -111,8 +111,7 @@ std::shared_ptr<RemoteContextImpl> RemoteContextImpl::get_this_shared_ptr() {

 ov::SoPtr<ov::ITensor> RemoteContextImpl::create_host_tensor(const ov::element::Type type, const ov::Shape& shape) {
    if (m_engine->use_unified_shared_memory()) {
-        USMHostAllocator allocator(get_this_shared_ptr());
-        return { ov::make_tensor(type, shape, allocator), nullptr };
+        return { std::make_shared<USMHostTensor>(get_this_shared_ptr(), type, shape), nullptr };
    } else {
        return { ov::make_tensor(type, shape), nullptr };
    }
--- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
+++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp
@@ -2,17 +2,29 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "intel_gpu/plugin/common_utils.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "intel_gpu/plugin/remote_tensor.hpp"
-#include "intel_gpu/plugin/remote_allocators.hpp"
 #include "intel_gpu/plugin/plugin.hpp"
 #include "intel_gpu/runtime/itt.hpp"
+#include "intel_gpu/runtime/memory_caps.hpp"

 #include <memory>

 namespace ov {
 namespace intel_gpu {

+TensorType RemoteTensorImpl::allocation_type_to_tensor_type(cldnn::allocation_type t) {
+    switch (t) {
+    case cldnn::allocation_type::cl_mem: return TensorType::BT_BUF_INTERNAL;
+    case cldnn::allocation_type::usm_host: return TensorType::BT_USM_HOST_INTERNAL;
+    case cldnn::allocation_type::usm_device: return TensorType::BT_USM_DEVICE_INTERNAL;
+    default: return TensorType::BT_EMPTY;
+    }
+
+    return TensorType::BT_EMPTY;
+}
+
 RemoteTensorImpl::RemoteTensorImpl(RemoteContextImpl::Ptr context,
                                   const ov::Shape& shape,
                                   const ov::element::Type& element_type,
@@ -28,20 +40,8 @@ RemoteTensorImpl::RemoteTensorImpl(RemoteContextImpl::Ptr context,
    , m_mem(mem)
    , m_surf(surf)
    , m_plane(plane) {
-    if (supports_caching()) {
-        m_hash = cldnn::hash_combine(0, m_mem);
-        m_hash = cldnn::hash_combine(m_hash, m_surf);
-        m_hash = cldnn::hash_combine(m_hash, plane);
-        m_hash = cldnn::hash_combine(m_hash, m_shape.size());
-        m_hash = cldnn::hash_combine(m_hash, element_type.hash());
-        for (const auto& d : m_shape) {
-            m_hash = cldnn::hash_combine(m_hash, d);
-        }
-    }
-
-    update_strides();
+    update_hash();
    allocate();
-    init_properties();
 }

 RemoteTensorImpl::~RemoteTensorImpl() {
@@ -82,12 +82,15 @@ const AnyMap& RemoteTensorImpl::get_properties() const {
    m_shape = shape;

    if (ov::shape_size(shape) > m_memory_object->count()) {
-        OPENVINO_ASSERT(!is_shared(), "Cannot call setShape for Tensor created on top of preallocated memory if shape was increased.");
+        GPU_DEBUG_TRACE_DETAIL << "Remote realloc" << std::endl;
+        OPENVINO_ASSERT(!is_shared(), "Cannot call set_shape for Tensor created on top of preallocated memory if shape was increased.");
        if (!deallocate()) {
-            OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in setShape.");
+            OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in set_shape.");
        }

        allocate();
+    } else {
+        update_strides();
    }
 }

@@ -108,23 +111,39 @@ void RemoteTensorImpl::allocate() {

    if (enable_caching) {
        m_memory_object = context->try_get_cached_memory(m_hash);
-        if (m_memory_object)
+        if (m_memory_object) {
+            update_properties();
+            update_strides();
            return;
+        }
    }

    auto& engine = context->get_engine();

+    // Currently, clDeviceMemAllocINTEL returns memory address allocated to other input blob if the current blob is empty
+    // W/A for this issue:
+    // Allocate with non-empty shape and then reinterprete with original shape
+    auto shape_copy = m_shape;
+    for (auto &i : shape_copy) {
+        if (i == 0)
+            i = 1;
+    }
+
+    m_layout.set_partial_shape(shape_copy);
+
+    const bool reset = false;
+
    switch (m_mem_type) {
    case TensorType::BT_BUF_INTERNAL: {
-        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
+        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset);
        break;
    }
    case TensorType::BT_USM_HOST_INTERNAL: {
-        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host);
+        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset);
        break;
    }
    case TensorType::BT_USM_DEVICE_INTERNAL: {
-        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_device);
+        m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_device, reset);
        break;
    }
    case TensorType::BT_BUF_SHARED: {
@@ -161,6 +180,9 @@ void RemoteTensorImpl::allocate() {
        m_memory_object.reset();
    }

+    update_properties();
+    update_strides();
+
    if (enable_caching)
        context->add_to_cache(m_hash, m_memory_object);
 }
@@ -181,6 +203,19 @@ bool RemoteTensorImpl::supports_caching() const {
    return is_shared();
 }

+void RemoteTensorImpl::update_hash() {
+    if (supports_caching()) {
+        m_hash = cldnn::hash_combine(0, m_mem);
+        m_hash = cldnn::hash_combine(m_hash, m_surf);
+        m_hash = cldnn::hash_combine(m_hash, m_plane);
+        m_hash = cldnn::hash_combine(m_hash, m_shape.size());
+        m_hash = cldnn::hash_combine(m_hash, m_element_type.hash());
+        for (const auto& d : m_shape) {
+            m_hash = cldnn::hash_combine(m_hash, d);
+        }
+    }
+}
+
 bool RemoteTensorImpl::is_surface() const noexcept {
    return m_mem_type == TensorType::BT_SURF_SHARED ||
           m_mem_type == TensorType::BT_IMG_SHARED ||
@@ -196,11 +231,24 @@ cldnn::memory::ptr RemoteTensorImpl::get_original_memory() const {
    return m_memory_object;
 }

+void RemoteTensorImpl::set_memory(cldnn::memory::ptr memory, size_t actual_size) {
+    auto engine = m_memory_object->get_engine();
+    m_layout = memory->get_layout();
+    m_shape = m_layout.get_shape();
+
+    auto actual_layout = m_layout;
+    actual_layout.set_partial_shape({ov::Dimension(actual_size)});
+    m_memory_object = engine->reinterpret_buffer(*memory, actual_layout);
+
+    update_properties();
+    update_strides();
+}
+
 std::shared_ptr<RemoteContextImpl> RemoteTensorImpl::get_context() const {
    return m_context;
 }

-void RemoteTensorImpl::init_properties() {
+void RemoteTensorImpl::update_properties() {
    OPENVINO_ASSERT(is_allocated(), "[GPU] Can't initialize RemoteTensorImpl parameters as memory was not allocated");
    auto params = m_memory_object->get_internal_params();

--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "intel_gpu/plugin/usm_host_tensor.hpp"
 #include "openvino/runtime/make_tensor.hpp"
 #include "openvino/core/preprocess/input_tensor_info.hpp"
 #include "openvino/core/parallel.hpp"
@@ -10,7 +11,6 @@

 #include "intel_gpu/plugin/sync_infer_request.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
-#include "intel_gpu/plugin/remote_allocators.hpp"
 #include "intel_gpu/plugin/remote_tensor.hpp"
 #include "intel_gpu/plugin/compiled_model.hpp"
 #include "intel_gpu/plugin/variable_state.hpp"
@@ -41,6 +41,15 @@ inline bool can_use_usm_host(const cldnn::engine& engine) {
    return can_use_usm;
 }

+inline ov::Shape get_tensor_shape(const ov::PartialShape& pshape) {
+    ov::Shape res(pshape.size());
+    for (size_t i = 0; i < pshape.size(); i++) {
+        res[i] = pshape[i].is_dynamic() ? 0 : pshape[i].get_length();
+    }
+
+    return res;
+}
+
 inline std::string get_port_name(const ov::Output<const ov::Node>& port, const bool is_legacy_api) {
    std::string name;
    // TODO: Should use tensor name as the port name, but many legacy tests still use legacy name
@@ -72,7 +81,7 @@ void convert_and_copy(const void* src_ptr, ov::element::Type src_et, void* dst_p
        return;

    if (src_et == dst_et) {
-        std::memcpy(dst_ptr, src_ptr, size);
+        std::memcpy(dst_ptr, src_ptr, size * src_et.size());
        return;
    }

@@ -425,6 +434,7 @@ void SyncInferRequest::wait() {
            OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait::reinterpret_memory");
            OPENVINO_ASSERT(!output_memory->get_layout().data_padding, "[GPU] Unexpected padding in output buffer");
            output_memory = m_graph->get_engine().reinterpret_buffer(*output_memory, output_layout);
+            GPU_DEBUG_TRACE_DETAIL << name << " model output: " << output_memory->buffer_ptr() << std::endl;
        }

        OPENVINO_ASSERT(m_user_outputs.count(name) > 0, "[GPU] Output ", name, " is not found in output tensors map");
@@ -433,6 +443,12 @@ void SyncInferRequest::wait() {
        auto remote_ptr = std::dynamic_pointer_cast<RemoteTensorImpl>(output_tensor);
        bool is_remote = remote_ptr != nullptr;

+        if (is_remote) {
+            GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (remote): " << remote_ptr->get_original_memory()->buffer_ptr() << std::endl;
+        } else {
+            GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (host): " << output_tensor->data() << std::endl;
+        }
+
        bool need_output_update = output_layout.bytes_count() == 0 || (output_memory && output_tensor->get_byte_size() != output_memory->size());
        if (need_output_update) {
            OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait::update_output");
@@ -444,6 +460,19 @@ void SyncInferRequest::wait() {
                OPENVINO_ASSERT(ov::shape_size(port.get_shape()) == ov::shape_size(mem_shape), "[GPU] Unexpected elements count for output tensor");
                mem_shape = port.get_shape();
            }
+            if (port.get_partial_shape().is_dynamic()) {
+                bool need_reallocate = true;
+                auto usm_host_tensor = std::dynamic_pointer_cast<USMHostTensor>(output_tensor);
+                if (usm_host_tensor && output_memory)
+                    need_reallocate = usm_host_tensor->get_impl()->get_original_memory()->size() < output_memory->size();
+
+                if (need_reallocate) {
+                    auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
+                    auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), shape_predictor);
+                    output_tensor->set_shape(actual_memory_shape);
+                }
+            }
+
            output_tensor->set_shape(mem_shape);
        }

@@ -453,6 +482,8 @@ void SyncInferRequest::wait() {
            auto dst_ptr = static_cast<uint8_t*>(output_tensor->data());
            bool same_mem = same_host_mem(output_memory, dst_ptr);
            if (!same_mem && output_memory->size()) {
+                GPU_DEBUG_TRACE_DETAIL << name << " copy from: " << output_memory->buffer_ptr() << " to "
+                                       << (!is_remote ? output_tensor->data() : remote_ptr->get_original_memory()->buffer_ptr()) << std::endl;
                if (auto ev = copy_output_data(output_memory, *output_tensor)) {
                    copy_events.push_back(ev);
                }
@@ -492,22 +523,13 @@ void SyncInferRequest::setup_stream_graph() {

 std::shared_ptr<ov::ITensor> SyncInferRequest::create_host_tensor(const ov::PartialShape& port_shape, const ov::element::Type& port_element_type) const {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::create_host_tensor");
-    // Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes
-    // If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception
-    bool use_usm = m_graph->get_engine().use_unified_shared_memory() && !port_shape.is_dynamic();
-
-    auto shape = port_shape.is_static() ? port_shape.to_shape() : ov::Shape(port_shape.size(), 0);
-    auto usm_allocator = USMHostAllocator(m_context);
-    return use_usm ? ov::make_tensor(port_element_type, shape, usm_allocator)
-                   : ov::make_tensor(port_element_type, shape);
+    return m_context->create_host_tensor(port_element_type, get_tensor_shape(port_shape))._ptr;
 }

-std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Shape& shape, ov::element::Type element_type,
-                                                                    bool need_lockable_memory, void* mem_ptr) const {
+std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::PartialShape& port_shape, ov::element::Type element_type,
+                                                                    bool need_lockable_memory) const {
    TensorType tensor_type = TensorType::BT_EMPTY;
-    if (mem_ptr) {
-        tensor_type = TensorType::BT_USM_SHARED;
-    } else if (m_graph->get_engine().use_unified_shared_memory()) {
+    if (m_graph->get_engine().use_unified_shared_memory()) {
        tensor_type = need_lockable_memory ? TensorType::BT_USM_HOST_INTERNAL : TensorType::BT_USM_DEVICE_INTERNAL;
    } else {
        tensor_type = TensorType::BT_BUF_INTERNAL;
@@ -517,24 +539,10 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Sh
    if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory)
        tensor_type = TensorType::BT_BUF_INTERNAL;

-    // Currently, clDeviceMemAllocINTEL returns memory address allocated to other input blob if the current blob is empty
-    // W/A for this issue:
-    // Allocate with non-empty shape and then reinterprete with original shape
-    auto shape_copy = shape;
-    for (auto &i : shape_copy) {
-        if (i == 0)
-            i = 1;
-    }
-
    return std::make_shared<RemoteTensorImpl>(m_context,
-                                              shape_copy,
+                                              get_tensor_shape(port_shape),
                                              element_type,
-                                              tensor_type,
-                                              mem_ptr);
-}
-
-std::shared_ptr<ov::ITensor> SyncInferRequest::create_shared_device_tensor(const ov::Shape& shape, ov::element::Type element_type, void* usm_host_mem) const {
-    return create_device_tensor(shape, element_type, false, usm_host_mem);
+                                              tensor_type);
 }

 TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrapper& user_tensor_wrapper,
@@ -546,17 +554,12 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
    auto tensor_shape = user_tensor->get_shape();
    bool is_dynamic = port_pshape.is_dynamic();
    OPENVINO_ASSERT(std::dynamic_pointer_cast<RemoteTensorImpl>(user_tensor) == nullptr, "[GPU] Unexpected remote tensor");
-    auto input_ptr = user_tensor->data();
-    const auto alloc_type = m_graph->get_engine().detect_usm_allocation_type(input_ptr);
-    const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
-    bool can_share = is_usm_host &&
-                     !is_convert_required(user_tensor->get_element_type(), element_type) &&
+    auto usm_host_tensor = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
+    bool can_share = usm_host_tensor != nullptr && !is_convert_required(user_tensor->get_element_type(), element_type) &&
                     can_use_usm_host(m_graph->get_engine());

    if (can_share) {
-        // For USM case we create host blob using custom USM host allocator
-        // and then create shared device blob on top of this buffer
-        return { create_shared_device_tensor(tensor_shape, element_type, input_ptr), user_tensor_wrapper.owner };
+        return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
    }

    auto actual_memory_shape = tensor_shape;
@@ -689,13 +692,17 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_batched_input(const std
 std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string& name,
                                                               const ov::Output<const ov::Node>& port,
                                                               const TensorWrapper& user_tensor_wrapper) {
-    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::prepare_input");
+    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, openvino::itt::handle("SyncInferRequest::prepare_input: " + name));
    auto pshape = port.get_partial_shape();
    auto is_dynamic = pshape.is_dynamic();
    auto user_tensor = user_tensor_wrapper.ptr;
    auto element_type = user_tensor->get_element_type();
+
    auto remote_ptr = std::dynamic_pointer_cast<RemoteTensorImpl>(user_tensor);
+    auto usm_host_ptr = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
    bool is_remote = remote_ptr != nullptr;
+    bool is_usm_host_tensor = usm_host_ptr != nullptr;
+
    GPU_DEBUG_TRACE_DETAIL << "Prepare input for " << name << " ( is_remote ? " << is_remote << ")" << std::endl;
    GPU_DEBUG_TRACE_DETAIL << "    port shape       : " << pshape.to_string() << std::endl;
    GPU_DEBUG_TRACE_DETAIL << "    user_tensor shape: " << user_tensor->get_shape().to_string() << std::endl;
@@ -713,12 +720,16 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
                    user_tensor->get_shape(),
                    ") are incompatible");

-    if (is_remote) {
-        m_plugin_inputs[name] = user_tensor_wrapper;
-    }
-
    auto device_tensor_et = convert_to_supported_device_type(element_type);
    bool convert_needed = is_convert_required(element_type, device_tensor_et);
+
+    if (is_remote) {
+        m_plugin_inputs[name] = user_tensor_wrapper;
+    } else if (is_usm_host_tensor && !convert_needed) {
+        m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner};
+        is_remote = true;
+    }
+
    bool update_device_tensor = m_plugin_inputs.count(name) == 0 || (m_plugin_inputs[name].owner == TensorOwner::USER && !is_remote);

    if (update_device_tensor) {
@@ -780,6 +791,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
        }
    }

+    GPU_DEBUG_TRACE_DETAIL << name << " prepare input: " << memory->buffer_ptr() << std::endl;
    const cldnn::primitive_id internal_name = "parameter:" + name;
    network->set_input_data(internal_name, memory);

@@ -839,6 +851,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_output(const std::strin

    auto output_tensor = std::dynamic_pointer_cast<RemoteTensorImpl>(m_plugin_outputs.at(name).ptr);
    auto output_memory = output_tensor->get_memory();
+    GPU_DEBUG_TRACE_DETAIL << name << " prepare output: " << output_memory->buffer_ptr() << std::endl;
    return network->set_output_memory(internal_name, output_memory);
 }

--- a/src/plugins/intel_gpu/src/plugin/usm_host_tensor.cpp
+++ b/src/plugins/intel_gpu/src/plugin/usm_host_tensor.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/plugin/usm_host_tensor.hpp"
+#include "intel_gpu/plugin/remote_tensor.hpp"
+#include "intel_gpu/plugin/remote_context.hpp"
+#include "openvino/runtime/make_tensor.hpp"
+#include <memory>
+
+namespace ov {
+namespace intel_gpu {
+
+USMHostTensor::USMHostTensor(std::shared_ptr<RemoteContextImpl> context, const element::Type element_type, const Shape& shape)
+    : m_impl(std::make_shared<RemoteTensorImpl>(context, shape, element_type, TensorType::BT_USM_HOST_INTERNAL)) {}
+
+USMHostTensor::USMHostTensor(std::shared_ptr<RemoteTensorImpl> tensor)
+    : m_impl(tensor) {}
+
+void* USMHostTensor::data(const element::Type& element_type) const {
+    return m_impl->get_original_memory()->buffer_ptr();
+}
+
+const element::Type& USMHostTensor::get_element_type() const {
+    return m_impl->get_element_type();
+}
+
+const Shape& USMHostTensor::get_shape() const {
+    return m_impl->get_shape();
+}
+
+const Strides& USMHostTensor::get_strides() const {
+    return m_impl->get_strides();
+}
+
+void USMHostTensor::set_shape(ov::Shape new_shape) {
+    m_impl->set_shape(new_shape);
+}
+
+void USMHostTensor::set_memory(std::shared_ptr<RemoteTensorImpl> tensor) {
+    OPENVINO_ASSERT(tensor->get_original_memory()->get_allocation_type() == cldnn::allocation_type::usm_host, "[GPU] Unexpected allocation type");
+    m_impl = tensor;
+}
+
+std::shared_ptr<RemoteTensorImpl> USMHostTensor::get_impl() const {
+    return m_impl;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
--- a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
+++ b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include "openvino/core/dimension.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/concat.hpp"
+
+namespace tests {
+
+inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch = ov::Dimension::dynamic(),
+                                                            ov::Dimension n_heads = ov::Dimension::dynamic(),
+                                                            ov::Dimension n_features = ov::Dimension::dynamic(),
+                                                            ov::element::Type_t element_type = ov::element::f32) {
+    ov::PartialShape kv_cache_size = {batch, n_heads, -1, n_features};
+    ov::PartialShape new_token_size = {batch, -1, n_heads, n_features};
+    ov::PartialShape matmul_in_size = {batch, n_heads, -1, -1};
+
+    auto in_kv_prev = std::make_shared<ov::op::v0::Parameter>(element_type, kv_cache_size);
+    in_kv_prev->set_friendly_name("past_key_values");
+    auto in_new_token = std::make_shared<ov::op::v0::Parameter>(element_type, new_token_size);
+    in_new_token->set_friendly_name("new_token_input");
+    auto in_matmul = std::make_shared<ov::op::v0::Parameter>(element_type, matmul_in_size);
+    in_matmul->set_friendly_name("in_matmul");
+
+    auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, {new_token_size.size()}, {0, 2, 1, 3});
+    auto transpose = std::make_shared<ov::op::v1::Transpose>(in_new_token, transpose_const);
+    auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{in_kv_prev, transpose}, 2);
+    auto convert = std::make_shared<ov::op::v0::Convert>(concat, element_type);
+    auto kv_present = std::make_shared<ov::op::v0::Result>(convert);
+    kv_present->set_friendly_name("present_key_values");
+    auto matmul = std::make_shared<ov::op::v0::MatMul>(in_matmul, concat, false, false);
+    auto matmul_out = std::make_shared<ov::op::v0::Result>(matmul);
+    matmul_out->set_friendly_name("matmul_out");
+
+    ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul};
+    ov::ResultVector results{kv_present, matmul_out};
+    return std::make_shared<ov::Model>(results, params, "LLM-KV-Cache");
+}
+
+} // namespace tests
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <limits.h>
+#include "behavior/ov_infer_request/iteration_chaining.hpp"
+#include "common_test_utils/test_constants.hpp"
+#include "openvino/runtime/properties.hpp"
+
+using namespace ov::test::behavior;
+
+namespace {
+
+const std::vector<ov::AnyMap> configs = {
+    { ov::hint::inference_precision(ov::element::f32) }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVIterationChaining,
+                        ::testing::Combine(
+                                ::testing::Values(ov::test::utils::DEVICE_GPU),
+                                ::testing::ValuesIn(configs)),
+                        OVIterationChaining::getTestCaseName);
+
+}  // namespace
--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp
@@ -0,0 +1,217 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "openvino/core/node_vector.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/core/preprocess/pre_post_process.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/transpose.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "shared_test_classes/base/utils/compare_results.hpp"
+#include "transformations/rt_info/decompression.hpp"
+#include "subgraphs_builders.hpp"
+
+using namespace ov::test;
+
+namespace SubgraphTestsDefinitions {
+
+using KVCacheTestParams = std::tuple<std::vector<InputShape>,  // input shapes
+                                     ov::element::Type,        // in/out precision
+                                     std::map<std::string, std::string>>;  // additional config
+
+class KVCacheTest : public testing::WithParamInterface<KVCacheTestParams>, public SubgraphBaseTest {
+public:
+    static std::string get_test_case_name(testing::TestParamInfo<KVCacheTestParams> obj) {
+        std::vector<InputShape> input_shapes;
+        ov::element::Type element_type;
+        std::map<std::string, std::string> additional_config;
+
+        std::tie(input_shapes, element_type, additional_config) = obj.param;
+
+        std::ostringstream result;
+        for (const auto& shape : input_shapes) {
+            result << ov::test::utils::partialShape2str({shape.first}) << "_";
+        }
+        result << "TS=";
+        for (const auto& shape : input_shapes) {
+            result << "(";
+            if (!shape.second.empty()) {
+                auto itr = shape.second.begin();
+                do {
+                    result << ov::test::utils::vec2str(*itr);
+                } while (++itr != shape.second.end() && result << "_");
+            }
+            result << ")_";
+        }
+        result << "precision=" << element_type << "_";
+        result << "config=(";
+        for (const auto& configEntry : additional_config) {
+            result << configEntry.first << ", " << configEntry.second << ":";
+        }
+        result << ")";
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_GPU;
+
+        std::vector<InputShape> input_shapes;
+        ov::element::Type element_type;
+        std::map<std::string, std::string> additional_config;
+
+        std::tie(input_shapes, element_type, additional_config) = GetParam();
+
+        configuration.insert(additional_config.begin(), additional_config.end());
+        init_input_shapes(input_shapes);
+
+        inType = outType = element_type;
+
+        function = tests::make_llm_kv_cache_pattern(inputDynamicShapes[0][0], inputDynamicShapes[0][1], inputDynamicShapes[0][3], element_type);
+    }
+};
+
+TEST_P(KVCacheTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+}
+
+namespace {
+
+const std::vector<ov::element::Type> precisions = {ov::element::f32, ov::element::f16};
+
+const std::vector<std::vector<InputShape>> input_shapes_basic = {
+    {
+        {{-1, 32, -1, 80}, { {1, 32, 0, 80}, {1, 32, 20, 80} }},
+        {{-1, -1, 32, 80}, { {1, 20, 32, 80}, {1, 1, 32, 80} }},
+        {{-1, 32, -1, -1}, { {1, 32, 1, 20}, {1, 32, 1, 21} }}
+    },
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_GPU_Dynamic,
+                         KVCacheTest,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_basic),
+                                            ::testing::ValuesIn(precisions),
+                                            ::testing::Values(std::map<std::string, std::string>())),
+                         KVCacheTest::get_test_case_name);
+} // namespace
+
+TEST(KVCacheTest, smoke_multipleIterations) {
+#if defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    auto core = ov::Core();
+
+    const size_t batch = 1;
+    const size_t n_heads = 32;
+    const size_t n_features = 80;
+    const size_t context_size = 20;
+    size_t cache_size = 0;
+
+    ov::element::Type element_type = ov::element::f16;
+
+    auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type);
+    auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
+
+    auto input0 = model->get_parameters().at(0);
+    auto input1 = model->get_parameters().at(1);
+    auto input2 = model->get_parameters().at(2);
+    auto output0 = model->get_results().at(0);
+    auto output1 = model->get_results().at(1);
+
+    auto infer_request = compiled_model.create_infer_request();
+    auto input0_tensor_remote_io = infer_request.get_tensor(input0);
+    auto input1_tensor_remote_io = infer_request.get_tensor(input1);
+    auto input2_tensor_remote_io = infer_request.get_tensor(input2);
+    auto output0_tensor_remote_io = infer_request.get_tensor(output0);
+    auto output1_tensor_remote_io = infer_request.get_tensor(output1);
+
+    auto compare_tensors = [&model](const std::vector<ov::Tensor> expected, const std::vector<ov::Tensor>& actual) {
+            ASSERT_EQ(expected.size(), actual.size());
+            ASSERT_EQ(expected.size(), model->get_results().size());
+            auto compareMap = ov::test::utils::getCompareMap();
+            const auto& results = model->get_results();
+            for (size_t j = 0; j < results.size(); j++) {
+                const auto result = results[j];
+                for (size_t i = 0; i < result->get_input_size(); ++i) {
+                    std::shared_ptr<ov::Node> inputNode = result->get_input_node_shared_ptr(i);
+                    if (std::dynamic_pointer_cast<ov::op::v0::Convert>(inputNode)) {
+                        std::shared_ptr<ov::Node> nextNodePtr = inputNode->get_input_node_shared_ptr(0);
+                        if (!ngraph::is_type<ov::op::v0::Result>(nextNodePtr)) {
+                            inputNode = nextNodePtr;
+                        }
+                    }
+                    auto it = compareMap.find(inputNode->get_type_info());
+                    ASSERT_NE(it, compareMap.end());
+                    it->second(inputNode, i, expected[j], actual[j], 1e-4f, 1e-4f);
+                }
+            }
+    };
+
+    {
+        const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
+        const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
+        const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
+
+        auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
+        auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
+
+        auto kv_cache_input = infer_request.get_tensor(input0);
+        kv_cache_input.set_shape(kv_cache_size_initial);
+
+        auto ref_model = model->clone();
+        ngraph::helpers::resize_function(ref_model, {kv_cache_input.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()});
+        auto results = ngraph::helpers::interpretFunction(ref_model, {{input0, kv_cache_input}, {input1, new_token_data}, {input2, matmul_data}});
+
+        infer_request.set_tensor(input0, kv_cache_input);
+        infer_request.set_tensor(input1, new_token_data);
+        infer_request.set_tensor(input2, matmul_data);
+
+        infer_request.infer();
+
+        compare_tensors(results, {infer_request.get_tensor(output0), infer_request.get_tensor(output1)});
+
+        cache_size += context_size;
+    }
+
+    const size_t input_tokens = 1;
+    const size_t niters = 10;
+    const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
+    size_t context_length = cache_size + input_tokens;
+    for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
+        ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
+        auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
+        auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
+
+        auto kv_cache_input = infer_request.get_tensor(output0);
+        auto kv_shape = kv_cache_input.get_shape();
+
+        auto ref_model = model->clone();
+        ngraph::helpers::resize_function(ref_model, {kv_shape, new_token_data.get_shape(), matmul_data.get_shape()});
+        auto results = ngraph::helpers::interpretFunction(ref_model, {{input0, kv_cache_input}, {input1, new_token_data}, {input2, matmul_data}});
+
+        auto new_token_input = infer_request.get_tensor(input1);
+        new_token_input.set_shape(new_token_data.get_shape());
+        auto matmul_input = infer_request.get_tensor(input2);
+        matmul_input.set_shape(matmul_data.get_shape());
+
+        new_token_data.copy_to(new_token_input);
+        matmul_data.copy_to(matmul_input);
+
+        infer_request.set_tensor(input0, kv_cache_input);
+        infer_request.set_tensor(input1, new_token_input);
+        infer_request.set_tensor(input2, matmul_input);
+
+        infer_request.infer();
+
+        compare_tensors(results, {infer_request.get_tensor(output0), infer_request.get_tensor(output1)});
+    }
+}
+
+} // namespace SubgraphTestsDefinitions