[GPU] UsmHostTensor implementation (#20518)

Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
This commit is contained in:
Sergey Shlyapnikov
2023-10-18 16:12:15 +04:00
committed by GitHub
parent 90ad4c618d
commit 6e97b91a77
12 changed files with 518 additions and 152 deletions

View File

@@ -1,43 +0,0 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/runtime/so_ptr.hpp"
#include <memory>
namespace ov {
namespace intel_gpu {
class RemoteTensorImpl;
class RemoteContextImpl;
class USMHostAllocator final {
private:
ov::SoPtr<RemoteTensorImpl> _usm_host_tensor = { nullptr, nullptr };
std::shared_ptr<RemoteContextImpl> _context = nullptr;
public:
using Ptr = std::shared_ptr<USMHostAllocator>;
explicit USMHostAllocator(std::shared_ptr<RemoteContextImpl> context) : _context(context) { }
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* allocate(const size_t bytes, const size_t alignment = alignof(max_align_t)) noexcept;
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool deallocate(void* handle, const size_t bytes, size_t alignment = alignof(max_align_t)) noexcept;
bool is_equal(const USMHostAllocator& other) const;
};
} // namespace intel_gpu
} // namespace ov

View File

@@ -15,6 +15,7 @@
#endif
#include "openvino/runtime/iremote_tensor.hpp"
#include "intel_gpu/runtime/memory_caps.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
@@ -56,6 +57,8 @@ public:
cldnn::memory::ptr get_memory() const;
cldnn::memory::ptr get_original_memory() const;
void set_memory(cldnn::memory::ptr memory, size_t actual_size);
std::shared_ptr<RemoteContextImpl> get_context() const;
private:
@@ -76,8 +79,11 @@ private:
size_t m_hash = 0;
bool supports_caching() const;
void update_hash();
void update_strides();
void init_properties();
void update_properties();
static TensorType allocation_type_to_tensor_type(cldnn::allocation_type t);
};
} // namespace intel_gpu

View File

@@ -90,9 +90,7 @@ private:
bool need_lockable_mem) const;
std::shared_ptr<ov::ITensor> reinterpret_device_tensor(std::shared_ptr<RemoteTensorImpl> tensor, const ov::Shape new_shape) const;
std::shared_ptr<ov::ITensor> create_host_tensor(const ov::PartialShape& port_shape, const ov::element::Type& port_element_type) const;
std::shared_ptr<ov::ITensor> create_device_tensor(const ov::Shape& pshape, ov::element::Type element_type,
bool need_lockable_memory = false, void* mem_ptr = nullptr) const;
std::shared_ptr<ov::ITensor> create_shared_device_tensor(const ov::Shape& pshape, ov::element::Type element_type, void* usm_host_mem) const;
std::shared_ptr<ov::ITensor> create_device_tensor(const ov::PartialShape& pshape, ov::element::Type element_type, bool need_lockable_memory = false) const;
void allocate_inputs();
void allocate_outputs();

View File

@@ -0,0 +1,42 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/runtime/itensor.hpp"
#include <memory>
namespace ov {
namespace intel_gpu {
class RemoteContextImpl;
class RemoteTensorImpl;
class USMHostTensor : public ov::ITensor {
public:
USMHostTensor(std::shared_ptr<RemoteContextImpl> context, const element::Type element_type, const Shape& shape);
explicit USMHostTensor(std::shared_ptr<RemoteTensorImpl> tensor);
~USMHostTensor() override = default;
void* data(const element::Type& element_type) const override;
const element::Type& get_element_type() const override;
const Shape& get_shape() const override;
const Strides& get_strides() const override;
void set_shape(ov::Shape new_shape) override;
void set_memory(std::shared_ptr<RemoteTensorImpl> tensor);
std::shared_ptr<RemoteTensorImpl> get_impl() const;
private:
std::shared_ptr<RemoteTensorImpl> m_impl;
};
} // namespace intel_gpu
} // namespace ov

View File

@@ -1,37 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/remote_tensor.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include <memory>
namespace ov {
namespace intel_gpu {
void* USMHostAllocator::allocate(const size_t bytes, const size_t /* alignment */) noexcept {
try {
ov::AnyMap params = { ov::intel_gpu::shared_mem_type(ov::intel_gpu::SharedMemType::USM_HOST_BUFFER) };
_usm_host_tensor = _context->create_tensor(ov::element::u8, {bytes}, params);
if (auto casted = std::dynamic_pointer_cast<RemoteTensorImpl>(_usm_host_tensor._ptr)) {
return casted->get_original_memory()->get_internal_params().mem;
}
return nullptr;
} catch (std::exception&) {
return nullptr;
}
}
bool USMHostAllocator::deallocate(void* /* handle */, const size_t /* bytes */, size_t /* alignment */) noexcept {
try {
_usm_host_tensor = {nullptr, nullptr};
} catch (std::exception&) { }
return true;
}
bool USMHostAllocator::is_equal(const USMHostAllocator& other) const {
return other._usm_host_tensor != nullptr && _usm_host_tensor != nullptr && other._usm_host_tensor._ptr == _usm_host_tensor._ptr;
}
} // namespace intel_gpu
} // namespace ov

View File

@@ -6,7 +6,7 @@
#include "openvino/runtime/make_tensor.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_tensor.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/usm_host_tensor.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/device_query.hpp"
#include <memory>
@@ -111,8 +111,7 @@ std::shared_ptr<RemoteContextImpl> RemoteContextImpl::get_this_shared_ptr() {
ov::SoPtr<ov::ITensor> RemoteContextImpl::create_host_tensor(const ov::element::Type type, const ov::Shape& shape) {
if (m_engine->use_unified_shared_memory()) {
USMHostAllocator allocator(get_this_shared_ptr());
return { ov::make_tensor(type, shape, allocator), nullptr };
return { std::make_shared<USMHostTensor>(get_this_shared_ptr(), type, shape), nullptr };
} else {
return { ov::make_tensor(type, shape), nullptr };
}

View File

@@ -2,17 +2,29 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_tensor.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/plugin.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/memory_caps.hpp"
#include <memory>
namespace ov {
namespace intel_gpu {
TensorType RemoteTensorImpl::allocation_type_to_tensor_type(cldnn::allocation_type t) {
switch (t) {
case cldnn::allocation_type::cl_mem: return TensorType::BT_BUF_INTERNAL;
case cldnn::allocation_type::usm_host: return TensorType::BT_USM_HOST_INTERNAL;
case cldnn::allocation_type::usm_device: return TensorType::BT_USM_DEVICE_INTERNAL;
default: return TensorType::BT_EMPTY;
}
return TensorType::BT_EMPTY;
}
RemoteTensorImpl::RemoteTensorImpl(RemoteContextImpl::Ptr context,
const ov::Shape& shape,
const ov::element::Type& element_type,
@@ -28,20 +40,8 @@ RemoteTensorImpl::RemoteTensorImpl(RemoteContextImpl::Ptr context,
, m_mem(mem)
, m_surf(surf)
, m_plane(plane) {
if (supports_caching()) {
m_hash = cldnn::hash_combine(0, m_mem);
m_hash = cldnn::hash_combine(m_hash, m_surf);
m_hash = cldnn::hash_combine(m_hash, plane);
m_hash = cldnn::hash_combine(m_hash, m_shape.size());
m_hash = cldnn::hash_combine(m_hash, element_type.hash());
for (const auto& d : m_shape) {
m_hash = cldnn::hash_combine(m_hash, d);
}
}
update_strides();
update_hash();
allocate();
init_properties();
}
RemoteTensorImpl::~RemoteTensorImpl() {
@@ -82,12 +82,15 @@ const AnyMap& RemoteTensorImpl::get_properties() const {
m_shape = shape;
if (ov::shape_size(shape) > m_memory_object->count()) {
OPENVINO_ASSERT(!is_shared(), "Cannot call setShape for Tensor created on top of preallocated memory if shape was increased.");
GPU_DEBUG_TRACE_DETAIL << "Remote realloc" << std::endl;
OPENVINO_ASSERT(!is_shared(), "Cannot call set_shape for Tensor created on top of preallocated memory if shape was increased.");
if (!deallocate()) {
OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in setShape.");
OPENVINO_THROW("Cannot deallocate tensor while an attempt to enlarge tensor area in set_shape.");
}
allocate();
} else {
update_strides();
}
}
@@ -108,23 +111,39 @@ void RemoteTensorImpl::allocate() {
if (enable_caching) {
m_memory_object = context->try_get_cached_memory(m_hash);
if (m_memory_object)
if (m_memory_object) {
update_properties();
update_strides();
return;
}
}
auto& engine = context->get_engine();
// Currently, clDeviceMemAllocINTEL returns memory address allocated to other input blob if the current blob is empty
// W/A for this issue:
// Allocate with non-empty shape and then reinterprete with original shape
auto shape_copy = m_shape;
for (auto &i : shape_copy) {
if (i == 0)
i = 1;
}
m_layout.set_partial_shape(shape_copy);
const bool reset = false;
switch (m_mem_type) {
case TensorType::BT_BUF_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem, reset);
break;
}
case TensorType::BT_USM_HOST_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host);
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host, reset);
break;
}
case TensorType::BT_USM_DEVICE_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_device);
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_device, reset);
break;
}
case TensorType::BT_BUF_SHARED: {
@@ -161,6 +180,9 @@ void RemoteTensorImpl::allocate() {
m_memory_object.reset();
}
update_properties();
update_strides();
if (enable_caching)
context->add_to_cache(m_hash, m_memory_object);
}
@@ -181,6 +203,19 @@ bool RemoteTensorImpl::supports_caching() const {
return is_shared();
}
void RemoteTensorImpl::update_hash() {
if (supports_caching()) {
m_hash = cldnn::hash_combine(0, m_mem);
m_hash = cldnn::hash_combine(m_hash, m_surf);
m_hash = cldnn::hash_combine(m_hash, m_plane);
m_hash = cldnn::hash_combine(m_hash, m_shape.size());
m_hash = cldnn::hash_combine(m_hash, m_element_type.hash());
for (const auto& d : m_shape) {
m_hash = cldnn::hash_combine(m_hash, d);
}
}
}
bool RemoteTensorImpl::is_surface() const noexcept {
return m_mem_type == TensorType::BT_SURF_SHARED ||
m_mem_type == TensorType::BT_IMG_SHARED ||
@@ -196,11 +231,24 @@ cldnn::memory::ptr RemoteTensorImpl::get_original_memory() const {
return m_memory_object;
}
void RemoteTensorImpl::set_memory(cldnn::memory::ptr memory, size_t actual_size) {
auto engine = m_memory_object->get_engine();
m_layout = memory->get_layout();
m_shape = m_layout.get_shape();
auto actual_layout = m_layout;
actual_layout.set_partial_shape({ov::Dimension(actual_size)});
m_memory_object = engine->reinterpret_buffer(*memory, actual_layout);
update_properties();
update_strides();
}
std::shared_ptr<RemoteContextImpl> RemoteTensorImpl::get_context() const {
return m_context;
}
void RemoteTensorImpl::init_properties() {
void RemoteTensorImpl::update_properties() {
OPENVINO_ASSERT(is_allocated(), "[GPU] Can't initialize RemoteTensorImpl parameters as memory was not allocated");
auto params = m_memory_object->get_internal_params();

View File

@@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/usm_host_tensor.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "openvino/core/preprocess/input_tensor_info.hpp"
#include "openvino/core/parallel.hpp"
@@ -10,7 +11,6 @@
#include "intel_gpu/plugin/sync_infer_request.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/remote_tensor.hpp"
#include "intel_gpu/plugin/compiled_model.hpp"
#include "intel_gpu/plugin/variable_state.hpp"
@@ -41,6 +41,15 @@ inline bool can_use_usm_host(const cldnn::engine& engine) {
return can_use_usm;
}
inline ov::Shape get_tensor_shape(const ov::PartialShape& pshape) {
ov::Shape res(pshape.size());
for (size_t i = 0; i < pshape.size(); i++) {
res[i] = pshape[i].is_dynamic() ? 0 : pshape[i].get_length();
}
return res;
}
inline std::string get_port_name(const ov::Output<const ov::Node>& port, const bool is_legacy_api) {
std::string name;
// TODO: Should use tensor name as the port name, but many legacy tests still use legacy name
@@ -72,7 +81,7 @@ void convert_and_copy(const void* src_ptr, ov::element::Type src_et, void* dst_p
return;
if (src_et == dst_et) {
std::memcpy(dst_ptr, src_ptr, size);
std::memcpy(dst_ptr, src_ptr, size * src_et.size());
return;
}
@@ -425,6 +434,7 @@ void SyncInferRequest::wait() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait::reinterpret_memory");
OPENVINO_ASSERT(!output_memory->get_layout().data_padding, "[GPU] Unexpected padding in output buffer");
output_memory = m_graph->get_engine().reinterpret_buffer(*output_memory, output_layout);
GPU_DEBUG_TRACE_DETAIL << name << " model output: " << output_memory->buffer_ptr() << std::endl;
}
OPENVINO_ASSERT(m_user_outputs.count(name) > 0, "[GPU] Output ", name, " is not found in output tensors map");
@@ -433,6 +443,12 @@ void SyncInferRequest::wait() {
auto remote_ptr = std::dynamic_pointer_cast<RemoteTensorImpl>(output_tensor);
bool is_remote = remote_ptr != nullptr;
if (is_remote) {
GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (remote): " << remote_ptr->get_original_memory()->buffer_ptr() << std::endl;
} else {
GPU_DEBUG_TRACE_DETAIL << name << " handle output tensor (host): " << output_tensor->data() << std::endl;
}
bool need_output_update = output_layout.bytes_count() == 0 || (output_memory && output_tensor->get_byte_size() != output_memory->size());
if (need_output_update) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::wait::update_output");
@@ -444,6 +460,19 @@ void SyncInferRequest::wait() {
OPENVINO_ASSERT(ov::shape_size(port.get_shape()) == ov::shape_size(mem_shape), "[GPU] Unexpected elements count for output tensor");
mem_shape = port.get_shape();
}
if (port.get_partial_shape().is_dynamic()) {
bool need_reallocate = true;
auto usm_host_tensor = std::dynamic_pointer_cast<USMHostTensor>(output_tensor);
if (usm_host_tensor && output_memory)
need_reallocate = usm_host_tensor->get_impl()->get_original_memory()->size() < output_memory->size();
if (need_reallocate) {
auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), shape_predictor);
output_tensor->set_shape(actual_memory_shape);
}
}
output_tensor->set_shape(mem_shape);
}
@@ -453,6 +482,8 @@ void SyncInferRequest::wait() {
auto dst_ptr = static_cast<uint8_t*>(output_tensor->data());
bool same_mem = same_host_mem(output_memory, dst_ptr);
if (!same_mem && output_memory->size()) {
GPU_DEBUG_TRACE_DETAIL << name << " copy from: " << output_memory->buffer_ptr() << " to "
<< (!is_remote ? output_tensor->data() : remote_ptr->get_original_memory()->buffer_ptr()) << std::endl;
if (auto ev = copy_output_data(output_memory, *output_tensor)) {
copy_events.push_back(ev);
}
@@ -492,22 +523,13 @@ void SyncInferRequest::setup_stream_graph() {
std::shared_ptr<ov::ITensor> SyncInferRequest::create_host_tensor(const ov::PartialShape& port_shape, const ov::element::Type& port_element_type) const {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::create_host_tensor");
// Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes
// If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception
bool use_usm = m_graph->get_engine().use_unified_shared_memory() && !port_shape.is_dynamic();
auto shape = port_shape.is_static() ? port_shape.to_shape() : ov::Shape(port_shape.size(), 0);
auto usm_allocator = USMHostAllocator(m_context);
return use_usm ? ov::make_tensor(port_element_type, shape, usm_allocator)
: ov::make_tensor(port_element_type, shape);
return m_context->create_host_tensor(port_element_type, get_tensor_shape(port_shape))._ptr;
}
std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Shape& shape, ov::element::Type element_type,
bool need_lockable_memory, void* mem_ptr) const {
std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::PartialShape& port_shape, ov::element::Type element_type,
bool need_lockable_memory) const {
TensorType tensor_type = TensorType::BT_EMPTY;
if (mem_ptr) {
tensor_type = TensorType::BT_USM_SHARED;
} else if (m_graph->get_engine().use_unified_shared_memory()) {
if (m_graph->get_engine().use_unified_shared_memory()) {
tensor_type = need_lockable_memory ? TensorType::BT_USM_HOST_INTERNAL : TensorType::BT_USM_DEVICE_INTERNAL;
} else {
tensor_type = TensorType::BT_BUF_INTERNAL;
@@ -517,24 +539,10 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Sh
if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory)
tensor_type = TensorType::BT_BUF_INTERNAL;
// Currently, clDeviceMemAllocINTEL returns memory address allocated to other input blob if the current blob is empty
// W/A for this issue:
// Allocate with non-empty shape and then reinterprete with original shape
auto shape_copy = shape;
for (auto &i : shape_copy) {
if (i == 0)
i = 1;
}
return std::make_shared<RemoteTensorImpl>(m_context,
shape_copy,
get_tensor_shape(port_shape),
element_type,
tensor_type,
mem_ptr);
}
std::shared_ptr<ov::ITensor> SyncInferRequest::create_shared_device_tensor(const ov::Shape& shape, ov::element::Type element_type, void* usm_host_mem) const {
return create_device_tensor(shape, element_type, false, usm_host_mem);
tensor_type);
}
TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrapper& user_tensor_wrapper,
@@ -546,17 +554,12 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
auto tensor_shape = user_tensor->get_shape();
bool is_dynamic = port_pshape.is_dynamic();
OPENVINO_ASSERT(std::dynamic_pointer_cast<RemoteTensorImpl>(user_tensor) == nullptr, "[GPU] Unexpected remote tensor");
auto input_ptr = user_tensor->data();
const auto alloc_type = m_graph->get_engine().detect_usm_allocation_type(input_ptr);
const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
bool can_share = is_usm_host &&
!is_convert_required(user_tensor->get_element_type(), element_type) &&
auto usm_host_tensor = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
bool can_share = usm_host_tensor != nullptr && !is_convert_required(user_tensor->get_element_type(), element_type) &&
can_use_usm_host(m_graph->get_engine());
if (can_share) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
return { create_shared_device_tensor(tensor_shape, element_type, input_ptr), user_tensor_wrapper.owner };
return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
}
auto actual_memory_shape = tensor_shape;
@@ -689,13 +692,17 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_batched_input(const std
std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string& name,
const ov::Output<const ov::Node>& port,
const TensorWrapper& user_tensor_wrapper) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::prepare_input");
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, openvino::itt::handle("SyncInferRequest::prepare_input: " + name));
auto pshape = port.get_partial_shape();
auto is_dynamic = pshape.is_dynamic();
auto user_tensor = user_tensor_wrapper.ptr;
auto element_type = user_tensor->get_element_type();
auto remote_ptr = std::dynamic_pointer_cast<RemoteTensorImpl>(user_tensor);
auto usm_host_ptr = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
bool is_remote = remote_ptr != nullptr;
bool is_usm_host_tensor = usm_host_ptr != nullptr;
GPU_DEBUG_TRACE_DETAIL << "Prepare input for " << name << " ( is_remote ? " << is_remote << ")" << std::endl;
GPU_DEBUG_TRACE_DETAIL << " port shape : " << pshape.to_string() << std::endl;
GPU_DEBUG_TRACE_DETAIL << " user_tensor shape: " << user_tensor->get_shape().to_string() << std::endl;
@@ -713,12 +720,16 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
user_tensor->get_shape(),
") are incompatible");
if (is_remote) {
m_plugin_inputs[name] = user_tensor_wrapper;
}
auto device_tensor_et = convert_to_supported_device_type(element_type);
bool convert_needed = is_convert_required(element_type, device_tensor_et);
if (is_remote) {
m_plugin_inputs[name] = user_tensor_wrapper;
} else if (is_usm_host_tensor && !convert_needed) {
m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner};
is_remote = true;
}
bool update_device_tensor = m_plugin_inputs.count(name) == 0 || (m_plugin_inputs[name].owner == TensorOwner::USER && !is_remote);
if (update_device_tensor) {
@@ -780,6 +791,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
}
}
GPU_DEBUG_TRACE_DETAIL << name << " prepare input: " << memory->buffer_ptr() << std::endl;
const cldnn::primitive_id internal_name = "parameter:" + name;
network->set_input_data(internal_name, memory);
@@ -839,6 +851,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_output(const std::strin
auto output_tensor = std::dynamic_pointer_cast<RemoteTensorImpl>(m_plugin_outputs.at(name).ptr);
auto output_memory = output_tensor->get_memory();
GPU_DEBUG_TRACE_DETAIL << name << " prepare output: " << output_memory->buffer_ptr() << std::endl;
return network->set_output_memory(internal_name, output_memory);
}

View File

@@ -0,0 +1,50 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/usm_host_tensor.hpp"
#include "intel_gpu/plugin/remote_tensor.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include <memory>
namespace ov {
namespace intel_gpu {
USMHostTensor::USMHostTensor(std::shared_ptr<RemoteContextImpl> context, const element::Type element_type, const Shape& shape)
: m_impl(std::make_shared<RemoteTensorImpl>(context, shape, element_type, TensorType::BT_USM_HOST_INTERNAL)) {}
USMHostTensor::USMHostTensor(std::shared_ptr<RemoteTensorImpl> tensor)
: m_impl(tensor) {}
void* USMHostTensor::data(const element::Type& element_type) const {
return m_impl->get_original_memory()->buffer_ptr();
}
const element::Type& USMHostTensor::get_element_type() const {
return m_impl->get_element_type();
}
const Shape& USMHostTensor::get_shape() const {
return m_impl->get_shape();
}
const Strides& USMHostTensor::get_strides() const {
return m_impl->get_strides();
}
void USMHostTensor::set_shape(ov::Shape new_shape) {
m_impl->set_shape(new_shape);
}
void USMHostTensor::set_memory(std::shared_ptr<RemoteTensorImpl> tensor) {
OPENVINO_ASSERT(tensor->get_original_memory()->get_allocation_type() == cldnn::allocation_type::usm_host, "[GPU] Unexpected allocation type");
m_impl = tensor;
}
std::shared_ptr<RemoteTensorImpl> USMHostTensor::get_impl() const {
return m_impl;
}
} // namespace intel_gpu
} // namespace ov

View File

@@ -0,0 +1,50 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <memory>
#include "openvino/core/dimension.hpp"
#include "openvino/core/model.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/transpose.hpp"
#include "openvino/op/result.hpp"
#include "openvino/op/parameter.hpp"
#include "openvino/op/matmul.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/concat.hpp"
namespace tests {
inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch = ov::Dimension::dynamic(),
ov::Dimension n_heads = ov::Dimension::dynamic(),
ov::Dimension n_features = ov::Dimension::dynamic(),
ov::element::Type_t element_type = ov::element::f32) {
ov::PartialShape kv_cache_size = {batch, n_heads, -1, n_features};
ov::PartialShape new_token_size = {batch, -1, n_heads, n_features};
ov::PartialShape matmul_in_size = {batch, n_heads, -1, -1};
auto in_kv_prev = std::make_shared<ov::op::v0::Parameter>(element_type, kv_cache_size);
in_kv_prev->set_friendly_name("past_key_values");
auto in_new_token = std::make_shared<ov::op::v0::Parameter>(element_type, new_token_size);
in_new_token->set_friendly_name("new_token_input");
auto in_matmul = std::make_shared<ov::op::v0::Parameter>(element_type, matmul_in_size);
in_matmul->set_friendly_name("in_matmul");
auto transpose_const = ov::op::v0::Constant::create(ov::element::i32, {new_token_size.size()}, {0, 2, 1, 3});
auto transpose = std::make_shared<ov::op::v1::Transpose>(in_new_token, transpose_const);
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{in_kv_prev, transpose}, 2);
auto convert = std::make_shared<ov::op::v0::Convert>(concat, element_type);
auto kv_present = std::make_shared<ov::op::v0::Result>(convert);
kv_present->set_friendly_name("present_key_values");
auto matmul = std::make_shared<ov::op::v0::MatMul>(in_matmul, concat, false, false);
auto matmul_out = std::make_shared<ov::op::v0::Result>(matmul);
matmul_out->set_friendly_name("matmul_out");
ov::ParameterVector params{in_kv_prev, in_new_token, in_matmul};
ov::ResultVector results{kv_present, matmul_out};
return std::make_shared<ov::Model>(results, params, "LLM-KV-Cache");
}
} // namespace tests

View File

@@ -0,0 +1,23 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <limits.h>
#include "behavior/ov_infer_request/iteration_chaining.hpp"
#include "common_test_utils/test_constants.hpp"
#include "openvino/runtime/properties.hpp"
using namespace ov::test::behavior;
namespace {
const std::vector<ov::AnyMap> configs = {
{ ov::hint::inference_precision(ov::element::f32) }
};
INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVIterationChaining,
::testing::Combine(
::testing::Values(ov::test::utils::DEVICE_GPU),
::testing::ValuesIn(configs)),
OVIterationChaining::getTestCaseName);
} // namespace

View File

@@ -0,0 +1,217 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/ov_tensor_utils.hpp"
#include "openvino/core/node_vector.hpp"
#include "openvino/core/partial_shape.hpp"
#include "openvino/core/preprocess/pre_post_process.hpp"
#include "openvino/op/concat.hpp"
#include "openvino/op/matmul.hpp"
#include "openvino/op/parameter.hpp"
#include "openvino/op/transpose.hpp"
#include "shared_test_classes/base/layer_test_utils.hpp"
#include "shared_test_classes/base/ov_subgraph.hpp"
#include "shared_test_classes/base/utils/compare_results.hpp"
#include "transformations/rt_info/decompression.hpp"
#include "subgraphs_builders.hpp"
using namespace ov::test;
namespace SubgraphTestsDefinitions {
using KVCacheTestParams = std::tuple<std::vector<InputShape>, // input shapes
ov::element::Type, // in/out precision
std::map<std::string, std::string>>; // additional config
class KVCacheTest : public testing::WithParamInterface<KVCacheTestParams>, public SubgraphBaseTest {
public:
static std::string get_test_case_name(testing::TestParamInfo<KVCacheTestParams> obj) {
std::vector<InputShape> input_shapes;
ov::element::Type element_type;
std::map<std::string, std::string> additional_config;
std::tie(input_shapes, element_type, additional_config) = obj.param;
std::ostringstream result;
for (const auto& shape : input_shapes) {
result << ov::test::utils::partialShape2str({shape.first}) << "_";
}
result << "TS=";
for (const auto& shape : input_shapes) {
result << "(";
if (!shape.second.empty()) {
auto itr = shape.second.begin();
do {
result << ov::test::utils::vec2str(*itr);
} while (++itr != shape.second.end() && result << "_");
}
result << ")_";
}
result << "precision=" << element_type << "_";
result << "config=(";
for (const auto& configEntry : additional_config) {
result << configEntry.first << ", " << configEntry.second << ":";
}
result << ")";
return result.str();
}
protected:
void SetUp() override {
targetDevice = ov::test::utils::DEVICE_GPU;
std::vector<InputShape> input_shapes;
ov::element::Type element_type;
std::map<std::string, std::string> additional_config;
std::tie(input_shapes, element_type, additional_config) = GetParam();
configuration.insert(additional_config.begin(), additional_config.end());
init_input_shapes(input_shapes);
inType = outType = element_type;
function = tests::make_llm_kv_cache_pattern(inputDynamicShapes[0][0], inputDynamicShapes[0][1], inputDynamicShapes[0][3], element_type);
}
};
TEST_P(KVCacheTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()
run();
}
namespace {
const std::vector<ov::element::Type> precisions = {ov::element::f32, ov::element::f16};
const std::vector<std::vector<InputShape>> input_shapes_basic = {
{
{{-1, 32, -1, 80}, { {1, 32, 0, 80}, {1, 32, 20, 80} }},
{{-1, -1, 32, 80}, { {1, 20, 32, 80}, {1, 1, 32, 80} }},
{{-1, 32, -1, -1}, { {1, 32, 1, 20}, {1, 32, 1, 21} }}
},
};
INSTANTIATE_TEST_SUITE_P(smoke_GPU_Dynamic,
KVCacheTest,
::testing::Combine(::testing::ValuesIn(input_shapes_basic),
::testing::ValuesIn(precisions),
::testing::Values(std::map<std::string, std::string>())),
KVCacheTest::get_test_case_name);
} // namespace
TEST(KVCacheTest, smoke_multipleIterations) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
auto core = ov::Core();
const size_t batch = 1;
const size_t n_heads = 32;
const size_t n_features = 80;
const size_t context_size = 20;
size_t cache_size = 0;
ov::element::Type element_type = ov::element::f16;
auto model = tests::make_llm_kv_cache_pattern(batch, n_heads, n_features, element_type);
auto compiled_model = core.compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
auto input0 = model->get_parameters().at(0);
auto input1 = model->get_parameters().at(1);
auto input2 = model->get_parameters().at(2);
auto output0 = model->get_results().at(0);
auto output1 = model->get_results().at(1);
auto infer_request = compiled_model.create_infer_request();
auto input0_tensor_remote_io = infer_request.get_tensor(input0);
auto input1_tensor_remote_io = infer_request.get_tensor(input1);
auto input2_tensor_remote_io = infer_request.get_tensor(input2);
auto output0_tensor_remote_io = infer_request.get_tensor(output0);
auto output1_tensor_remote_io = infer_request.get_tensor(output1);
auto compare_tensors = [&model](const std::vector<ov::Tensor> expected, const std::vector<ov::Tensor>& actual) {
ASSERT_EQ(expected.size(), actual.size());
ASSERT_EQ(expected.size(), model->get_results().size());
auto compareMap = ov::test::utils::getCompareMap();
const auto& results = model->get_results();
for (size_t j = 0; j < results.size(); j++) {
const auto result = results[j];
for (size_t i = 0; i < result->get_input_size(); ++i) {
std::shared_ptr<ov::Node> inputNode = result->get_input_node_shared_ptr(i);
if (std::dynamic_pointer_cast<ov::op::v0::Convert>(inputNode)) {
std::shared_ptr<ov::Node> nextNodePtr = inputNode->get_input_node_shared_ptr(0);
if (!ngraph::is_type<ov::op::v0::Result>(nextNodePtr)) {
inputNode = nextNodePtr;
}
}
auto it = compareMap.find(inputNode->get_type_info());
ASSERT_NE(it, compareMap.end());
it->second(inputNode, i, expected[j], actual[j], 1e-4f, 1e-4f);
}
}
};
{
const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
auto kv_cache_input = infer_request.get_tensor(input0);
kv_cache_input.set_shape(kv_cache_size_initial);
auto ref_model = model->clone();
ngraph::helpers::resize_function(ref_model, {kv_cache_input.get_shape(), new_token_data.get_shape(), matmul_data.get_shape()});
auto results = ngraph::helpers::interpretFunction(ref_model, {{input0, kv_cache_input}, {input1, new_token_data}, {input2, matmul_data}});
infer_request.set_tensor(input0, kv_cache_input);
infer_request.set_tensor(input1, new_token_data);
infer_request.set_tensor(input2, matmul_data);
infer_request.infer();
compare_tensors(results, {infer_request.get_tensor(output0), infer_request.get_tensor(output1)});
cache_size += context_size;
}
const size_t input_tokens = 1;
const size_t niters = 10;
const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
size_t context_length = cache_size + input_tokens;
for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
auto kv_cache_input = infer_request.get_tensor(output0);
auto kv_shape = kv_cache_input.get_shape();
auto ref_model = model->clone();
ngraph::helpers::resize_function(ref_model, {kv_shape, new_token_data.get_shape(), matmul_data.get_shape()});
auto results = ngraph::helpers::interpretFunction(ref_model, {{input0, kv_cache_input}, {input1, new_token_data}, {input2, matmul_data}});
auto new_token_input = infer_request.get_tensor(input1);
new_token_input.set_shape(new_token_data.get_shape());
auto matmul_input = infer_request.get_tensor(input2);
matmul_input.set_shape(matmul_data.get_shape());
new_token_data.copy_to(new_token_input);
matmul_data.copy_to(matmul_input);
infer_request.set_tensor(input0, kv_cache_input);
infer_request.set_tensor(input1, new_token_input);
infer_request.set_tensor(input2, matmul_input);
infer_request.infer();
compare_tensors(results, {infer_request.get_tensor(output0), infer_request.get_tensor(output1)});
}
}
} // namespace SubgraphTestsDefinitions