From 3a7acbb5cc6cbbbe758a6762a583693493d343a6 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Fri, 15 Dec 2023 13:57:49 +0400
Subject: [PATCH] [GPU] Allow precision conversion on get_state() call (#21658)

---
 .../include/intel_gpu/plugin/common_utils.hpp |   4 +
 .../intel_gpu/plugin/variable_state.hpp       |   9 +-
 .../intel_gpu/src/plugin/common_utils.cpp     | 152 ++++++++++++++++++
 .../src/plugin/sync_infer_request.cpp         |  96 -----------
 .../intel_gpu/src/plugin/variable_state.cpp   |  17 +-
 .../intel_gpu/tests/unit/CMakeLists.txt       |   1 +
 6 files changed, 176 insertions(+), 103 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/plugin/common_utils.cpp
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
index b69ada3946f..0e8b92e5d63 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
@@ -7,6 +7,7 @@
 #include <ostream>
 #include <tuple>
 #include "intel_gpu/runtime/layout.hpp"
+#include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/shape_predictor.hpp"
 #include "openvino/core/layout.hpp"
 #include "openvino/core/type/element_type.hpp"
@@ -102,6 +103,9 @@ inline void ForceExit() {
     std::_Exit(-1);
 }
 
+void convert_and_copy(const cldnn::memory::ptr src, ov::ITensor const* dst, const cldnn::stream& stream);
+void convert_and_copy(const ov::ITensor* src, ov::ITensor const* dst, const cldnn::stream& stream);
+
 }  // namespace intel_gpu
 
 inline std::ostream& operator<<(std::ostream& os, const ov::AnyMap& params) {
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp
index 68604a1889f..ffce17acb2a 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/variable_state.hpp
@@ -3,6 +3,7 @@
 //
 #pragma once
 
+#include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/ivariable_state.hpp"
 #include "intel_gpu/runtime/layout.hpp"
 #include "intel_gpu/runtime/shape_predictor.hpp"
@@ -15,10 +16,14 @@ namespace intel_gpu {
 class RemoteContextImpl;
 
 struct VariableStateInfo {
-    VariableStateInfo(const std::string& id, const cldnn::layout& layout) : m_id(id), m_layout(layout) {}
+    VariableStateInfo(const std::string& id, const cldnn::layout& layout, ov::element::Type_t user_specified_type = ov::element::undefined)
+        : m_id(id)
+        , m_layout(layout)
+        , m_user_specified_type(user_specified_type) {}
 
     std::string m_id;
     cldnn::layout m_layout;
+    ov::element::Type m_user_specified_type;
 };
 
 class VariableState : public ov::IVariableState {
@@ -38,6 +43,7 @@ public:
 
 private:
     cldnn::layout m_layout;
+    ov::element::Type m_user_specified_type;
     std::shared_ptr<RemoteContextImpl> m_context;
     std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor;
     bool m_is_set = false;
@@ -45,6 +51,7 @@ private:
     size_t actual_size = 0;
 
     void update_device_buffer();
+    ov::element::Type get_user_specified_type() const;
 };
 
 using VariablesMap = std::unordered_map<std::string, VariableState::Ptr>;
diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp
new file mode 100644
index 00000000000..0375aa495f2
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp
@@ -0,0 +1,152 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/plugin/common_utils.hpp"
+#include "intel_gpu/plugin/remote_tensor.hpp"
+#include "intel_gpu/runtime/layout.hpp"
+#include "intel_gpu/runtime/memory.hpp"
+#include "intel_gpu/runtime/memory_caps.hpp"
+
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/runtime/tensor.hpp"
+#include "openvino/op/util/op_types.hpp"
+
+#include <algorithm>
+#include <memory>
+
+namespace {
+
+template <typename src_t, typename dst_t>
+void convert_and_copy_no_pad(const src_t* src, dst_t* dst, size_t size) {
+    OPENVINO_ASSERT(src && dst, "[GPU] Src or Dst ptr is null");
+    for (size_t i = 0; i < size; i++)
+        dst[i] = static_cast<dst_t>(src[i]);
+}
+
+template <typename src_t, typename dst_t>
+void convert_and_copy_padded_source(const src_t* src, dst_t* dst, cldnn::layout layout) {
+    cldnn::tensor size = layout.get_tensor();
+    for (int64_t b = 0; b < size.batch[0]; b++) {
+        for (int64_t f = 0; f < size.feature[0]; f++) {
+            for (int64_t w = 0; w < size.spatial[3]; w++) {
+                for (int64_t z = 0; z < size.spatial[2]; z++) {
+                    for (int64_t y = 0; y < size.spatial[1]; y++) {
+                        for (int64_t x = 0; x < size.spatial[0]; x++) {
+                            *dst++ = static_cast<dst_t>(src[layout.get_linear_offset(cldnn::tensor(b, f, x, y, z, w))]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void convert_and_copy(const void* src_ptr, ov::element::Type src_et, void* dst_ptr, ov::element::Type dst_et, size_t size, cldnn::layout layout) {
+    if (size == 0)
+        return;
+
+    if (src_et == dst_et && !layout.data_padding) {
+        std::memcpy(dst_ptr, src_ptr, size * src_et.size());
+        return;
+    }
+
+    #define CASE(s_et, d_et, s_type, d_type)                                                                                       \
+        if (src_et == s_et && dst_et == d_et) {                                                                                    \
+            if (static_cast<bool>(layout.data_padding)) {                                                                          \
+                return convert_and_copy_padded_source(static_cast<const s_type*>(src_ptr), static_cast<d_type*>(dst_ptr), layout); \
+            } else {                                                                                                               \
+                return convert_and_copy_no_pad(static_cast<const s_type*>(src_ptr), static_cast<d_type*>(dst_ptr), size);          \
+            }                                                                                                                      \
+        }
+
+    // For unsupported inputs
+    CASE(ov::element::f64, ov::element::f32, double, float);
+    CASE(ov::element::i16, ov::element::f32, int16_t, float);
+    CASE(ov::element::u16, ov::element::f32, uint16_t, float);
+    CASE(ov::element::u64, ov::element::i32, uint64_t, int32_t);
+    CASE(ov::element::i64, ov::element::i32, int64_t, int32_t);
+    CASE(ov::element::u32, ov::element::i32, uint32_t, int32_t);
+
+    // For unsupported outputs
+    CASE(ov::element::f32, ov::element::f64, float, double);
+    CASE(ov::element::i32, ov::element::i64, int32_t, int64_t);
+    CASE(ov::element::i32, ov::element::u64, int32_t, uint64_t);
+    CASE(ov::element::i32, ov::element::u32, int32_t, uint32_t);
+    CASE(ov::element::f32, ov::element::i16, float, int16_t);
+    CASE(ov::element::f32, ov::element::u16, float, uint16_t);
+
+    // TODO: Need instances below?
+    CASE(ov::element::u32, ov::element::i64, uint32_t, int64_t);
+    CASE(ov::element::u32, ov::element::u64, uint32_t, uint64_t);
+
+    // For state conversions
+    CASE(ov::element::f32, ov::element::f32, float, float);
+    CASE(ov::element::f16, ov::element::f16, ov::float16, ov::float16);
+    CASE(ov::element::f32, ov::element::f16, float, ov::float16);
+    CASE(ov::element::f16, ov::element::f32, ov::float16, float);
+
+    OPENVINO_THROW("[GPU] Unsupported element types combination for copy: ", src_et, " -> ", dst_et);
+}
+
+}  // namespace
+
+namespace ov {
+namespace intel_gpu {
+
+void convert_and_copy(const cldnn::memory::ptr src, ov::ITensor const* dst, const cldnn::stream& stream) {
+    auto src_et = src->get_layout().data_type;
+    auto dst_et = dst->get_element_type();
+
+    size_t size = ov::shape_size(dst->get_shape());
+
+    cldnn::mem_lock<uint8_t, cldnn::mem_lock_type::read> src_lock(src, stream);
+    std::unique_ptr<cldnn::mem_lock<uint8_t>> dst_lock = nullptr;
+
+    const void* src_ptr = src_lock.data();
+    void* dst_ptr = nullptr;
+
+    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(dst)) {
+        auto mem = remote->get_original_memory();
+        dst_lock.reset(new cldnn::mem_lock<uint8_t>(mem, stream));
+        dst_ptr = dst_lock->data();
+    } else {
+        dst_ptr = dst->data();
+    }
+
+    return ::convert_and_copy(src_ptr, src_et, dst_ptr, dst_et, size, src->get_layout());
+}
+
+void convert_and_copy(const ov::ITensor* src, ov::ITensor const* dst, const cldnn::stream& stream) {
+    auto src_et = src->get_element_type();
+    auto dst_et = dst->get_element_type();
+
+    size_t size = ov::shape_size(dst->get_shape());
+
+    const void* src_ptr = nullptr;
+    void* dst_ptr = nullptr;
+
+    std::unique_ptr<cldnn::mem_lock<uint8_t, cldnn::mem_lock_type::read>> src_lock = nullptr;
+    std::unique_ptr<cldnn::mem_lock<uint8_t>> dst_lock = nullptr;
+
+    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(src)) {
+        auto mem = remote->get_original_memory();
+        src_lock.reset(new cldnn::mem_lock<uint8_t, cldnn::mem_lock_type::read>(mem, stream));
+        src_ptr = src_lock->data();
+    } else {
+        src_ptr = src->data();
+    }
+
+    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(dst)) {
+        auto mem = remote->get_original_memory();
+        dst_lock.reset(new cldnn::mem_lock<uint8_t>(mem, stream));
+        dst_ptr = dst_lock->data();
+    } else {
+        dst_ptr = dst->data();
+    }
+
+    return ::convert_and_copy(src_ptr, src_et, dst_ptr, dst_et, size, cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding()));
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index fde0b40085f..eb84b99f042 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -62,106 +62,10 @@ inline std::string get_port_name(const ov::Output<const ov::Node>& port, const b
     return name;
 }
 
-template <typename src_t, typename dst_t>
-void convert_any_copy(const src_t* src, dst_t* dst, size_t size) {
-    OPENVINO_ASSERT(src && dst, "[GPU] Src or Dst ptr is null");
-    for (size_t i = 0; i < size; i++)
-        dst[i] = static_cast<dst_t>(src[i]);
-}
-
-void convert_and_copy(const void* src_ptr, ov::element::Type src_et, void* dst_ptr, ov::element::Type dst_et, size_t size) {
-    if (size == 0)
-        return;
-
-    if (src_et == dst_et) {
-        std::memcpy(dst_ptr, src_ptr, size * src_et.size());
-        return;
-    }
-
-    #define CASE(s_et, d_et, s_type, d_type) \
-        if (src_et == s_et && dst_et == d_et) return convert_any_copy(static_cast<const s_type*>(src_ptr), static_cast<d_type*>(dst_ptr), size)
-
-    // For unsupported inputs
-    CASE(ov::element::f64, ov::element::f32, double, float);
-    CASE(ov::element::i16, ov::element::f32, int16_t, float);
-    CASE(ov::element::u16, ov::element::f32, uint16_t, float);
-    CASE(ov::element::u64, ov::element::i32, uint64_t, int32_t);
-    CASE(ov::element::i64, ov::element::i32, int64_t, int32_t);
-    CASE(ov::element::u32, ov::element::i32, uint32_t, int32_t);
-
-    // For unsupported outputs
-    CASE(ov::element::f32, ov::element::f64, float, double);
-    CASE(ov::element::i32, ov::element::i64, int32_t, int64_t);
-    CASE(ov::element::i32, ov::element::u64, int32_t, uint64_t);
-    CASE(ov::element::i32, ov::element::u32, int32_t, uint32_t);
-    CASE(ov::element::f32, ov::element::i16, float, int16_t);
-    CASE(ov::element::f32, ov::element::u16, float, uint16_t);
-
-    // TODO: Need instances below?
-    CASE(ov::element::u32, ov::element::i64, uint32_t, int64_t);
-    CASE(ov::element::u32, ov::element::u64, uint32_t, uint64_t);
-
-    OPENVINO_THROW("[GPU] Unsupported element types combination for copy: ", src_et, " -> ", dst_et);
-}
-
 bool is_convert_required(ov::element::Type src_et, ov::element::Type dst_et) {
     return src_et != dst_et && !(dst_et == ov::element::boolean && src_et == ov::element::u8);
 }
 
-void convert_and_copy(const cldnn::memory::ptr src, ov::ITensor const* dst, const cldnn::stream& stream) {
-    auto src_et = src->get_layout().data_type;
-    auto dst_et = dst->get_element_type();
-
-    size_t size = ov::shape_size(dst->get_shape());
-
-    cldnn::mem_lock<uint8_t> src_lock(src, stream);
-    std::unique_ptr<cldnn::mem_lock<uint8_t>> dst_lock = nullptr;
-
-    const void* src_ptr = src_lock.data();
-    void* dst_ptr = nullptr;
-
-    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(dst)) {
-        auto mem = remote->get_original_memory();
-        dst_lock.reset(new cldnn::mem_lock<uint8_t>(mem, stream));
-        dst_ptr = dst_lock->data();
-    } else {
-        dst_ptr = dst->data();
-    }
-
-    return convert_and_copy(src_ptr, src_et, dst_ptr, dst_et, size);
-}
-
-void convert_and_copy(const ov::ITensor* src, ov::ITensor const* dst, const cldnn::stream& stream) {
-    auto src_et = src->get_element_type();
-    auto dst_et = dst->get_element_type();
-
-    size_t size = ov::shape_size(dst->get_shape());
-
-    const void* src_ptr = nullptr;
-    void* dst_ptr = nullptr;
-
-    std::unique_ptr<cldnn::mem_lock<uint8_t>> src_lock = nullptr;
-    std::unique_ptr<cldnn::mem_lock<uint8_t>> dst_lock = nullptr;
-
-    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(src)) {
-        auto mem = remote->get_original_memory();
-        src_lock.reset(new cldnn::mem_lock<uint8_t>(mem, stream));
-        src_ptr = src_lock->data();
-    } else {
-        src_ptr = src->data();
-    }
-
-    if (auto remote = dynamic_cast<const ov::intel_gpu::RemoteTensorImpl*>(dst)) {
-        auto mem = remote->get_original_memory();
-        dst_lock.reset(new cldnn::mem_lock<uint8_t>(mem, stream));
-        dst_ptr = dst_lock->data();
-    } else {
-        dst_ptr = dst->data();
-    }
-
-    return convert_and_copy(src_ptr, src_et, dst_ptr, dst_et, size);
-}
-
 bool same_host_mem(cldnn::memory::cptr memory, const uint8_t* host_ptr) {
     const uint8_t* device_ptr = memory->get_allocation_type() == cldnn::allocation_type::usm_host ?
                                 static_cast<uint8_t*>(memory->get_internal_params().mem) : nullptr;
diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
index cdd551b5ca8..78c3479eeb7 100644
--- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp
+++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/make_tensor.hpp"
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "intel_gpu/plugin/common_utils.hpp"
@@ -19,9 +20,9 @@ namespace intel_gpu {
 VariableState::VariableState(const VariableStateInfo& info, RemoteContextImpl::Ptr context, std::shared_ptr<cldnn::ShapePredictor> shape_predictor)
     : ov::IVariableState {info.m_id}
     , m_layout(info.m_layout)
+    , m_user_specified_type(info.m_user_specified_type)
     , m_context(context)
     , m_shape_predictor(shape_predictor) {
-    m_state = m_context->create_host_tensor(m_layout.data_type, get_tensor_shape(m_layout.get_partial_shape()));
     update_device_buffer();
 }
 
@@ -80,12 +81,16 @@ void VariableState::update_device_buffer() {
     m_memory = m_context->get_engine().reinterpret_buffer(*m_memory, m_layout);
 }
 
-ov::SoPtr<ov::ITensor> VariableState::get_state() const {
-    const bool blocking = true;
-    m_state->set_shape(m_memory->get_layout().get_shape());
-    m_memory->copy_to(m_context->get_engine().get_service_stream(), m_state->data(), blocking);
+ov::element::Type VariableState::get_user_specified_type() const {
+    return m_user_specified_type != ov::element::undefined ? m_user_specified_type : ov::element::Type(m_layout.data_type);
+}
 
-    return m_state;
+ov::SoPtr<ov::ITensor> VariableState::get_state() const {
+    auto tensor = m_context->create_host_tensor(get_user_specified_type(), m_memory->get_layout().get_shape());
+
+    convert_and_copy(m_memory, tensor._ptr.get(), m_context->get_engine().get_service_stream());
+
+    return tensor;
 }
 
 }  // namespace intel_gpu
diff --git a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
index 887594590de..458280bbeae 100644
--- a/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
+++ b/src/plugins/intel_gpu/tests/unit/CMakeLists.txt
@@ -24,6 +24,7 @@ file(GLOB_RECURSE SOURCES_MAIN
     "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/remote_context.cpp"
     "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp"
     "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/usm_host_tensor.cpp"
+    "${CMAKE_HOME_DIRECTORY}/src/plugins/intel_gpu/src/plugin/common_utils.cpp"
   )
 
 if (NOT ENABLE_ONEDNN_FOR_GPU)