[GPU] Enable state precision conversion to infer precision (#21729)

This commit is contained in:
Vladimir Paramuzov
2023-12-19 09:36:03 +04:00
committed by GitHub
parent 0e92469330
commit 63b23a1ebb
14 changed files with 128 additions and 69 deletions

View File

@@ -277,7 +277,7 @@ private:
void add_default_output_chains();
void calculate_weights_cache_capacity();
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
void set_variables_state_info(const std::string& variable_id, const layout& variable_layout);
void set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type);
#ifdef GPU_DEBUG_CONFIG
int64_t iteration = 0;

View File

@@ -50,6 +50,8 @@ private:
cldnn::memory::ptr m_memory = nullptr;
size_t actual_size = 0;
const cldnn::layout m_initial_layout;
void update_device_buffer();
ov::element::Type get_user_specified_type() const;
};

View File

@@ -25,13 +25,16 @@ struct assign : public primitive_base<assign> {
assign(const primitive_id &id,
const std::vector<input_info>& inputs,
const std::string& variable_id,
const layout& output_layout)
const layout& output_layout,
const ov::element::Type& user_specified_type = ov::element::undefined)
: primitive_base(id, inputs, {padding()}, {optional_data_type{output_layout.data_type}}),
variable_id{variable_id},
output_layout{output_layout} {}
output_layout{output_layout},
user_specified_type(user_specified_type) {}
std::string variable_id;
layout output_layout;
ov::element::Type user_specified_type;
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
@@ -39,19 +42,25 @@ struct assign : public primitive_base<assign> {
auto rhs_casted = downcast<const assign>(rhs);
return variable_id == rhs_casted.variable_id;
return variable_id == rhs_casted.variable_id &&
user_specified_type == rhs_casted.user_specified_type;
}
void save(BinaryOutputBuffer& ob) const override {
primitive_base<assign>::save(ob);
ov::element::Type_t data_type = user_specified_type;
ob << variable_id;
ob << output_layout;
ob << make_data(&data_type, sizeof(ov::element::Type_t));
}
void load(BinaryInputBuffer& ib) override {
primitive_base<assign>::load(ib);
ov::element::Type_t data_type;
ib >> variable_id;
ib >> output_layout;
ib >> make_data(&data_type, sizeof(ov::element::Type_t));
user_specified_type = data_type;
}
};
} // namespace cldnn

View File

@@ -6,6 +6,7 @@
#include <vector>
#include "openvino/core/type/element_type.hpp"
#include "primitive.hpp"
#include "intel_gpu/runtime/memory.hpp"
@@ -25,13 +26,16 @@ struct read_value : public primitive_base<read_value> {
read_value(const primitive_id& id,
const std::vector<input_info>& inputs,
const std::string& variable_id,
const layout& output_layout)
const layout& output_layout,
const ov::element::Type& user_specified_type = ov::element::undefined)
: primitive_base(id, inputs, {padding()}, {optional_data_type{output_layout.data_type}}),
variable_id{variable_id},
output_layout{output_layout} {}
output_layout{output_layout},
user_specified_type(user_specified_type) {}
std::string variable_id;
layout output_layout;
ov::element::Type user_specified_type;
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
@@ -39,19 +43,25 @@ struct read_value : public primitive_base<read_value> {
auto rhs_casted = downcast<const read_value>(rhs);
return variable_id == rhs_casted.variable_id;
return variable_id == rhs_casted.variable_id &&
user_specified_type == rhs_casted.user_specified_type;
}
void save(BinaryOutputBuffer& ob) const override {
primitive_base<read_value>::save(ob);
ov::element::Type_t data_type = user_specified_type;
ob << variable_id;
ob << output_layout;
ob << make_data(&data_type, sizeof(ov::element::Type_t));
}
void load(BinaryInputBuffer& ib) override {
primitive_base<read_value>::load(ib);
ov::element::Type_t data_type;
ib >> variable_id;
ib >> output_layout;
ib >> make_data(&data_type, sizeof(ov::element::Type_t));
user_specified_type = data_type;
}
};
} // namespace cldnn

View File

@@ -12,7 +12,7 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(assign)
assign_inst::typed_primitive_inst(network& network, const assign_node& node) :
parent{network, node, false},
memory_state::variable{node.get_primitive()->variable_id} {
memory_state::variable{node.get_primitive()->variable_id, node.get_primitive()->user_specified_type} {
}
layout assign_inst::calc_output_layout(const assign_node& node, kernel_impl_params const& impl_param) {

View File

@@ -6,23 +6,9 @@
#include "intel_gpu/primitives/assign.hpp"
#include "primitive_inst.h"
#include "variable.hpp"
namespace cldnn {
namespace memory_state {
class variable {
public:
explicit variable(const std::string& variable_id) : variable_id_ {variable_id} {}
const std::string& variable_id() const { return variable_id_; }
void set_variable_id(const std::string& variable_id) { variable_id_ = variable_id; }
private:
std::string variable_id_;
};
} // namespace memory_state
template <>
struct typed_program_node<assign> : public typed_program_node_base<assign> {
private:

View File

@@ -4,9 +4,9 @@
#pragma once
#include "assign_inst.h"
#include "intel_gpu/primitives/read_value.hpp"
#include "primitive_inst.h"
#include "variable.hpp"
namespace cldnn {

View File

@@ -0,0 +1,28 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <string>
#include "openvino/core/type/element_type.hpp"
namespace cldnn {
namespace memory_state {
class variable {
public:
explicit variable(const std::string& variable_id, ov::element::Type user_specified_type = ov::element::undefined)
: m_variable_id {variable_id}
, m_user_specified_type(user_specified_type) {}
const std::string& variable_id() const { return m_variable_id; }
ov::element::Type get_user_specified_type() const { return m_user_specified_type; }
private:
std::string m_variable_id;
ov::element::Type m_user_specified_type;
};
} // namespace memory_state
} // namespace cldnn

View File

@@ -1327,7 +1327,7 @@ void network::allocate_primitive_instance(program_node const& node) {
_data_outputs.push_back(inst);
}
if (auto state_prim = std::dynamic_pointer_cast<memory_state::variable>(inst)) {
set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0));
set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type());
}
if (node.is_constant())
transfer_memory_to_device(inst, node);
@@ -1393,8 +1393,8 @@ const ov::intel_gpu::VariablesInfoMap& network::get_variables_info() const {
return _variables_state_info;
}
void network::set_variables_state_info(const std::string& variable_id, const layout& variable_layout) {
_variables_state_info.emplace(variable_id, ov::intel_gpu::VariableStateInfo{variable_id, variable_layout});
void network::set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type) {
_variables_state_info.emplace(variable_id, ov::intel_gpu::VariableStateInfo{variable_id, variable_layout, user_specified_type});
}
} // namespace cldnn

View File

@@ -12,7 +12,7 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(read_value)
read_value_inst::typed_primitive_inst(network& network, const read_value_node& node) :
parent(network, node, !node.can_be_optimized() && (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound())),
memory_state::variable{node.get_primitive()->variable_id} {
memory_state::variable{node.get_primitive()->variable_id, node.get_primitive()->user_specified_type} {
}
layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) {

View File

@@ -4,8 +4,10 @@
#include "intel_gpu/plugin/program_builder.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include "openvino/core/type/element_type.hpp"
#include "openvino/op/assign.hpp"
#include "openvino/op/read_value.hpp"
#include "transformations/rt_info/original_precision_attribute.hpp"
#include "intel_gpu/primitives/assign.hpp"
#include "intel_gpu/primitives/read_value.hpp"
@@ -24,10 +26,12 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr<ov::
const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format };
auto inputs = p.GetInputInfo(op);
auto user_specified_type = get_original_precision(op);
const auto prim = T_PRIMITIVE{layer_type_name_ID(op),
inputs,
variable_id,
variable_layout};
variable_layout,
user_specified_type};
p.add_primitive(*op, prim);
}

View File

@@ -292,10 +292,12 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
const bool keep_precision_sensitive_in_fp32_1 = true;
const bool convert_input_output_precision = false;
const bool store_original_precision_as_rt_attribute = true;
manager.register_pass<ov::pass::ConvertPrecision>(fp_convert_precision_map,
empty_fuse_map,
keep_precision_sensitive_in_fp32_1,
convert_input_output_precision);
convert_input_output_precision,
store_original_precision_as_rt_attribute);
manager.register_pass<ov::pass::CommonOptimizations>();

View File

@@ -22,12 +22,14 @@ VariableState::VariableState(const VariableStateInfo& info, RemoteContextImpl::P
, m_layout(info.m_layout)
, m_user_specified_type(info.m_user_specified_type)
, m_context(context)
, m_shape_predictor(shape_predictor) {
, m_shape_predictor(shape_predictor)
, m_initial_layout(info.m_layout) {
update_device_buffer();
}
void VariableState::reset() {
m_is_set = false;
set_layout(m_initial_layout);
}
cldnn::memory::ptr VariableState::get_memory() const {

View File

@@ -251,7 +251,10 @@ class KVCacheTests: public ::testing::Test {
}
}
void test_smoke_multipleIterations_stateful(bool is_caching_test, bool fuse_cache_reorder, bool build_state_initializer) {
void test_smoke_multipleIterations_stateful(bool is_caching_test,
bool fuse_cache_reorder,
bool build_state_initializer,
ov::element::Type model_element_type = ov::element::f16) {
#if defined(ANDROID)
GTEST_SKIP();
#endif
@@ -276,11 +279,10 @@ class KVCacheTests: public ::testing::Test {
const size_t batch = 1;
const size_t n_heads = 32;
const size_t n_features = 80;
const size_t n_features = 10;
const size_t context_size = 20;
size_t cache_size = 0;
ov::element::Type element_type = ov::element::f16;
ov::element::Type element_type = model_element_type;
const bool stateful = true;
@@ -368,53 +370,62 @@ class KVCacheTests: public ::testing::Test {
if (fuse_cache_reorder) {
infer_request.set_tensor(input2, beam_idx_data);
}
ov::Tensor ref_kv_cache;
{
const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
for (size_t num_repeats = 0; num_repeats < 2; num_repeats++) {
ov::Tensor ref_kv_cache;
size_t cache_size = 0;
{
const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
new_token_input.set_shape(new_token_data.get_shape());
matmul_input.set_shape(matmul_data.get_shape());
new_token_input.set_shape(new_token_data.get_shape());
matmul_input.set_shape(matmul_data.get_shape());
new_token_data.copy_to(new_token_input);
matmul_data.copy_to(matmul_input);
new_token_data.copy_to(new_token_input);
matmul_data.copy_to(matmul_input);
ref_kv_cache = ov::Tensor(element_type, kv_cache_size_initial);
ref_kv_cache = ov::Tensor(element_type, kv_cache_size_initial);
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
ref_kv_cache = ref_results[0];
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
ref_kv_cache = ref_results[0];
infer_request.infer();
infer_request.infer();
compare_tensors({ ref_results[1] }, {matmul_out});
compare_tensors({ ref_results[1] }, {matmul_out});
cache_size += context_size;
}
cache_size += context_size;
}
const size_t input_tokens = 1;
const size_t niters = 10;
const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
size_t context_length = cache_size + input_tokens;
for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
ref_kv_cache = ref_results[0];
const size_t input_tokens = 1;
const size_t niters = 10;
const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
size_t context_length = cache_size + input_tokens;
for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
ref_kv_cache = ref_results[0];
new_token_input.set_shape(new_token_data.get_shape());
matmul_input.set_shape(matmul_data.get_shape());
new_token_data.copy_to(new_token_input);
matmul_data.copy_to(matmul_input);
new_token_input.set_shape(new_token_data.get_shape());
matmul_input.set_shape(matmul_data.get_shape());
new_token_data.copy_to(new_token_input);
matmul_data.copy_to(matmul_input);
infer_request.infer();
infer_request.infer();
compare_tensors({ ref_results[1] }, {matmul_out});
compare_tensors({ ref_results[1] }, {matmul_out});
}
auto state = infer_request.query_state()[0].get_state();
ASSERT_EQ(state.get_element_type(), element_type);
compare_tensors({ ref_kv_cache }, {state});
infer_request.reset_state();
}
if (is_caching_test) {
@@ -448,4 +459,9 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer)
TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_cached) {
this->test_smoke_multipleIterations_stateful(true, true, true);
}
TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_f32) {
this->test_smoke_multipleIterations_stateful(false, true, true, ov::element::f32);
}
} // namespace