[GPU] Enable state precision conversion to infer precision (#21729)
This commit is contained in:
committed by
GitHub
parent
0e92469330
commit
63b23a1ebb
@@ -277,7 +277,7 @@ private:
|
||||
void add_default_output_chains();
|
||||
void calculate_weights_cache_capacity();
|
||||
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
|
||||
void set_variables_state_info(const std::string& variable_id, const layout& variable_layout);
|
||||
void set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type);
|
||||
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
int64_t iteration = 0;
|
||||
|
||||
@@ -50,6 +50,8 @@ private:
|
||||
cldnn::memory::ptr m_memory = nullptr;
|
||||
size_t actual_size = 0;
|
||||
|
||||
const cldnn::layout m_initial_layout;
|
||||
|
||||
void update_device_buffer();
|
||||
ov::element::Type get_user_specified_type() const;
|
||||
};
|
||||
|
||||
@@ -25,13 +25,16 @@ struct assign : public primitive_base<assign> {
|
||||
assign(const primitive_id &id,
|
||||
const std::vector<input_info>& inputs,
|
||||
const std::string& variable_id,
|
||||
const layout& output_layout)
|
||||
const layout& output_layout,
|
||||
const ov::element::Type& user_specified_type = ov::element::undefined)
|
||||
: primitive_base(id, inputs, {padding()}, {optional_data_type{output_layout.data_type}}),
|
||||
variable_id{variable_id},
|
||||
output_layout{output_layout} {}
|
||||
output_layout{output_layout},
|
||||
user_specified_type(user_specified_type) {}
|
||||
|
||||
std::string variable_id;
|
||||
layout output_layout;
|
||||
ov::element::Type user_specified_type;
|
||||
|
||||
bool operator==(const primitive& rhs) const override {
|
||||
if (!compare_common_params(rhs))
|
||||
@@ -39,19 +42,25 @@ struct assign : public primitive_base<assign> {
|
||||
|
||||
auto rhs_casted = downcast<const assign>(rhs);
|
||||
|
||||
return variable_id == rhs_casted.variable_id;
|
||||
return variable_id == rhs_casted.variable_id &&
|
||||
user_specified_type == rhs_casted.user_specified_type;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
primitive_base<assign>::save(ob);
|
||||
ov::element::Type_t data_type = user_specified_type;
|
||||
ob << variable_id;
|
||||
ob << output_layout;
|
||||
ob << make_data(&data_type, sizeof(ov::element::Type_t));
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
primitive_base<assign>::load(ib);
|
||||
ov::element::Type_t data_type;
|
||||
ib >> variable_id;
|
||||
ib >> output_layout;
|
||||
ib >> make_data(&data_type, sizeof(ov::element::Type_t));
|
||||
user_specified_type = data_type;
|
||||
}
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "openvino/core/type/element_type.hpp"
|
||||
#include "primitive.hpp"
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
|
||||
@@ -25,13 +26,16 @@ struct read_value : public primitive_base<read_value> {
|
||||
read_value(const primitive_id& id,
|
||||
const std::vector<input_info>& inputs,
|
||||
const std::string& variable_id,
|
||||
const layout& output_layout)
|
||||
const layout& output_layout,
|
||||
const ov::element::Type& user_specified_type = ov::element::undefined)
|
||||
: primitive_base(id, inputs, {padding()}, {optional_data_type{output_layout.data_type}}),
|
||||
variable_id{variable_id},
|
||||
output_layout{output_layout} {}
|
||||
output_layout{output_layout},
|
||||
user_specified_type(user_specified_type) {}
|
||||
|
||||
std::string variable_id;
|
||||
layout output_layout;
|
||||
ov::element::Type user_specified_type;
|
||||
|
||||
bool operator==(const primitive& rhs) const override {
|
||||
if (!compare_common_params(rhs))
|
||||
@@ -39,19 +43,25 @@ struct read_value : public primitive_base<read_value> {
|
||||
|
||||
auto rhs_casted = downcast<const read_value>(rhs);
|
||||
|
||||
return variable_id == rhs_casted.variable_id;
|
||||
return variable_id == rhs_casted.variable_id &&
|
||||
user_specified_type == rhs_casted.user_specified_type;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
primitive_base<read_value>::save(ob);
|
||||
ov::element::Type_t data_type = user_specified_type;
|
||||
ob << variable_id;
|
||||
ob << output_layout;
|
||||
ob << make_data(&data_type, sizeof(ov::element::Type_t));
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
primitive_base<read_value>::load(ib);
|
||||
ov::element::Type_t data_type;
|
||||
ib >> variable_id;
|
||||
ib >> output_layout;
|
||||
ib >> make_data(&data_type, sizeof(ov::element::Type_t));
|
||||
user_specified_type = data_type;
|
||||
}
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -12,7 +12,7 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(assign)
|
||||
|
||||
assign_inst::typed_primitive_inst(network& network, const assign_node& node) :
|
||||
parent{network, node, false},
|
||||
memory_state::variable{node.get_primitive()->variable_id} {
|
||||
memory_state::variable{node.get_primitive()->variable_id, node.get_primitive()->user_specified_type} {
|
||||
}
|
||||
|
||||
layout assign_inst::calc_output_layout(const assign_node& node, kernel_impl_params const& impl_param) {
|
||||
|
||||
@@ -6,23 +6,9 @@
|
||||
|
||||
#include "intel_gpu/primitives/assign.hpp"
|
||||
#include "primitive_inst.h"
|
||||
#include "variable.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
namespace memory_state {
|
||||
|
||||
class variable {
|
||||
public:
|
||||
explicit variable(const std::string& variable_id) : variable_id_ {variable_id} {}
|
||||
|
||||
const std::string& variable_id() const { return variable_id_; }
|
||||
void set_variable_id(const std::string& variable_id) { variable_id_ = variable_id; }
|
||||
|
||||
private:
|
||||
std::string variable_id_;
|
||||
};
|
||||
|
||||
} // namespace memory_state
|
||||
|
||||
template <>
|
||||
struct typed_program_node<assign> : public typed_program_node_base<assign> {
|
||||
private:
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "assign_inst.h"
|
||||
#include "intel_gpu/primitives/read_value.hpp"
|
||||
#include "primitive_inst.h"
|
||||
#include "variable.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
|
||||
28
src/plugins/intel_gpu/src/graph/include/variable.hpp
Normal file
28
src/plugins/intel_gpu/src/graph/include/variable.hpp
Normal file
@@ -0,0 +1,28 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "openvino/core/type/element_type.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
namespace memory_state {
|
||||
|
||||
class variable {
|
||||
public:
|
||||
explicit variable(const std::string& variable_id, ov::element::Type user_specified_type = ov::element::undefined)
|
||||
: m_variable_id {variable_id}
|
||||
, m_user_specified_type(user_specified_type) {}
|
||||
|
||||
const std::string& variable_id() const { return m_variable_id; }
|
||||
ov::element::Type get_user_specified_type() const { return m_user_specified_type; }
|
||||
|
||||
private:
|
||||
std::string m_variable_id;
|
||||
ov::element::Type m_user_specified_type;
|
||||
};
|
||||
|
||||
} // namespace memory_state
|
||||
} // namespace cldnn
|
||||
@@ -1327,7 +1327,7 @@ void network::allocate_primitive_instance(program_node const& node) {
|
||||
_data_outputs.push_back(inst);
|
||||
}
|
||||
if (auto state_prim = std::dynamic_pointer_cast<memory_state::variable>(inst)) {
|
||||
set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0));
|
||||
set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type());
|
||||
}
|
||||
if (node.is_constant())
|
||||
transfer_memory_to_device(inst, node);
|
||||
@@ -1393,8 +1393,8 @@ const ov::intel_gpu::VariablesInfoMap& network::get_variables_info() const {
|
||||
return _variables_state_info;
|
||||
}
|
||||
|
||||
void network::set_variables_state_info(const std::string& variable_id, const layout& variable_layout) {
|
||||
_variables_state_info.emplace(variable_id, ov::intel_gpu::VariableStateInfo{variable_id, variable_layout});
|
||||
void network::set_variables_state_info(const std::string& variable_id, const layout& variable_layout, ov::element::Type user_specified_type) {
|
||||
_variables_state_info.emplace(variable_id, ov::intel_gpu::VariableStateInfo{variable_id, variable_layout, user_specified_type});
|
||||
}
|
||||
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -12,7 +12,7 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(read_value)
|
||||
|
||||
read_value_inst::typed_primitive_inst(network& network, const read_value_node& node) :
|
||||
parent(network, node, !node.can_be_optimized() && (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound())),
|
||||
memory_state::variable{node.get_primitive()->variable_id} {
|
||||
memory_state::variable{node.get_primitive()->variable_id, node.get_primitive()->user_specified_type} {
|
||||
}
|
||||
|
||||
layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) {
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
|
||||
#include "intel_gpu/plugin/program_builder.hpp"
|
||||
#include "intel_gpu/plugin/common_utils.hpp"
|
||||
#include "openvino/core/type/element_type.hpp"
|
||||
#include "openvino/op/assign.hpp"
|
||||
#include "openvino/op/read_value.hpp"
|
||||
#include "transformations/rt_info/original_precision_attribute.hpp"
|
||||
#include "intel_gpu/primitives/assign.hpp"
|
||||
#include "intel_gpu/primitives/read_value.hpp"
|
||||
|
||||
@@ -24,10 +26,12 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr<ov::
|
||||
const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format };
|
||||
|
||||
auto inputs = p.GetInputInfo(op);
|
||||
auto user_specified_type = get_original_precision(op);
|
||||
const auto prim = T_PRIMITIVE{layer_type_name_ID(op),
|
||||
inputs,
|
||||
variable_id,
|
||||
variable_layout};
|
||||
variable_layout,
|
||||
user_specified_type};
|
||||
|
||||
p.add_primitive(*op, prim);
|
||||
}
|
||||
|
||||
@@ -292,10 +292,12 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|
||||
|
||||
const bool keep_precision_sensitive_in_fp32_1 = true;
|
||||
const bool convert_input_output_precision = false;
|
||||
const bool store_original_precision_as_rt_attribute = true;
|
||||
manager.register_pass<ov::pass::ConvertPrecision>(fp_convert_precision_map,
|
||||
empty_fuse_map,
|
||||
keep_precision_sensitive_in_fp32_1,
|
||||
convert_input_output_precision);
|
||||
convert_input_output_precision,
|
||||
store_original_precision_as_rt_attribute);
|
||||
|
||||
manager.register_pass<ov::pass::CommonOptimizations>();
|
||||
|
||||
|
||||
@@ -22,12 +22,14 @@ VariableState::VariableState(const VariableStateInfo& info, RemoteContextImpl::P
|
||||
, m_layout(info.m_layout)
|
||||
, m_user_specified_type(info.m_user_specified_type)
|
||||
, m_context(context)
|
||||
, m_shape_predictor(shape_predictor) {
|
||||
, m_shape_predictor(shape_predictor)
|
||||
, m_initial_layout(info.m_layout) {
|
||||
update_device_buffer();
|
||||
}
|
||||
|
||||
void VariableState::reset() {
|
||||
m_is_set = false;
|
||||
set_layout(m_initial_layout);
|
||||
}
|
||||
|
||||
cldnn::memory::ptr VariableState::get_memory() const {
|
||||
|
||||
@@ -251,7 +251,10 @@ class KVCacheTests: public ::testing::Test {
|
||||
}
|
||||
}
|
||||
|
||||
void test_smoke_multipleIterations_stateful(bool is_caching_test, bool fuse_cache_reorder, bool build_state_initializer) {
|
||||
void test_smoke_multipleIterations_stateful(bool is_caching_test,
|
||||
bool fuse_cache_reorder,
|
||||
bool build_state_initializer,
|
||||
ov::element::Type model_element_type = ov::element::f16) {
|
||||
#if defined(ANDROID)
|
||||
GTEST_SKIP();
|
||||
#endif
|
||||
@@ -276,11 +279,10 @@ class KVCacheTests: public ::testing::Test {
|
||||
|
||||
const size_t batch = 1;
|
||||
const size_t n_heads = 32;
|
||||
const size_t n_features = 80;
|
||||
const size_t n_features = 10;
|
||||
const size_t context_size = 20;
|
||||
size_t cache_size = 0;
|
||||
|
||||
ov::element::Type element_type = ov::element::f16;
|
||||
ov::element::Type element_type = model_element_type;
|
||||
|
||||
const bool stateful = true;
|
||||
|
||||
@@ -368,53 +370,62 @@ class KVCacheTests: public ::testing::Test {
|
||||
if (fuse_cache_reorder) {
|
||||
infer_request.set_tensor(input2, beam_idx_data);
|
||||
}
|
||||
ov::Tensor ref_kv_cache;
|
||||
|
||||
{
|
||||
const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
|
||||
const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
|
||||
const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
|
||||
for (size_t num_repeats = 0; num_repeats < 2; num_repeats++) {
|
||||
ov::Tensor ref_kv_cache;
|
||||
size_t cache_size = 0;
|
||||
{
|
||||
const ov::Shape new_token_size_initial = {batch, context_size, n_heads, n_features};
|
||||
const ov::Shape kv_cache_size_initial = {batch, n_heads, cache_size, n_features};
|
||||
const ov::Shape matmul_in_size_initial = {batch, n_heads, context_size, context_size};
|
||||
|
||||
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
|
||||
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
|
||||
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size_initial);
|
||||
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_initial);
|
||||
|
||||
new_token_input.set_shape(new_token_data.get_shape());
|
||||
matmul_input.set_shape(matmul_data.get_shape());
|
||||
new_token_input.set_shape(new_token_data.get_shape());
|
||||
matmul_input.set_shape(matmul_data.get_shape());
|
||||
|
||||
new_token_data.copy_to(new_token_input);
|
||||
matmul_data.copy_to(matmul_input);
|
||||
new_token_data.copy_to(new_token_input);
|
||||
matmul_data.copy_to(matmul_input);
|
||||
|
||||
ref_kv_cache = ov::Tensor(element_type, kv_cache_size_initial);
|
||||
ref_kv_cache = ov::Tensor(element_type, kv_cache_size_initial);
|
||||
|
||||
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
|
||||
ref_kv_cache = ref_results[0];
|
||||
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
|
||||
ref_kv_cache = ref_results[0];
|
||||
|
||||
infer_request.infer();
|
||||
infer_request.infer();
|
||||
|
||||
compare_tensors({ ref_results[1] }, {matmul_out});
|
||||
compare_tensors({ ref_results[1] }, {matmul_out});
|
||||
|
||||
cache_size += context_size;
|
||||
}
|
||||
cache_size += context_size;
|
||||
}
|
||||
|
||||
const size_t input_tokens = 1;
|
||||
const size_t niters = 10;
|
||||
const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
|
||||
size_t context_length = cache_size + input_tokens;
|
||||
for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
|
||||
ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
|
||||
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
|
||||
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
|
||||
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
|
||||
ref_kv_cache = ref_results[0];
|
||||
const size_t input_tokens = 1;
|
||||
const size_t niters = 10;
|
||||
const ov::Shape new_token_size = {batch, input_tokens, n_heads, n_features};
|
||||
size_t context_length = cache_size + input_tokens;
|
||||
for (size_t i = 0; i < niters; i++, context_length += input_tokens) {
|
||||
ov::Shape matmul_in_size_loop = {batch, n_heads, input_tokens, context_length};
|
||||
auto new_token_data = ov::test::utils::create_and_fill_tensor(element_type, new_token_size);
|
||||
auto matmul_data = ov::test::utils::create_and_fill_tensor(element_type, matmul_in_size_loop);
|
||||
auto ref_results = get_ref_results(ref_kv_cache, new_token_data, matmul_data);
|
||||
ref_kv_cache = ref_results[0];
|
||||
|
||||
new_token_input.set_shape(new_token_data.get_shape());
|
||||
matmul_input.set_shape(matmul_data.get_shape());
|
||||
new_token_data.copy_to(new_token_input);
|
||||
matmul_data.copy_to(matmul_input);
|
||||
new_token_input.set_shape(new_token_data.get_shape());
|
||||
matmul_input.set_shape(matmul_data.get_shape());
|
||||
new_token_data.copy_to(new_token_input);
|
||||
matmul_data.copy_to(matmul_input);
|
||||
|
||||
infer_request.infer();
|
||||
infer_request.infer();
|
||||
|
||||
compare_tensors({ ref_results[1] }, {matmul_out});
|
||||
compare_tensors({ ref_results[1] }, {matmul_out});
|
||||
}
|
||||
|
||||
auto state = infer_request.query_state()[0].get_state();
|
||||
ASSERT_EQ(state.get_element_type(), element_type);
|
||||
compare_tensors({ ref_kv_cache }, {state});
|
||||
|
||||
infer_request.reset_state();
|
||||
}
|
||||
|
||||
if (is_caching_test) {
|
||||
@@ -448,4 +459,9 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer)
|
||||
TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_cached) {
|
||||
this->test_smoke_multipleIterations_stateful(true, true, true);
|
||||
}
|
||||
|
||||
TEST_F(KVCacheTests, smoke_multipleIterations_stateful_gather_with_initializer_f32) {
|
||||
this->test_smoke_multipleIterations_stateful(false, true, true, ov::element::f32);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Reference in New Issue
Block a user