[GPU] adding support for weights compression to gather (#21711)

* 1st version

* fixed style check error

* added unit tests

* removed unnecessary comments

* updated hash to include decompression type

* applied code reviews

* applied code reviews

* fixed unit tests
This commit is contained in:
Eddy Kim
2023-12-19 02:46:11 +09:00
committed by GitHub
parent d03dc4fa2a
commit 63e08f9965
17 changed files with 898 additions and 24 deletions

View File

@@ -0,0 +1,47 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/core/node.hpp"
#include "openvino/op/gather.hpp"
namespace ov {
namespace intel_gpu {
namespace op {
class GatherCompressed : public ov::op::v8::Gather {
public:
OPENVINO_OP("GatherCompressed", "gpu_opset");
GatherCompressed() = default;
GatherCompressed(const ov::Output<Node> &data,
const ov::Output<Node> &indices,
const ov::Output<Node> &axis,
const ov::Output<Node> &decompression_scale,
const ov::Output<Node> &decompression_zero_point,
const ov::element::Type output_type = ov::element::undefined);
GatherCompressed(const ov::Output<Node> &data,
const ov::Output<Node> &indices,
const ov::Output<Node> &axis,
const ov::Output<Node> &decompression_scale,
const ov::element::Type output_type = ov::element::undefined);
bool visit_attributes(ov::AttributeVisitor &visitor) override;
void validate_and_infer_types() override;
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
ov::element::Type get_output_type() const { return m_output_type; }
protected:
ov::element::Type m_output_type;
};
} // namespace op
} // namespace intel_gpu
} // namespace ov

View File

@@ -272,3 +272,4 @@ REGISTER_FACTORY(internal, MulticlassNmsIEInternal);
REGISTER_FACTORY(internal, FullyConnected);
REGISTER_FACTORY(internal, FullyConnectedCompressed);
REGISTER_FACTORY(internal, RMS);
REGISTER_FACTORY(internal, GatherCompressed);

View File

@@ -41,6 +41,42 @@ struct gather : public primitive_base<gather> {
, batch_dim(batch_dim)
, support_neg_ind(support_neg_ind) {}
/// @brief Constructs gather compressed primitive.
/// @param id This primitive id.
/// @param dict Input dictionary primitive id.
/// @param idx Input indexes primitive id.
/// @param axis Gathering axis.
/// @param decompression_scale Input decompression scale factors primitive id.
/// @param decompression_zero_point Input decompression zero point primitive id.
/// @param input_rank Input rank.
/// @param output_shape Output shape.
/// @param batch_dim Batch_dim
/// @param support_neg_ind Support negative indexes
gather(const primitive_id& id,
const input_info& dict,
const input_info& idx,
const int64_t axis,
const input_info& decompression_scale,
const input_info& decompression_zero_point,
const ov::element::Type decompressed_type,
const int64_t input_rank,
const ov::Shape& output_shape,
const int64_t batch_dim = 0,
const bool support_neg_ind = false,
const padding& output_padding = padding())
: primitive_base(id, {dict, idx}, {output_padding})
, axis(axis)
, input_rank(input_rank)
, output_shape(output_shape)
, batch_dim(batch_dim)
, support_neg_ind(support_neg_ind)
, compressed_weights(true)
, decompressed_type(decompressed_type)
, decompression_scale(decompression_scale)
, decompression_zero_point(decompression_zero_point) {
OPENVINO_ASSERT(decompression_scale.is_valid(), "[GPU] Compressed gather requires at least decompression scale input");
}
/// @brief Gathering axis
int64_t axis = 0;
/// @brief Gather input rank
@@ -52,11 +88,23 @@ struct gather : public primitive_base<gather> {
/// @brief Support negative indexes
bool support_neg_ind = false;
bool compressed_weights = false;
ov::element::Type decompressed_type;
input_info decompression_scale;
input_info decompression_zero_point;
optional_value<float> decompression_zero_point_scalar = optional_value<float>();
size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, axis);
seed = hash_combine(seed, batch_dim);
seed = hash_combine(seed, support_neg_ind);
seed = hash_combine(seed, compressed_weights);
seed = hash_combine(seed, decompressed_type.get_type_name());
seed = hash_combine(seed, decompression_scale.is_valid());
seed = hash_combine(seed, decompression_zero_point.is_valid());
seed = hash_combine(seed, decompression_zero_point_scalar.has_value());
seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f));
return seed;
}
@@ -68,7 +116,11 @@ struct gather : public primitive_base<gather> {
return axis == rhs_casted.axis &&
batch_dim == rhs_casted.batch_dim &&
support_neg_ind == rhs_casted.support_neg_ind;
support_neg_ind == rhs_casted.support_neg_ind &&
compressed_weights == rhs_casted.compressed_weights &&
decompression_scale.is_valid() == rhs_casted.decompression_scale.is_valid() &&
decompression_zero_point.is_valid() == rhs_casted.decompression_zero_point.is_valid() &&
decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f);
}
void save(BinaryOutputBuffer& ob) const override {
@@ -78,6 +130,17 @@ struct gather : public primitive_base<gather> {
ob << output_shape;
ob << batch_dim;
ob << support_neg_ind;
ob << compressed_weights;
ob << decompressed_type.get_type_name();
ob << decompression_scale;
ob << decompression_zero_point;
if (decompression_zero_point_scalar.has_value()) {
ob << true;
ob << make_data(&decompression_zero_point_scalar.value(), sizeof(float));
} else {
ob << false;
}
}
void load(BinaryInputBuffer& ib) override {
@@ -87,6 +150,35 @@ struct gather : public primitive_base<gather> {
ib >> output_shape;
ib >> batch_dim;
ib >> support_neg_ind;
ib >> compressed_weights;
std::string decompressed_type_name;
ib >> decompressed_type_name;
decompressed_type = ov::element::Type(decompressed_type_name);
ib >> decompression_scale;
ib >> decompression_zero_point;
bool has_value;
ib >> has_value;
if (has_value) {
float decompression_zero_point_value = 0.f;
ib >> make_data(&decompression_zero_point_value, sizeof(float));
decompression_zero_point_scalar = decompression_zero_point_value;
} else {
decompression_zero_point_scalar = optional_value<float>();
}
}
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
std::vector<std::reference_wrapper<const primitive_id>> ret;
if (decompression_scale.is_valid())
ret.push_back(decompression_scale.pid);
if (decompression_zero_point.is_valid())
ret.push_back(decompression_zero_point.pid);
return ret;
}
};
} // namespace cldnn

View File

@@ -66,6 +66,10 @@ struct input_info {
}
};
bool is_valid() const {
return pid.compare("") != 0;
}
void save(BinaryOutputBuffer& ob) const {
ob << pid;
ob << idx;

View File

@@ -56,6 +56,9 @@ layout gather_inst::calc_output_layout(gather_node const& node, kernel_impl_para
}
}
auto output_type = input_layout.data_type;
if (impl_param.typed_desc<gather>()->compressed_weights) {
output_type = impl_param.typed_desc<gather>()->decompressed_type;
}
if (impl_param.has_fused_primitives()) {
output_type = impl_param.get_fused_output_layout().data_type;
}
@@ -73,6 +76,9 @@ std::vector<layout> gather_inst::calc_output_layouts(gather_node const& /*node*/
auto input1_layout = impl_param.get_input_layout(1);
auto output_type = input0_layout.data_type;
if (impl_param.typed_desc<gather>()->compressed_weights) {
output_type = impl_param.typed_desc<gather>()->decompressed_type;
}
if (impl_param.has_fused_primitives()) {
output_type = impl_param.get_fused_output_layout().data_type;
}
@@ -111,6 +117,14 @@ std::string gather_inst::to_string(gather_node const& node) {
gather_info.add("axis", desc->axis);
gather_info.add("batch_dim", desc->batch_dim);
gather_info.add("output shape", cldnn::to_string(desc->output_shape));
gather_info.add("compressed weights", desc->compressed_weights ? "true" : "false");
if (desc->compressed_weights) {
gather_info.add("decompression scale id", desc->decompression_scale.pid);
gather_info.add("decompression zp id", desc->decompression_zero_point.pid);
if (desc->decompression_zero_point_scalar.has_value()) {
gather_info.add("decompression zp value", desc->decompression_zero_point_scalar.value());
}
}
node_info->add("gather info", gather_info);
node_info->dump(primitive_description);

View File

@@ -76,6 +76,20 @@ struct gather_impl : typed_primitive_impl_ocl<gather> {
}
}
protected:
kernel_arguments_data get_arguments(const typed_primitive_inst<gather>& instance) const override {
kernel_arguments_data args = parent::get_arguments(instance);
const auto& desc = instance.get_typed_desc<gather>();
if (desc->decompression_scale.is_valid())
args.inputs.push_back(instance.dep_memory_ptr(2));
if (desc->decompression_zero_point.is_valid())
args.inputs.push_back(instance.dep_memory_ptr(3));
return args;
}
public:
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<gather>();
@@ -105,6 +119,22 @@ public:
params.outputs[0] = convert_data_tensor(output_layout);
params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
bool commpressed = primitive->decompression_scale.is_valid();
bool with_zp = primitive->decompression_zero_point.is_valid();
if (commpressed) {
params.compressed = true;
params.decompression_scale = convert_data_tensor(impl_param.get_input_layout(2));
if (with_zp) {
params.has_decompression_zp = true;
params.decompression_zero_point = convert_data_tensor(impl_param.get_input_layout(3));
} else if (primitive->decompression_zero_point_scalar.has_value()) {
params.has_decompression_zp = true;
params.scalar_zp = true;
params.zp_value = primitive->decompression_zero_point_scalar.value();
}
}
return {params, optional_params};
}
@@ -151,6 +181,8 @@ attach_gather_impl::attach_gather_impl() {
data_types::f16,
data_types::i8,
data_types::u8,
data_types::i4,
data_types::u4,
data_types::i32
};
@@ -190,6 +222,8 @@ attach_gather_impl::attach_gather_impl() {
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::i4, format::bfyx),
std::make_tuple(data_types::u4, format::bfyx),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),

View File

@@ -3,6 +3,7 @@
//
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/int4_utils.cl"
#ifdef INDEX_DIM
inline uint FUNC(get_positive_index)(int in)
@@ -25,6 +26,12 @@ KERNEL(gather_ref)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* dictionary,
const __global INPUT1_TYPE* indices,
#if DECOMPRESSION_SCALE_TERM
const __global DECOMPRESSION_SCALE_TYPE* decompression_scale,
#endif
#if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR
const __global DECOMPRESSION_ZP_TYPE* decompression_zp,
#endif
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
@@ -54,13 +61,50 @@ KERNEL(gather_ref)(
const uint dictionary_idx = GET_DICTIONARY_INDEX(DICTIONARY_INDEX_ORDER);
const uint output_idx = GET_INDEX(OUTPUT,,ORDER);
#if COMPRESSED_WEIGHTS
OUTPUT_TYPE val = OUTPUT_VAL_ZERO;
#if GATHER_AXIS_SHAPE_INFO_INDEX
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? dictionary[dictionary_idx] : 0;
#elif AXIS_DIM
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? dictionary[dictionary_idx] : 0;
#if GATHER_AXIS_SHAPE_INFO_INDEX
bool need_decompress = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? true : false;
#elif AXIS_DIM
bool need_decompress = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? true : false;
#else
bool need_decompress = true;
#endif
if (need_decompress) {
#if DECOMPRESSION_ZP_TERM
#if DECOMPRESSION_ZP_SCALAR
OUTPUT_TYPE zp = DECOMPRESSION_ZP_VALUE;
#else
const uint zp_offset = dictionary_idx / DECOMPRESSION_ZP_GROUP_SIZE;
OUTPUT_TYPE zp = TO_OUTPUT_TYPE(decompression_zp[zp_offset]);
#endif
#else
OUTPUT_TYPE zp = OUTPUT_VAL_ZERO;
#endif
const uint decomp_offset = dictionary_idx / DECOMPRESSION_SCALE_GROUP_SIZE;
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
#if COMPRESSED_WEIGHTS_INT8
OUTPUT_TYPE val_compressed = dictionary[dictionary_idx];
val = (val_compressed - zp) * scale;
#elif COMPRESSED_WEIGHTS_INT4
INPUT0_TYPE val_packed = dictionary[dictionary_idx / 2];
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) val_unpacked = UNPACK_INT4x2(OUTPUT_TYPE, *((INT4_PACKED_TYPE*)&val_packed));
OUTPUT_TYPE val_compressed = ((OUTPUT_TYPE*)(&val_unpacked))[dictionary_idx % 2];
val = (val_compressed - zp) * scale;
#endif
}
#else
INPUT0_TYPE val = dictionary[dictionary_idx];
#if GATHER_AXIS_SHAPE_INFO_INDEX
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? dictionary[dictionary_idx] : 0;
#elif AXIS_DIM
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? dictionary[dictionary_idx] : 0;
#else
INPUT0_TYPE val = dictionary[dictionary_idx];
#endif
#endif
#if HAS_FUSED_OPS

View File

@@ -40,6 +40,8 @@ ParamsKey GatherKernelRef::GetSupportedKey() const {
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT4);
k.EnableInputDataType(Datatype::INT4);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
@@ -254,6 +256,42 @@ JitConstants GatherKernelRef::GetJitConstants(const gather_params& params) const
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}
if (params.compressed) {
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS", 1)});
if (params.inputs[0].GetDType() == Datatype::INT8 || params.inputs[0].GetDType() == Datatype::UINT8) {
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT8", 1)});
} else if (params.inputs[0].GetDType() == Datatype::INT4 || params.inputs[0].GetDType() == Datatype::UINT4) {
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT4", 1)});
}
auto wt = params.inputs[0].GetDType();
if (wt == Datatype::UINT4) {
jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", WeightsType::UINT4, 2));
} else if (wt == Datatype::INT4) {
jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", WeightsType::INT4, 2));
}
const size_t scale_groups_num = params.decompression_scale.LogicalSize();
const size_t scale_group_size = params.inputs[0].LogicalSize() / scale_groups_num;
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_TERM", 1)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE", params.decompression_scale)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUPS_NUM", scale_groups_num)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUP_SIZE", scale_group_size)});
if (params.has_decompression_zp) {
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_TERM", 1)});
if (params.scalar_zp) {
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_VALUE", params.zp_value)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_SCALAR", 1)});
} else {
const size_t zp_groups_num = params.decompression_zero_point.LogicalSize();
const size_t zp_group_size = params.inputs[0].LogicalSize() / zp_groups_num;
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP", params.decompression_zero_point)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUPS_NUM", zp_groups_num)});
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUP_SIZE", zp_group_size)});
}
}
}
return jit;
}
@@ -321,6 +359,13 @@ KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional
GetUpdateDispatchDataFunc(kd);
int inputs_count = 2;
if (newParams.compressed) {
inputs_count++;
if (newParams.has_decompression_zp && !newParams.scalar_zp)
inputs_count++;
}
FillCLKernelData(kernel,
dispatchData,
params.engineInfo,
@@ -330,7 +375,7 @@ KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional
"",
false,
false,
2,
inputs_count,
GetFusedPrimitiveInputsCount(params),
1,
newParams.has_dynamic_tensors());

View File

@@ -16,6 +16,13 @@ struct gather_params : public base_params {
GatherAxis axis;
int64_t batch_dim;
bool support_neg_ind;
bool compressed = false;
bool has_decompression_zp = false;
bool scalar_zp = false;
float zp_value = 0.0f;
DataTensor decompression_scale;
DataTensor decompression_zero_point;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -6,18 +6,29 @@
#include "intel_gpu/plugin/common_utils.hpp"
#include "transformations/utils/utils.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/gather.hpp"
#include "intel_gpu/op/gather_compressed.hpp"
#include "intel_gpu/primitives/gather.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/reshape.hpp"
#include "intel_gpu/primitives/crop.hpp"
namespace ov {
namespace op {
namespace internal {
using GatherCompressed = ov::intel_gpu::op::GatherCompressed;
} // namespace internal
} // namespace op
} // namespace ov
namespace ov {
namespace intel_gpu {
template <typename T>
void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const int64_t batch_dim = 0, bool support_neg_ind = false) {
void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const int64_t batch_dim = 0, bool support_neg_ind = false,
bool weights_compressed = false) {
auto inputs = p.GetInputInfo(op);
std::string layerName = layer_type_name_ID(op);
@@ -120,15 +131,47 @@ void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const i
auto cropPrim = cldnn::crop(layerName, reordered_inputs[0], outTensor, offsetTensor);
p.add_primitive(*op, cropPrim);
} else {
auto gatherPrim = cldnn::gather(layerName,
reordered_inputs[0],
reordered_inputs[1],
axis,
input_rank,
out_shape,
batch_dim,
support_neg_ind);
p.add_primitive(*op, gatherPrim);
if (!weights_compressed) {
auto gatherPrim = cldnn::gather(layerName,
reordered_inputs[0],
reordered_inputs[1],
axis,
input_rank,
out_shape,
batch_dim,
support_neg_ind);
p.add_primitive(*op, gatherPrim);
} else {
float zp_value = 0.0f;
bool has_scalar_zp = false;
if (op->get_input_size() == 5) {
std::shared_ptr<ov::op::v0::Constant> zp_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(4));
if (zp_const && ov::shape_size(zp_const->get_output_shape(0)) == 1) {
has_scalar_zp = true;
zp_value = zp_const->cast_vector<float>()[0];
}
}
std::shared_ptr<ov::intel_gpu::op::GatherCompressed> op_compressed = std::dynamic_pointer_cast<ov::intel_gpu::op::GatherCompressed>(op);
auto gatherPrim = cldnn::gather(layerName,
reordered_inputs[0],
reordered_inputs[1],
axis,
reordered_inputs[3],
has_scalar_zp ? cldnn::input_info() : reordered_inputs[4],
op_compressed->get_output_type(),
input_rank,
out_shape,
batch_dim,
support_neg_ind);
if (has_scalar_zp) {
gatherPrim.decompression_zero_point_scalar = zp_value;
}
p.add_primitive(*op, gatherPrim);
}
}
// Add reorder and reshape for scalar indice
@@ -174,5 +217,12 @@ static void CreateGatherOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v8::
REGISTER_FACTORY_IMPL(v8, Gather);
static void CreateGatherCompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::GatherCompressed>& op) {
validate_inputs_count(op, {4, 5});
CreateGatherOpBase<ov::op::internal::GatherCompressed>(p, op, op->get_batch_dims(), true, true);
}
REGISTER_FACTORY_IMPL(internal, GatherCompressed);
} // namespace intel_gpu
} // namespace ov

View File

@@ -0,0 +1,131 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "convert_gather_to_compressed.hpp"
#include <memory>
#include "intel_gpu/op/gather_compressed.hpp"
#include "openvino/opsets/opset10.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/pass/pattern/op/pattern.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/pattern/op/or.hpp"
#include "transformations/utils/utils.hpp"
namespace ov {
namespace intel_gpu {
ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
using namespace ov::pass::pattern;
auto compressed_constant = [](const ov::Output<ov::Node>& output) {
return (output.get_element_type() == ov::element::u8 ||
output.get_element_type() == ov::element::i8 ||
output.get_element_type() == ov::element::u4 ||
output.get_element_type() == ov::element::i4) &&
output.get_target_inputs().size() == 1 &&
(output.get_shape().size() == 2 ||
output.get_shape().size() == 3);
};
auto reshape_3d_to_2d = [](const ov::Output<ov::Node>& output) {
auto in_ps = output.get_node()->get_input_partial_shape(0);
auto out_ps = output.get_node()->get_output_partial_shape(0);
return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2;
};
auto dicts_m = wrap_type<ov::op::v0::Constant>(compressed_constant);
auto convert_m = wrap_type<ov::op::v0::Convert>({dicts_m});
auto sub_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto subtract_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_const_m});
auto mul_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
auto mul_with_sub_m = wrap_type<ov::op::v1::Multiply>({subtract_m, mul_const_m});
auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_const_m});
auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});
auto reshape_const_m = wrap_type<ov::op::v0::Constant>();
auto reshape_m = wrap_type<ov::op::v1::Reshape>({mul_m, reshape_const_m}, reshape_3d_to_2d);
auto last_convert_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{reshape_m, mul_m});
auto last_convert_m = wrap_type<opset10::Convert>({last_convert_input});
auto dicts_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, last_convert_m, mul_m});
auto gather_m = wrap_type<opset10::Gather>({dicts_input_m, any_input(), any_input()});
ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
OPENVINO_ASSERT(pattern_map.count(gather_m));
OPENVINO_ASSERT(pattern_map.count(mul_const_m));
OPENVINO_ASSERT(pattern_map.count(dicts_m));
OPENVINO_ASSERT(pattern_map.count(convert_m));
ov::Shape dicts_shape = pattern_map.at(dicts_m).get_node_shared_ptr()->get_shape();
auto gather_node = std::dynamic_pointer_cast<opset10::Gather>(pattern_map.at(gather_m).get_node_shared_ptr());
if (!gather_node || transformation_callback(gather_node)) {
return false;
}
auto reshape_const_to_2d = [](std::shared_ptr<ov::Node> node) {
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
OPENVINO_ASSERT(constant != nullptr);
ov::Shape current_shape = constant->get_shape();
if (current_shape.size() == 2)
return constant;
OPENVINO_ASSERT(current_shape.size() == 3);
auto new_shape = ov::Shape{current_shape[0], current_shape[1] * current_shape[2]};
return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
};
std::shared_ptr<ov::Node> gather_input_a = reshape_const_to_2d(pattern_map.at(dicts_m).get_node_shared_ptr());
const auto& gather_input_b = gather_node->get_input_node_shared_ptr(1);
const auto& gather_input_c = gather_node->get_input_node_shared_ptr(2);
const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr());
std::shared_ptr<ov::Node> optional_zero_point = nullptr;
const bool with_zero_point = pattern_map.count(subtract_m) > 0;
if (with_zero_point) {
optional_zero_point = reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr());
}
std::shared_ptr<ov::Node> gather_input_scale = scale;
std::shared_ptr<ov::Node> gather_input_zp = optional_zero_point;
std::vector<std::shared_ptr<ov::Node>> result_nodes = {};
std::shared_ptr<ov::Node> new_gather_node = nullptr;
if (with_zero_point) {
new_gather_node = std::make_shared<op::GatherCompressed>(gather_input_a,
gather_input_b,
gather_input_c,
gather_input_scale,
gather_input_zp,
gather_node->get_output_element_type(0));
} else {
new_gather_node = std::make_shared<op::GatherCompressed>(gather_input_a,
gather_input_b,
gather_input_c,
gather_input_scale,
gather_node->get_output_element_type(0));
}
result_nodes.push_back(new_gather_node);
new_gather_node->set_friendly_name(gather_node->get_friendly_name());
ov::copy_runtime_info(m.get_matched_nodes(), result_nodes);
ov::replace_node(gather_node, new_gather_node);
return true;
};
auto m = std::make_shared<ov::pass::pattern::Matcher>(gather_m, "ConvertGatherToGatherCompressed");
this->register_matcher(m, callback);
}
} // namespace intel_gpu
} // namespace ov

View File

@@ -0,0 +1,19 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/pass/graph_rewrite.hpp"
namespace ov {
namespace intel_gpu {
class ConvertGatherToGatherCompressed: public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ConvertGatherToGatherCompressed", "0");
ConvertGatherToGatherCompressed();
};
} // namespace intel_gpu
} // namespace ov

View File

@@ -0,0 +1,76 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/op/gather_compressed.hpp"
#include "gather_shape_inference.hpp"
namespace ov {
namespace intel_gpu {
namespace op {
GatherCompressed::GatherCompressed(const ov::Output<Node>& data,
const ov::Output<Node>& indices,
const ov::Output<Node>& axis,
const ov::Output<Node>& decompression_scale,
const ov::Output<Node>& decompression_zero_point,
const ov::element::Type output_type)
: ov::op::v8::Gather({data, indices, axis}), m_output_type(output_type) {
set_argument(3, decompression_scale);
set_argument(4, decompression_zero_point);
validate_and_infer_types();
}
GatherCompressed::GatherCompressed(const ov::Output<Node>& data,
const ov::Output<Node>& indices,
const ov::Output<Node>& axis,
const ov::Output<Node>& decompression_scale,
const ov::element::Type output_type)
: ov::op::v8::Gather({data, indices, axis}), m_output_type(output_type) {
set_argument(3, decompression_scale);
validate_and_infer_types();
}
std::shared_ptr<ov::Node> GatherCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);
if (new_args.size() == 4)
return std::make_shared<GatherCompressed>(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
m_output_type);
else if (new_args.size() == 5)
return std::make_shared<GatherCompressed>(new_args.at(0),
new_args.at(1),
new_args.at(2),
new_args.at(3),
new_args.at(4),
m_output_type);
else
OPENVINO_THROW("Unexpected inputs count for GatherCompressed op: ", new_args.size());
}
void GatherCompressed::validate_and_infer_types() {
const auto input_size = get_input_size();
NODE_VALIDATION_CHECK(this,
input_size >= 3,
"Number of inputs is incorrect. Current value is: ",
input_size,
", expected at least 3.");
auto out_shapes = ov::op::shape_infer(this, std::vector<ov::PartialShape>{get_input_partial_shape(0),
get_input_partial_shape(1), get_input_partial_shape(2)});
auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
set_output_type(0, output_type, out_shapes[0]);
}
bool GatherCompressed::visit_attributes(ov::AttributeVisitor &visitor) {
visitor.on_attribute("output_type", m_output_type);
return true;
}
} // namespace op
} // namespace intel_gpu
} // namespace ov

View File

@@ -30,6 +30,7 @@
#include "openvino/op/group_conv.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/util/sub_graph_base.hpp"
#include "openvino/op/gather.hpp"
#include "openvino/pass/manager.hpp"
#include "openvino/pass/constant_folding.hpp"
@@ -116,6 +117,7 @@
#include "plugin/transformations/convert_matmul_to_fc.hpp"
#include "plugin/transformations/move_fc_reshape_to_weights.hpp"
#include "plugin/transformations/convert_fc_to_compressed.hpp"
#include "plugin/transformations/convert_gather_to_compressed.hpp"
#include "plugin/transformations/rms_fusion.hpp"
#include "plugin/transformations/binary_conv_to_conv.hpp"
#include "plugin/transformations/move_convert_after_gather.hpp"
@@ -147,7 +149,7 @@ static bool disable_reduce_decomposition(const std::shared_ptr<const ov::Node> n
return false;
}
static bool is_non_decompression_multiply(const std::shared_ptr<const ov::Node> node) {
static bool is_non_supported_decompression_op(const std::shared_ptr<const ov::Node> node) {
auto get_single_consumer = [](const std::shared_ptr<const ov::Node> node) -> std::shared_ptr<ov::Node> {
const auto consumers = node->get_output_target_inputs(0);
if (consumers.size() != 1)
@@ -159,17 +161,17 @@ static bool is_non_decompression_multiply(const std::shared_ptr<const ov::Node>
if (!consumer)
return true;
if (ov::is_type<ov::opset1::MatMul>(consumer)) {
if (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer)) {
return false;
} else if (ov::is_type<ov::opset1::Reshape>(consumer)) {
consumer = get_single_consumer(consumer);
if (consumer != nullptr && ov::is_type<ov::opset1::MatMul>(consumer)) {
if (consumer != nullptr && (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer))) {
return false;
}
}
if (consumer != nullptr && ov::is_type<ov::opset1::Convert>(consumer)) {
consumer = get_single_consumer(consumer);
if (consumer != nullptr && ov::is_type<ov::opset1::MatMul>(consumer)) {
if (consumer != nullptr && (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer))) {
return false;
}
}
@@ -274,7 +276,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
ov::element::i4}, true);
// Ignore nodes that are not related to FullyConnected and allow ConstantFolding to be applied to them
initial_transformations_manager.get_pass_config()->set_callback<ov::pass::MarkDequantizationSubgraph>(is_non_decompression_multiply);
initial_transformations_manager.get_pass_config()->set_callback<ov::pass::MarkDequantizationSubgraph>(is_non_supported_decompression_op);
initial_transformations_manager.run_passes(func);
ov::pass::Manager manager;
@@ -697,6 +699,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();
manager.register_pass<ov::intel_gpu::ConvertGatherToGatherCompressed>();
manager.register_pass<ov::intel_gpu::RMSFusion>();
// This is supposed to be the last pass to ensure that we don't have name collisions until

View File

@@ -2161,3 +2161,169 @@ TEST(gather_single_axis, simple_Baxis) {
auto crop_prim = network.get_primitive("gather");
ASSERT_EQ(crop_prim->can_be_optimized(), false);
}
class gather_gpu_tests: public ::testing::Test {
public:
void test_compressed_scale_zp(bool is_caching_test) {
auto& engine = get_test_engine();
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
auto zp_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
set_values(input_mem, { 0, 0, 4,
4, 0, 0 });
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
6, 7, 8, 9, 10});
set_values(scale_mem, { 2.0f, 4.0f });
set_values(zp_mem, { 1.0f, 2.0f });
topology topology(
input_layout("input", input_mem->get_layout()),
data("weights", weights_mem),
data("scale", scale_mem),
data("zp", zp_mem),
gather("gather_prim", input_info("weights"), input_info("input"), 1,
input_info("scale"), input_info("zp"), data_types::f32, 2, ov::Shape{2, 3}, 1)
);
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input_mem);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gather_prim");
auto output_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr (output_mem, get_test_stream());
ov::PartialShape expected_shape{2, 3};
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
std::vector<float> expected_result = {0.0f, 0.0f, 8.0f, 32.0f, 16.0f, 16.0f};
for (size_t i = 0; i < expected_result.size(); i++) {
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
}
}
void test_compressed_scale(bool is_caching_test) {
auto& engine = get_test_engine();
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
set_values(input_mem, { 0, 0, 4,
4, 0, 0 });
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
6, 7, 8, 9, 10});
set_values(scale_mem, { 2.0f, 4.0f });
topology topology(
input_layout("input", input_mem->get_layout()),
data("weights", weights_mem),
data("scale", scale_mem),
gather("gather_prim", input_info("weights"), input_info("input"), 1,
input_info("scale"), input_info(""), data_types::f32, 2, ov::Shape{2, 3}, 1)
);
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input_mem);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gather_prim");
auto output_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr (output_mem, get_test_stream());
ov::PartialShape expected_shape{2, 3};
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
std::vector<float> expected_result = {2.0f, 2.0f, 10.0f, 40.0f, 24.0f, 24.0f};
for (size_t i = 0; i < expected_result.size(); i++) {
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
}
}
void test_compressed_scale_fp16(bool is_caching_test) {
auto& engine = get_test_engine();
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f16, format::bfyx });
set_values(input_mem, { 0, 0, 4,
4, 0, 0 });
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
6, 7, 8, 9, 10});
set_values<ov::float16>(scale_mem, { ov::float16(2.0f), ov::float16(4.0f) });
topology topology(
input_layout("input", input_mem->get_layout()),
data("weights", weights_mem),
data("scale", scale_mem),
gather("gather_prim", input_info("weights"), input_info("input"), 1,
input_info("scale"), input_info(""), data_types::f16, 2, ov::Shape{2, 3}, 1)
);
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input_mem);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "gather_prim");
auto output_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());
ov::PartialShape expected_shape{2, 3};
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
std::vector<ov::float16> expected_result = {ov::float16(2), ov::float16(2), ov::float16(10),
ov::float16(40), ov::float16(24), ov::float16(24)};
for (size_t i = 0; i < expected_result.size(); i++) {
ASSERT_FLOAT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
}
}
};
TEST_F(gather_gpu_tests, compressed_scale_zp) {
this->test_compressed_scale_zp(false);
}
TEST_F(gather_gpu_tests, compressed_scale_zp_cached) {
this->test_compressed_scale_zp(true);
}
TEST_F(gather_gpu_tests, compressed_scale) {
this->test_compressed_scale(false);
}
TEST_F(gather_gpu_tests, compressed_scale_cached) {
this->test_compressed_scale(true);
}
TEST_F(gather_gpu_tests, compressed_scale_fp16) {
this->test_compressed_scale_fp16(false);
}
TEST_F(gather_gpu_tests, compressed_scale_fp16_cached) {
this->test_compressed_scale_fp16(true);
}

View File

@@ -104,8 +104,8 @@ public:
const auto primitive_hash = primitve->hash();
const auto params_hash = prim_inst->get_impl_params()->hash();
ASSERT_EQ(primitive_hash, 93320679543770233UL);
ASSERT_EQ(params_hash, 1542578941420280552UL);
ASSERT_EQ(primitive_hash, 8439414674502129643UL);
ASSERT_EQ(params_hash, 9235751886952244871UL);
}
void test_gemm_basic(bool is_caching_test) {

View File

@@ -0,0 +1,141 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "common_test_utils/ov_test_utils.hpp"
#include "openvino/core/model.hpp"
#include "openvino/pass/manager.hpp"
#include "openvino/op/transpose.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/parameter.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/result.hpp"
#include "openvino/op/subtract.hpp"
#include "intel_gpu/op/gather_compressed.hpp"
#include "plugin/transformations/convert_gather_to_compressed.hpp"
#include <memory>
using namespace testing;
using namespace ov::intel_gpu;
namespace ov {
namespace test {
namespace intel_gpu {
TEST_F(TransformationTestsF, ConvertGatherToCompressed1) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
auto gather = std::make_shared<ov::op::v8::Gather>(scale, input1, axis_const);
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
manager.register_pass<ConvertGatherToGatherCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, ov::element::f32);
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
}
}
TEST_F(TransformationTestsF, ConvertGatherToCompressed2) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto gather = std::make_shared<ov::op::v8::Gather>(scale, input1, axis_const);
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
manager.register_pass<ConvertGatherToGatherCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
}
}
TEST_F(TransformationTestsF, ConvertGatherToCompressed3) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 4, 4 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { -1, 16 });
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
auto gather = std::make_shared<ov::op::v8::Gather>(reshape, input1, axis_const);
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
manager.register_pass<ConvertGatherToGatherCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
}
}
TEST_F(TransformationTestsF, ConvertGatherToCompressed4) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 32, 4, 4 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 1, 1, 1 }, { 1 });
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { -1, 16 });
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
auto gather = std::make_shared<ov::op::v8::Gather>(reshape, input1, axis_const);
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
manager.register_pass<ConvertGatherToGatherCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 32, 16 }, { 1 });
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 1, 1 }, { 1 });
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
}
}
} // namespace intel_gpu
} // namespace test
} // namespace ov