[GPU] adding support for weights compression to gather (#21711)
* 1st version * fixed style check error * added unit tests * removed unnecessary comments * updated hash to include decompression type * applied code reviews * applied code reviews * fixed unit tests
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/core/node.hpp"
|
||||
#include "openvino/op/gather.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
namespace op {
|
||||
|
||||
class GatherCompressed : public ov::op::v8::Gather {
|
||||
public:
|
||||
OPENVINO_OP("GatherCompressed", "gpu_opset");
|
||||
|
||||
GatherCompressed() = default;
|
||||
|
||||
GatherCompressed(const ov::Output<Node> &data,
|
||||
const ov::Output<Node> &indices,
|
||||
const ov::Output<Node> &axis,
|
||||
const ov::Output<Node> &decompression_scale,
|
||||
const ov::Output<Node> &decompression_zero_point,
|
||||
const ov::element::Type output_type = ov::element::undefined);
|
||||
|
||||
GatherCompressed(const ov::Output<Node> &data,
|
||||
const ov::Output<Node> &indices,
|
||||
const ov::Output<Node> &axis,
|
||||
const ov::Output<Node> &decompression_scale,
|
||||
const ov::element::Type output_type = ov::element::undefined);
|
||||
|
||||
bool visit_attributes(ov::AttributeVisitor &visitor) override;
|
||||
|
||||
void validate_and_infer_types() override;
|
||||
|
||||
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
|
||||
|
||||
ov::element::Type get_output_type() const { return m_output_type; }
|
||||
|
||||
protected:
|
||||
ov::element::Type m_output_type;
|
||||
};
|
||||
|
||||
} // namespace op
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
@@ -272,3 +272,4 @@ REGISTER_FACTORY(internal, MulticlassNmsIEInternal);
|
||||
REGISTER_FACTORY(internal, FullyConnected);
|
||||
REGISTER_FACTORY(internal, FullyConnectedCompressed);
|
||||
REGISTER_FACTORY(internal, RMS);
|
||||
REGISTER_FACTORY(internal, GatherCompressed);
|
||||
|
||||
@@ -41,6 +41,42 @@ struct gather : public primitive_base<gather> {
|
||||
, batch_dim(batch_dim)
|
||||
, support_neg_ind(support_neg_ind) {}
|
||||
|
||||
/// @brief Constructs gather compressed primitive.
|
||||
/// @param id This primitive id.
|
||||
/// @param dict Input dictionary primitive id.
|
||||
/// @param idx Input indexes primitive id.
|
||||
/// @param axis Gathering axis.
|
||||
/// @param decompression_scale Input decompression scale factors primitive id.
|
||||
/// @param decompression_zero_point Input decompression zero point primitive id.
|
||||
/// @param input_rank Input rank.
|
||||
/// @param output_shape Output shape.
|
||||
/// @param batch_dim Batch_dim
|
||||
/// @param support_neg_ind Support negative indexes
|
||||
gather(const primitive_id& id,
|
||||
const input_info& dict,
|
||||
const input_info& idx,
|
||||
const int64_t axis,
|
||||
const input_info& decompression_scale,
|
||||
const input_info& decompression_zero_point,
|
||||
const ov::element::Type decompressed_type,
|
||||
const int64_t input_rank,
|
||||
const ov::Shape& output_shape,
|
||||
const int64_t batch_dim = 0,
|
||||
const bool support_neg_ind = false,
|
||||
const padding& output_padding = padding())
|
||||
: primitive_base(id, {dict, idx}, {output_padding})
|
||||
, axis(axis)
|
||||
, input_rank(input_rank)
|
||||
, output_shape(output_shape)
|
||||
, batch_dim(batch_dim)
|
||||
, support_neg_ind(support_neg_ind)
|
||||
, compressed_weights(true)
|
||||
, decompressed_type(decompressed_type)
|
||||
, decompression_scale(decompression_scale)
|
||||
, decompression_zero_point(decompression_zero_point) {
|
||||
OPENVINO_ASSERT(decompression_scale.is_valid(), "[GPU] Compressed gather requires at least decompression scale input");
|
||||
}
|
||||
|
||||
/// @brief Gathering axis
|
||||
int64_t axis = 0;
|
||||
/// @brief Gather input rank
|
||||
@@ -52,11 +88,23 @@ struct gather : public primitive_base<gather> {
|
||||
/// @brief Support negative indexes
|
||||
bool support_neg_ind = false;
|
||||
|
||||
bool compressed_weights = false;
|
||||
ov::element::Type decompressed_type;
|
||||
input_info decompression_scale;
|
||||
input_info decompression_zero_point;
|
||||
optional_value<float> decompression_zero_point_scalar = optional_value<float>();
|
||||
|
||||
size_t hash() const override {
|
||||
size_t seed = primitive::hash();
|
||||
seed = hash_combine(seed, axis);
|
||||
seed = hash_combine(seed, batch_dim);
|
||||
seed = hash_combine(seed, support_neg_ind);
|
||||
seed = hash_combine(seed, compressed_weights);
|
||||
seed = hash_combine(seed, decompressed_type.get_type_name());
|
||||
seed = hash_combine(seed, decompression_scale.is_valid());
|
||||
seed = hash_combine(seed, decompression_zero_point.is_valid());
|
||||
seed = hash_combine(seed, decompression_zero_point_scalar.has_value());
|
||||
seed = hash_combine(seed, decompression_zero_point_scalar.value_or(0.0f));
|
||||
return seed;
|
||||
}
|
||||
|
||||
@@ -68,7 +116,11 @@ struct gather : public primitive_base<gather> {
|
||||
|
||||
return axis == rhs_casted.axis &&
|
||||
batch_dim == rhs_casted.batch_dim &&
|
||||
support_neg_ind == rhs_casted.support_neg_ind;
|
||||
support_neg_ind == rhs_casted.support_neg_ind &&
|
||||
compressed_weights == rhs_casted.compressed_weights &&
|
||||
decompression_scale.is_valid() == rhs_casted.decompression_scale.is_valid() &&
|
||||
decompression_zero_point.is_valid() == rhs_casted.decompression_zero_point.is_valid() &&
|
||||
decompression_zero_point_scalar.value_or(0.0f) == rhs_casted.decompression_zero_point_scalar.value_or(0.0f);
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
@@ -78,6 +130,17 @@ struct gather : public primitive_base<gather> {
|
||||
ob << output_shape;
|
||||
ob << batch_dim;
|
||||
ob << support_neg_ind;
|
||||
ob << compressed_weights;
|
||||
ob << decompressed_type.get_type_name();
|
||||
ob << decompression_scale;
|
||||
ob << decompression_zero_point;
|
||||
|
||||
if (decompression_zero_point_scalar.has_value()) {
|
||||
ob << true;
|
||||
ob << make_data(&decompression_zero_point_scalar.value(), sizeof(float));
|
||||
} else {
|
||||
ob << false;
|
||||
}
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
@@ -87,6 +150,35 @@ struct gather : public primitive_base<gather> {
|
||||
ib >> output_shape;
|
||||
ib >> batch_dim;
|
||||
ib >> support_neg_ind;
|
||||
ib >> compressed_weights;
|
||||
std::string decompressed_type_name;
|
||||
ib >> decompressed_type_name;
|
||||
decompressed_type = ov::element::Type(decompressed_type_name);
|
||||
ib >> decompression_scale;
|
||||
ib >> decompression_zero_point;
|
||||
|
||||
bool has_value;
|
||||
ib >> has_value;
|
||||
if (has_value) {
|
||||
float decompression_zero_point_value = 0.f;
|
||||
ib >> make_data(&decompression_zero_point_value, sizeof(float));
|
||||
decompression_zero_point_scalar = decompression_zero_point_value;
|
||||
} else {
|
||||
decompression_zero_point_scalar = optional_value<float>();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
|
||||
std::vector<std::reference_wrapper<const primitive_id>> ret;
|
||||
|
||||
if (decompression_scale.is_valid())
|
||||
ret.push_back(decompression_scale.pid);
|
||||
|
||||
if (decompression_zero_point.is_valid())
|
||||
ret.push_back(decompression_zero_point.pid);
|
||||
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -66,6 +66,10 @@ struct input_info {
|
||||
}
|
||||
};
|
||||
|
||||
bool is_valid() const {
|
||||
return pid.compare("") != 0;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const {
|
||||
ob << pid;
|
||||
ob << idx;
|
||||
|
||||
@@ -56,6 +56,9 @@ layout gather_inst::calc_output_layout(gather_node const& node, kernel_impl_para
|
||||
}
|
||||
}
|
||||
auto output_type = input_layout.data_type;
|
||||
if (impl_param.typed_desc<gather>()->compressed_weights) {
|
||||
output_type = impl_param.typed_desc<gather>()->decompressed_type;
|
||||
}
|
||||
if (impl_param.has_fused_primitives()) {
|
||||
output_type = impl_param.get_fused_output_layout().data_type;
|
||||
}
|
||||
@@ -73,6 +76,9 @@ std::vector<layout> gather_inst::calc_output_layouts(gather_node const& /*node*/
|
||||
auto input1_layout = impl_param.get_input_layout(1);
|
||||
|
||||
auto output_type = input0_layout.data_type;
|
||||
if (impl_param.typed_desc<gather>()->compressed_weights) {
|
||||
output_type = impl_param.typed_desc<gather>()->decompressed_type;
|
||||
}
|
||||
if (impl_param.has_fused_primitives()) {
|
||||
output_type = impl_param.get_fused_output_layout().data_type;
|
||||
}
|
||||
@@ -111,6 +117,14 @@ std::string gather_inst::to_string(gather_node const& node) {
|
||||
gather_info.add("axis", desc->axis);
|
||||
gather_info.add("batch_dim", desc->batch_dim);
|
||||
gather_info.add("output shape", cldnn::to_string(desc->output_shape));
|
||||
gather_info.add("compressed weights", desc->compressed_weights ? "true" : "false");
|
||||
if (desc->compressed_weights) {
|
||||
gather_info.add("decompression scale id", desc->decompression_scale.pid);
|
||||
gather_info.add("decompression zp id", desc->decompression_zero_point.pid);
|
||||
if (desc->decompression_zero_point_scalar.has_value()) {
|
||||
gather_info.add("decompression zp value", desc->decompression_zero_point_scalar.value());
|
||||
}
|
||||
}
|
||||
|
||||
node_info->add("gather info", gather_info);
|
||||
node_info->dump(primitive_description);
|
||||
|
||||
@@ -76,6 +76,20 @@ struct gather_impl : typed_primitive_impl_ocl<gather> {
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
kernel_arguments_data get_arguments(const typed_primitive_inst<gather>& instance) const override {
|
||||
kernel_arguments_data args = parent::get_arguments(instance);
|
||||
const auto& desc = instance.get_typed_desc<gather>();
|
||||
|
||||
if (desc->decompression_scale.is_valid())
|
||||
args.inputs.push_back(instance.dep_memory_ptr(2));
|
||||
|
||||
if (desc->decompression_zero_point.is_valid())
|
||||
args.inputs.push_back(instance.dep_memory_ptr(3));
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
public:
|
||||
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
|
||||
const auto& primitive = impl_param.typed_desc<gather>();
|
||||
@@ -105,6 +119,22 @@ public:
|
||||
|
||||
params.outputs[0] = convert_data_tensor(output_layout);
|
||||
params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
|
||||
|
||||
bool commpressed = primitive->decompression_scale.is_valid();
|
||||
bool with_zp = primitive->decompression_zero_point.is_valid();
|
||||
if (commpressed) {
|
||||
params.compressed = true;
|
||||
params.decompression_scale = convert_data_tensor(impl_param.get_input_layout(2));
|
||||
if (with_zp) {
|
||||
params.has_decompression_zp = true;
|
||||
params.decompression_zero_point = convert_data_tensor(impl_param.get_input_layout(3));
|
||||
} else if (primitive->decompression_zero_point_scalar.has_value()) {
|
||||
params.has_decompression_zp = true;
|
||||
params.scalar_zp = true;
|
||||
params.zp_value = primitive->decompression_zero_point_scalar.value();
|
||||
}
|
||||
}
|
||||
|
||||
return {params, optional_params};
|
||||
}
|
||||
|
||||
@@ -151,6 +181,8 @@ attach_gather_impl::attach_gather_impl() {
|
||||
data_types::f16,
|
||||
data_types::i8,
|
||||
data_types::u8,
|
||||
data_types::i4,
|
||||
data_types::u4,
|
||||
data_types::i32
|
||||
};
|
||||
|
||||
@@ -190,6 +222,8 @@ attach_gather_impl::attach_gather_impl() {
|
||||
std::make_tuple(data_types::i32, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
std::make_tuple(data_types::i4, format::bfyx),
|
||||
std::make_tuple(data_types::u4, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/int4_utils.cl"
|
||||
|
||||
#ifdef INDEX_DIM
|
||||
inline uint FUNC(get_positive_index)(int in)
|
||||
@@ -25,6 +26,12 @@ KERNEL(gather_ref)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
const __global INPUT0_TYPE* dictionary,
|
||||
const __global INPUT1_TYPE* indices,
|
||||
#if DECOMPRESSION_SCALE_TERM
|
||||
const __global DECOMPRESSION_SCALE_TYPE* decompression_scale,
|
||||
#endif
|
||||
#if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR
|
||||
const __global DECOMPRESSION_ZP_TYPE* decompression_zp,
|
||||
#endif
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
@@ -54,13 +61,50 @@ KERNEL(gather_ref)(
|
||||
const uint dictionary_idx = GET_DICTIONARY_INDEX(DICTIONARY_INDEX_ORDER);
|
||||
const uint output_idx = GET_INDEX(OUTPUT,,ORDER);
|
||||
|
||||
#if COMPRESSED_WEIGHTS
|
||||
OUTPUT_TYPE val = OUTPUT_VAL_ZERO;
|
||||
|
||||
#if GATHER_AXIS_SHAPE_INFO_INDEX
|
||||
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? dictionary[dictionary_idx] : 0;
|
||||
#elif AXIS_DIM
|
||||
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? dictionary[dictionary_idx] : 0;
|
||||
#if GATHER_AXIS_SHAPE_INFO_INDEX
|
||||
bool need_decompress = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? true : false;
|
||||
#elif AXIS_DIM
|
||||
bool need_decompress = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? true : false;
|
||||
#else
|
||||
bool need_decompress = true;
|
||||
#endif
|
||||
|
||||
if (need_decompress) {
|
||||
#if DECOMPRESSION_ZP_TERM
|
||||
#if DECOMPRESSION_ZP_SCALAR
|
||||
OUTPUT_TYPE zp = DECOMPRESSION_ZP_VALUE;
|
||||
#else
|
||||
const uint zp_offset = dictionary_idx / DECOMPRESSION_ZP_GROUP_SIZE;
|
||||
OUTPUT_TYPE zp = TO_OUTPUT_TYPE(decompression_zp[zp_offset]);
|
||||
#endif
|
||||
#else
|
||||
OUTPUT_TYPE zp = OUTPUT_VAL_ZERO;
|
||||
#endif
|
||||
const uint decomp_offset = dictionary_idx / DECOMPRESSION_SCALE_GROUP_SIZE;
|
||||
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
|
||||
|
||||
#if COMPRESSED_WEIGHTS_INT8
|
||||
OUTPUT_TYPE val_compressed = dictionary[dictionary_idx];
|
||||
val = (val_compressed - zp) * scale;
|
||||
#elif COMPRESSED_WEIGHTS_INT4
|
||||
INPUT0_TYPE val_packed = dictionary[dictionary_idx / 2];
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) val_unpacked = UNPACK_INT4x2(OUTPUT_TYPE, *((INT4_PACKED_TYPE*)&val_packed));
|
||||
|
||||
OUTPUT_TYPE val_compressed = ((OUTPUT_TYPE*)(&val_unpacked))[dictionary_idx % 2];
|
||||
val = (val_compressed - zp) * scale;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
INPUT0_TYPE val = dictionary[dictionary_idx];
|
||||
#if GATHER_AXIS_SHAPE_INFO_INDEX
|
||||
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < shape_info[GATHER_AXIS_SHAPE_INFO_INDEX]) ? dictionary[dictionary_idx] : 0;
|
||||
#elif AXIS_DIM
|
||||
INPUT0_TYPE val = (INPUT_AXIS_INDEX >= 0 && INPUT_AXIS_INDEX < AXIS_DIM) ? dictionary[dictionary_idx] : 0;
|
||||
#else
|
||||
INPUT0_TYPE val = dictionary[dictionary_idx];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
|
||||
@@ -40,6 +40,8 @@ ParamsKey GatherKernelRef::GetSupportedKey() const {
|
||||
k.EnableInputDataType(Datatype::INT32);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT4);
|
||||
k.EnableInputDataType(Datatype::INT4);
|
||||
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
@@ -254,6 +256,42 @@ JitConstants GatherKernelRef::GetJitConstants(const gather_params& params) const
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
|
||||
}
|
||||
|
||||
if (params.compressed) {
|
||||
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS", 1)});
|
||||
if (params.inputs[0].GetDType() == Datatype::INT8 || params.inputs[0].GetDType() == Datatype::UINT8) {
|
||||
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT8", 1)});
|
||||
} else if (params.inputs[0].GetDType() == Datatype::INT4 || params.inputs[0].GetDType() == Datatype::UINT4) {
|
||||
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT4", 1)});
|
||||
}
|
||||
|
||||
auto wt = params.inputs[0].GetDType();
|
||||
if (wt == Datatype::UINT4) {
|
||||
jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", WeightsType::UINT4, 2));
|
||||
} else if (wt == Datatype::INT4) {
|
||||
jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", WeightsType::INT4, 2));
|
||||
}
|
||||
|
||||
const size_t scale_groups_num = params.decompression_scale.LogicalSize();
|
||||
const size_t scale_group_size = params.inputs[0].LogicalSize() / scale_groups_num;
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_TERM", 1)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE", params.decompression_scale)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUPS_NUM", scale_groups_num)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUP_SIZE", scale_group_size)});
|
||||
if (params.has_decompression_zp) {
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_TERM", 1)});
|
||||
if (params.scalar_zp) {
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_VALUE", params.zp_value)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_SCALAR", 1)});
|
||||
} else {
|
||||
const size_t zp_groups_num = params.decompression_zero_point.LogicalSize();
|
||||
const size_t zp_group_size = params.inputs[0].LogicalSize() / zp_groups_num;
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP", params.decompression_zero_point)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUPS_NUM", zp_groups_num)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUP_SIZE", zp_group_size)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
@@ -321,6 +359,13 @@ KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional
|
||||
|
||||
GetUpdateDispatchDataFunc(kd);
|
||||
|
||||
int inputs_count = 2;
|
||||
if (newParams.compressed) {
|
||||
inputs_count++;
|
||||
if (newParams.has_decompression_zp && !newParams.scalar_zp)
|
||||
inputs_count++;
|
||||
}
|
||||
|
||||
FillCLKernelData(kernel,
|
||||
dispatchData,
|
||||
params.engineInfo,
|
||||
@@ -330,7 +375,7 @@ KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional
|
||||
"",
|
||||
false,
|
||||
false,
|
||||
2,
|
||||
inputs_count,
|
||||
GetFusedPrimitiveInputsCount(params),
|
||||
1,
|
||||
newParams.has_dynamic_tensors());
|
||||
|
||||
@@ -16,6 +16,13 @@ struct gather_params : public base_params {
|
||||
GatherAxis axis;
|
||||
int64_t batch_dim;
|
||||
bool support_neg_ind;
|
||||
|
||||
bool compressed = false;
|
||||
bool has_decompression_zp = false;
|
||||
bool scalar_zp = false;
|
||||
float zp_value = 0.0f;
|
||||
DataTensor decompression_scale;
|
||||
DataTensor decompression_zero_point;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -6,18 +6,29 @@
|
||||
#include "intel_gpu/plugin/common_utils.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/gather.hpp"
|
||||
#include "intel_gpu/op/gather_compressed.hpp"
|
||||
|
||||
#include "intel_gpu/primitives/gather.hpp"
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/reshape.hpp"
|
||||
#include "intel_gpu/primitives/crop.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace op {
|
||||
namespace internal {
|
||||
using GatherCompressed = ov::intel_gpu::op::GatherCompressed;
|
||||
} // namespace internal
|
||||
} // namespace op
|
||||
} // namespace ov
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
|
||||
template <typename T>
|
||||
void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const int64_t batch_dim = 0, bool support_neg_ind = false) {
|
||||
void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const int64_t batch_dim = 0, bool support_neg_ind = false,
|
||||
bool weights_compressed = false) {
|
||||
auto inputs = p.GetInputInfo(op);
|
||||
std::string layerName = layer_type_name_ID(op);
|
||||
|
||||
@@ -120,15 +131,47 @@ void CreateGatherOpBase(ProgramBuilder& p, const std::shared_ptr<T>& op, const i
|
||||
auto cropPrim = cldnn::crop(layerName, reordered_inputs[0], outTensor, offsetTensor);
|
||||
p.add_primitive(*op, cropPrim);
|
||||
} else {
|
||||
auto gatherPrim = cldnn::gather(layerName,
|
||||
reordered_inputs[0],
|
||||
reordered_inputs[1],
|
||||
axis,
|
||||
input_rank,
|
||||
out_shape,
|
||||
batch_dim,
|
||||
support_neg_ind);
|
||||
p.add_primitive(*op, gatherPrim);
|
||||
if (!weights_compressed) {
|
||||
auto gatherPrim = cldnn::gather(layerName,
|
||||
reordered_inputs[0],
|
||||
reordered_inputs[1],
|
||||
axis,
|
||||
input_rank,
|
||||
out_shape,
|
||||
batch_dim,
|
||||
support_neg_ind);
|
||||
p.add_primitive(*op, gatherPrim);
|
||||
} else {
|
||||
float zp_value = 0.0f;
|
||||
bool has_scalar_zp = false;
|
||||
if (op->get_input_size() == 5) {
|
||||
std::shared_ptr<ov::op::v0::Constant> zp_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(4));
|
||||
if (zp_const && ov::shape_size(zp_const->get_output_shape(0)) == 1) {
|
||||
has_scalar_zp = true;
|
||||
zp_value = zp_const->cast_vector<float>()[0];
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::intel_gpu::op::GatherCompressed> op_compressed = std::dynamic_pointer_cast<ov::intel_gpu::op::GatherCompressed>(op);
|
||||
|
||||
auto gatherPrim = cldnn::gather(layerName,
|
||||
reordered_inputs[0],
|
||||
reordered_inputs[1],
|
||||
axis,
|
||||
reordered_inputs[3],
|
||||
has_scalar_zp ? cldnn::input_info() : reordered_inputs[4],
|
||||
op_compressed->get_output_type(),
|
||||
input_rank,
|
||||
out_shape,
|
||||
batch_dim,
|
||||
support_neg_ind);
|
||||
|
||||
if (has_scalar_zp) {
|
||||
gatherPrim.decompression_zero_point_scalar = zp_value;
|
||||
}
|
||||
|
||||
p.add_primitive(*op, gatherPrim);
|
||||
}
|
||||
}
|
||||
|
||||
// Add reorder and reshape for scalar indice
|
||||
@@ -174,5 +217,12 @@ static void CreateGatherOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v8::
|
||||
|
||||
REGISTER_FACTORY_IMPL(v8, Gather);
|
||||
|
||||
static void CreateGatherCompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::GatherCompressed>& op) {
|
||||
validate_inputs_count(op, {4, 5});
|
||||
CreateGatherOpBase<ov::op::internal::GatherCompressed>(p, op, op->get_batch_dims(), true, true);
|
||||
}
|
||||
|
||||
REGISTER_FACTORY_IMPL(internal, GatherCompressed);
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "convert_gather_to_compressed.hpp"
|
||||
#include <memory>
|
||||
|
||||
#include "intel_gpu/op/gather_compressed.hpp"
|
||||
|
||||
#include "openvino/opsets/opset10.hpp"
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/subtract.hpp"
|
||||
#include "openvino/op/convert.hpp"
|
||||
#include "openvino/op/reshape.hpp"
|
||||
#include "openvino/core/rt_info.hpp"
|
||||
#include "openvino/pass/pattern/op/pattern.hpp"
|
||||
#include "openvino/pass/pattern/op/wrap_type.hpp"
|
||||
#include "openvino/pass/pattern/op/or.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
|
||||
ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
|
||||
using namespace ov::pass::pattern;
|
||||
|
||||
auto compressed_constant = [](const ov::Output<ov::Node>& output) {
|
||||
return (output.get_element_type() == ov::element::u8 ||
|
||||
output.get_element_type() == ov::element::i8 ||
|
||||
output.get_element_type() == ov::element::u4 ||
|
||||
output.get_element_type() == ov::element::i4) &&
|
||||
output.get_target_inputs().size() == 1 &&
|
||||
(output.get_shape().size() == 2 ||
|
||||
output.get_shape().size() == 3);
|
||||
};
|
||||
|
||||
auto reshape_3d_to_2d = [](const ov::Output<ov::Node>& output) {
|
||||
auto in_ps = output.get_node()->get_input_partial_shape(0);
|
||||
auto out_ps = output.get_node()->get_output_partial_shape(0);
|
||||
return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2;
|
||||
};
|
||||
|
||||
auto dicts_m = wrap_type<ov::op::v0::Constant>(compressed_constant);
|
||||
auto convert_m = wrap_type<ov::op::v0::Convert>({dicts_m});
|
||||
|
||||
auto sub_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
|
||||
auto subtract_m = wrap_type<ov::op::v1::Subtract>({convert_m, sub_const_m});
|
||||
|
||||
auto mul_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
|
||||
auto mul_with_sub_m = wrap_type<ov::op::v1::Multiply>({subtract_m, mul_const_m});
|
||||
auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_const_m});
|
||||
auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});
|
||||
|
||||
auto reshape_const_m = wrap_type<ov::op::v0::Constant>();
|
||||
auto reshape_m = wrap_type<ov::op::v1::Reshape>({mul_m, reshape_const_m}, reshape_3d_to_2d);
|
||||
|
||||
auto last_convert_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{reshape_m, mul_m});
|
||||
auto last_convert_m = wrap_type<opset10::Convert>({last_convert_input});
|
||||
|
||||
auto dicts_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, last_convert_m, mul_m});
|
||||
auto gather_m = wrap_type<opset10::Gather>({dicts_input_m, any_input(), any_input()});
|
||||
|
||||
ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
|
||||
const auto& pattern_map = m.get_pattern_value_map();
|
||||
OPENVINO_ASSERT(pattern_map.count(gather_m));
|
||||
OPENVINO_ASSERT(pattern_map.count(mul_const_m));
|
||||
OPENVINO_ASSERT(pattern_map.count(dicts_m));
|
||||
OPENVINO_ASSERT(pattern_map.count(convert_m));
|
||||
ov::Shape dicts_shape = pattern_map.at(dicts_m).get_node_shared_ptr()->get_shape();
|
||||
auto gather_node = std::dynamic_pointer_cast<opset10::Gather>(pattern_map.at(gather_m).get_node_shared_ptr());
|
||||
if (!gather_node || transformation_callback(gather_node)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto reshape_const_to_2d = [](std::shared_ptr<ov::Node> node) {
|
||||
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
|
||||
OPENVINO_ASSERT(constant != nullptr);
|
||||
ov::Shape current_shape = constant->get_shape();
|
||||
if (current_shape.size() == 2)
|
||||
return constant;
|
||||
OPENVINO_ASSERT(current_shape.size() == 3);
|
||||
|
||||
auto new_shape = ov::Shape{current_shape[0], current_shape[1] * current_shape[2]};
|
||||
|
||||
return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
|
||||
};
|
||||
|
||||
std::shared_ptr<ov::Node> gather_input_a = reshape_const_to_2d(pattern_map.at(dicts_m).get_node_shared_ptr());
|
||||
const auto& gather_input_b = gather_node->get_input_node_shared_ptr(1);
|
||||
const auto& gather_input_c = gather_node->get_input_node_shared_ptr(2);
|
||||
const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr());
|
||||
std::shared_ptr<ov::Node> optional_zero_point = nullptr;
|
||||
|
||||
const bool with_zero_point = pattern_map.count(subtract_m) > 0;
|
||||
if (with_zero_point) {
|
||||
optional_zero_point = reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr());
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> gather_input_scale = scale;
|
||||
std::shared_ptr<ov::Node> gather_input_zp = optional_zero_point;
|
||||
std::vector<std::shared_ptr<ov::Node>> result_nodes = {};
|
||||
|
||||
std::shared_ptr<ov::Node> new_gather_node = nullptr;
|
||||
if (with_zero_point) {
|
||||
new_gather_node = std::make_shared<op::GatherCompressed>(gather_input_a,
|
||||
gather_input_b,
|
||||
gather_input_c,
|
||||
gather_input_scale,
|
||||
gather_input_zp,
|
||||
gather_node->get_output_element_type(0));
|
||||
} else {
|
||||
new_gather_node = std::make_shared<op::GatherCompressed>(gather_input_a,
|
||||
gather_input_b,
|
||||
gather_input_c,
|
||||
gather_input_scale,
|
||||
gather_node->get_output_element_type(0));
|
||||
}
|
||||
|
||||
result_nodes.push_back(new_gather_node);
|
||||
new_gather_node->set_friendly_name(gather_node->get_friendly_name());
|
||||
ov::copy_runtime_info(m.get_matched_nodes(), result_nodes);
|
||||
ov::replace_node(gather_node, new_gather_node);
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ov::pass::pattern::Matcher>(gather_m, "ConvertGatherToGatherCompressed");
|
||||
this->register_matcher(m, callback);
|
||||
}
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
@@ -0,0 +1,19 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/pass/graph_rewrite.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
|
||||
class ConvertGatherToGatherCompressed: public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("ConvertGatherToGatherCompressed", "0");
|
||||
ConvertGatherToGatherCompressed();
|
||||
};
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
@@ -0,0 +1,76 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "intel_gpu/op/gather_compressed.hpp"
|
||||
#include "gather_shape_inference.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
namespace op {
|
||||
|
||||
GatherCompressed::GatherCompressed(const ov::Output<Node>& data,
|
||||
const ov::Output<Node>& indices,
|
||||
const ov::Output<Node>& axis,
|
||||
const ov::Output<Node>& decompression_scale,
|
||||
const ov::Output<Node>& decompression_zero_point,
|
||||
const ov::element::Type output_type)
|
||||
: ov::op::v8::Gather({data, indices, axis}), m_output_type(output_type) {
|
||||
set_argument(3, decompression_scale);
|
||||
set_argument(4, decompression_zero_point);
|
||||
validate_and_infer_types();
|
||||
}
|
||||
|
||||
GatherCompressed::GatherCompressed(const ov::Output<Node>& data,
|
||||
const ov::Output<Node>& indices,
|
||||
const ov::Output<Node>& axis,
|
||||
const ov::Output<Node>& decompression_scale,
|
||||
const ov::element::Type output_type)
|
||||
: ov::op::v8::Gather({data, indices, axis}), m_output_type(output_type) {
|
||||
set_argument(3, decompression_scale);
|
||||
validate_and_infer_types();
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GatherCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const {
|
||||
check_new_args_count(this, new_args);
|
||||
|
||||
if (new_args.size() == 4)
|
||||
return std::make_shared<GatherCompressed>(new_args.at(0),
|
||||
new_args.at(1),
|
||||
new_args.at(2),
|
||||
new_args.at(3),
|
||||
m_output_type);
|
||||
else if (new_args.size() == 5)
|
||||
return std::make_shared<GatherCompressed>(new_args.at(0),
|
||||
new_args.at(1),
|
||||
new_args.at(2),
|
||||
new_args.at(3),
|
||||
new_args.at(4),
|
||||
m_output_type);
|
||||
else
|
||||
OPENVINO_THROW("Unexpected inputs count for GatherCompressed op: ", new_args.size());
|
||||
}
|
||||
|
||||
void GatherCompressed::validate_and_infer_types() {
|
||||
const auto input_size = get_input_size();
|
||||
NODE_VALIDATION_CHECK(this,
|
||||
input_size >= 3,
|
||||
"Number of inputs is incorrect. Current value is: ",
|
||||
input_size,
|
||||
", expected at least 3.");
|
||||
|
||||
auto out_shapes = ov::op::shape_infer(this, std::vector<ov::PartialShape>{get_input_partial_shape(0),
|
||||
get_input_partial_shape(1), get_input_partial_shape(2)});
|
||||
|
||||
auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
|
||||
set_output_type(0, output_type, out_shapes[0]);
|
||||
}
|
||||
|
||||
bool GatherCompressed::visit_attributes(ov::AttributeVisitor &visitor) {
|
||||
visitor.on_attribute("output_type", m_output_type);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "openvino/op/group_conv.hpp"
|
||||
#include "openvino/op/multiply.hpp"
|
||||
#include "openvino/op/util/sub_graph_base.hpp"
|
||||
#include "openvino/op/gather.hpp"
|
||||
|
||||
#include "openvino/pass/manager.hpp"
|
||||
#include "openvino/pass/constant_folding.hpp"
|
||||
@@ -116,6 +117,7 @@
|
||||
#include "plugin/transformations/convert_matmul_to_fc.hpp"
|
||||
#include "plugin/transformations/move_fc_reshape_to_weights.hpp"
|
||||
#include "plugin/transformations/convert_fc_to_compressed.hpp"
|
||||
#include "plugin/transformations/convert_gather_to_compressed.hpp"
|
||||
#include "plugin/transformations/rms_fusion.hpp"
|
||||
#include "plugin/transformations/binary_conv_to_conv.hpp"
|
||||
#include "plugin/transformations/move_convert_after_gather.hpp"
|
||||
@@ -147,7 +149,7 @@ static bool disable_reduce_decomposition(const std::shared_ptr<const ov::Node> n
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_non_decompression_multiply(const std::shared_ptr<const ov::Node> node) {
|
||||
static bool is_non_supported_decompression_op(const std::shared_ptr<const ov::Node> node) {
|
||||
auto get_single_consumer = [](const std::shared_ptr<const ov::Node> node) -> std::shared_ptr<ov::Node> {
|
||||
const auto consumers = node->get_output_target_inputs(0);
|
||||
if (consumers.size() != 1)
|
||||
@@ -159,17 +161,17 @@ static bool is_non_decompression_multiply(const std::shared_ptr<const ov::Node>
|
||||
if (!consumer)
|
||||
return true;
|
||||
|
||||
if (ov::is_type<ov::opset1::MatMul>(consumer)) {
|
||||
if (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer)) {
|
||||
return false;
|
||||
} else if (ov::is_type<ov::opset1::Reshape>(consumer)) {
|
||||
consumer = get_single_consumer(consumer);
|
||||
if (consumer != nullptr && ov::is_type<ov::opset1::MatMul>(consumer)) {
|
||||
if (consumer != nullptr && (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (consumer != nullptr && ov::is_type<ov::opset1::Convert>(consumer)) {
|
||||
consumer = get_single_consumer(consumer);
|
||||
if (consumer != nullptr && ov::is_type<ov::opset1::MatMul>(consumer)) {
|
||||
if (consumer != nullptr && (ov::is_type<ov::opset1::MatMul>(consumer) || ov::is_type<ov::op::v8::Gather>(consumer))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -274,7 +276,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|
||||
ov::element::i4}, true);
|
||||
|
||||
// Ignore nodes that are not related to FullyConnected and allow ConstantFolding to be applied to them
|
||||
initial_transformations_manager.get_pass_config()->set_callback<ov::pass::MarkDequantizationSubgraph>(is_non_decompression_multiply);
|
||||
initial_transformations_manager.get_pass_config()->set_callback<ov::pass::MarkDequantizationSubgraph>(is_non_supported_decompression_op);
|
||||
initial_transformations_manager.run_passes(func);
|
||||
|
||||
ov::pass::Manager manager;
|
||||
@@ -697,6 +699,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|
||||
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
|
||||
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
|
||||
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>();
|
||||
manager.register_pass<ov::intel_gpu::ConvertGatherToGatherCompressed>();
|
||||
manager.register_pass<ov::intel_gpu::RMSFusion>();
|
||||
|
||||
// This is supposed to be the last pass to ensure that we don't have name collisions until
|
||||
|
||||
@@ -2161,3 +2161,169 @@ TEST(gather_single_axis, simple_Baxis) {
|
||||
auto crop_prim = network.get_primitive("gather");
|
||||
ASSERT_EQ(crop_prim->can_be_optimized(), false);
|
||||
}
|
||||
|
||||
class gather_gpu_tests: public ::testing::Test {
|
||||
public:
|
||||
void test_compressed_scale_zp(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
|
||||
auto zp_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_mem, { 0, 0, 4,
|
||||
4, 0, 0 });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
|
||||
6, 7, 8, 9, 10});
|
||||
set_values(scale_mem, { 2.0f, 4.0f });
|
||||
set_values(zp_mem, { 1.0f, 2.0f });
|
||||
|
||||
topology topology(
|
||||
input_layout("input", input_mem->get_layout()),
|
||||
data("weights", weights_mem),
|
||||
data("scale", scale_mem),
|
||||
data("zp", zp_mem),
|
||||
gather("gather_prim", input_info("weights"), input_info("input"), 1,
|
||||
input_info("scale"), input_info("zp"), data_types::f32, 2, ov::Shape{2, 3}, 1)
|
||||
);
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
|
||||
network->set_input_data("input", input_mem);
|
||||
|
||||
auto outputs = network->execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gather_prim");
|
||||
|
||||
auto output_mem = outputs.begin()->second.get_memory();
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_mem, get_test_stream());
|
||||
|
||||
ov::PartialShape expected_shape{2, 3};
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<float> expected_result = {0.0f, 0.0f, 8.0f, 32.0f, 16.0f, 16.0f};
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
}
|
||||
}
|
||||
|
||||
void test_compressed_scale(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_mem, { 0, 0, 4,
|
||||
4, 0, 0 });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
|
||||
6, 7, 8, 9, 10});
|
||||
set_values(scale_mem, { 2.0f, 4.0f });
|
||||
|
||||
topology topology(
|
||||
input_layout("input", input_mem->get_layout()),
|
||||
data("weights", weights_mem),
|
||||
data("scale", scale_mem),
|
||||
gather("gather_prim", input_info("weights"), input_info("input"), 1,
|
||||
input_info("scale"), input_info(""), data_types::f32, 2, ov::Shape{2, 3}, 1)
|
||||
);
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
|
||||
network->set_input_data("input", input_mem);
|
||||
|
||||
auto outputs = network->execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gather_prim");
|
||||
|
||||
auto output_mem = outputs.begin()->second.get_memory();
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_mem, get_test_stream());
|
||||
|
||||
ov::PartialShape expected_shape{2, 3};
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<float> expected_result = {2.0f, 2.0f, 10.0f, 40.0f, 24.0f, 24.0f};
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
}
|
||||
}
|
||||
|
||||
void test_compressed_scale_fp16(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ {2, 3}, data_types::i32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {2, 5}, data_types::u8, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {2, 1}, data_types::f16, format::bfyx });
|
||||
|
||||
set_values(input_mem, { 0, 0, 4,
|
||||
4, 0, 0 });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5,
|
||||
6, 7, 8, 9, 10});
|
||||
set_values<ov::float16>(scale_mem, { ov::float16(2.0f), ov::float16(4.0f) });
|
||||
|
||||
topology topology(
|
||||
input_layout("input", input_mem->get_layout()),
|
||||
data("weights", weights_mem),
|
||||
data("scale", scale_mem),
|
||||
gather("gather_prim", input_info("weights"), input_info("input"), 1,
|
||||
input_info("scale"), input_info(""), data_types::f16, 2, ov::Shape{2, 3}, 1)
|
||||
);
|
||||
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
|
||||
network->set_input_data("input", input_mem);
|
||||
|
||||
auto outputs = network->execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "gather_prim");
|
||||
|
||||
auto output_mem = outputs.begin()->second.get_memory();
|
||||
|
||||
cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());
|
||||
|
||||
ov::PartialShape expected_shape{2, 3};
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<ov::float16> expected_result = {ov::float16(2), ov::float16(2), ov::float16(10),
|
||||
ov::float16(40), ov::float16(24), ov::float16(24)};
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_FLOAT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale_zp) {
|
||||
this->test_compressed_scale_zp(false);
|
||||
}
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale_zp_cached) {
|
||||
this->test_compressed_scale_zp(true);
|
||||
}
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale) {
|
||||
this->test_compressed_scale(false);
|
||||
}
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale_cached) {
|
||||
this->test_compressed_scale(true);
|
||||
}
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale_fp16) {
|
||||
this->test_compressed_scale_fp16(false);
|
||||
}
|
||||
|
||||
TEST_F(gather_gpu_tests, compressed_scale_fp16_cached) {
|
||||
this->test_compressed_scale_fp16(true);
|
||||
}
|
||||
|
||||
@@ -104,8 +104,8 @@ public:
|
||||
const auto primitive_hash = primitve->hash();
|
||||
const auto params_hash = prim_inst->get_impl_params()->hash();
|
||||
|
||||
ASSERT_EQ(primitive_hash, 93320679543770233UL);
|
||||
ASSERT_EQ(params_hash, 1542578941420280552UL);
|
||||
ASSERT_EQ(primitive_hash, 8439414674502129643UL);
|
||||
ASSERT_EQ(params_hash, 9235751886952244871UL);
|
||||
}
|
||||
|
||||
void test_gemm_basic(bool is_caching_test) {
|
||||
|
||||
@@ -0,0 +1,141 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "common_test_utils/ov_test_utils.hpp"
|
||||
|
||||
#include "openvino/core/model.hpp"
|
||||
#include "openvino/pass/manager.hpp"
|
||||
#include "openvino/op/transpose.hpp"
|
||||
#include "openvino/op/convert.hpp"
|
||||
#include "openvino/op/multiply.hpp"
|
||||
#include "openvino/op/parameter.hpp"
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/reshape.hpp"
|
||||
#include "openvino/op/result.hpp"
|
||||
#include "openvino/op/subtract.hpp"
|
||||
#include "intel_gpu/op/gather_compressed.hpp"
|
||||
|
||||
#include "plugin/transformations/convert_gather_to_compressed.hpp"
|
||||
|
||||
#include <memory>
|
||||
|
||||
using namespace testing;
|
||||
using namespace ov::intel_gpu;
|
||||
|
||||
namespace ov {
|
||||
namespace test {
|
||||
namespace intel_gpu {
|
||||
|
||||
TEST_F(TransformationTestsF, ConvertGatherToCompressed1) {
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto scale = std::make_shared<ov::op::v1::Multiply>(convert, scale_const);
|
||||
auto gather = std::make_shared<ov::op::v8::Gather>(scale, input1, axis_const);
|
||||
|
||||
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
|
||||
manager.register_pass<ConvertGatherToGatherCompressed>();
|
||||
}
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, ov::element::f32);
|
||||
|
||||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, ConvertGatherToCompressed2) {
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
|
||||
auto gather = std::make_shared<ov::op::v8::Gather>(scale, input1, axis_const);
|
||||
|
||||
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
|
||||
manager.register_pass<ConvertGatherToGatherCompressed>();
|
||||
}
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 1 }, { 1 });
|
||||
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
|
||||
|
||||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, ConvertGatherToCompressed3) {
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 4, 4 }, { 1 });
|
||||
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
|
||||
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
|
||||
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
|
||||
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { -1, 16 });
|
||||
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
|
||||
auto gather = std::make_shared<ov::op::v8::Gather>(reshape, input1, axis_const);
|
||||
|
||||
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
|
||||
manager.register_pass<ConvertGatherToGatherCompressed>();
|
||||
}
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
|
||||
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
|
||||
|
||||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, ConvertGatherToCompressed4) {
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 32, 4, 4 }, { 1 });
|
||||
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 1, 1, 1 }, { 1 });
|
||||
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4, 1 }, { 1 });
|
||||
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
|
||||
auto reshape_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, { -1, 16 });
|
||||
auto reshape = std::make_shared<ov::op::v1::Reshape>(scale, reshape_const, false);
|
||||
auto gather = std::make_shared<ov::op::v8::Gather>(reshape, input1, axis_const);
|
||||
|
||||
model = std::make_shared<ov::Model>(ov::NodeVector{ gather }, ov::ParameterVector{ input1 });
|
||||
manager.register_pass<ConvertGatherToGatherCompressed>();
|
||||
}
|
||||
{
|
||||
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{ -1, 16 });
|
||||
auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 1 }, { 1 });
|
||||
auto weights_const = ov::op::v0::Constant::create(ov::element::u4, ov::Shape{ 32, 16 }, { 1 });
|
||||
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 32, 4 }, { 1 });
|
||||
auto zp_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 1, 1 }, { 1 });
|
||||
auto gather_compressed = std::make_shared<ov::intel_gpu::op::GatherCompressed>(weights_const, input1, axis_const, scale_const, zp_const, ov::element::f32);
|
||||
|
||||
model_ref = std::make_shared<ov::Model>(ov::NodeVector{ gather_compressed }, ov::ParameterVector{ input1 });
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace test
|
||||
} // namespace ov
|
||||
Reference in New Issue
Block a user