[GPU] Grouped decompression scale/zp support (#20491)
This commit is contained in:
committed by
GitHub
parent
4574fb112c
commit
90ad4c618d
@@ -2,6 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "fully_connected_inst.h"
|
||||
#include "pooling_inst.h"
|
||||
#include "quantize_inst.h"
|
||||
#include "reorder_inst.h"
|
||||
@@ -847,6 +848,42 @@ bool prepare_quantization::optimize_quantize(program &p, quantize_node& quantize
|
||||
return true;
|
||||
}
|
||||
|
||||
static void optimize_weights_decompression_parameters(fully_connected_node& fc_node, program& p) {
|
||||
auto fc_prim = fc_node.get_primitive();
|
||||
if (!fc_prim->compressed_weights)
|
||||
return;
|
||||
|
||||
auto reorder_bfyx_to_fbyx = [&](size_t dep_id) {
|
||||
auto& dep = fc_node.get_dependency(dep_id);
|
||||
auto target_layout = dep.get_output_layout();
|
||||
target_layout.format = format::fbyx;
|
||||
auto reorder_prim = std::make_shared<reorder>(dep.id() + "_reorder", dep.id(), target_layout);
|
||||
p.add_intermediate(reorder_prim, fc_node, dep_id, true);
|
||||
fc_node.get_dependency(dep_id).recalc_output_layout(false);
|
||||
};
|
||||
|
||||
auto need_reorder = [&](size_t dep_id) {
|
||||
auto dep_layout = fc_node.get_input_layout(dep_id);
|
||||
auto dep_pshape = dep_layout.get_partial_shape();
|
||||
|
||||
auto groups_count = dep_pshape[dep_pshape.size() - 1].get_length();
|
||||
|
||||
return groups_count > 1;
|
||||
};
|
||||
|
||||
auto decompression_scale_idx = !fc_node.bias_term() ? 2 : 3;
|
||||
if (need_reorder(decompression_scale_idx)) {
|
||||
reorder_bfyx_to_fbyx(decompression_scale_idx);
|
||||
}
|
||||
|
||||
if (!fc_prim->decompression_zero_point.empty()) {
|
||||
auto decompression_zp_idx = decompression_scale_idx + 1;
|
||||
if (need_reorder(decompression_zp_idx)) {
|
||||
reorder_bfyx_to_fbyx(decompression_zp_idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void prepare_quantization::run(program& p) {
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
@@ -859,6 +896,8 @@ void prepare_quantization::run(program& p) {
|
||||
remove_fake_reorders(p, node->as<reorder>());
|
||||
} else if (node->is_type<convolution>()) {
|
||||
prepare_asymmetric_quantization(p, node->as<convolution>());
|
||||
} else if (node->is_type<fully_connected>()) {
|
||||
optimize_weights_decompression_parameters(node->as<fully_connected>(), p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,20 +110,13 @@ public:
|
||||
bool has_scale = !primitive->decompression_scale.empty();
|
||||
|
||||
size_t offset = primitive->bias.empty() ? 2 : 3;
|
||||
const auto& weights_pshape = input1_layout.get_partial_shape();
|
||||
if (has_scale) {
|
||||
auto scale_layout = input_layouts[offset++];
|
||||
if (input1_pshape.size() != 2) {
|
||||
scale_layout.set_partial_shape(reshape_to_2d(scale_layout.get_partial_shape(), weights_pshape[0], primitive->weights_rank));
|
||||
}
|
||||
layouts.push_back(scale_layout);
|
||||
}
|
||||
|
||||
if (has_zp) {
|
||||
auto zp_layout = input_layouts[offset];
|
||||
if (input1_pshape.size() != 2) {
|
||||
zp_layout.set_partial_shape(reshape_to_2d(zp_layout.get_partial_shape(), weights_pshape[0], primitive->weights_rank));
|
||||
}
|
||||
layouts.push_back(zp_layout);
|
||||
}
|
||||
|
||||
|
||||
@@ -307,7 +307,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
(fmt_prev == format::b_fs_yx_fsv4 &&
|
||||
prev_output_layout.feature() % 32 == 0 &&
|
||||
prev_output_layout.spatial(0) == 1 &&
|
||||
prev_output_layout.spatial(1) == 1)))
|
||||
prev_output_layout.spatial(1) == 1)) && is_input_reorder(prev, next))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
|
||||
|
||||
@@ -120,7 +120,7 @@ KERNEL(fc)(
|
||||
uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
|
||||
uint weights_offset = out_f * INPUT_ELEMENTS_COUNT;
|
||||
|
||||
#if COMPRESSED_WEIGHTS
|
||||
#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1
|
||||
#if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % SIMD == 0
|
||||
ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f);
|
||||
#elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % SIMD != 0
|
||||
@@ -134,9 +134,11 @@ KERNEL(fc)(
|
||||
ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0];
|
||||
#endif
|
||||
|
||||
#if !DECOMPRESSION_ZP_TERM
|
||||
ACCUMULATOR_VEC_TYPE d_zp = 0;
|
||||
#elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD == 0
|
||||
ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale);
|
||||
#endif
|
||||
|
||||
#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1
|
||||
#if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD == 0
|
||||
ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f);
|
||||
#elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD != 0
|
||||
ACCUMULATOR_VEC_TYPE d_zp = 0;
|
||||
@@ -148,9 +150,7 @@ KERNEL(fc)(
|
||||
#else
|
||||
ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0];
|
||||
#endif
|
||||
|
||||
ACCUMULATOR_TYPE* ds = (ACCUMULATOR_TYPE*)(&d_scale);
|
||||
ACCUMULATOR_TYPE* dzp = (ACCUMULATOR_TYPE*)(&d_zp);
|
||||
ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp);
|
||||
#endif
|
||||
|
||||
#if REALIGN_FP16_OFFSET
|
||||
@@ -193,7 +193,28 @@ KERNEL(fc)(
|
||||
ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei);
|
||||
unroll_for(uint kii = 0; kii < TILE_K; ++kii) {
|
||||
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
w[kii * TILE_OFM + fi] = (w[kii * TILE_OFM + fi] - dzp[fi]) * ds[fi];
|
||||
const uint w_idx = kii * TILE_OFM + fi;
|
||||
const uint offset_ofm = out_f + fi*SIMD + sglid;
|
||||
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
|
||||
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
|
||||
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
|
||||
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
|
||||
#else
|
||||
ACCUMULATOR_TYPE ds = d_scales[fi];
|
||||
#endif
|
||||
|
||||
#if DECOMPRESSION_ZP_TERM
|
||||
#if DECOMPRESSION_ZP_GROUPS_NUM > 1
|
||||
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
|
||||
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
|
||||
ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
|
||||
#else
|
||||
ACCUMULATOR_TYPE dzp = d_zps[fi];
|
||||
#endif
|
||||
#else
|
||||
ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO;
|
||||
#endif
|
||||
w[w_idx] = (w[w_idx] - dzp) * ds;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -230,7 +251,28 @@ KERNEL(fc)(
|
||||
ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei);
|
||||
unroll_for(uint kii = 0; kii < TILE_K; ++kii) {
|
||||
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
w[kii * TILE_OFM + fi] = (w[kii * TILE_OFM + fi] - dzp[fi]) * ds[fi];
|
||||
const uint w_idx = kii * TILE_OFM + fi;
|
||||
uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id();
|
||||
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
|
||||
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
|
||||
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
|
||||
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
|
||||
#else
|
||||
ACCUMULATOR_TYPE ds = d_scales[fi];
|
||||
#endif
|
||||
|
||||
#if DECOMPRESSION_ZP_TERM
|
||||
#if DECOMPRESSION_ZP_GROUPS_NUM > 1
|
||||
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
|
||||
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
|
||||
ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
|
||||
#else
|
||||
ACCUMULATOR_TYPE dzp = d_zps[fi];
|
||||
#endif
|
||||
#else
|
||||
ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO;
|
||||
#endif
|
||||
w[w_idx] = (w[w_idx] - dzp) * ds;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -36,18 +36,24 @@ KERNEL(fc)(
|
||||
for (uint x = 0; x < INPUT0_SIZE_X; ++x)
|
||||
{
|
||||
const uint input0_idx = INPUT0_GET_INDEX(b, ofm, y, x);
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
|
||||
#if COMPRESSED_WEIGHTS
|
||||
ACCUMULATOR_TYPE filter_compressed = TO_ACCUMULATOR_TYPE(weights[filter_idx]);
|
||||
#if DECOMPRESSION_ZP_TERM
|
||||
ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[DECOMPRESSION_ZP_GET_INDEX_SAFE(0, oym, 0, 0)]);
|
||||
const uint zp_offset = DECOMPRESSION_ZP_GET_INDEX_SAFE(oym, y / DECOMPRESSION_ZP_GROUP_SIZE, 0, 0);
|
||||
ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]);
|
||||
#else
|
||||
ACCUMULATOR_TYPE zp = ACCUMULATOR_VAL_ZERO;
|
||||
#endif
|
||||
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[DECOMPRESSION_SCALE_GET_INDEX_SAFE(0, oym, 0, 0)];
|
||||
ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - TO_ACCUMULATOR_TYPE(zp)) * scale;
|
||||
const uint decomp_offset = DECOMPRESSION_SCALE_GET_INDEX_SAFE(oym, y / DECOMPRESSION_SCALE_GROUP_SIZE, 0, 0);
|
||||
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
|
||||
#endif
|
||||
|
||||
#if COMPRESSED_WEIGHTS_INT8
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
|
||||
ACCUMULATOR_TYPE filter_compressed = TO_ACCUMULATOR_TYPE(weights[filter_idx]);
|
||||
ACCUMULATOR_TYPE filter_val = (filter_compressed - zp) * scale;
|
||||
dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(filter_val);
|
||||
#else
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
|
||||
dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(weights[filter_idx]);
|
||||
#endif
|
||||
}
|
||||
@@ -67,19 +73,25 @@ KERNEL(fc)(
|
||||
for (uint x = 0; x < INPUT0_SIZE_X; ++x)
|
||||
{
|
||||
const uint input0_idx = INPUT0_GET_INDEX(b, ifm, y, x);
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
|
||||
#if COMPRESSED_WEIGHTS
|
||||
FILTER_TYPE filter_compressed = weights[filter_idx];
|
||||
#if DECOMPRESSION_ZP_TERM
|
||||
ACCUMULATOR_TYPE zp = decompression_zp[DECOMPRESSION_ZP_GET_INDEX_SAFE(0, ofm, 0, 0)];
|
||||
const uint zp_offset = DECOMPRESSION_ZP_GET_INDEX_SAFE(ofm, ifm / DECOMPRESSION_ZP_GROUP_SIZE, 0, 0);
|
||||
ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]);
|
||||
#else
|
||||
ACCUMULATOR_TYPE zp = ACCUMULATOR_VAL_ZERO;
|
||||
#endif
|
||||
const uint decomp_offset = DECOMPRESSION_SCALE_GET_INDEX_SAFE(ofm, ifm / DECOMPRESSION_SCALE_GROUP_SIZE, 0, 0);
|
||||
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
|
||||
#endif
|
||||
|
||||
DECOMPRESSION_SCALE_TYPE scale = decompression_scale[DECOMPRESSION_SCALE_GET_INDEX_SAFE(0, ofm, 0, 0)];
|
||||
ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - TO_ACCUMULATOR_TYPE(zp)) * scale;
|
||||
|
||||
#if COMPRESSED_WEIGHTS_INT8
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
|
||||
FILTER_TYPE filter_compressed = weights[filter_idx];
|
||||
ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - zp) * scale;
|
||||
dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(filter_val);
|
||||
#else
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
|
||||
dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(weights[filter_idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -24,11 +24,23 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par
|
||||
|
||||
if (params.compressed) {
|
||||
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS", 1)});
|
||||
if (params.weights.GetDType() == WeightsType::INT8 || params.weights.GetDType() == WeightsType::UINT8) {
|
||||
jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT8", 1)});
|
||||
}
|
||||
|
||||
const size_t scale_groups_num = params.decompression_scale.Feature().v;
|
||||
const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_TERM", 1)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE", params.decompression_scale)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUPS_NUM", scale_groups_num)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUP_SIZE", scale_group_size)});
|
||||
if (params.has_decompression_zp) {
|
||||
const size_t zp_groups_num = params.decompression_zero_point.Feature().v;
|
||||
const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_TERM", 1)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP", params.decompression_zero_point)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUPS_NUM", zp_groups_num)});
|
||||
jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUP_SIZE", zp_group_size)});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,16 +3,19 @@
|
||||
//
|
||||
|
||||
#include "convert_fc_to_compressed.hpp"
|
||||
#include <memory>
|
||||
|
||||
#include "intel_gpu/op/fully_connected.hpp"
|
||||
#include "intel_gpu/op/fully_connected_compressed.hpp"
|
||||
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/subtract.hpp"
|
||||
#include "openvino/op/matmul.hpp"
|
||||
#include "openvino/op/convert.hpp"
|
||||
#include "openvino/op/transpose.hpp"
|
||||
#include "openvino/op/reshape.hpp"
|
||||
#include "openvino/core/rt_info.hpp"
|
||||
#include "openvino/pass/pattern/op/pattern.hpp"
|
||||
#include "openvino/pass/pattern/op/wrap_type.hpp"
|
||||
#include "openvino/pass/pattern/op/or.hpp"
|
||||
#include "transformations/utils/utils.hpp"
|
||||
@@ -23,7 +26,19 @@ namespace intel_gpu {
|
||||
ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed() {
|
||||
using namespace ov::pass::pattern;
|
||||
|
||||
auto weights_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
|
||||
auto compressed_constant = [](const ov::Output<ov::Node>& output) {
|
||||
return (output.get_element_type() == ov::element::u8 ||
|
||||
output.get_element_type() == ov::element::i8) &&
|
||||
output.get_target_inputs().size() == 1;
|
||||
};
|
||||
|
||||
auto reshape_3d_to_2d = [](const ov::Output<ov::Node>& output) {
|
||||
auto in_ps = output.get_node()->get_input_partial_shape(0);
|
||||
auto out_ps = output.get_node()->get_output_partial_shape(0);
|
||||
return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2;
|
||||
};
|
||||
|
||||
auto weights_m = wrap_type<ov::op::v0::Constant>(compressed_constant);
|
||||
auto convert_m = wrap_type<ov::op::v0::Convert>({weights_m});
|
||||
|
||||
auto sub_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
|
||||
@@ -34,11 +49,15 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
|
||||
auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_const_m});
|
||||
auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});
|
||||
|
||||
auto reshape_const_m = wrap_type<ov::op::v0::Constant>();
|
||||
auto reshape_m = wrap_type<ov::op::v1::Reshape>({mul_m, reshape_const_m}, reshape_3d_to_2d);
|
||||
|
||||
auto transpose_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{reshape_m, mul_m});
|
||||
auto transpose_const_m = wrap_type<ov::op::v0::Constant>();
|
||||
auto transpose_m = wrap_type<ov::op::v1::Transpose>({mul_m, transpose_const_m});
|
||||
auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{mul_m, transpose_m});
|
||||
auto transpose_m = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const_m});
|
||||
|
||||
auto data_m = any_input();
|
||||
auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m, mul_m});
|
||||
auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m});
|
||||
|
||||
ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
|
||||
@@ -52,53 +71,73 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& fc_input_a = fc->get_input_node_shared_ptr(0);
|
||||
const auto& scale = pattern_map.at(mul_const_m).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> optional_zero_point = nullptr;
|
||||
bool has_transpose = pattern_map.count(transpose_m);
|
||||
auto scale_shape = pattern_map.at(mul_const_m).get_shape();
|
||||
bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { return d > 1; }) > 1;
|
||||
|
||||
ov::NodeVector nodes_to_copy_info{pattern_map.at(fully_connected_m).get_node_shared_ptr(),
|
||||
pattern_map.at(convert_m).get_node_shared_ptr()};
|
||||
if (pattern_map.count(mul_no_sub_m)) {
|
||||
nodes_to_copy_info.push_back(pattern_map.at(mul_no_sub_m).get_node_shared_ptr());
|
||||
}
|
||||
if (pattern_map.count(mul_with_sub_m)) {
|
||||
nodes_to_copy_info.push_back(pattern_map.at(mul_with_sub_m).get_node_shared_ptr());
|
||||
}
|
||||
auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr<ov::Node> node) {
|
||||
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
|
||||
OPENVINO_ASSERT(constant != nullptr);
|
||||
ov::Shape current_shape = constant->get_shape();
|
||||
if (current_shape.size() == 2)
|
||||
return constant;
|
||||
OPENVINO_ASSERT(current_shape.size() == 3);
|
||||
|
||||
auto new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]}
|
||||
: ov::Shape{current_shape[0], current_shape[1] * current_shape[2]};
|
||||
|
||||
return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
|
||||
};
|
||||
|
||||
const auto& fc_input_a = fc->get_input_node_shared_ptr(0);
|
||||
const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr());
|
||||
std::shared_ptr<ov::Node> optional_zero_point = nullptr;
|
||||
|
||||
const bool with_zero_point = pattern_map.count(subtract_m) > 0;
|
||||
if (with_zero_point) {
|
||||
optional_zero_point = pattern_map.at(sub_const_m).get_node_shared_ptr();
|
||||
nodes_to_copy_info.push_back(subtract_m);
|
||||
optional_zero_point = reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr());
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> fc_input_b = pattern_map.at(weights_m).get_node_shared_ptr();
|
||||
if (pattern_map.count(transpose_m)) {
|
||||
std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());
|
||||
std::shared_ptr<ov::Node> fc_input_scale = scale;
|
||||
std::shared_ptr<ov::Node> fc_input_zp = optional_zero_point;
|
||||
if (has_transpose) {
|
||||
const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr();
|
||||
const auto& transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
|
||||
std::shared_ptr<ov::Node> transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
|
||||
if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) {
|
||||
std::vector<int32_t> new_order(fc_input_b->get_output_partial_shape(0).size());
|
||||
std::iota(new_order.begin(), new_order.end(), 0);
|
||||
std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]);
|
||||
transpose_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_order.size()}, new_order);
|
||||
}
|
||||
|
||||
fc_input_b = transpose->clone_with_new_inputs({ fc_input_b->output(0), transpose_const });
|
||||
fc_input_scale = transpose->clone_with_new_inputs({ scale->output(0), transpose_const });
|
||||
if (with_zero_point)
|
||||
fc_input_zp = transpose->clone_with_new_inputs({ optional_zero_point->output(0), transpose_const });
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> new_fc = nullptr;
|
||||
if (with_zero_point) {
|
||||
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
|
||||
fc_input_b,
|
||||
scale,
|
||||
optional_zero_point,
|
||||
fc_input_scale,
|
||||
fc_input_zp,
|
||||
fc->get_output_type());
|
||||
} else {
|
||||
new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
|
||||
fc_input_b,
|
||||
scale,
|
||||
fc_input_scale,
|
||||
fc->get_output_type());
|
||||
}
|
||||
|
||||
new_fc->set_friendly_name(fc->get_friendly_name());
|
||||
ov::copy_runtime_info(nodes_to_copy_info, new_fc);
|
||||
ov::copy_runtime_info(m.get_matched_nodes(), new_fc);
|
||||
ov::replace_node(fc, new_fc);
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ov::pass::pattern::Matcher>(fully_connected_m);
|
||||
auto m = std::make_shared<ov::pass::pattern::Matcher>(fully_connected_m, "ConvertFullyConnectedToFullyConnectedCompressed");
|
||||
this->register_matcher(m, callback);
|
||||
}
|
||||
|
||||
|
||||
@@ -160,7 +160,7 @@ ConvertMatMulToFullyConnected::ConvertMatMulToFullyConnected() {
|
||||
return true;
|
||||
};
|
||||
|
||||
auto m = std::make_shared<ov::pass::pattern::Matcher>(matmul_m);
|
||||
auto m = std::make_shared<ov::pass::pattern::Matcher>(matmul_m, "ConvertMatMulToFullyConnected");
|
||||
this->register_matcher(m, callback);
|
||||
}
|
||||
|
||||
|
||||
@@ -2,19 +2,21 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ov_models/builders.hpp"
|
||||
#include "common_test_utils/ov_tensor_utils.hpp"
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/matmul.hpp"
|
||||
#include "shared_test_classes/base/layer_test_utils.hpp"
|
||||
#include "shared_test_classes/base/ov_subgraph.hpp"
|
||||
#include "transformations/rt_info/decompression.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
using namespace ov;
|
||||
using namespace ov::test;
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
/*
|
||||
* Subtract_const(U8)
|
||||
* Subtract_const(U8/NF4/U4)
|
||||
* /
|
||||
* Weights(U8) Convert(F32)
|
||||
* Weights(U8/NF4/U4) Convert(F32)
|
||||
* | /
|
||||
* Convert(F32) Reshape(optional)
|
||||
* \ / Multiply_const(F32)
|
||||
@@ -29,7 +31,20 @@ namespace SubgraphTestsDefinitions {
|
||||
* |
|
||||
* Bias
|
||||
*/
|
||||
using MatmulWeightsDecompressionParams = std::tuple<std::vector<InputShape>, // input shapes
|
||||
|
||||
struct ShapeParams {
|
||||
ShapeParams() = default;
|
||||
ShapeParams(InputShape data_shape, ov::Shape weights_shape, int weights_group_size = -1)
|
||||
: data_shape(std::move(data_shape)),
|
||||
weights_shape(std::move(weights_shape)),
|
||||
weights_group_size(weights_group_size) {}
|
||||
|
||||
InputShape data_shape;
|
||||
ov::Shape weights_shape;
|
||||
// Decompression group size. If the value is equal to -1, ordinary decompression is used
|
||||
int weights_group_size;
|
||||
};
|
||||
using MatmulWeightsDecompressionParams = std::tuple<ShapeParams, // input shapes
|
||||
ov::test::ElementType, // weights precision
|
||||
ov::test::ElementType, // activations precision
|
||||
bool, // transpose on weights
|
||||
@@ -40,7 +55,7 @@ using MatmulWeightsDecompressionParams = std::tuple<std::vector<InputShape>, //
|
||||
class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>, public SubgraphBaseTest {
|
||||
public:
|
||||
static std::string get_test_case_name(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj) {
|
||||
std::vector<InputShape> inputShapes;
|
||||
ShapeParams shape_params;
|
||||
ov::test::ElementType weights_precision;
|
||||
ov::test::ElementType activations_precision;
|
||||
bool transpose;
|
||||
@@ -48,7 +63,7 @@ public:
|
||||
bool reshape_on_decompression;
|
||||
std::map<std::string, std::string> additional_config;
|
||||
|
||||
std::tie(inputShapes,
|
||||
std::tie(shape_params,
|
||||
weights_precision,
|
||||
activations_precision,
|
||||
transpose,
|
||||
@@ -57,20 +72,9 @@ public:
|
||||
additional_config) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
for (const auto& shape : inputShapes) {
|
||||
result << ov::test::utils::partialShape2str({shape.first}) << "_";
|
||||
}
|
||||
result << "TS=";
|
||||
for (const auto& shape : inputShapes) {
|
||||
result << "(";
|
||||
if (!shape.second.empty()) {
|
||||
auto itr = shape.second.begin();
|
||||
do {
|
||||
result << ov::test::utils::vec2str(*itr);
|
||||
} while (++itr != shape.second.end() && result << "_");
|
||||
}
|
||||
result << ")_";
|
||||
}
|
||||
result << "data_shape=" << shape_params.data_shape << "_";
|
||||
result << "weights_shape=" << shape_params.weights_shape << "_";
|
||||
result << "group_size=" << shape_params.weights_group_size << "_";
|
||||
result << "weights_precision=" << weights_precision << "_";
|
||||
result << "activations_precision=" << activations_precision << "_";
|
||||
result << "transpose_weights=" << transpose << "_";
|
||||
@@ -87,34 +91,87 @@ public:
|
||||
}
|
||||
|
||||
protected:
|
||||
std::shared_ptr<ov::Model> init_subgraph(std::vector<ov::PartialShape>& inputShapes,
|
||||
const ov::element::Type data_precision,
|
||||
const ov::element::Type weights_precision,
|
||||
const bool transpose_weights,
|
||||
const bool add_subtract,
|
||||
const bool reshape_on_decompression) {
|
||||
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, inputShapes[0])};
|
||||
std::shared_ptr<ov::Model> init_subgraph(const ov::PartialShape& data_shape,
|
||||
const ov::Shape& weights_shape,
|
||||
const int group_size,
|
||||
const ov::element::Type data_precision,
|
||||
const ov::element::Type weights_precision,
|
||||
const bool transpose_weights,
|
||||
const bool add_subtract,
|
||||
const bool reshape_on_decompression) {
|
||||
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
|
||||
const auto weights_subgraph = init_compressed_weights_subgraph(weights_shape,
|
||||
group_size,
|
||||
data_precision,
|
||||
weights_precision,
|
||||
transpose_weights,
|
||||
add_subtract,
|
||||
reshape_on_decompression);
|
||||
|
||||
auto mat_mul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
|
||||
return std::make_shared<ov::Model>(NodeVector{mat_mul}, params, "MatmulWeightsDecompression");
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> init_compressed_weights_subgraph(const ov::Shape& weights_shape,
|
||||
const int group_size,
|
||||
const ov::element::Type data_precision,
|
||||
const ov::element::Type weights_precision,
|
||||
const bool transpose_weights,
|
||||
const bool add_subtract,
|
||||
const bool reshape_on_decompression_constant) {
|
||||
auto transpose_if_necessary = [&](const ov::Shape& shape) {
|
||||
if (!transpose_weights)
|
||||
return shape;
|
||||
auto transposed_shape = shape;
|
||||
std::swap(*transposed_shape.rbegin(), *(transposed_shape.rbegin() + 1));
|
||||
return transposed_shape;
|
||||
auto result_shape = shape;
|
||||
if (transpose_weights)
|
||||
std::swap(*result_shape.rbegin(), *(result_shape.rbegin() + 1));
|
||||
return result_shape;
|
||||
};
|
||||
|
||||
auto weights_shape = transpose_if_necessary(inputShapes[1].to_shape());
|
||||
auto weights = ngraph::builder::makeConstant<uint8_t>(weights_precision, weights_shape, {}, true);
|
||||
const bool group_decompression = group_size != -1;
|
||||
// Weights has shape [I, O], where
|
||||
// I - input channels
|
||||
// O - output channels
|
||||
// In case of group decompression, input channels dimension is split into 2: I -> [N, G], where
|
||||
// N - number of groups
|
||||
// G - group size
|
||||
auto transformed_weights_shape = transpose_if_necessary(weights_shape);
|
||||
if (group_decompression) {
|
||||
OPENVINO_ASSERT(weights_shape[0] % group_size == 0,
|
||||
"Weights output channels count (",
|
||||
weights_shape[0],
|
||||
") must be divisible by decompression group size (",
|
||||
group_size,
|
||||
").");
|
||||
auto in_channel_idx = transpose_weights ? transformed_weights_shape.size() - 1 : transformed_weights_shape.size() - 2;
|
||||
transformed_weights_shape[in_channel_idx] = weights_shape[0] / group_size;
|
||||
transformed_weights_shape.insert(transformed_weights_shape.begin() + in_channel_idx + 1, group_size);
|
||||
}
|
||||
auto weights_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, transformed_weights_shape);
|
||||
auto weights = std::make_shared<ov::op::v0::Constant>(weights_tensor);
|
||||
weights->set_friendly_name("Compressed_weights");
|
||||
auto weights_convert = std::make_shared<ngraph::opset1::Convert>(weights, data_precision);
|
||||
|
||||
std::shared_ptr<ov::Node> mul_parent = weights_convert;
|
||||
auto output_channels = transpose_weights ? *(weights_shape.rbegin() + 1) : *weights_shape.rbegin();
|
||||
auto scaleshift_target_shape = transpose_if_necessary(ov::Shape{1, output_channels});
|
||||
auto scaleshift_const_shape = reshape_on_decompression ? ov::Shape{output_channels} : scaleshift_target_shape;
|
||||
auto output_channels = *weights_shape.rbegin();
|
||||
|
||||
// Decompression constants shape:
|
||||
// Ordinary decompression: [O, 1]
|
||||
// Group decompression: [O, N, 1]
|
||||
ov::Shape scaleshift_target_shape{output_channels};
|
||||
scaleshift_target_shape.insert(scaleshift_target_shape.begin(), group_decompression ? weights_shape[0] / group_size : 1);
|
||||
scaleshift_target_shape = transpose_if_necessary(scaleshift_target_shape);
|
||||
if (group_decompression) {
|
||||
auto in_channel_idx = transpose_weights ? scaleshift_target_shape.size() - 1 : scaleshift_target_shape.size() - 2;
|
||||
scaleshift_target_shape.insert(scaleshift_target_shape.begin() + in_channel_idx + 1, 1);
|
||||
}
|
||||
|
||||
auto scaleshift_const_shape = scaleshift_target_shape;
|
||||
if (reshape_on_decompression_constant)
|
||||
scaleshift_const_shape.erase(std::remove(scaleshift_const_shape.begin(), scaleshift_const_shape.end(), 1), scaleshift_const_shape.end());
|
||||
if (add_subtract) {
|
||||
auto shift_const = ngraph::builder::makeConstant<uint8_t>(weights_precision, scaleshift_const_shape, {}, true);
|
||||
auto shift_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, scaleshift_const_shape);
|
||||
auto shift_const = std::make_shared<ov::op::v0::Constant>(shift_tensor);
|
||||
std::shared_ptr<ov::Node> shift_convert = std::make_shared<ngraph::opset1::Convert>(shift_const, data_precision);
|
||||
if (reshape_on_decompression) {
|
||||
if (reshape_on_decompression_constant) {
|
||||
auto shift_reshape_const = ov::opset10::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape);
|
||||
auto shift_reshape = std::make_shared<ov::opset10::Reshape>(shift_convert, shift_reshape_const, false);
|
||||
shift_convert = shift_reshape;
|
||||
@@ -122,32 +179,36 @@ protected:
|
||||
mul_parent = std::make_shared<ov::opset10::Subtract>(weights_convert, shift_convert);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> scale_const = ngraph::builder::makeConstant<float>(data_precision, scaleshift_const_shape, {}, true);
|
||||
if (reshape_on_decompression) {
|
||||
auto scale_tensor = ov::test::utils::create_and_fill_tensor(data_precision, scaleshift_const_shape, 1, -0.5, 10000);
|
||||
std::shared_ptr<ov::Node> scale_const = std::make_shared<ov::op::v0::Constant>(scale_tensor);
|
||||
if (reshape_on_decompression_constant) {
|
||||
auto scale_reshape_const = ov::opset10::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape);
|
||||
auto scale_reshape = std::make_shared<ov::opset10::Reshape>(scale_const, scale_reshape_const, false);
|
||||
scale_const = scale_reshape;
|
||||
}
|
||||
auto multiply = std::make_shared<ov::opset10::Multiply>(mul_parent, scale_const);
|
||||
std::shared_ptr<ov::Node> last_node = std::make_shared<ov::opset10::Multiply>(mul_parent, scale_const);
|
||||
|
||||
std::shared_ptr<ov::Node> matmul_weights = multiply;
|
||||
if (group_decompression) {
|
||||
auto reshape_target_shape = transpose_weights ? std::vector<int>{-1, static_cast<int>(weights_shape[0])}
|
||||
: std::vector<int>{static_cast<int>(weights_shape[0]), -1};
|
||||
auto target_shape_node = ov::opset10::Constant::create(ov::element::i32, {reshape_target_shape.size()}, reshape_target_shape);
|
||||
last_node = std::make_shared<ov::opset10::Reshape>(last_node, target_shape_node, false);
|
||||
}
|
||||
if (transpose_weights) {
|
||||
const size_t rank = matmul_weights->get_output_partial_shape(0).size();
|
||||
const size_t rank = last_node->get_output_partial_shape(0).size();
|
||||
std::vector<int> order(rank);
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
std::swap(*order.rbegin(), *(order.rbegin() + 1));
|
||||
auto transpose_constant = ov::opset10::Constant::create(ov::element::i32, {rank}, order);
|
||||
auto transpose = std::make_shared<ov::opset10::Transpose>(matmul_weights, transpose_constant);
|
||||
matmul_weights = transpose;
|
||||
last_node = std::make_shared<ov::opset10::Transpose>(last_node, transpose_constant);
|
||||
}
|
||||
auto matMul = builder::makeMatMul(params[0], matmul_weights);
|
||||
return std::make_shared<ov::Model>(NodeVector{matMul}, params, "MatmulWeightsDecompression");
|
||||
return last_node;
|
||||
}
|
||||
|
||||
void SetUp() override {
|
||||
targetDevice = ov::test::utils::DEVICE_GPU;
|
||||
|
||||
std::vector<InputShape> inputShapes;
|
||||
ShapeParams shape_params;
|
||||
ov::test::ElementType weights_precision;
|
||||
ov::test::ElementType activations_precision;
|
||||
bool transpose_weights;
|
||||
@@ -155,7 +216,7 @@ protected:
|
||||
bool reshape_on_decompression;
|
||||
std::map<std::string, std::string> additional_config;
|
||||
|
||||
std::tie(inputShapes,
|
||||
std::tie(shape_params,
|
||||
weights_precision,
|
||||
activations_precision,
|
||||
transpose_weights,
|
||||
@@ -164,14 +225,47 @@ protected:
|
||||
additional_config) = GetParam();
|
||||
|
||||
configuration.insert(additional_config.begin(), additional_config.end());
|
||||
init_input_shapes(inputShapes);
|
||||
init_input_shapes({shape_params.data_shape, {{}, {{shape_params.weights_shape}}}});
|
||||
|
||||
inType = outType = activations_precision;
|
||||
|
||||
function = init_subgraph(inputDynamicShapes, activations_precision, weights_precision, transpose_weights, decompression_sub, reshape_on_decompression);
|
||||
function = init_subgraph(inputDynamicShapes[0],
|
||||
shape_params.weights_shape,
|
||||
shape_params.weights_group_size,
|
||||
activations_precision,
|
||||
weights_precision,
|
||||
transpose_weights,
|
||||
decompression_sub,
|
||||
reshape_on_decompression);
|
||||
|
||||
|
||||
if (activations_precision == ov::element::f16) {
|
||||
auto weights_size = ov::shape_size(shape_params.weights_shape);
|
||||
auto weights_input_channels = weights_size / (transpose_weights ? shape_params.weights_shape[0] : shape_params.weights_shape.back());
|
||||
// Absolute values range during accumulation may be quite big ( > 200) so fp16 representation & math error is larger than default threshold
|
||||
if (weights_input_channels > 2048) {
|
||||
abs_threshold = 4.0f;
|
||||
} else {
|
||||
abs_threshold = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void checkResults() {
|
||||
void generate_inputs(const std::vector<ngraph::Shape>& target_input_static_shapes) override {
|
||||
inputs.clear();
|
||||
const auto& model_inputs = function->inputs();
|
||||
for (size_t i = 0; i < model_inputs.size(); ++i) {
|
||||
const auto& model_input = model_inputs[i];
|
||||
ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(),
|
||||
target_input_static_shapes[i],
|
||||
2,
|
||||
-1,
|
||||
10000);
|
||||
inputs.insert({model_input.get_node_shared_ptr(), tensor});
|
||||
}
|
||||
}
|
||||
|
||||
void check_results() {
|
||||
const auto& test_param = GetParam();
|
||||
ov::test::ElementType weights_precision = std::get<1>(test_param);
|
||||
for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
|
||||
@@ -185,24 +279,20 @@ protected:
|
||||
TEST_P(MatmulWeightsDecompression, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
run();
|
||||
checkResults();
|
||||
check_results();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const std::vector<ov::test::ElementType> activations_precisions = {ov::element::f32, ov::element::f16};
|
||||
const std::vector<ov::test::ElementType> weights_precisions = {ov::element::u8};
|
||||
const std::vector<std::vector<InputShape>> input_shapes_basic = {
|
||||
{{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {{}, {{16, 32}}}},
|
||||
{{{}, {{10, 40, 496}}}, {{}, {{1, 496, 240}}}},
|
||||
{{{}, {{1, 4, 48}}}, {{}, {{48, 256}}}},
|
||||
{{{}, {{11, 339, 377}}}, {{}, {{377, 335}}}},
|
||||
{{{}, {{1, 4, 32}}}, {{}, {{32, 256}}}},
|
||||
{{{}, {{1, 4, 512}}}, {{}, {{512, 256}}}},
|
||||
{{{}, {{1, 16, 32}}}, {{}, {{32, 64}}}},
|
||||
{{{}, {{2, 4, 32}}}, {{}, {{32, 65}}}},
|
||||
{{{}, {{3, 12, 768}}}, {{}, {{768, 1024}}}},
|
||||
{{{}, {{11, 339, 577}}}, {{}, {{577, 335}}}},
|
||||
const std::vector<ShapeParams> input_shapes_basic = {
|
||||
{{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}},
|
||||
{{{}, {{1, 4, 16}}}, {16, 32}, 2ul},
|
||||
{{{}, {{1, 4, 16}}}, {1, 16, 32}},
|
||||
{{{}, {{10, 40, 496}}}, {1, 496, 240}},
|
||||
{{{}, {{1, 4, 48}}}, {48, 256}},
|
||||
{{{}, {{11, 339, 377}}}, {377, 335}}
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
|
||||
@@ -216,15 +306,16 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
|
||||
::testing::Values(std::map<std::string, std::string>())),
|
||||
MatmulWeightsDecompression::get_test_case_name);
|
||||
|
||||
const std::vector<std::vector<InputShape>> input_shapes_corner_cases_basic = {
|
||||
{{{-1, -1, -1}, {{1, 4, 16}}}, {{}, {{1, 16, 32}}}},
|
||||
{{{}, {{1, 4, 16}}}, {{}, {{1, 16, 32}}}},
|
||||
{{{-1, -1, -1}, {{1, 4, 16}}}, {{}, {{16, 32}}}},
|
||||
{{{-1, -1, -1, -1}, {{1, 1, 4, 16}}}, {{}, {{1, 1, 16, 32}}}},
|
||||
{{{}, {{1, 1, 4, 16}}}, {{}, {{1, 1, 16, 32}}}},
|
||||
const std::vector<ShapeParams> input_shapes_corner_cases_basic = {
|
||||
{{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}},
|
||||
{{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}},
|
||||
{{{-1, -1, 16}, {{1, 4, 16}}}, {16, 32}, 4},
|
||||
};
|
||||
const std::vector<std::vector<InputShape>> input_shapes_corner_cases_big = {
|
||||
{{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {{}, {{1, 480, 256}}}},
|
||||
const std::vector<ShapeParams> input_shapes_corner_cases_big = {
|
||||
{{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
|
||||
{{{-1, -1, -1}, {{1, 1, 4096}}}, {4096, 4096}, 128},
|
||||
{{{-1, -1, -1}, {{1, 1, 4096}}}, {4096, 4096}},
|
||||
{{{-1, 4096}, {{1, 4096}}}, {4096, 4096}, 128},
|
||||
};
|
||||
|
||||
const std::vector<bool> transpose_weights = {true, false};
|
||||
@@ -242,7 +333,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases_basic,
|
||||
::testing::Values(std::map<std::string, std::string>{})),
|
||||
MatmulWeightsDecompression::get_test_case_name);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases_big,
|
||||
INSTANTIATE_TEST_SUITE_P(MatMulCompressedWeights_corner_cases_big,
|
||||
MatmulWeightsDecompression,
|
||||
::testing::Combine(::testing::ValuesIn(input_shapes_corner_cases_big),
|
||||
::testing::ValuesIn(weights_precisions),
|
||||
|
||||
@@ -663,21 +663,22 @@ TEST(fully_connected_gpu, compressed_scale_zp_bias) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ {1, 2, 4}, data_types::f32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
|
||||
auto bias_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
|
||||
auto zp_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx });
|
||||
auto zp_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f,
|
||||
0.5f, -2.0f, -0.5f, -1.0f });
|
||||
set_values(weights_mem, { 1.5f, 1.0f, 0.5f, -1.0f,
|
||||
0.0f, 0.5f, 0.5f, -0.5f,
|
||||
-2.0f, -0.5f, 1.0f, 1.5f,
|
||||
-2.0f, -0.5f, 1.0f, 1.5f,
|
||||
2.0f, 0.5f, -1.0f, -1.5f,
|
||||
2.0f, 0.5f, -1.0f, -1.5f,
|
||||
-1.5f, -1.0f, -0.5f, 1.0f,
|
||||
0.0f, -0.5f, 0.5f, 0.5f });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 0,
|
||||
15, 14, 13, 12,
|
||||
11, 10, 9, 8,
|
||||
7, 6, 5, 4,
|
||||
3, 2, 1, 0});
|
||||
|
||||
|
||||
set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f });
|
||||
set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f });
|
||||
@@ -709,8 +710,7 @@ TEST(fully_connected_gpu, compressed_scale_zp_bias) {
|
||||
ov::PartialShape expected_shape{1, 2, 8};
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<float> expected_result = {-4.0f, -23.0f, 11.0f, 0.0f, -2.0f, -3.5f, -30.0f, -10.5f,
|
||||
6.0f, 19.0f, -5.0f, -8.0f, 12.0f, -8.5f, 44.0f, 14.5f};
|
||||
std::vector<float> expected_result = {13.f, 58.f, -51.f, -108.f, 18.5f, -18.f, 1.f, -4.f, -11.f, -62.f, 57.f, 100.f, -8.5f, 6.f, 13.f, 8.f, };
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
@@ -721,20 +721,20 @@ TEST(fully_connected_gpu, compressed_scale_bias) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ {1, 2, 4}, data_types::f32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f32, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
|
||||
auto bias_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f,
|
||||
0.5f, -2.0f, -0.5f, -1.0f });
|
||||
set_values(weights_mem, { 1.5f, 1.0f, 0.5f, -1.0f,
|
||||
0.0f, 0.5f, 0.5f, -0.5f,
|
||||
-2.0f, -0.5f, 1.0f, 1.5f,
|
||||
-2.0f, -0.5f, 1.0f, 1.5f,
|
||||
2.0f, 0.5f, -1.0f, -1.5f,
|
||||
2.0f, 0.5f, -1.0f, -1.5f,
|
||||
-1.5f, -1.0f, -0.5f, 1.0f,
|
||||
0.0f, -0.5f, 0.5f, 0.5f });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 0,
|
||||
15, 14, 13, 12,
|
||||
11, 10, 9, 8,
|
||||
7, 6, 5, 4,
|
||||
3, 2, 1, 0});
|
||||
|
||||
set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f });
|
||||
set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 1.0f });
|
||||
@@ -764,8 +764,7 @@ TEST(fully_connected_gpu, compressed_scale_bias) {
|
||||
ov::PartialShape expected_shape{1, 2, 8};
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<float> expected_result = {2.0f, 1.0f, -1.0f, -12.0f, 4.0f, -5.0f, 6.0f, -8.25f,
|
||||
0.0f, -5.0f, 7.0f, 4.0f, 6.0f, -7.0f, 8.0f, -7.75f};
|
||||
std::vector<float> expected_result = {19.f, 40.f, 69.f, 54.f, 83.f, 48.f, 37.f, -2.f, -17.f, -44.f, -63.f, -62.f, -73.f, -60.f, -23.f, -14.f };
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
@@ -776,19 +775,19 @@ TEST(fully_connected_gpu, compressed_scale_fp16) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input_mem = engine.allocate_memory({ { 2, 4}, data_types::f16, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f16, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {1, 8}, data_types::f16, format::bfyx });
|
||||
auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
|
||||
auto scale_mem = engine.allocate_memory({ {8, 1}, data_types::f16, format::bfyx });
|
||||
|
||||
set_values<ov::float16>(input_mem, { ov::float16(-0.5f), ov::float16(2.0f), ov::float16(0.5f), ov::float16(1.0f),
|
||||
ov::float16(0.5f), ov::float16(-2.0f), ov::float16(-0.5f), ov::float16(-1.0f) });
|
||||
set_values<ov::float16>(weights_mem, {ov::float16( 1.5f), ov::float16( 1.0f), ov::float16( 0.5f), ov::float16(-1.0f),
|
||||
ov::float16( 0.0f), ov::float16( 0.5f), ov::float16( 0.5f), ov::float16(-0.5f),
|
||||
ov::float16(-2.0f), ov::float16(-0.5f), ov::float16( 1.0f), ov::float16( 1.5f),
|
||||
ov::float16(-2.0f), ov::float16(-0.5f), ov::float16( 1.0f), ov::float16( 1.5f),
|
||||
ov::float16( 2.0f), ov::float16( 0.5f), ov::float16(-1.0f), ov::float16(-1.5f),
|
||||
ov::float16( 2.0f), ov::float16( 0.5f), ov::float16(-1.0f), ov::float16(-1.5f),
|
||||
ov::float16(-1.5f), ov::float16(-1.0f), ov::float16(-0.5f), ov::float16( 1.0f),
|
||||
ov::float16( 0.0f), ov::float16(-0.5f), ov::float16(0.5f), ov::float16( 0.5f) });
|
||||
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
|
||||
5, 6, 7, 8,
|
||||
9, 10, 11, 12,
|
||||
13, 14, 15, 0,
|
||||
15, 14, 13, 12,
|
||||
11, 10, 9, 8,
|
||||
7, 6, 5, 4,
|
||||
3, 2, 1, 0});
|
||||
|
||||
set_values<ov::float16>(scale_mem, {ov::float16(2.0f), ov::float16(4.0f), ov::float16(-2.0f), ov::float16(-4.0f), ov::float16(0.5f), ov::float16(-0.5f), ov::float16(2.0f), ov::float16(2.0f)});
|
||||
|
||||
@@ -817,8 +816,8 @@ TEST(fully_connected_gpu, compressed_scale_fp16) {
|
||||
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());
|
||||
|
||||
std::vector<ov::float16> expected_result = {
|
||||
ov::float16(1.0f), ov::float16( 3.0f), ov::float16(-4.0f), ov::float16(-8.0f), ov::float16(-1.0f), ov::float16( 1.0f), ov::float16(-1.0f), ov::float16(-0.5f),
|
||||
ov::float16(-1.0f), ov::float16(-3.0f), ov::float16( 4.0f), ov::float16( 8.0f), ov::float16( 1.0f), ov::float16(-1.0f), ov::float16( 1.0f), ov::float16( 0.5f)};
|
||||
ov::float16(18), ov::float16(84), ov::float16(-66), ov::float16(-116), ov::float16(19.5), ov::float16(-13.5), ov::float16(30), ov::float16(6),
|
||||
ov::float16(-18), ov::float16(-84), ov::float16(66), ov::float16(116), ov::float16(-19.5), ov::float16(13.5), ov::float16(-30), ov::float16(-6) };
|
||||
|
||||
for (size_t i = 0; i < expected_result.size(); i++) {
|
||||
ASSERT_FLOAT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
|
||||
|
||||
Reference in New Issue
Block a user