[GPU] Grouped decompression scale/zp support (#20491)

2023-10-18 15:56:35 +04:00
parent 4574fb112c
commit 90ad4c618d
10 changed files with 390 additions and 163 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "fully_connected_inst.h"
 #include "pooling_inst.h"
 #include "quantize_inst.h"
 #include "reorder_inst.h"
@@ -847,6 +848,42 @@ bool prepare_quantization::optimize_quantize(program &p, quantize_node& quantize
    return true;
 }

+static void optimize_weights_decompression_parameters(fully_connected_node& fc_node, program& p) {
+    auto fc_prim = fc_node.get_primitive();
+    if (!fc_prim->compressed_weights)
+        return;
+
+    auto reorder_bfyx_to_fbyx = [&](size_t dep_id) {
+        auto& dep = fc_node.get_dependency(dep_id);
+        auto target_layout = dep.get_output_layout();
+        target_layout.format = format::fbyx;
+        auto reorder_prim = std::make_shared<reorder>(dep.id() + "_reorder", dep.id(), target_layout);
+        p.add_intermediate(reorder_prim, fc_node, dep_id, true);
+        fc_node.get_dependency(dep_id).recalc_output_layout(false);
+    };
+
+    auto need_reorder = [&](size_t dep_id) {
+        auto dep_layout = fc_node.get_input_layout(dep_id);
+        auto dep_pshape = dep_layout.get_partial_shape();
+
+        auto groups_count = dep_pshape[dep_pshape.size() - 1].get_length();
+
+        return groups_count > 1;
+    };
+
+    auto decompression_scale_idx = !fc_node.bias_term() ? 2 : 3;
+    if (need_reorder(decompression_scale_idx)) {
+        reorder_bfyx_to_fbyx(decompression_scale_idx);
+    }
+
+    if (!fc_prim->decompression_zero_point.empty()) {
+        auto decompression_zp_idx = decompression_scale_idx + 1;
+        if (need_reorder(decompression_zp_idx)) {
+            reorder_bfyx_to_fbyx(decompression_zp_idx);
+        }
+    }
+}
+
 void prepare_quantization::run(program& p) {
    auto itr = p.get_processing_order().begin();
    while (itr != p.get_processing_order().end()) {
@@ -859,6 +896,8 @@ void prepare_quantization::run(program& p) {
            remove_fake_reorders(p, node->as<reorder>());
        } else if (node->is_type<convolution>()) {
            prepare_asymmetric_quantization(p, node->as<convolution>());
+        } else if (node->is_type<fully_connected>()) {
+            optimize_weights_decompression_parameters(node->as<fully_connected>(), p);
        }
    }
 }
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp
@@ -110,20 +110,13 @@ public:
            bool has_scale = !primitive->decompression_scale.empty();

            size_t offset = primitive->bias.empty() ? 2 : 3;
-            const auto& weights_pshape = input1_layout.get_partial_shape();
            if (has_scale) {
                auto scale_layout = input_layouts[offset++];
-                if (input1_pshape.size() != 2) {
-                    scale_layout.set_partial_shape(reshape_to_2d(scale_layout.get_partial_shape(), weights_pshape[0], primitive->weights_rank));
-                }
                layouts.push_back(scale_layout);
            }

            if (has_zp) {
                auto zp_layout = input_layouts[offset];
-                if (input1_pshape.size() != 2) {
-                    zp_layout.set_partial_shape(reshape_to_2d(zp_layout.get_partial_shape(), weights_pshape[0], primitive->weights_rank));
-                }
                layouts.push_back(zp_layout);
            }

--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -307,7 +307,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
         (fmt_prev == format::b_fs_yx_fsv4 &&
          prev_output_layout.feature() % 32 == 0 &&
          prev_output_layout.spatial(0) == 1 &&
-          prev_output_layout.spatial(1) == 1)))
+          prev_output_layout.spatial(1) == 1)) && is_input_reorder(prev, next))
        return true;

    if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -120,7 +120,7 @@ KERNEL(fc)(
    uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
    uint weights_offset = out_f * INPUT_ELEMENTS_COUNT;

-#if COMPRESSED_WEIGHTS
+#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1
    #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % SIMD == 0
        ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f);
    #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % SIMD != 0
@@ -134,9 +134,11 @@ KERNEL(fc)(
        ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0];
    #endif

-    #if !DECOMPRESSION_ZP_TERM
-        ACCUMULATOR_VEC_TYPE d_zp = 0;
-    #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD == 0
+    ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale);
+#endif
+
+#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1
+    #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD == 0
        ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f);
    #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % SIMD != 0
        ACCUMULATOR_VEC_TYPE d_zp = 0;
@@ -148,9 +150,7 @@ KERNEL(fc)(
    #else
        ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0];
    #endif
-
-    ACCUMULATOR_TYPE* ds = (ACCUMULATOR_TYPE*)(&d_scale);
-    ACCUMULATOR_TYPE* dzp = (ACCUMULATOR_TYPE*)(&d_zp);
+    ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp);
 #endif

 #if REALIGN_FP16_OFFSET
@@ -193,7 +193,28 @@ KERNEL(fc)(
                ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei);
                unroll_for(uint kii = 0; kii < TILE_K; ++kii) {
                    unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
-                        w[kii * TILE_OFM + fi] = (w[kii * TILE_OFM + fi] - dzp[fi]) * ds[fi];
+                        const uint w_idx = kii * TILE_OFM + fi;
+                        const uint offset_ofm = out_f + fi*SIMD + sglid;
+                        #if DECOMPRESSION_SCALE_GROUPS_NUM > 1
+                            const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH  +
+                                                     ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
+                            ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
+                        #else
+                            ACCUMULATOR_TYPE ds = d_scales[fi];
+                        #endif
+
+                        #if DECOMPRESSION_ZP_TERM
+                            #if DECOMPRESSION_ZP_GROUPS_NUM > 1
+                                const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
+                                                    ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
+                                ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
+                            #else
+                                ACCUMULATOR_TYPE dzp = d_zps[fi];
+                            #endif
+                        #else
+                            ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO;
+                        #endif
+                        w[w_idx] = (w[w_idx] - dzp) * ds;
                    }
                }
            #endif
@@ -230,7 +251,28 @@ KERNEL(fc)(
                ACCUMULATOR_TYPE* w = (ACCUMULATOR_TYPE*)(&wei);
                unroll_for(uint kii = 0; kii < TILE_K; ++kii) {
                    unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
-                        w[kii * TILE_OFM + fi] = (w[kii * TILE_OFM + fi] - dzp[fi]) * ds[fi];
+                        const uint w_idx = kii * TILE_OFM + fi;
+                        uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id();
+                        #if DECOMPRESSION_SCALE_GROUPS_NUM > 1
+                            const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
+                                                     ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
+                            ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
+                        #else
+                            ACCUMULATOR_TYPE ds = d_scales[fi];
+                        #endif
+
+                        #if DECOMPRESSION_ZP_TERM
+                            #if DECOMPRESSION_ZP_GROUPS_NUM > 1
+                                const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
+                                                    ((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
+                                ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
+                            #else
+                                ACCUMULATOR_TYPE dzp = d_zps[fi];
+                            #endif
+                        #else
+                            ACCUMULATOR_TYPE dzp = ACCUMULATOR_VAL_ZERO;
+                        #endif
+                        w[w_idx] = (w[w_idx] - dzp) * ds;
                    }
                }
            #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bfyx_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bfyx_ref.cl
@@ -36,18 +36,24 @@ KERNEL(fc)(
        for (uint x = 0; x < INPUT0_SIZE_X; ++x)
        {
            const uint input0_idx = INPUT0_GET_INDEX(b, ofm, y, x);
-            const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
            #if COMPRESSED_WEIGHTS
-                ACCUMULATOR_TYPE filter_compressed = TO_ACCUMULATOR_TYPE(weights[filter_idx]);
                #if DECOMPRESSION_ZP_TERM
-                    ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[DECOMPRESSION_ZP_GET_INDEX_SAFE(0, oym, 0, 0)]);
+                    const uint zp_offset = DECOMPRESSION_ZP_GET_INDEX_SAFE(oym, y / DECOMPRESSION_ZP_GROUP_SIZE, 0, 0);
+                    ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]);
                #else
                    ACCUMULATOR_TYPE zp = ACCUMULATOR_VAL_ZERO;
                #endif
-                DECOMPRESSION_SCALE_TYPE scale = decompression_scale[DECOMPRESSION_SCALE_GET_INDEX_SAFE(0, oym, 0, 0)];
-                ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - TO_ACCUMULATOR_TYPE(zp)) * scale;
+                const uint decomp_offset = DECOMPRESSION_SCALE_GET_INDEX_SAFE(oym, y / DECOMPRESSION_SCALE_GROUP_SIZE, 0, 0);
+                DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
+            #endif
+
+            #if COMPRESSED_WEIGHTS_INT8
+                const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
+                ACCUMULATOR_TYPE filter_compressed = TO_ACCUMULATOR_TYPE(weights[filter_idx]);
+                ACCUMULATOR_TYPE filter_val = (filter_compressed - zp) * scale;
                dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(filter_val);
            #else
+                const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
                dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(weights[filter_idx]);
            #endif
        }
@@ -67,19 +73,25 @@ KERNEL(fc)(
           for (uint x = 0; x < INPUT0_SIZE_X; ++x)
            {
                const uint input0_idx = INPUT0_GET_INDEX(b, ifm, y, x);
-                const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
                #if COMPRESSED_WEIGHTS
-                    FILTER_TYPE filter_compressed = weights[filter_idx];
                    #if DECOMPRESSION_ZP_TERM
-                        ACCUMULATOR_TYPE zp = decompression_zp[DECOMPRESSION_ZP_GET_INDEX_SAFE(0, ofm, 0, 0)];
+                        const uint zp_offset = DECOMPRESSION_ZP_GET_INDEX_SAFE(ofm, ifm / DECOMPRESSION_ZP_GROUP_SIZE, 0, 0);
+                        ACCUMULATOR_TYPE zp = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]);
                    #else
                        ACCUMULATOR_TYPE zp = ACCUMULATOR_VAL_ZERO;
                    #endif
+                    const uint decomp_offset = DECOMPRESSION_SCALE_GET_INDEX_SAFE(ofm, ifm / DECOMPRESSION_SCALE_GROUP_SIZE, 0, 0);
+                    DECOMPRESSION_SCALE_TYPE scale = decompression_scale[decomp_offset];
+                #endif

-                    DECOMPRESSION_SCALE_TYPE scale = decompression_scale[DECOMPRESSION_SCALE_GET_INDEX_SAFE(0, ofm, 0, 0)];
-                    ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - TO_ACCUMULATOR_TYPE(zp)) * scale;
+
+                #if COMPRESSED_WEIGHTS_INT8
+                    const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
+                    FILTER_TYPE filter_compressed = weights[filter_idx];
+                    ACCUMULATOR_TYPE filter_val = (TO_ACCUMULATOR_TYPE(filter_compressed) - zp) * scale;
                    dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(filter_val);
                #else
+                    const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, y, x);
                    dotProd += (ACCUMULATOR_TYPE)(input[input0_idx]) * (ACCUMULATOR_TYPE)(weights[filter_idx]);
                #endif
            }
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_base.cpp
@@ -24,11 +24,23 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par

    if (params.compressed) {
        jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS", 1)});
+        if (params.weights.GetDType() == WeightsType::INT8 || params.weights.GetDType() == WeightsType::UINT8) {
+            jit.AddConstants({MakeJitConstant("COMPRESSED_WEIGHTS_INT8", 1)});
+        }
+
+        const size_t scale_groups_num = params.decompression_scale.Feature().v;
+        const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
        jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_TERM", 1)});
        jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE", params.decompression_scale)});
+        jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUPS_NUM", scale_groups_num)});
+        jit.AddConstants({MakeJitConstant("DECOMPRESSION_SCALE_GROUP_SIZE", scale_group_size)});
        if (params.has_decompression_zp) {
+            const size_t zp_groups_num = params.decompression_zero_point.Feature().v;
+            const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;
            jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_TERM", 1)});
            jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP", params.decompression_zero_point)});
+            jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUPS_NUM", zp_groups_num)});
+            jit.AddConstants({MakeJitConstant("DECOMPRESSION_ZP_GROUP_SIZE", zp_group_size)});
        }
    }

--- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp
@@ -3,16 +3,19 @@
 //

 #include "convert_fc_to_compressed.hpp"
+#include <memory>

 #include "intel_gpu/op/fully_connected.hpp"
 #include "intel_gpu/op/fully_connected_compressed.hpp"

+#include "openvino/op/constant.hpp"
 #include "openvino/op/subtract.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/core/rt_info.hpp"
+#include "openvino/pass/pattern/op/pattern.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
 #include "transformations/utils/utils.hpp"
@@ -23,7 +26,19 @@ namespace intel_gpu {
 ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed() {
    using namespace ov::pass::pattern;

-    auto weights_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
+    auto compressed_constant = [](const ov::Output<ov::Node>& output) {
+        return (output.get_element_type() == ov::element::u8 ||
+                output.get_element_type() == ov::element::i8) &&
+               output.get_target_inputs().size() == 1;
+    };
+
+    auto reshape_3d_to_2d = [](const ov::Output<ov::Node>& output) {
+        auto in_ps = output.get_node()->get_input_partial_shape(0);
+        auto out_ps = output.get_node()->get_output_partial_shape(0);
+        return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2;
+    };
+
+    auto weights_m = wrap_type<ov::op::v0::Constant>(compressed_constant);
    auto convert_m = wrap_type<ov::op::v0::Convert>({weights_m});

    auto sub_const_m = wrap_type<ov::op::v0::Constant>(consumers_count(1));
@@ -34,11 +49,15 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
    auto mul_no_sub_m = wrap_type<ov::op::v1::Multiply>({convert_m, mul_const_m});
    auto mul_m = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{mul_with_sub_m, mul_no_sub_m});

+    auto reshape_const_m = wrap_type<ov::op::v0::Constant>();
+    auto reshape_m = wrap_type<ov::op::v1::Reshape>({mul_m, reshape_const_m}, reshape_3d_to_2d);
+
+    auto transpose_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{reshape_m, mul_m});
    auto transpose_const_m = wrap_type<ov::op::v0::Constant>();
-    auto transpose_m = wrap_type<ov::op::v1::Transpose>({mul_m, transpose_const_m});
-    auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{mul_m, transpose_m});
+    auto transpose_m = wrap_type<ov::op::v1::Transpose>({transpose_input, transpose_const_m});

    auto data_m = any_input();
+    auto weights_input_m = std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{reshape_m, transpose_m, mul_m});
    auto fully_connected_m = wrap_type<op::FullyConnected>({data_m, weights_input_m});

    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
@@ -52,53 +71,73 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
            return false;
        }

-        const auto& fc_input_a = fc->get_input_node_shared_ptr(0);
-        const auto& scale = pattern_map.at(mul_const_m).get_node_shared_ptr();
-        std::shared_ptr<ov::Node> optional_zero_point = nullptr;
+        bool has_transpose = pattern_map.count(transpose_m);
+        auto scale_shape = pattern_map.at(mul_const_m).get_shape();
+        bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { return d > 1; }) > 1;

-        ov::NodeVector nodes_to_copy_info{pattern_map.at(fully_connected_m).get_node_shared_ptr(),
-                                          pattern_map.at(convert_m).get_node_shared_ptr()};
-        if (pattern_map.count(mul_no_sub_m)) {
-            nodes_to_copy_info.push_back(pattern_map.at(mul_no_sub_m).get_node_shared_ptr());
-        }
-        if (pattern_map.count(mul_with_sub_m)) {
-            nodes_to_copy_info.push_back(pattern_map.at(mul_with_sub_m).get_node_shared_ptr());
-        }
+        auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr<ov::Node> node) {
+            auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
+            OPENVINO_ASSERT(constant != nullptr);
+            ov::Shape current_shape = constant->get_shape();
+            if (current_shape.size() == 2)
+                return constant;
+            OPENVINO_ASSERT(current_shape.size() == 3);
+
+            auto new_shape = (has_transpose || !grouped) ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]}
+                                                         : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]};
+
+            return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
+        };
+
+        const auto& fc_input_a = fc->get_input_node_shared_ptr(0);
+        const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr());
+        std::shared_ptr<ov::Node> optional_zero_point = nullptr;

        const bool with_zero_point = pattern_map.count(subtract_m) > 0;
        if (with_zero_point) {
-            optional_zero_point = pattern_map.at(sub_const_m).get_node_shared_ptr();
-            nodes_to_copy_info.push_back(subtract_m);
+            optional_zero_point = reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr());
        }

-        std::shared_ptr<ov::Node> fc_input_b = pattern_map.at(weights_m).get_node_shared_ptr();
-        if (pattern_map.count(transpose_m)) {
+        std::shared_ptr<ov::Node> fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr());
+        std::shared_ptr<ov::Node> fc_input_scale = scale;
+        std::shared_ptr<ov::Node> fc_input_zp = optional_zero_point;
+        if (has_transpose) {
            const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr();
-            const auto& transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
+            std::shared_ptr<ov::Node> transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr();
+            if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) {
+                std::vector<int32_t> new_order(fc_input_b->get_output_partial_shape(0).size());
+                std::iota(new_order.begin(), new_order.end(), 0);
+                std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]);
+                transpose_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{new_order.size()}, new_order);
+            }
+
            fc_input_b = transpose->clone_with_new_inputs({ fc_input_b->output(0), transpose_const });
+            fc_input_scale = transpose->clone_with_new_inputs({ scale->output(0), transpose_const });
+            if (with_zero_point)
+                fc_input_zp = transpose->clone_with_new_inputs({ optional_zero_point->output(0), transpose_const });
        }

        std::shared_ptr<ov::Node> new_fc = nullptr;
        if (with_zero_point) {
            new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
                                                                    fc_input_b,
-                                                                    scale,
-                                                                    optional_zero_point,
+                                                                    fc_input_scale,
+                                                                    fc_input_zp,
                                                                    fc->get_output_type());
        } else {
            new_fc = std::make_shared<op::FullyConnectedCompressed>(fc_input_a,
                                                                    fc_input_b,
-                                                                    scale,
+                                                                    fc_input_scale,
                                                                    fc->get_output_type());
        }

        new_fc->set_friendly_name(fc->get_friendly_name());
-        ov::copy_runtime_info(nodes_to_copy_info, new_fc);
+        ov::copy_runtime_info(m.get_matched_nodes(), new_fc);
        ov::replace_node(fc, new_fc);
        return true;
    };

-    auto m = std::make_shared<ov::pass::pattern::Matcher>(fully_connected_m);
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(fully_connected_m, "ConvertFullyConnectedToFullyConnectedCompressed");
    this->register_matcher(m, callback);
 }

--- a/src/plugins/intel_gpu/src/plugin/transformations/convert_matmul_to_fc.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_matmul_to_fc.cpp
@@ -160,7 +160,7 @@ ConvertMatMulToFullyConnected::ConvertMatMulToFullyConnected() {
        return true;
    };

-    auto m = std::make_shared<ov::pass::pattern::Matcher>(matmul_m);
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(matmul_m, "ConvertMatMulToFullyConnected");
    this->register_matcher(m, callback);
 }

--- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp
+++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/matmul_weights_decompression.cpp
@@ -2,19 +2,21 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include "ov_models/builders.hpp"
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/matmul.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "shared_test_classes/base/ov_subgraph.hpp"
 #include "transformations/rt_info/decompression.hpp"

-using namespace ngraph;
+using namespace ov;
 using namespace ov::test;

 namespace SubgraphTestsDefinitions {
 /*
- *                        Subtract_const(U8)
+ *                        Subtract_const(U8/NF4/U4)
 *                           /
- *    Weights(U8)       Convert(F32)
+ *    Weights(U8/NF4/U4)       Convert(F32)
 *       |               /
 *    Convert(F32)   Reshape(optional)
 *            \        /       Multiply_const(F32)
@@ -29,7 +31,20 @@ namespace SubgraphTestsDefinitions {
 *               |
 *              Bias
 */
-using MatmulWeightsDecompressionParams = std::tuple<std::vector<InputShape>,  // input shapes
+
+struct ShapeParams {
+    ShapeParams() = default;
+    ShapeParams(InputShape data_shape, ov::Shape weights_shape, int weights_group_size = -1)
+        : data_shape(std::move(data_shape)),
+          weights_shape(std::move(weights_shape)),
+          weights_group_size(weights_group_size) {}
+
+    InputShape data_shape;
+    ov::Shape weights_shape;
+    // Decompression group size. If the value is equal to -1, ordinary decompression is used
+    int weights_group_size;
+};
+using MatmulWeightsDecompressionParams = std::tuple<ShapeParams,              // input shapes
                                                    ov::test::ElementType,    // weights precision
                                                    ov::test::ElementType,    // activations precision
                                                    bool,                     // transpose on weights
@@ -40,7 +55,7 @@ using MatmulWeightsDecompressionParams = std::tuple<std::vector<InputShape>,  //
 class MatmulWeightsDecompression : public testing::WithParamInterface<MatmulWeightsDecompressionParams>, public SubgraphBaseTest {
 public:
    static std::string get_test_case_name(testing::TestParamInfo<MatmulWeightsDecompressionParams> obj) {
-        std::vector<InputShape> inputShapes;
+        ShapeParams shape_params;
        ov::test::ElementType weights_precision;
        ov::test::ElementType activations_precision;
        bool transpose;
@@ -48,7 +63,7 @@ public:
        bool reshape_on_decompression;
        std::map<std::string, std::string> additional_config;

-        std::tie(inputShapes,
+        std::tie(shape_params,
                 weights_precision,
                 activations_precision,
                 transpose,
@@ -57,20 +72,9 @@ public:
                 additional_config) = obj.param;

        std::ostringstream result;
-        for (const auto& shape : inputShapes) {
-            result << ov::test::utils::partialShape2str({shape.first}) << "_";
-        }
-        result << "TS=";
-        for (const auto& shape : inputShapes) {
-            result << "(";
-            if (!shape.second.empty()) {
-                auto itr = shape.second.begin();
-                do {
-                    result << ov::test::utils::vec2str(*itr);
-                } while (++itr != shape.second.end() && result << "_");
-            }
-            result << ")_";
-        }
+        result << "data_shape=" << shape_params.data_shape << "_";
+        result << "weights_shape=" << shape_params.weights_shape << "_";
+        result << "group_size=" << shape_params.weights_group_size << "_";
        result << "weights_precision=" << weights_precision << "_";
        result << "activations_precision=" << activations_precision << "_";
        result << "transpose_weights=" << transpose << "_";
@@ -87,34 +91,87 @@ public:
    }

 protected:
-    std::shared_ptr<ov::Model> init_subgraph(std::vector<ov::PartialShape>& inputShapes,
-                                             const ov::element::Type data_precision,
-                                             const ov::element::Type weights_precision,
-                                             const bool transpose_weights,
-                                             const bool add_subtract,
-                                             const bool reshape_on_decompression) {
-        ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, inputShapes[0])};
+    std::shared_ptr<ov::Model> init_subgraph(const ov::PartialShape& data_shape,
+                                              const ov::Shape& weights_shape,
+                                              const int group_size,
+                                              const ov::element::Type data_precision,
+                                              const ov::element::Type weights_precision,
+                                              const bool transpose_weights,
+                                              const bool add_subtract,
+                                              const bool reshape_on_decompression) {
+        ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(data_precision, data_shape)};
+        const auto weights_subgraph = init_compressed_weights_subgraph(weights_shape,
+                                                                       group_size,
+                                                                       data_precision,
+                                                                       weights_precision,
+                                                                       transpose_weights,
+                                                                       add_subtract,
+                                                                       reshape_on_decompression);
+
+        auto mat_mul = std::make_shared<ov::op::v0::MatMul>(params[0], weights_subgraph);
+        return std::make_shared<ov::Model>(NodeVector{mat_mul}, params, "MatmulWeightsDecompression");
+    }
+
+    std::shared_ptr<ov::Node> init_compressed_weights_subgraph(const ov::Shape& weights_shape,
+                                                               const int group_size,
+                                                               const ov::element::Type data_precision,
+                                                               const ov::element::Type weights_precision,
+                                                               const bool transpose_weights,
+                                                               const bool add_subtract,
+                                                               const bool reshape_on_decompression_constant) {
        auto transpose_if_necessary = [&](const ov::Shape& shape) {
-            if (!transpose_weights)
-                return shape;
-            auto transposed_shape = shape;
-            std::swap(*transposed_shape.rbegin(), *(transposed_shape.rbegin() + 1));
-            return transposed_shape;
+            auto result_shape = shape;
+            if (transpose_weights)
+                std::swap(*result_shape.rbegin(), *(result_shape.rbegin() + 1));
+            return result_shape;
        };

-        auto weights_shape = transpose_if_necessary(inputShapes[1].to_shape());
-        auto weights = ngraph::builder::makeConstant<uint8_t>(weights_precision, weights_shape, {}, true);
+        const bool group_decompression = group_size != -1;
+        // Weights has shape [I, O], where
+        // I - input channels
+        // O - output channels
+        // In case of group decompression, input channels dimension is split into 2: I -> [N, G], where
+        // N - number of groups
+        // G - group size
+        auto transformed_weights_shape = transpose_if_necessary(weights_shape);
+        if (group_decompression) {
+            OPENVINO_ASSERT(weights_shape[0] % group_size == 0,
+                            "Weights output channels count (",
+                            weights_shape[0],
+                            ") must be divisible by decompression group size (",
+                            group_size,
+                            ").");
+            auto in_channel_idx = transpose_weights ? transformed_weights_shape.size() - 1 : transformed_weights_shape.size() - 2;
+            transformed_weights_shape[in_channel_idx] = weights_shape[0] / group_size;
+            transformed_weights_shape.insert(transformed_weights_shape.begin() + in_channel_idx + 1, group_size);
+        }
+        auto weights_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, transformed_weights_shape);
+        auto weights = std::make_shared<ov::op::v0::Constant>(weights_tensor);
        weights->set_friendly_name("Compressed_weights");
        auto weights_convert = std::make_shared<ngraph::opset1::Convert>(weights, data_precision);

        std::shared_ptr<ov::Node> mul_parent = weights_convert;
-        auto output_channels = transpose_weights ? *(weights_shape.rbegin() + 1) : *weights_shape.rbegin();
-        auto scaleshift_target_shape = transpose_if_necessary(ov::Shape{1, output_channels});
-        auto scaleshift_const_shape = reshape_on_decompression ? ov::Shape{output_channels} : scaleshift_target_shape;
+        auto output_channels = *weights_shape.rbegin();
+
+        // Decompression constants shape:
+        // Ordinary decompression: [O, 1]
+        // Group decompression: [O, N, 1]
+        ov::Shape scaleshift_target_shape{output_channels};
+        scaleshift_target_shape.insert(scaleshift_target_shape.begin(), group_decompression ? weights_shape[0] / group_size : 1);
+        scaleshift_target_shape = transpose_if_necessary(scaleshift_target_shape);
+        if (group_decompression) {
+            auto in_channel_idx = transpose_weights ? scaleshift_target_shape.size() - 1 : scaleshift_target_shape.size() - 2;
+            scaleshift_target_shape.insert(scaleshift_target_shape.begin() + in_channel_idx + 1, 1);
+        }
+
+        auto scaleshift_const_shape = scaleshift_target_shape;
+        if (reshape_on_decompression_constant)
+            scaleshift_const_shape.erase(std::remove(scaleshift_const_shape.begin(), scaleshift_const_shape.end(), 1), scaleshift_const_shape.end());
        if (add_subtract) {
-            auto shift_const = ngraph::builder::makeConstant<uint8_t>(weights_precision, scaleshift_const_shape, {}, true);
+            auto shift_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, scaleshift_const_shape);
+            auto shift_const = std::make_shared<ov::op::v0::Constant>(shift_tensor);
            std::shared_ptr<ov::Node> shift_convert = std::make_shared<ngraph::opset1::Convert>(shift_const, data_precision);
-            if (reshape_on_decompression) {
+            if (reshape_on_decompression_constant) {
                auto shift_reshape_const = ov::opset10::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape);
                auto shift_reshape = std::make_shared<ov::opset10::Reshape>(shift_convert, shift_reshape_const, false);
                shift_convert = shift_reshape;
@@ -122,32 +179,36 @@ protected:
            mul_parent = std::make_shared<ov::opset10::Subtract>(weights_convert, shift_convert);
        }

-        std::shared_ptr<ov::Node> scale_const = ngraph::builder::makeConstant<float>(data_precision, scaleshift_const_shape, {}, true);
-        if (reshape_on_decompression) {
+        auto scale_tensor = ov::test::utils::create_and_fill_tensor(data_precision, scaleshift_const_shape, 1, -0.5, 10000);
+        std::shared_ptr<ov::Node> scale_const = std::make_shared<ov::op::v0::Constant>(scale_tensor);
+        if (reshape_on_decompression_constant) {
            auto scale_reshape_const = ov::opset10::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape);
            auto scale_reshape = std::make_shared<ov::opset10::Reshape>(scale_const, scale_reshape_const, false);
            scale_const = scale_reshape;
        }
-        auto multiply = std::make_shared<ov::opset10::Multiply>(mul_parent, scale_const);
+        std::shared_ptr<ov::Node> last_node = std::make_shared<ov::opset10::Multiply>(mul_parent, scale_const);

-        std::shared_ptr<ov::Node> matmul_weights = multiply;
+        if (group_decompression) {
+            auto reshape_target_shape = transpose_weights ? std::vector<int>{-1, static_cast<int>(weights_shape[0])}
+                                                          : std::vector<int>{static_cast<int>(weights_shape[0]), -1};
+            auto target_shape_node = ov::opset10::Constant::create(ov::element::i32, {reshape_target_shape.size()}, reshape_target_shape);
+            last_node = std::make_shared<ov::opset10::Reshape>(last_node, target_shape_node, false);
+        }
        if (transpose_weights) {
-            const size_t rank = matmul_weights->get_output_partial_shape(0).size();
+            const size_t rank = last_node->get_output_partial_shape(0).size();
            std::vector<int> order(rank);
            std::iota(order.begin(), order.end(), 0);
            std::swap(*order.rbegin(), *(order.rbegin() + 1));
            auto transpose_constant = ov::opset10::Constant::create(ov::element::i32, {rank}, order);
-            auto transpose = std::make_shared<ov::opset10::Transpose>(matmul_weights, transpose_constant);
-            matmul_weights = transpose;
+            last_node = std::make_shared<ov::opset10::Transpose>(last_node, transpose_constant);
        }
-        auto matMul = builder::makeMatMul(params[0], matmul_weights);
-        return std::make_shared<ov::Model>(NodeVector{matMul}, params, "MatmulWeightsDecompression");
+        return last_node;
    }

    void SetUp() override {
        targetDevice = ov::test::utils::DEVICE_GPU;

-        std::vector<InputShape> inputShapes;
+        ShapeParams shape_params;
        ov::test::ElementType weights_precision;
        ov::test::ElementType activations_precision;
        bool transpose_weights;
@@ -155,7 +216,7 @@ protected:
        bool reshape_on_decompression;
        std::map<std::string, std::string> additional_config;

-        std::tie(inputShapes,
+        std::tie(shape_params,
                 weights_precision,
                 activations_precision,
                 transpose_weights,
@@ -164,14 +225,47 @@ protected:
                 additional_config) = GetParam();

        configuration.insert(additional_config.begin(), additional_config.end());
-        init_input_shapes(inputShapes);
+        init_input_shapes({shape_params.data_shape, {{}, {{shape_params.weights_shape}}}});

        inType = outType = activations_precision;

-        function = init_subgraph(inputDynamicShapes, activations_precision, weights_precision, transpose_weights, decompression_sub, reshape_on_decompression);
+        function = init_subgraph(inputDynamicShapes[0],
+                                 shape_params.weights_shape,
+                                 shape_params.weights_group_size,
+                                 activations_precision,
+                                 weights_precision,
+                                 transpose_weights,
+                                 decompression_sub,
+                                 reshape_on_decompression);
+
+
+        if (activations_precision == ov::element::f16) {
+            auto weights_size = ov::shape_size(shape_params.weights_shape);
+            auto weights_input_channels = weights_size / (transpose_weights ? shape_params.weights_shape[0] : shape_params.weights_shape.back());
+            // Absolute values range during accumulation may be quite big ( > 200) so fp16 representation & math error is larger than default threshold
+            if (weights_input_channels > 2048) {
+                abs_threshold = 4.0f;
+            } else {
+                abs_threshold = 1.0f;
+            }
+        }
    }

-    void checkResults() {
+    void generate_inputs(const std::vector<ngraph::Shape>& target_input_static_shapes) override {
+          inputs.clear();
+          const auto& model_inputs = function->inputs();
+          for (size_t i = 0; i < model_inputs.size(); ++i) {
+                const auto& model_input = model_inputs[i];
+                ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(),
+                                                                            target_input_static_shapes[i],
+                                                                            2,
+                                                                            -1,
+                                                                            10000);
+                inputs.insert({model_input.get_node_shared_ptr(), tensor});
+          }
+    }
+
+    void check_results() {
        const auto& test_param = GetParam();
        ov::test::ElementType weights_precision = std::get<1>(test_param);
        for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
@@ -185,24 +279,20 @@ protected:
 TEST_P(MatmulWeightsDecompression, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    run();
-    checkResults();
+    check_results();
 }

 namespace {

 const std::vector<ov::test::ElementType> activations_precisions = {ov::element::f32, ov::element::f16};
 const std::vector<ov::test::ElementType> weights_precisions = {ov::element::u8};
-const std::vector<std::vector<InputShape>> input_shapes_basic = {
-    {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {{}, {{16, 32}}}},
-    {{{}, {{10, 40, 496}}}, {{}, {{1, 496, 240}}}},
-    {{{}, {{1, 4, 48}}}, {{}, {{48, 256}}}},
-    {{{}, {{11, 339, 377}}}, {{}, {{377, 335}}}},
-    {{{}, {{1, 4, 32}}}, {{}, {{32, 256}}}},
-    {{{}, {{1, 4, 512}}}, {{}, {{512, 256}}}},
-    {{{}, {{1, 16, 32}}}, {{}, {{32, 64}}}},
-    {{{}, {{2, 4, 32}}}, {{}, {{32, 65}}}},
-    {{{}, {{3, 12, 768}}}, {{}, {{768, 1024}}}},
-    {{{}, {{11, 339, 577}}}, {{}, {{577, 335}}}},
+const std::vector<ShapeParams> input_shapes_basic = {
+    {{{-1, -1, -1}, {{1, 4, 16}, {10, 16, 16}}}, {16, 32}},
+    {{{}, {{1, 4, 16}}}, {16, 32}, 2ul},
+    {{{}, {{1, 4, 16}}}, {1, 16, 32}},
+    {{{}, {{10, 40, 496}}}, {1, 496, 240}},
+    {{{}, {{1, 4, 48}}}, {48, 256}},
+    {{{}, {{11, 339, 377}}}, {377, 335}}
 };

 INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
@@ -216,15 +306,16 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_basic,
                                            ::testing::Values(std::map<std::string, std::string>())),
                         MatmulWeightsDecompression::get_test_case_name);

-const std::vector<std::vector<InputShape>> input_shapes_corner_cases_basic = {
-    {{{-1, -1, -1}, {{1, 4, 16}}}, {{}, {{1, 16, 32}}}},
-    {{{}, {{1, 4, 16}}}, {{}, {{1, 16, 32}}}},
-    {{{-1, -1, -1}, {{1, 4, 16}}}, {{}, {{16, 32}}}},
-    {{{-1, -1, -1, -1}, {{1, 1, 4, 16}}}, {{}, {{1, 1, 16, 32}}}},
-    {{{}, {{1, 1, 4, 16}}}, {{}, {{1, 1, 16, 32}}}},
+const std::vector<ShapeParams> input_shapes_corner_cases_basic = {
+    {{{-1, -1, -1}, {{1, 4, 16}}}, {1, 16, 32}},
+    {{{-1, -1, -1}, {{1, 4, 16}}}, {16, 32}},
+    {{{-1, -1, 16}, {{1, 4, 16}}}, {16, 32}, 4},
 };
-const std::vector<std::vector<InputShape>> input_shapes_corner_cases_big = {
-    {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {{}, {{1, 480, 256}}}},
+const std::vector<ShapeParams> input_shapes_corner_cases_big = {
+    {{{-1, -1, -1}, {{10, 40, 480}, {11, 40, 480}}}, {1, 480, 256}},
+    {{{-1, -1, -1}, {{1, 1, 4096}}}, {4096, 4096}, 128},
+    {{{-1, -1, -1}, {{1, 1, 4096}}}, {4096, 4096}},
+    {{{-1, 4096}, {{1, 4096}}}, {4096, 4096}, 128},
 };

 const std::vector<bool> transpose_weights = {true, false};
@@ -242,7 +333,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases_basic,
                                            ::testing::Values(std::map<std::string, std::string>{})),
                         MatmulWeightsDecompression::get_test_case_name);

-INSTANTIATE_TEST_SUITE_P(smoke_MatMulCompressedWeights_corner_cases_big,
+INSTANTIATE_TEST_SUITE_P(MatMulCompressedWeights_corner_cases_big,
                         MatmulWeightsDecompression,
                         ::testing::Combine(::testing::ValuesIn(input_shapes_corner_cases_big),
                                            ::testing::ValuesIn(weights_precisions),
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -663,21 +663,22 @@ TEST(fully_connected_gpu, compressed_scale_zp_bias) {
    auto& engine = get_test_engine();

    auto input_mem = engine.allocate_memory({ {1, 2, 4}, data_types::f32, format::bfyx });
-    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f32, format::bfyx });
+    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
    auto bias_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
-    auto scale_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
-    auto zp_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
+    auto scale_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx });
+    auto zp_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx });

    set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f,
                            0.5f, -2.0f, -0.5f, -1.0f });
-    set_values(weights_mem, { 1.5f,  1.0f,  0.5f, -1.0f,
-                              0.0f,  0.5f,  0.5f, -0.5f,
-                             -2.0f, -0.5f,  1.0f,  1.5f,
-                             -2.0f, -0.5f,  1.0f,  1.5f,
-                              2.0f,  0.5f, -1.0f, -1.5f,
-                              2.0f,  0.5f, -1.0f, -1.5f,
-                             -1.5f, -1.0f, -0.5f,  1.0f,
-                              0.0f, -0.5f, 0.5f, 0.5f });
+    set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
+                                       5, 6, 7, 8,
+                                       9, 10, 11, 12,
+                                       13, 14, 15, 0,
+                                       15, 14, 13, 12,
+                                       11, 10, 9, 8,
+                                       7, 6, 5, 4,
+                                       3, 2, 1, 0});
+

    set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f  });
    set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f  });
@@ -709,8 +710,7 @@ TEST(fully_connected_gpu, compressed_scale_zp_bias) {
    ov::PartialShape expected_shape{1, 2, 8};
    ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());

-    std::vector<float> expected_result = {-4.0f, -23.0f, 11.0f, 0.0f, -2.0f, -3.5f, -30.0f, -10.5f,
-                                          6.0f, 19.0f, -5.0f, -8.0f, 12.0f, -8.5f, 44.0f, 14.5f};
+    std::vector<float> expected_result = {13.f, 58.f, -51.f, -108.f, 18.5f, -18.f, 1.f, -4.f, -11.f, -62.f, 57.f, 100.f, -8.5f, 6.f, 13.f, 8.f, };

    for (size_t i = 0; i < expected_result.size(); i++) {
        ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
@@ -721,20 +721,20 @@ TEST(fully_connected_gpu, compressed_scale_bias) {
    auto& engine = get_test_engine();

    auto input_mem = engine.allocate_memory({ {1, 2, 4}, data_types::f32, format::bfyx });
-    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f32, format::bfyx });
+    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
    auto bias_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });
    auto scale_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx });

    set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f,
                            0.5f, -2.0f, -0.5f, -1.0f });
-    set_values(weights_mem, { 1.5f,  1.0f,  0.5f, -1.0f,
-                              0.0f,  0.5f,  0.5f, -0.5f,
-                             -2.0f, -0.5f,  1.0f,  1.5f,
-                             -2.0f, -0.5f,  1.0f,  1.5f,
-                              2.0f,  0.5f, -1.0f, -1.5f,
-                              2.0f,  0.5f, -1.0f, -1.5f,
-                             -1.5f, -1.0f, -0.5f,  1.0f,
-                              0.0f, -0.5f, 0.5f, 0.5f });
+    set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
+                                       5, 6, 7, 8,
+                                       9, 10, 11, 12,
+                                       13, 14, 15, 0,
+                                       15, 14, 13, 12,
+                                       11, 10, 9, 8,
+                                       7, 6, 5, 4,
+                                       3, 2, 1, 0});

    set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f });
    set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 1.0f });
@@ -764,8 +764,7 @@ TEST(fully_connected_gpu, compressed_scale_bias) {
    ov::PartialShape expected_shape{1, 2, 8};
    ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());

-    std::vector<float> expected_result = {2.0f, 1.0f, -1.0f, -12.0f, 4.0f, -5.0f, 6.0f, -8.25f,
-                                          0.0f, -5.0f, 7.0f, 4.0f, 6.0f, -7.0f, 8.0f, -7.75f};
+    std::vector<float> expected_result = {19.f, 40.f, 69.f, 54.f, 83.f, 48.f, 37.f, -2.f, -17.f, -44.f, -63.f, -62.f, -73.f, -60.f, -23.f, -14.f };

    for (size_t i = 0; i < expected_result.size(); i++) {
        ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
@@ -776,19 +775,19 @@ TEST(fully_connected_gpu, compressed_scale_fp16) {
    auto& engine = get_test_engine();

    auto input_mem = engine.allocate_memory({ { 2, 4}, data_types::f16, format::bfyx });
-    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::f16, format::bfyx });
-    auto scale_mem = engine.allocate_memory({ {1, 8}, data_types::f16, format::bfyx });
+    auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx });
+    auto scale_mem = engine.allocate_memory({ {8, 1}, data_types::f16, format::bfyx });

    set_values<ov::float16>(input_mem, { ov::float16(-0.5f), ov::float16(2.0f),  ov::float16(0.5f),  ov::float16(1.0f),
                                     ov::float16(0.5f),  ov::float16(-2.0f), ov::float16(-0.5f), ov::float16(-1.0f) });
-    set_values<ov::float16>(weights_mem, {ov::float16( 1.5f), ov::float16( 1.0f), ov::float16( 0.5f), ov::float16(-1.0f),
-                                      ov::float16( 0.0f), ov::float16( 0.5f), ov::float16( 0.5f), ov::float16(-0.5f),
-                                      ov::float16(-2.0f), ov::float16(-0.5f), ov::float16( 1.0f), ov::float16( 1.5f),
-                                      ov::float16(-2.0f), ov::float16(-0.5f), ov::float16( 1.0f), ov::float16( 1.5f),
-                                      ov::float16( 2.0f), ov::float16( 0.5f), ov::float16(-1.0f), ov::float16(-1.5f),
-                                      ov::float16( 2.0f), ov::float16( 0.5f), ov::float16(-1.0f), ov::float16(-1.5f),
-                                      ov::float16(-1.5f), ov::float16(-1.0f), ov::float16(-0.5f), ov::float16( 1.0f),
-                                      ov::float16( 0.0f), ov::float16(-0.5f), ov::float16(0.5f),  ov::float16( 0.5f) });
+    set_values<uint8_t>(weights_mem, { 1, 2, 3, 4,
+                                       5, 6, 7, 8,
+                                       9, 10, 11, 12,
+                                       13, 14, 15, 0,
+                                       15, 14, 13, 12,
+                                       11, 10, 9, 8,
+                                       7, 6, 5, 4,
+                                       3, 2, 1, 0});

    set_values<ov::float16>(scale_mem, {ov::float16(2.0f), ov::float16(4.0f), ov::float16(-2.0f), ov::float16(-4.0f), ov::float16(0.5f), ov::float16(-0.5f), ov::float16(2.0f), ov::float16(2.0f)});

@@ -817,8 +816,8 @@ TEST(fully_connected_gpu, compressed_scale_fp16) {
    ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());

   std::vector<ov::float16> expected_result = {
-        ov::float16(1.0f), ov::float16( 3.0f), ov::float16(-4.0f), ov::float16(-8.0f), ov::float16(-1.0f), ov::float16( 1.0f), ov::float16(-1.0f), ov::float16(-0.5f),
-        ov::float16(-1.0f), ov::float16(-3.0f), ov::float16( 4.0f), ov::float16( 8.0f), ov::float16( 1.0f), ov::float16(-1.0f), ov::float16( 1.0f), ov::float16( 0.5f)};
+       ov::float16(18), ov::float16(84), ov::float16(-66), ov::float16(-116), ov::float16(19.5), ov::float16(-13.5), ov::float16(30), ov::float16(6),
+       ov::float16(-18), ov::float16(-84), ov::float16(66), ov::float16(116), ov::float16(-19.5), ov::float16(13.5), ov::float16(-30), ov::float16(-6) };

    for (size_t i = 0; i < expected_result.size(); i++) {
        ASSERT_FLOAT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;