[GPU] fp16-int8 mixed precision (#9483)

* Use fp16-int8 mixed precision, instead of fp32-int8 mixed precision for onednn * Allow quantization fusion into bsv32_fsv16 conv * For conv, do not select bsv16_fsv16. Select bsv32_fsv16 for mixed-layout * depthwise conv is supported even though it is not fp16 * Allow resample kernel to work as cross-layout * test case for cross-layout of resample_opt kernel * Select onednn-friendly format from cldnn conv * Optimization for fp16 mixed precision * Choose mixed layout in case of mixed precision from reorder_inputs * Support for mixed precision from depth_to_space * Do not convert first conv format * Use onednn for FC output of fp16 * Choose bsv8_fsv4 from quantization even when conv kernel size is not 7 * Select cldnn for first conv when input feature depth is 1 * For first conv, use onednn only when kernel size is 7x7 * Use short variable name and added is_i8_u8 helper function Co-authored-by: Kim,SungEun <sungeun.kim@intel.com>
2022-01-11 17:56:36 +09:00
parent 2c6078e96c
commit ef390902ec
8 changed files with 150 additions and 53 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp
@@ -116,6 +116,10 @@ struct data_type_traits {
        return (static_cast<uint32_t>(data_type) & float_type_mask) != 0;
    }

+    static bool is_i8_u8(data_types data_type) {
+        return data_type == data_types::i8 || data_type == data_types::u8;
+    }
+
    static size_t align_of(data_types data_type) {
        switch (data_type) {
            case data_types::bin:
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -844,13 +844,17 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                                 input_data.as<binary_convolution>().get_primitive()->dilation.spatial[0] == 1 &&
                                 input_data.as<binary_convolution>().get_primitive()->dilation.spatial[1] == 1;

+            auto expected_format = _lo.get_preferred_format(input_data);
+
            should_fuse |= input_data.is_type<convolution>() && conv_supports_fusings(input_data.as<convolution>()) &&
                           quantize_node.get_scale_shift_opt() &&
                           ((out_layout.data_type == data_types::f32 || out_layout.data_type == data_types::f16)  ||
                            input_data.get_output_layout().format == format::b_fs_yx_fsv16 ||
+                            input_data.get_output_layout().format == format::bs_fs_yx_bsv32_fsv16 ||
                            (_lo.should_select_b_fs_yx_fsv16_layout(input_data.as<convolution>(), input_data.get_dependency(1).get_output_layout()) &&
                             !is_grouped_conv(input_data.as<convolution>())) ||
                           // Avoid fusing to b_fs_yx_fsv16 (and similar) kernels
+                           expected_format == cldnn::format::bs_fs_yx_bsv32_fsv16 /* Allow quantization fusing for onednn */ ||
                           ((input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 ||
                           input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) &&
                           (out_layout.data_type == data_types::u8 || out_layout.data_type == data_types::i8)));
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -562,16 +562,54 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
        }
    };

-    const auto reorder_weights_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
-        auto& weights = conv_node.weights();
-        auto weights_layout = weights.get_output_layout();
-        if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
-            auto dims = weights_layout.format.dimension();
-            auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
-            auto reorder = rf.get_reorder(weights.id(), weights_layout,
-                layout{ weights_layout.data_type, preferred_format, weights_layout.size });
-            if (reorder.first) {
-                p.add_intermediate(reorder.first, conv_node, 1, !reorder.second);
+    const auto reorder_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
+        {
+            // reorder weights convolution
+            auto& weights = conv_node.weights();
+            auto weights_layout = weights.get_output_layout();
+            if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
+                auto dims = weights_layout.format.dimension();
+                auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
+                auto reorder = rf.get_reorder(weights.id(), weights_layout,
+                    layout{ weights_layout.data_type, preferred_format, weights_layout.size });
+                if (reorder.first) {
+                    p.add_intermediate(reorder.first, conv_node, 1, !reorder.second);
+                }
+            }
+        }
+
+        std::vector<format> wrong_format = {format::b_fs_yx_fsv16, format::bs_fs_yx_bsv32_fsv16};
+        std::vector<format> correct_format = {format::b_fs_yx_fsv32, format::bs_fs_yx_bsv32_fsv32};
+        for (int i = 0; i < wrong_format.size(); i++) {
+            // reorder for onednn mixed-precision conv
+            // If the layouts are like below, change input layout to fsv32.
+            // From:
+            //   (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16)
+            // To:
+            //   (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16)
+            //
+            // Do not apply such change for b=1 first conv
+
+            auto prev_node = conv_node.get_dependencies().front();
+            auto old_layout = prev_node->get_output_layout();
+            auto conv_layout = conv_node.get_output_layout();
+            if (lo.get_optimization_attributes().use_onednn_impls
+                    && conv_layout.format == wrong_format[i]
+                    && data_type_traits::is_i8_u8(old_layout.data_type)
+                    && (old_layout.format == wrong_format[i])
+                    && !(old_layout.size.batch[0] == 1 && old_layout.size.feature[0] <= 4)) {
+                auto new_layout = old_layout;
+                new_layout.format = correct_format[i];
+                auto new_input = rf.get_reorder(prev_node->id(),
+                                                old_layout,
+                                                new_layout);
+
+                if (new_input.first) {
+                    p.add_intermediate(new_input.first, conv_node, 0, !new_input.second);
+                }
+
+                // Prevent layout propagation as we are using mixed precision for conv
+                conv_node.get_dependencies().front()->set_output_layout(new_layout, false);
            }
        }
    };
@@ -598,7 +636,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
            reorder_input_detection_output,
            reorder_input_binary_convolution,
            reorder_input_and_weights_deconvolution,
-            reorder_weights_convolution,
+            reorder_convolution,
            reorder_input_fully_connected);
   }

--- a/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp
@@ -67,6 +67,14 @@ attach_depth_to_space_impl::attach_depth_to_space_impl() {
        std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
        std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
+        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
+        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
+        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
+        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
+        std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
+        std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
+        std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
+        std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
    });
 }

--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -9,6 +9,7 @@

 #include "data_inst.h"
 #include "reorder_inst.h"
+#include "resample_inst.h"
 #include "reshape_inst.h"
 #include "generic_layer.hpp"
 #include <sstream>
@@ -208,6 +209,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        }
    }

+    // Ref kernels are the main for depth_to_space and region_yolo. It can do anything.
+    if (next.is_type<depth_to_space>() || next.is_type<region_yolo>())
+        return true;
+
    if (next.is_type<reorder>()) {
        // Avoid fusing current reorder to fuse next reorder
        if (next.get_users().size() == 1 && next.get_users().front()->is_type<convolution>() && use_onednn_impls) {
@@ -221,6 +226,14 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        return true;
    }

+    // resample_opt kernel can work cross-layout between fsv16 and fsv32
+    if (next.is_type<resample>() &&
+        (fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_yx_fsv32
+            || fmt_prev == format::bs_fs_yx_bsv32_fsv16 || fmt_prev == format::bs_fs_yx_bsv32_fsv32) &&
+        (fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_yx_fsv32
+            || fmt_next == format::bs_fs_yx_bsv32_fsv16 || fmt_next == format::bs_fs_yx_bsv32_fsv32))
+        return true;
+
    if (next.is_type<pooling>() &&
        (((prev_simple && next_simple) && (prev_dt == next_dt)) ||
        ((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))
@@ -319,21 +332,29 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,

        // Remove Reorder to support mixed format convolutions of bsv32fsv16 or bsv32fsv32 output
        if (next.is_type<convolution>() && (prev.is_type<eltwise>() || prev.is_type<quantize>()) &&
-            (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) &&
+            (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2 || fmt_prev == format::bs_fs_yx_bsv8_fsv4) &&
            ((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
            (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4))))
            return true;

        // Remove Reorder for Convolution: b_fs_yx_fsv32 (i8/u8) -> b_fs_yx_fsv16 (fp32/fp16)
-        if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv32 && fmt_next == format::b_fs_yx_fsv16 &&
-            !data_type_traits::is_floating_point(prev_dt) && data_type_traits::is_floating_point(next_dt)) {
-            auto& node = prev.get_users().front();
-            // Avoid to fuse padding reorder to previous onednn convolution
-            if (prev.get_preferred_impl_type() == impl_types::onednn &&
-                (node->get_output_layout().data_padding != prev.get_output_layout().data_padding))
-                return false;
-            else
-                return true;
+        //                                 b_fs_yx_fsv16 (fp32/fp16) -> b_fs_yx_fsv32 (i8/u8)
+        if (next.is_type<convolution>()) {
+            const bool fsv32_to_fsv16 = (((fmt_prev == format::b_fs_yx_fsv32 && fmt_next == format::b_fs_yx_fsv16) ||
+                                          (fmt_prev == format::bs_fs_yx_bsv32_fsv32 && fmt_next == format::bs_fs_yx_bsv32_fsv16)) &&
+                                          data_type_traits::is_i8_u8(prev_dt) && data_type_traits::is_floating_point(next_dt));
+            const bool fsv16_to_fsv32 = (((fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv32) ||
+                                          (fmt_prev == format::bs_fs_yx_bsv32_fsv16 && fmt_next == format::bs_fs_yx_bsv32_fsv32)) &&
+                                          data_type_traits::is_floating_point(prev_dt) && data_type_traits::is_i8_u8(next_dt));
+            if (fsv32_to_fsv16 || fsv16_to_fsv32) {
+                auto& node = prev.get_users().front();
+                // Avoid to fuse padding reorder to previous onednn convolution
+                if (prev.get_preferred_impl_type() == impl_types::onednn &&
+                    (node->get_output_layout().data_padding != prev.get_output_layout().data_padding))
+                    return false;
+                else
+                    return true;
+            }
        }

        if (next.is_type<quantize>())
@@ -367,10 +388,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
 }

 bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next) {
-    if (next == nullptr) {
-        // Ref kernels are the main for depth_to_space and region_yolo. It can do anything
-        return prev.is_type<depth_to_space>() || prev.is_type<region_yolo>();
-    }
+    // Ref kernels are the main for depth_to_space and region_yolo. It can do anything. Should not see next.
+    if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>())
+        return true;
+
+    if (next == nullptr)
+        return false;

    auto dt_prev = prev.get_output_layout().data_type;
    auto dt_next = next->get_output_layout().data_type;
@@ -379,6 +402,14 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
    if (prev.is_type<reorder>())
        return true;

+    // resample_opt kernel can work cross-layout between fsv16 and fsv32
+    if (prev.is_type<resample>() &&
+        (fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_yx_fsv32
+            || fmt_prev == format::bs_fs_yx_bsv32_fsv16 || fmt_prev == format::bs_fs_yx_bsv32_fsv32) &&
+        (fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_yx_fsv32
+            || fmt_next == format::bs_fs_yx_bsv32_fsv16 || fmt_next == format::bs_fs_yx_bsv32_fsv32))
+        return true;
+
    if (prev.is_type<binary_convolution>() && fmt_next == format::b_fs_yx_fsv16)
        return true;

@@ -900,7 +931,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
        }
    }

-    if (use_onednn_impls) {
+    if (use_onednn_impls && onednn_valid_post_ops) {
        std::function<bool(const program_node&)> has_any_convolutions_below;
        has_any_convolutions_below = [&](const program_node& node) -> bool {
            for (auto& usr : node.get_users()) {
@@ -913,11 +944,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,

        /* ***************************** OneDNN impls format selection part ****************************** */
        bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0);
-        // TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented
-        // bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8;
+        bool i8_u8_output = data_type_traits::is_i8_u8(output_layout.data_type);
        // bool is_first_conv = input_layout.size.feature[0] < 4;

-        if (i8_u8_input) {
+        if (i8_u8_output) {
            if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) {
                if (input_layout.size.batch[0] >= 16) {
                    expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32;
@@ -937,23 +967,19 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
                expected_format = imad_case(node);
            }
            expected_tensor = current_layout.size;
-        } else if (input_layout.data_type == data_types::f16 && is_2d) {
+        } else if ((output_layout.data_type == data_types::f16 || output_layout.data_type == data_types::f32) && is_2d) {
            expected_tensor = current_layout.size;

            if (input_layout.size.batch[0] >= 16 && onednn_valid_post_ops) {
-                if (output_layout.data_type == input_layout.data_type) {
-                    if (non_grouped || valid_grouped || is_dw) {
-                        expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
-                    } else {
-                        expected_format = cldnn::format::b_fs_yx_fsv16;
-                    }
+                if (non_grouped || valid_grouped || is_dw) {
+                    expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16;
                } else {
-                    expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
+                    expected_format = cldnn::format::b_fs_yx_fsv16;
                }
            } else {
                expected_format = cldnn::format::b_fs_yx_fsv16;
            }
-        } else if (input_layout.data_type == data_types::f16 &&
+        } else if (output_layout.data_type == data_types::f16 &&
                convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) &&
                (output_layout.data_type == input_layout.data_type ||
                !data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) {
@@ -973,7 +999,11 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
        //}
    } else {
        /* *************************** Native impls format selection part ************************** */
-        if (i8_u8_input) {
+        if (use_onednn_impls && i8_u8_input) {
+            // It is here because of post operation condition for onednn.
+            // Use fsv32 for onednn friendliness.
+            expected_format = cldnn::format::b_fs_yx_fsv32;
+        } else if (i8_u8_input) {
            if ((_optimization_attributes.b_fs_yx_fsv16_network &&
                convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim))) {
                expected_format = cldnn::format::b_fs_yx_fsv16;
@@ -1155,10 +1185,11 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) {
        auto wei_dt = is_conv ? node.as<convolution>().weights().get_output_layout().data_type :
                                node.as<deconvolution>().weights().get_output_layout().data_type;

-        if ((in_dt == data_types::f16 && wei_dt == data_types::f16) && (out_dt == data_types::f16 || out_dt == data_types::f32 || out_dt == data_types::i8))
+        if ((in_dt == data_types::f16 && wei_dt == data_types::f16) &&
+            (out_dt == data_types::f16 || out_dt == data_types::f32 || out_dt == data_types::i8 || out_dt == data_types::u8))
            return true;
        if ((in_dt == data_types::i8 || in_dt == data_types::u8) && wei_dt == data_types::i8 &&
-            (out_dt == data_types::f32 || out_dt == data_types::i32 || out_dt == data_types::i8 || out_dt == data_types::u8))
+            (out_dt == data_types::f32 || out_dt == data_types::i32 || out_dt == data_types::f16 || out_dt == data_types::i8 || out_dt == data_types::u8))
            return true;
    } else if (node.is_type<fully_connected>()) {
        auto& fc_node = node.as<fully_connected>();
@@ -1170,7 +1201,7 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) {
        if (in_dt == data_types::f32 && wei_dt == data_types::f32)
            return true;
        if ((in_dt == data_types::i8 || in_dt == data_types::u8) && (wei_dt == data_types::i8) &&
-            (out_dt == data_types::i8 || out_dt == data_types::u8 || out_dt == data_types::i32 || out_dt == data_types::f32))
+            (out_dt == data_types::i8 || out_dt == data_types::u8 || out_dt == data_types::i32 || out_dt == data_types::f16 || out_dt == data_types::f32))
            return true;
    }

@@ -1259,6 +1290,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format

        auto input_fmt = input_layout.format;
        auto output_fmt = output_layout.format;
+        auto input_dt = input_layout.data_type;
+        auto output_dt = output_layout.data_type;

        preferred_impl = impl_types::onednn;

@@ -1273,14 +1306,22 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
        }

        // Native impl works faster for this type of reorder
-        if (input_layout.format == format::bfyx && output_layout.format == format::bfyx) {
+        if (input_fmt == format::bfyx && output_fmt == format::bfyx) {
            preferred_impl = impl_types::ocl;
        }

        // onednn reorder doesn't support different number of dimensions in input and output layouts
-        if (input_layout.format.dimension() != output_layout.format.dimension()) {
+        if (input_fmt.dimension() != output_fmt.dimension()) {
            preferred_impl = impl_types::ocl;
        }
+
+        // For mixed precision case, onednn is slower than cldnn
+        if (input_fmt == format::b_fs_yx_fsv16 && data_type_traits::is_i8_u8(input_dt))
+            preferred_impl = impl_types::ocl;
+        if (output_fmt == format::b_fs_yx_fsv16 && data_type_traits::is_i8_u8(output_dt))
+            preferred_impl = impl_types::ocl;
+        if (output_fmt == format::bfyx && output_dt == data_types::f32)
+            preferred_impl = impl_types::ocl;
    } else if (node.is_type<pooling>() || node.is_type<convolution>() || node.is_type<deconvolution>()) {
        if (!_optimization_attributes.use_onednn_impls)
            return impl_types::ocl;
@@ -1330,13 +1371,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
            auto& conv = node.as<convolution>();
            auto input_layout = conv.input().get_output_layout();
            auto output_layout = conv.get_output_layout();
-            bool fp16_input = input_layout.data_type == data_types::f16;
            bool has_groups = conv.get_primitive()->groups > 1;
            bool is_depthwise = conv.get_primitive()->groups == input_layout.size.feature[0];
            bool first_conv = input_layout.size.feature[0] <= 4;
-            bool enable_onednn_dw_fp16_conv = fp16_input && is_depthwise;
-            if (((has_groups && !enable_onednn_dw_fp16_conv) || first_conv) &&
-                (output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::bs_fs_yx_bsv32_fsv16) &&
+            if (((has_groups && !is_depthwise) || first_conv) &&
+                (output_layout.format == format::b_fs_yx_fsv16) &&
                !needs_onednn_bfyx_to_blocked(format::bfyx, output_layout.format, input_layout, conv))
                impl_candidate = impl_types::ocl;
            if (conv.get_output_layout().format == format::b_fs_yx_fsv32 && first_conv)
@@ -1522,7 +1561,7 @@ format layout_optimizer::get_preferred_format(program_node& node) {
                if (node.get_users().size() == 1 && node.get_users().front()->is_type<convolution>()) {
                    auto& conv = node.get_users().front()->as<convolution>();
                    auto ws = conv.get_dependency(1).get_output_layout().size;
-                    if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1)
+                    if (ws.spatial[0] != 7 || conv.get_primitive()->groups > 1 || layout.size.feature[0] == 1)
                        expected = format::bfyx;
                    else
                        expected = format::bs_fs_yx_bsv8_fsv4;
--- a/src/plugins/intel_gpu/src/plugin/program.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program.cpp
@@ -203,7 +203,7 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
        try {
            program = cldnn::program::build_program(*m_engine, *m_topology, options);
        } catch (std::exception& e) {
-            IE_THROW() << "cldnn program build failed!" << e.what();
+            IE_THROW() << "cldnn program build failed! " << e.what();
        }
        CleanupBuild();

--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -337,7 +337,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {

        // Conversion to FP32 might be needed for quantized models that face any fp16 related issues (e.g. overflow) for non-quantized layers
        // With this key users can work-around such issues
-        if (!config.enable_fp16_for_quantized_models || use_onednn) {
+        if (!config.enable_fp16_for_quantized_models) {
            ngraph::pass::Manager manager;
            manager.register_pass<ngraph::pass::ConvertPrecision>(precisions_array {{ ngraph::element::f16, ngraph::element::f32 }});
            manager.run_passes(func);
--- a/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp
@@ -2168,10 +2168,13 @@ struct resample_opt_random_test : testing::TestWithParam<resample_opt_random_tes
        prim_opt.pads_begin = params.pads_begin;
        prim_opt.pads_end = params.pads_end;
        topo_opt.add(prim_opt);
-        topo_opt.add(reorder("res_to_bfyx", "resample_opt", format::bfyx, params.input_type));
+        topo_opt.add(reorder("to_output_type", "resample_opt", params.out_format, params.input_type));
+        topo_opt.add(reorder("res_to_bfyx", "to_output_type", format::bfyx, params.input_type));

        auto build_opts_opt = build_options();
-        build_opts_opt.set_option(build_option::outputs({"resample_opt", "res_to_bfyx"}));
+        build_opts_opt.set_option(build_option::outputs({"to_output_type", "res_to_bfyx"}));
+        // optimize_data is turned on to test cross-layout
+        build_opts_opt.set_option(build_option::optimize_data(true));

        network net_opt(engine, topo_opt, build_opts_opt);

@@ -2227,5 +2230,6 @@ INSTANTIATE_TEST_SUITE_P(resample_opt_smoke_linear_onnx,
                                { data_types::f16, {1, 128, 13, 13},  {1, 128, 26, 26},  1, resample_type::linear_onnx, 1, format::b_fs_yx_fsv32, format::b_fs_yx_fsv32, {}, {}},
                                { data_types::f16, {1, 128, 13, 13},  {1, 128, 26, 26},  1, resample_type::linear_onnx, 1, format::bs_fs_yx_bsv32_fsv16, format::bs_fs_yx_bsv32_fsv16, {}, {}},
                                { data_types::f16, {1, 128, 13, 13},  {1, 128, 26, 26},  1, resample_type::linear_onnx, 1, format::bs_fs_yx_bsv32_fsv32, format::bs_fs_yx_bsv32_fsv32, {}, {}},
+                                { data_types::f16, {1, 128, 13, 13},  {1, 128, 26, 26},  1, resample_type::linear_onnx, 1, format::b_fs_yx_fsv16, format::b_fs_yx_fsv32, {}, {}},
                            }
                        ));