diff --git a/samples/cpp/benchmark_app/inputs_filling.cpp b/samples/cpp/benchmark_app/inputs_filling.cpp
index 4c53dd41ece..fd244b595f2 100644
--- a/samples/cpp/benchmark_app/inputs_filling.cpp
+++ b/samples/cpp/benchmark_app/inputs_filling.cpp
@@ -225,6 +225,13 @@ ov::Tensor get_image_tensor(const std::vector<std::string>& files,
                                                inputInfo.second,
                                                inputInfo.first,
                                                filenames_used);
+    } else if (type == ov::element::f64) {
+        return create_tensor_from_image<double>(files,
+                                                inputId,
+                                                batchSize,
+                                                inputInfo.second,
+                                                inputInfo.first,
+                                                filenames_used);
     } else if (type == ov::element::i32) {
         return create_tensor_from_image<int32_t>(files,
                                                  inputId,
@@ -257,6 +264,8 @@ ov::Tensor get_im_info_tensor(const std::pair<size_t, size_t>& image_size,
     auto type = inputInfo.second.type;
     if (type == ov::element::f32) {
         return create_tensor_im_info<float>(image_size, batchSize, inputInfo.second, inputInfo.first);
+    } else if (type == ov::element::f64) {
+        return create_tensor_im_info<double>(image_size, batchSize, inputInfo.second, inputInfo.first);
     } else if (type == ov::element::f16) {
         return create_tensor_im_info<short>(image_size, batchSize, inputInfo.second, inputInfo.first);
     } else if (type == ov::element::i32) {
@@ -281,6 +290,13 @@ ov::Tensor get_binary_tensor(const std::vector<std::string>& files,
                                                 inputInfo.second,
                                                 inputInfo.first,
                                                 filenames_used);
+    } else if (type == ov::element::f64) {
+        return create_tensor_from_binary<double>(files,
+                                                 inputId,
+                                                 batchSize,
+                                                 inputInfo.second,
+                                                 inputInfo.first,
+                                                 filenames_used);
     } else if (type == ov::element::f16) {
         return create_tensor_from_binary<short>(files,
                                                 inputId,
@@ -318,6 +334,8 @@ ov::Tensor get_random_tensor(const std::pair<std::string, benchmark_app::InputIn
     auto type = inputInfo.second.type;
     if (type == ov::element::f32) {
         return create_tensor_random<float, float>(inputInfo.second);
+    } else if (type == ov::element::f64) {
+        return create_tensor_random<double, double>(inputInfo.second);
     } else if (type == ov::element::f16) {
         return create_tensor_random<short, short>(inputInfo.second);
     } else if (type == ov::element::i32) {
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index 191e22abdc6..076a91c462a 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -41,7 +41,7 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
 #ifdef ENABLE_ONEDNN_FOR_GPU
     size_t onednn_impls_counter = 0;
     size_t all_impls_counter = 0;
-    const float onednn_min_threshold = 0.1f;
+    const float onednn_min_threshold = 0.09f;
     bool should_update_fmt_map = false;
 
     // Calculate onednn kernels number and all kernels number inside the network
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index 7db2cea9cf5..75d4209348a 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1635,7 +1635,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
             return impl_types::onednn;
         }
     // TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn
-    } else if (node.is_type<fully_connected>()/* || node.is_type<gemm>()*/) {
+    } else if (node.is_type<fully_connected>() || node.is_type<gemm>()) {
         if (!_optimization_attributes.use_onednn_impls)
             return impl_types::ocl;
 
@@ -1666,13 +1666,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                 }
             }
 
-            impl_candidate = impl_types::ocl;
             auto gemm_prim = node.as<gemm>().get_primitive();
             auto in0_l = node.get_dependency(0).get_output_layout();
             auto in1_l = node.get_dependency(1).get_output_layout();
             auto out_l = node.get_output_layout();
             auto has_input2 = gemm_prim->dependencies().size() == 3;
-            size_t in2_batched_size;
+            size_t in2_batched_size = 0;
             if (has_input2) {
                 auto in2_l = node.get_dependency(2).get_output_layout();
                 in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1));
@@ -1693,9 +1692,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                                            !valid_extra_input_batch ||
                                            !valid_scale_factor;
 
-            // Gemm with k < 64 is calculated via ref kernel in onednn so cldnn way is more preferable for such cases
-            if (size_k < 64 || unsupported_onednn_gemm)
+            bool is_u8_i8 = data_type_traits::is_i8_u8(in0_l.data_type) && data_type_traits::is_i8_u8(in1_l.data_type);
+            bool use_ops_cldnn_kernel = is_u8_i8 || (in0_l.spatial(0) % 16 == 0 && in0_l.spatial(1) % 16 == 0 &&
+                                                     in1_l.spatial(0) % 16 == 0 && in1_l.spatial(1) % 16 == 0);
+
+            // Gemm with k < 64 may be faster in cldnn unless ref impl is used
+            if ((size_k < 64 && use_ops_cldnn_kernel) || unsupported_onednn_gemm) {
                 impl_candidate = impl_types::ocl;
+            }
         }
 
         preferred_impl = impl_candidate;
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 8c70883014e..24c151320d6 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -162,6 +162,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ngraph::pass::ConvertGather0D>();
 
         precisions_array convert_precision_list {
+                {ngraph::element::f64, ngraph::element::f32},
                 {ngraph::element::i64, ngraph::element::i32},
                 {ngraph::element::u64, ngraph::element::i32},
                 {ngraph::element::u16, ngraph::element::i32},
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
index b6df4b44f5d..8b8bf50e51e 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/main.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -408,7 +408,10 @@ def run(args):
                 input_tensor = request.get_input_tensor(port)
                 if not static_mode:
                     input_tensor.shape = data_tensor.shape
-                input_tensor.data[:] = data_tensor.data
+                if not len(input_tensor.shape):
+                    input_tensor.data.flat[:] = data_tensor.data
+                else:
+                    input_tensor.data[:] = data_tensor.data
 
         if statistics:
             statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py
index 8d0d2bcae07..d7694200dfe 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py
@@ -269,7 +269,8 @@ def fill_tensors_with_random(layer):
         if shape:
             input_tensors.append(Tensor(rs.uniform(rand_min, rand_max, list(shape)).astype(dtype)))
         else:
-            input_tensors.append(Tensor(rs.uniform(rand_min, rand_max)))
+            scalar = rs.uniform(rand_min, rand_max)
+            input_tensors.append(Tensor(np.ndarray([], dtype, np.array(scalar).astype(dtype))))
     return input_tensors
 
 
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
index dbf528f8ac4..e4cc70ffb19 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py
@@ -537,18 +537,18 @@ def get_inputs_info(shape_string, data_shape_string, layout_string, batch_size,
         elif inputs[i].node.layout != Layout():
             info.layout = inputs[i].node.layout
         else:
-            image_colors_dim = Dimension(3)
+            image_colors_dim_max = 4
             shape = info.partial_shape
             num_dims = len(shape)
             if num_dims == 4:
-                if(shape[1]) == image_colors_dim:
+                if shape[1].get_max_length() <= image_colors_dim_max and shape[3].get_max_length() > image_colors_dim_max:
                     info.layout = Layout("NCHW")
-                elif(shape[3] == image_colors_dim):
+                elif shape[3].get_max_length() <= image_colors_dim_max and shape[1].get_max_length() > image_colors_dim_max:
                     info.layout = Layout("NHWC")
             elif num_dims == 3:
-                if(shape[0]) == image_colors_dim:
+                if shape[0].get_max_length() <= image_colors_dim_max and shape[2].get_max_length() > image_colors_dim_max:
                     info.layout = Layout("CHW")
-                elif(shape[2] == image_colors_dim):
+                elif shape[2].get_max_length() <= image_colors_dim_max and shape[0].get_max_length() > image_colors_dim_max:
                     info.layout = Layout("HWC")
 
         # Update shape with batch if needed