diff --git a/samples/cpp/benchmark_app/inputs_filling.cpp b/samples/cpp/benchmark_app/inputs_filling.cpp index 4c53dd41ece..fd244b595f2 100644 --- a/samples/cpp/benchmark_app/inputs_filling.cpp +++ b/samples/cpp/benchmark_app/inputs_filling.cpp @@ -225,6 +225,13 @@ ov::Tensor get_image_tensor(const std::vector& files, inputInfo.second, inputInfo.first, filenames_used); + } else if (type == ov::element::f64) { + return create_tensor_from_image(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); } else if (type == ov::element::i32) { return create_tensor_from_image(files, inputId, @@ -257,6 +264,8 @@ ov::Tensor get_im_info_tensor(const std::pair& image_size, auto type = inputInfo.second.type; if (type == ov::element::f32) { return create_tensor_im_info(image_size, batchSize, inputInfo.second, inputInfo.first); + } else if (type == ov::element::f64) { + return create_tensor_im_info(image_size, batchSize, inputInfo.second, inputInfo.first); } else if (type == ov::element::f16) { return create_tensor_im_info(image_size, batchSize, inputInfo.second, inputInfo.first); } else if (type == ov::element::i32) { @@ -281,6 +290,13 @@ ov::Tensor get_binary_tensor(const std::vector& files, inputInfo.second, inputInfo.first, filenames_used); + } else if (type == ov::element::f64) { + return create_tensor_from_binary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); } else if (type == ov::element::f16) { return create_tensor_from_binary(files, inputId, @@ -318,6 +334,8 @@ ov::Tensor get_random_tensor(const std::pair(inputInfo.second); + } else if (type == ov::element::f64) { + return create_tensor_random(inputInfo.second); } else if (type == ov::element::f16) { return create_tensor_random(inputInfo.second); } else if (type == ov::element::i32) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 191e22abdc6..076a91c462a 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -41,7 +41,7 @@ std::map get_preferred_formats(program& p, layout_o #ifdef ENABLE_ONEDNN_FOR_GPU size_t onednn_impls_counter = 0; size_t all_impls_counter = 0; - const float onednn_min_threshold = 0.1f; + const float onednn_min_threshold = 0.09f; bool should_update_fmt_map = false; // Calculate onednn kernels number and all kernels number inside the network diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 7db2cea9cf5..75d4209348a 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1635,7 +1635,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format return impl_types::onednn; } // TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn - } else if (node.is_type()/* || node.is_type()*/) { + } else if (node.is_type() || node.is_type()) { if (!_optimization_attributes.use_onednn_impls) return impl_types::ocl; @@ -1666,13 +1666,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format } } - impl_candidate = impl_types::ocl; auto gemm_prim = node.as().get_primitive(); auto in0_l = node.get_dependency(0).get_output_layout(); auto in1_l = node.get_dependency(1).get_output_layout(); auto out_l = node.get_output_layout(); auto has_input2 = gemm_prim->dependencies().size() == 3; - size_t in2_batched_size; + size_t in2_batched_size = 0; if (has_input2) { auto in2_l = node.get_dependency(2).get_output_layout(); in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1)); @@ -1693,9 +1692,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format !valid_extra_input_batch || !valid_scale_factor; - // Gemm with k < 64 is calculated via ref kernel in onednn so cldnn way is more preferable for such cases - if (size_k < 64 || unsupported_onednn_gemm) + bool is_u8_i8 = data_type_traits::is_i8_u8(in0_l.data_type) && data_type_traits::is_i8_u8(in1_l.data_type); + bool use_ops_cldnn_kernel = is_u8_i8 || (in0_l.spatial(0) % 16 == 0 && in0_l.spatial(1) % 16 == 0 && + in1_l.spatial(0) % 16 == 0 && in1_l.spatial(1) % 16 == 0); + + // Gemm with k < 64 may be faster in cldnn unless ref impl is used + if ((size_k < 64 && use_ops_cldnn_kernel) || unsupported_onednn_gemm) { impl_candidate = impl_types::ocl; + } } preferred_impl = impl_candidate; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 8c70883014e..24c151320d6 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -162,6 +162,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); precisions_array convert_precision_list { + {ngraph::element::f64, ngraph::element::f32}, {ngraph::element::i64, ngraph::element::i32}, {ngraph::element::u64, ngraph::element::i32}, {ngraph::element::u16, ngraph::element::i32}, diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py index b6df4b44f5d..8b8bf50e51e 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/main.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py @@ -408,7 +408,10 @@ def run(args): input_tensor = request.get_input_tensor(port) if not static_mode: input_tensor.shape = data_tensor.shape - input_tensor.data[:] = data_tensor.data + if not len(input_tensor.shape): + input_tensor.data.flat[:] = data_tensor.data + else: + input_tensor.data[:] = data_tensor.data if statistics: statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG, diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py index 8d0d2bcae07..d7694200dfe 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/inputs_filling.py @@ -269,7 +269,8 @@ def fill_tensors_with_random(layer): if shape: input_tensors.append(Tensor(rs.uniform(rand_min, rand_max, list(shape)).astype(dtype))) else: - input_tensors.append(Tensor(rs.uniform(rand_min, rand_max))) + scalar = rs.uniform(rand_min, rand_max) + input_tensors.append(Tensor(np.ndarray([], dtype, np.array(scalar).astype(dtype)))) return input_tensors diff --git a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py index dbf528f8ac4..e4cc70ffb19 100644 --- a/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py +++ b/tools/benchmark_tool/openvino/tools/benchmark/utils/utils.py @@ -537,18 +537,18 @@ def get_inputs_info(shape_string, data_shape_string, layout_string, batch_size, elif inputs[i].node.layout != Layout(): info.layout = inputs[i].node.layout else: - image_colors_dim = Dimension(3) + image_colors_dim_max = 4 shape = info.partial_shape num_dims = len(shape) if num_dims == 4: - if(shape[1]) == image_colors_dim: + if shape[1].get_max_length() <= image_colors_dim_max and shape[3].get_max_length() > image_colors_dim_max: info.layout = Layout("NCHW") - elif(shape[3] == image_colors_dim): + elif shape[3].get_max_length() <= image_colors_dim_max and shape[1].get_max_length() > image_colors_dim_max: info.layout = Layout("NHWC") elif num_dims == 3: - if(shape[0]) == image_colors_dim: + if shape[0].get_max_length() <= image_colors_dim_max and shape[2].get_max_length() > image_colors_dim_max: info.layout = Layout("CHW") - elif(shape[2] == image_colors_dim): + elif shape[2].get_max_length() <= image_colors_dim_max and shape[0].get_max_length() > image_colors_dim_max: info.layout = Layout("HWC") # Update shape with batch if needed