[GPU] Use oneDNN gemm on DG2 in some cases (#12878)
* [GPU] Enable onednn gemm with some additional restrictions * f64 support in c++ benchmark_app * Fixed python benchmark_app
This commit is contained in:
parent
d8b6f5485b
commit
734dcc93e9
@ -225,6 +225,13 @@ ov::Tensor get_image_tensor(const std::vector<std::string>& files,
|
|||||||
inputInfo.second,
|
inputInfo.second,
|
||||||
inputInfo.first,
|
inputInfo.first,
|
||||||
filenames_used);
|
filenames_used);
|
||||||
|
} else if (type == ov::element::f64) {
|
||||||
|
return create_tensor_from_image<double>(files,
|
||||||
|
inputId,
|
||||||
|
batchSize,
|
||||||
|
inputInfo.second,
|
||||||
|
inputInfo.first,
|
||||||
|
filenames_used);
|
||||||
} else if (type == ov::element::i32) {
|
} else if (type == ov::element::i32) {
|
||||||
return create_tensor_from_image<int32_t>(files,
|
return create_tensor_from_image<int32_t>(files,
|
||||||
inputId,
|
inputId,
|
||||||
@ -257,6 +264,8 @@ ov::Tensor get_im_info_tensor(const std::pair<size_t, size_t>& image_size,
|
|||||||
auto type = inputInfo.second.type;
|
auto type = inputInfo.second.type;
|
||||||
if (type == ov::element::f32) {
|
if (type == ov::element::f32) {
|
||||||
return create_tensor_im_info<float>(image_size, batchSize, inputInfo.second, inputInfo.first);
|
return create_tensor_im_info<float>(image_size, batchSize, inputInfo.second, inputInfo.first);
|
||||||
|
} else if (type == ov::element::f64) {
|
||||||
|
return create_tensor_im_info<double>(image_size, batchSize, inputInfo.second, inputInfo.first);
|
||||||
} else if (type == ov::element::f16) {
|
} else if (type == ov::element::f16) {
|
||||||
return create_tensor_im_info<short>(image_size, batchSize, inputInfo.second, inputInfo.first);
|
return create_tensor_im_info<short>(image_size, batchSize, inputInfo.second, inputInfo.first);
|
||||||
} else if (type == ov::element::i32) {
|
} else if (type == ov::element::i32) {
|
||||||
@ -281,6 +290,13 @@ ov::Tensor get_binary_tensor(const std::vector<std::string>& files,
|
|||||||
inputInfo.second,
|
inputInfo.second,
|
||||||
inputInfo.first,
|
inputInfo.first,
|
||||||
filenames_used);
|
filenames_used);
|
||||||
|
} else if (type == ov::element::f64) {
|
||||||
|
return create_tensor_from_binary<double>(files,
|
||||||
|
inputId,
|
||||||
|
batchSize,
|
||||||
|
inputInfo.second,
|
||||||
|
inputInfo.first,
|
||||||
|
filenames_used);
|
||||||
} else if (type == ov::element::f16) {
|
} else if (type == ov::element::f16) {
|
||||||
return create_tensor_from_binary<short>(files,
|
return create_tensor_from_binary<short>(files,
|
||||||
inputId,
|
inputId,
|
||||||
@ -318,6 +334,8 @@ ov::Tensor get_random_tensor(const std::pair<std::string, benchmark_app::InputIn
|
|||||||
auto type = inputInfo.second.type;
|
auto type = inputInfo.second.type;
|
||||||
if (type == ov::element::f32) {
|
if (type == ov::element::f32) {
|
||||||
return create_tensor_random<float, float>(inputInfo.second);
|
return create_tensor_random<float, float>(inputInfo.second);
|
||||||
|
} else if (type == ov::element::f64) {
|
||||||
|
return create_tensor_random<double, double>(inputInfo.second);
|
||||||
} else if (type == ov::element::f16) {
|
} else if (type == ov::element::f16) {
|
||||||
return create_tensor_random<short, short>(inputInfo.second);
|
return create_tensor_random<short, short>(inputInfo.second);
|
||||||
} else if (type == ov::element::i32) {
|
} else if (type == ov::element::i32) {
|
||||||
|
@ -41,7 +41,7 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
|
|||||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
size_t onednn_impls_counter = 0;
|
size_t onednn_impls_counter = 0;
|
||||||
size_t all_impls_counter = 0;
|
size_t all_impls_counter = 0;
|
||||||
const float onednn_min_threshold = 0.1f;
|
const float onednn_min_threshold = 0.09f;
|
||||||
bool should_update_fmt_map = false;
|
bool should_update_fmt_map = false;
|
||||||
|
|
||||||
// Calculate onednn kernels number and all kernels number inside the network
|
// Calculate onednn kernels number and all kernels number inside the network
|
||||||
|
@ -1635,7 +1635,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
return impl_types::onednn;
|
return impl_types::onednn;
|
||||||
}
|
}
|
||||||
// TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn
|
// TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn
|
||||||
} else if (node.is_type<fully_connected>()/* || node.is_type<gemm>()*/) {
|
} else if (node.is_type<fully_connected>() || node.is_type<gemm>()) {
|
||||||
if (!_optimization_attributes.use_onednn_impls)
|
if (!_optimization_attributes.use_onednn_impls)
|
||||||
return impl_types::ocl;
|
return impl_types::ocl;
|
||||||
|
|
||||||
@ -1666,13 +1666,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl_candidate = impl_types::ocl;
|
|
||||||
auto gemm_prim = node.as<gemm>().get_primitive();
|
auto gemm_prim = node.as<gemm>().get_primitive();
|
||||||
auto in0_l = node.get_dependency(0).get_output_layout();
|
auto in0_l = node.get_dependency(0).get_output_layout();
|
||||||
auto in1_l = node.get_dependency(1).get_output_layout();
|
auto in1_l = node.get_dependency(1).get_output_layout();
|
||||||
auto out_l = node.get_output_layout();
|
auto out_l = node.get_output_layout();
|
||||||
auto has_input2 = gemm_prim->dependencies().size() == 3;
|
auto has_input2 = gemm_prim->dependencies().size() == 3;
|
||||||
size_t in2_batched_size;
|
size_t in2_batched_size = 0;
|
||||||
if (has_input2) {
|
if (has_input2) {
|
||||||
auto in2_l = node.get_dependency(2).get_output_layout();
|
auto in2_l = node.get_dependency(2).get_output_layout();
|
||||||
in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1));
|
in2_batched_size = in2_l.count() / (in2_l.spatial(0) * in2_l.spatial(1));
|
||||||
@ -1693,9 +1692,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
|||||||
!valid_extra_input_batch ||
|
!valid_extra_input_batch ||
|
||||||
!valid_scale_factor;
|
!valid_scale_factor;
|
||||||
|
|
||||||
// Gemm with k < 64 is calculated via ref kernel in onednn so cldnn way is more preferable for such cases
|
bool is_u8_i8 = data_type_traits::is_i8_u8(in0_l.data_type) && data_type_traits::is_i8_u8(in1_l.data_type);
|
||||||
if (size_k < 64 || unsupported_onednn_gemm)
|
bool use_ops_cldnn_kernel = is_u8_i8 || (in0_l.spatial(0) % 16 == 0 && in0_l.spatial(1) % 16 == 0 &&
|
||||||
|
in1_l.spatial(0) % 16 == 0 && in1_l.spatial(1) % 16 == 0);
|
||||||
|
|
||||||
|
// Gemm with k < 64 may be faster in cldnn unless ref impl is used
|
||||||
|
if ((size_k < 64 && use_ops_cldnn_kernel) || unsupported_onednn_gemm) {
|
||||||
impl_candidate = impl_types::ocl;
|
impl_candidate = impl_types::ocl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
preferred_impl = impl_candidate;
|
preferred_impl = impl_candidate;
|
||||||
|
@ -162,6 +162,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|
|||||||
manager.register_pass<ngraph::pass::ConvertGather0D>();
|
manager.register_pass<ngraph::pass::ConvertGather0D>();
|
||||||
|
|
||||||
precisions_array convert_precision_list {
|
precisions_array convert_precision_list {
|
||||||
|
{ngraph::element::f64, ngraph::element::f32},
|
||||||
{ngraph::element::i64, ngraph::element::i32},
|
{ngraph::element::i64, ngraph::element::i32},
|
||||||
{ngraph::element::u64, ngraph::element::i32},
|
{ngraph::element::u64, ngraph::element::i32},
|
||||||
{ngraph::element::u16, ngraph::element::i32},
|
{ngraph::element::u16, ngraph::element::i32},
|
||||||
|
@ -408,7 +408,10 @@ def run(args):
|
|||||||
input_tensor = request.get_input_tensor(port)
|
input_tensor = request.get_input_tensor(port)
|
||||||
if not static_mode:
|
if not static_mode:
|
||||||
input_tensor.shape = data_tensor.shape
|
input_tensor.shape = data_tensor.shape
|
||||||
input_tensor.data[:] = data_tensor.data
|
if not len(input_tensor.shape):
|
||||||
|
input_tensor.data.flat[:] = data_tensor.data
|
||||||
|
else:
|
||||||
|
input_tensor.data[:] = data_tensor.data
|
||||||
|
|
||||||
if statistics:
|
if statistics:
|
||||||
statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
|
statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
|
||||||
|
@ -269,7 +269,8 @@ def fill_tensors_with_random(layer):
|
|||||||
if shape:
|
if shape:
|
||||||
input_tensors.append(Tensor(rs.uniform(rand_min, rand_max, list(shape)).astype(dtype)))
|
input_tensors.append(Tensor(rs.uniform(rand_min, rand_max, list(shape)).astype(dtype)))
|
||||||
else:
|
else:
|
||||||
input_tensors.append(Tensor(rs.uniform(rand_min, rand_max)))
|
scalar = rs.uniform(rand_min, rand_max)
|
||||||
|
input_tensors.append(Tensor(np.ndarray([], dtype, np.array(scalar).astype(dtype))))
|
||||||
return input_tensors
|
return input_tensors
|
||||||
|
|
||||||
|
|
||||||
|
@ -537,18 +537,18 @@ def get_inputs_info(shape_string, data_shape_string, layout_string, batch_size,
|
|||||||
elif inputs[i].node.layout != Layout():
|
elif inputs[i].node.layout != Layout():
|
||||||
info.layout = inputs[i].node.layout
|
info.layout = inputs[i].node.layout
|
||||||
else:
|
else:
|
||||||
image_colors_dim = Dimension(3)
|
image_colors_dim_max = 4
|
||||||
shape = info.partial_shape
|
shape = info.partial_shape
|
||||||
num_dims = len(shape)
|
num_dims = len(shape)
|
||||||
if num_dims == 4:
|
if num_dims == 4:
|
||||||
if(shape[1]) == image_colors_dim:
|
if shape[1].get_max_length() <= image_colors_dim_max and shape[3].get_max_length() > image_colors_dim_max:
|
||||||
info.layout = Layout("NCHW")
|
info.layout = Layout("NCHW")
|
||||||
elif(shape[3] == image_colors_dim):
|
elif shape[3].get_max_length() <= image_colors_dim_max and shape[1].get_max_length() > image_colors_dim_max:
|
||||||
info.layout = Layout("NHWC")
|
info.layout = Layout("NHWC")
|
||||||
elif num_dims == 3:
|
elif num_dims == 3:
|
||||||
if(shape[0]) == image_colors_dim:
|
if shape[0].get_max_length() <= image_colors_dim_max and shape[2].get_max_length() > image_colors_dim_max:
|
||||||
info.layout = Layout("CHW")
|
info.layout = Layout("CHW")
|
||||||
elif(shape[2] == image_colors_dim):
|
elif shape[2].get_max_length() <= image_colors_dim_max and shape[0].get_max_length() > image_colors_dim_max:
|
||||||
info.layout = Layout("HWC")
|
info.layout = Layout("HWC")
|
||||||
|
|
||||||
# Update shape with batch if needed
|
# Update shape with batch if needed
|
||||||
|
Loading…
Reference in New Issue
Block a user