[GPU] Support cross-data type from detection output (#13125)
* detection_output kernel support cross-type(fp16/fp32), so graph-optimization also support cross-type detection_output Signed-off-by: hyunback <hyunback.kim@intel.com>
This commit is contained in:
parent
f2c0e0b4d7
commit
079021c673
@ -332,7 +332,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
|
||||
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
|
||||
bool allowed_dt_conversion_fuse = (input.is_type<one_hot>() || input.is_type<permute>() ||
|
||||
input.is_type<depth_to_space>() || input.is_type<region_yolo>());
|
||||
input.is_type<depth_to_space>() || input.is_type<region_yolo>() || input.is_type<detection_output>());
|
||||
if (!same_data_type && !allowed_dt_conversion_fuse)
|
||||
continue;
|
||||
|
||||
|
@ -581,16 +581,18 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
|
||||
const auto reorder_input_detection_output = [&p, &rf](typed_program_node<detection_output>& detection_output_node) {
|
||||
auto detection_output_prim = detection_output_node.get_primitive();
|
||||
if (detection_output_node.get_preferred_impl_type() == impl_types::cpu) {
|
||||
auto detection_output_prim = detection_output_node.get_primitive();
|
||||
|
||||
for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) {
|
||||
auto& input = detection_output_node.get_dependency(i);
|
||||
auto new_input = rf.get_reorder(input.id(),
|
||||
input.get_output_layout(),
|
||||
layout{ data_types::f32, format::bfyx, input.get_output_layout().get_tensor() });
|
||||
for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) {
|
||||
auto& input = detection_output_node.get_dependency(i);
|
||||
auto new_input = rf.get_reorder(input.id(),
|
||||
input.get_output_layout(),
|
||||
layout{ data_types::f32, format::bfyx, input.get_output_layout().get_tensor() });
|
||||
|
||||
if (new_input.first) {
|
||||
p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
|
||||
if (new_input.first) {
|
||||
p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -365,8 +365,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
if (prev.is_dynamic() || (next && next->is_dynamic()))
|
||||
return false;
|
||||
|
||||
// Ref kernels are the main for depth_to_space and region_yolo. It can do anything. Should not see next.
|
||||
if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>())
|
||||
// Ref kernels are the main for depth_to_space, region_yolo and detection_output. It can do anything. Should not see next.
|
||||
if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>() || prev.is_type<detection_output>())
|
||||
return true;
|
||||
|
||||
if (next == nullptr)
|
||||
|
@ -17,6 +17,7 @@ ParamsKey DetectionOutputKernelRef::GetSupportedKey() const {
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
|
Loading…
Reference in New Issue
Block a user