diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index a478cf290fd..1eb61449675 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -332,7 +332,7 @@ void remove_redundant_reorders::run(program& p) {
 
             bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
             bool allowed_dt_conversion_fuse = (input.is_type<one_hot>() || input.is_type<permute>() ||
-                                               input.is_type<depth_to_space>() || input.is_type<region_yolo>());
+                                               input.is_type<depth_to_space>() || input.is_type<region_yolo>() || input.is_type<detection_output>());
             if (!same_data_type && !allowed_dt_conversion_fuse)
                 continue;
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index 5128dc6aa47..5d16e800a97 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -581,16 +581,18 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
     }
 
     const auto reorder_input_detection_output = [&p, &rf](typed_program_node<detection_output>& detection_output_node) {
-        auto detection_output_prim = detection_output_node.get_primitive();
+        if (detection_output_node.get_preferred_impl_type() == impl_types::cpu) {
+            auto detection_output_prim = detection_output_node.get_primitive();
 
-        for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) {
-            auto& input = detection_output_node.get_dependency(i);
-            auto new_input = rf.get_reorder(input.id(),
-                                            input.get_output_layout(),
-                                            layout{ data_types::f32, format::bfyx, input.get_output_layout().get_tensor() });
+            for (size_t i = 0; i < detection_output_node.get_dependencies().size(); i++) {
+                auto& input = detection_output_node.get_dependency(i);
+                auto new_input = rf.get_reorder(input.id(),
+                                                input.get_output_layout(),
+                                                layout{ data_types::f32, format::bfyx, input.get_output_layout().get_tensor() });
 
-            if (new_input.first) {
-                p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
+                if (new_input.first) {
+                    p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
+                }
             }
         }
     };
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index d739c409156..eea226dff0c 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -365,8 +365,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
     if (prev.is_dynamic() || (next && next->is_dynamic()))
         return false;
 
-    // Ref kernels are the main for depth_to_space and region_yolo. It can do anything. Should not see next.
-    if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>())
+    // Ref kernels are the main for depth_to_space, region_yolo and detection_output. It can do anything. Should not see next.
+    if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>() || prev.is_type<detection_output>())
         return true;
 
     if (next == nullptr)
diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
index e2b57ade263..18f55477d8b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
@@ -17,6 +17,7 @@ ParamsKey DetectionOutputKernelRef::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F32);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableDifferentTypes();
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();