diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index a03cb307f9a..d6365b69138 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -125,6 +125,7 @@ public:
     int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
     int disable_memory_reuse;                       // Disable memmory reuse among layers
     int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
+    int disable_runtime_skip_reorder;               // Disable runtime skip reorder
     std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
     std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
     static const debug_configuration *get_instance();
diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
index 2d9e66b7ea6..70c3d1daa08 100644
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -210,6 +210,7 @@ public:
     void set_shape_change() { _shape_changed = true; }
 
     void build_deps();
+    void do_runtime_skip_reorder();
     void do_runtime_in_place_concat();
     void configure_shape_of_dependencies();
 
diff --git a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h
index f04cb7e7f5b..b1637032ffb 100644
--- a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h
@@ -86,7 +86,13 @@ public:
     bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); }
 
     void update_output_memory() override;
-    bool requires_reinterpret() const { return _req_reinterpr; }
+    bool requires_reinterpret() const {
+        auto req_reinterpr = _req_reinterpr;
+        if (input_memory().get_layout() != _impl_params->get_output_layout()) {
+            req_reinterpr = true;
+        }
+        return req_reinterpr;
+    }
 
     void save(cldnn::BinaryOutputBuffer& ob) const override;
     void load(cldnn::BinaryInputBuffer& ib) override;
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index e3e62beca9f..2d440831fdc 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -15,6 +15,7 @@
 #include "permute_inst.h"
 #include "resample_inst.h"
 #include "reshape_inst.h"
+#include "reorder_inst.h"
 #include "eltwise_inst.h"
 #include "deconvolution_inst.h"
 #include "shape_of_inst.h"
@@ -644,13 +645,54 @@ bool primitive_inst::update_impl() {
     return true;
 }
 
+void primitive_inst::do_runtime_skip_reorder() {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
+        return;
+    }
+    if (can_be_optimized())
+        return;
+
+    if (_impl_params->fused_desc.size() > 0)
+        return;
+
+    // set successive reorder can_be_optimized if layouts are same
+    for (auto u : get_user_insts()) {
+        if (u->get_node().is_type<reorder>()) {
+            if (is_input() && u->is_output())
+                continue;
+            // TODO: Skipped reorder + in_place concat is not supported yet. To support later.
+            if (u->get_users().size() == 1 && u->get_users().front()->is_type<concatenation>() && u->get_users().front()->can_be_optimized())
+                continue;
+            auto out_port_idx = u->get_node().get_dependency_with_port(0).second;
+            // If current node's output_node is not dynamic, the memory is already allocated at build time
+            auto alloc_type = allocation_type::unknown;
+            if (!get_node().is_dynamic_output_layout(out_port_idx) && static_cast<int64_t>(_outputs.size()) > out_port_idx) {
+                alloc_type = _outputs[out_port_idx]->get_allocation_type();
+            }
+            if (alloc_type == allocation_type::usm_device && u->is_output())
+                continue;
+            GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] update shape for user " << u->id() << std::endl;
+            u->update_shape();
+            u->update_shape_done_by_other = true;
+            if (u->_impl_params->get_input_layout() == u->_impl_params->get_output_layout()) {
+                u->set_can_be_optimized(true);
+                GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] set user " << u->id() << " as  can_be_optimized" << std::endl;
+            } else {
+                GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] user " << u->id() << " cannot be optimized" << std::endl;
+            }
+        }
+    }
+}
+
 void primitive_inst::do_runtime_in_place_concat() {
     GPU_DEBUG_GET_INSTANCE(debug_config);
     GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
         return;
     }
-    if (update_shape_done_by_other)
+    if (update_shape_done_by_other) {
         return;
+    }
     if (get_users().size() != 1) return;
 
     auto concat_inst = _network.get_primitive(get_users().front()->id());
@@ -720,6 +762,11 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         do_runtime_in_place_concat();
         OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
         update_shape();
+
+        // Check successor reorder if layouts are same
+        // Need to set can_be_optimized for user reorder at predesescor because
+        // if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
+        do_runtime_skip_reorder();
         if (_impl_params->output_layouts[0].count() == 0) {
             GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
             auto ev = get_network().get_stream().create_user_event(true);
@@ -792,6 +839,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
         if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
             dependencies.reserve(dependencies.size() + _exec_deps.size());
             for (auto& input : _exec_deps) {
+                if (input->is_input() && queue_type != QueueTypes::out_of_order)
+                    continue;
                 auto id = input->id();
                 try {
                     // if the requested event does not exists it means that it has not been executed, so the processing_order is
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index ca678a34382..b7572722895 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -189,8 +189,8 @@ debug_configuration::debug_configuration()
         , disable_async_compilation(0)
         , disable_dynamic_impl(0)
         , disable_runtime_buffer_fusing(0)
-        , disable_memory_reuse(0)
-        , disable_build_time_weight_reorder_for_dynamic_nodes(0) {
+        , disable_build_time_weight_reorder_for_dynamic_nodes(0)
+        , disable_runtime_skip_reorder(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -226,6 +226,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
     get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
     get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
+    get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
     std::string dump_iteration_str;
     get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
     std::string mem_preallocation_params_str;
diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp
new file mode 100644
index 00000000000..5ee76f2c2b2
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/data.hpp>
+#include <intel_gpu/primitives/fully_connected.hpp>
+
+#include "program_wrapper.h"
+
+#include <cmath>
+#include <algorithm>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace skip_reorder_tests {
+TEST(remove_redundant_reorder, skip_reorder_at_runtime) {
+    auto& engine = get_test_engine();
+
+    auto weight_mem = engine.allocate_memory({{2, 32}, data_types::f32, format::bfyx});
+    std::vector<float> weight_data(weight_mem->get_layout().count());
+    std::iota(weight_data.begin(), weight_data.end(), 1.0f);
+    set_values(weight_mem, weight_data);
+
+    auto input_l = layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx};
+    topology topology(input_layout("input", input_l),
+                      data("weight", weight_mem),
+                      fully_connected("fc", input_info("input"), {"weight"}, "", data_types::f32),
+                      reorder("reorder", input_info("fc"), format::bfyx, data_types::f32)); /*output padding*/
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    network network(engine, topology, config);
+    auto reorder_inst = network.get_primitive("reorder");
+    ASSERT_EQ(reorder_inst->can_be_optimized(), false);
+
+    auto input_mem = engine.allocate_memory({{10, 32}, data_types::f32, format::bfyx});
+    std::vector<float> input_data(input_mem->get_layout().count());
+    std::iota(input_data.begin(), input_data.end(), 0.5f);
+    set_values(input_mem, input_data);
+
+    network.set_input_data("input", input_mem);
+    network.execute();
+    ASSERT_EQ(reorder_inst->can_be_optimized(), true);
+    ASSERT_EQ(network.get_output_memory("reorder")->buffer_ptr(), network.get_primitive("fc")->output_memory_ptr()->buffer_ptr());
+}
+}  // memory_realloc_tests