[GPU] Prevent memory reset at runtime allocation for dynamic shape, fix wrong padding handling (#16351)

* Prevent memory reset at runtime allocation for dynamic shape * Set default alloc to reset mem * Additional fixes : - If there is any convolution/deconvolution users which requires padded input, enqueue reset buffer when reuse buffer. - Removed cl finish from gpu_buffer::fill. (Hopefully it should be waited only when needed. Otherwise sync is to be done by event) - Removed buffer reset from on_execute of nonzero count, which is not needed any more. * Remove unused API * Fix tensor offset to project the padding * Added unittest * Applied review comment
2023-03-24 13:10:33 -07:00 · 2023-03-24 13:10:33 -07:00 · 6a25143045
commit 6a25143045
parent 1ef94ec069
15 changed files with 231 additions and 58 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@ -214,13 +214,6 @@ public:
    bool is_primary_stream() const { return _is_primary_stream; }
    bool is_dynamic() const { return _is_dynamic; }

-    /// Create memory object with specified @p layout and allocation @p type for primitive with @p id
-    /// Underlying memory handle can be reused with other primitives from memory pool based on @p dependencies
-    memory_ptr get_memory_from_pool(const layout& layout,
-                                    primitive_id id,
-                                    std::set<primitive_id> dependencies,
-                                    allocation_type type,
-                                    bool reusable = true);
    memory_pool& get_memory_pool() {
        return *_memory_pool;
    }
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
@ -106,13 +106,15 @@ public:
                          uint32_t network_id,
                          const std::set<primitive_id>& restrictions,
                          allocation_type type,
-                          bool reusable = true);  // get from pool or create memory allocation
+                          bool reusable = true,
+                          bool reset = true);  // get from pool or create memory allocation
    memory_ptr get_memory(const layout& layout, allocation_type type, bool reset = true);
    memory_ptr get_from_non_padded_pool(const layout& layout,
                                        const primitive_id& id,
                                        uint32_t network_id,
                                        const std::set<primitive_id>&,
-                                        allocation_type type);
+                                        allocation_type type,
+                                        bool reset = true);
    memory_ptr get_from_padded_pool(const layout& layout,
                                    const primitive_id& id,
                                    uint32_t network_id,
--- a/src/plugins/intel_gpu/src/graph/include/binary_convolution_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/binary_convolution_inst.h
@ -42,6 +42,23 @@ public:
    static std::string to_string(binary_convolution_node const& node);
    typed_primitive_inst(network& network, binary_convolution_node const& node);

+    bool need_reset_input_memory() const override {
+        auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
+        if (input_layout.data_padding) {
+            return true;
+        }
+        return false;
+    }
+
+    bool need_reset_output_memory() const override {
+        bool res = parent::need_reset_output_memory();
+        auto output_layout = _impl_params->get_output_layout(0);
+        if (output_layout.data_padding) {
+            return true;
+        }
+        return res;
+    }
+
    memory::ptr weights_memory() const { return dep_memory_ptr(1); }
 };

--- a/src/plugins/intel_gpu/src/graph/include/convolution_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/convolution_inst.h
@ -127,6 +127,23 @@ public:
    static layout calc_output_layout(convolution_node const& node, kernel_impl_params const& impl_param);
    static std::string to_string(convolution_node const& node);

+    bool need_reset_input_memory() const override {
+        auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
+        if (input_layout.data_padding) {
+            return true;
+        }
+        return false;
+    }
+
+    bool need_reset_output_memory() const override {
+        bool res = parent::need_reset_output_memory();
+        auto output_layout = _impl_params->get_output_layout(0);
+        if (output_layout.data_padding) {
+            return true;
+        }
+        return res;
+    }
+
 public:
    typed_primitive_inst(network& network, convolution_node const& node);

--- a/src/plugins/intel_gpu/src/graph/include/deconvolution_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/deconvolution_inst.h
@ -60,6 +60,23 @@ public:
    static layout calc_output_layout(deconvolution_node const& node, kernel_impl_params const& impl_param);
    static std::string to_string(deconvolution_node const& node);

+    bool need_reset_input_memory() const override {
+        auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
+        if (input_layout.data_padding) {
+            return true;
+        }
+        return false;
+    }
+
+    bool need_reset_output_memory() const override {
+        bool res = parent::need_reset_output_memory();
+        auto output_layout = _impl_params->get_output_layout(0);
+        if (output_layout.data_padding) {
+            return true;
+        }
+        return res;
+    }
+
    typed_primitive_inst(network& network, deconvolution_node const& node);

    memory::ptr weights_memory() const {
--- a/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
@ -40,9 +40,6 @@ public:
    static std::string to_string(count_nonzero_node const& node);

    typed_primitive_inst(network& network, count_nonzero_node const& node);
-
-private:
-    void on_execute() override;
 };

 using count_nonzero_inst = typed_primitive_inst<count_nonzero>;
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -192,7 +192,7 @@ public:

    void allocate_internal_buffers();
    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
-                                       const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0);
+            const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true);

    std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }

@ -284,7 +284,7 @@ protected:
    size_t max_output_layout_size = 0;
    std::vector<size_t> max_intermediates_memory_sizes;

-    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr);
+    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true);
    memory::ptr allocate_internal_buffer(size_t idx);
    static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
        std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
@ -298,7 +298,7 @@ protected:
    virtual event::ptr update_weights();
    // if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
    bool update_impl();
-    void realloc_if_needed();
+    event::ptr realloc_if_needed();

    cldnn::network::ptr get_unfused_subgraph();

@ -332,6 +332,21 @@ protected:
        return { layout(in_layout.get<ShapeType>(), output_type, in_layout.format) };
    }

+    virtual bool need_reset_input_memory() const {
+        return false;
+    }
+
+    virtual bool need_reset_output_memory() const {
+        std::vector<primitive_id> users;
+        for (auto u : _node->get_users())
+            users.push_back(u->id());
+
+        for (auto u : _network.get_primitives(users)) {
+            if (u->need_reset_input_memory())
+                return true;
+        }
+        return false;
+    }

    // This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
    // but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -1322,16 +1322,6 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
    }
 }

-memory::ptr network::get_memory_from_pool(const layout& layout,
-                                               primitive_id id,
-                                               std::set<primitive_id> dependencies,
-                                               allocation_type type,
-                                               bool reusable) {
-    if (_config.get_property(ov::intel_gpu::enable_memory_pool))
-        return _memory_pool->get_memory(layout, id, get_id(), dependencies, type, reusable);
-    return _memory_pool->get_memory(layout, type);
-}
-
 network::VariableState& network::get_variable_memory(const std::string &variable_id) {
    auto it = _variables_states.find(variable_id);
    if (it == _variables_states.end()) {
--- a/src/plugins/intel_gpu/src/graph/non_zero.cpp
+++ b/src/plugins/intel_gpu/src/graph/non_zero.cpp
@ -48,10 +48,6 @@ std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {

 count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}

-void count_nonzero_inst::on_execute() {
-    output_memory().fill(_network.get_stream(), 0);
-}
-
 // -----------------------------------------------
 // gather_nonzero
 // -----------------------------------------------
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -242,11 +242,11 @@ void primitive_inst::update_shape() {
    }
 }

-void primitive_inst::realloc_if_needed() {
+event::ptr primitive_inst::realloc_if_needed() {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);

-
+    event::ptr ev = nullptr;
    // Update param if fake_alignment is available
    auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
    auto actual_layout = updated_params.get_output_layout();
@ -254,28 +254,31 @@ void primitive_inst::realloc_if_needed() {

    // input_layout node is supposed to always use external memory in dynamic case
    if (_node->is_type<input_layout>())
-        return;
+        return ev;

    bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;

    if (can_reuse_buffer) {
        GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
        _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
+        if (need_reset_output_memory()) {
+            ev = _outputs[0]->fill(_network.get_stream());
+        }
    } else {
        GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
                               <<  " Current buffer_size=" << max_output_layout_size
                               <<  " Requested buffer_size=" << actual_layout.count() << std::endl;
-        _outputs = allocate_outputs(&updated_params);
+        _outputs = allocate_outputs(&updated_params, need_reset_output_memory());
        // TODO : need to handle multiple outputs
        max_output_layout_size = updated_params.output_layouts[0].count();
    }
    // intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
    {
        if (_impl == nullptr)
-            return;
+            return ev;
        const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
        if (ibuf_layouts.empty())
-            return;
+            return ev;

        for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
            if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
@ -293,6 +296,7 @@ void primitive_inst::realloc_if_needed() {
            }
        }
    }
+    return ev;
 }

 bool primitive_inst::update_impl() {
@ -431,7 +435,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
                auto ev = update_weights();
                if (ev)
                    dependencies.push_back(ev);
-                realloc_if_needed();
+                auto ev_reset = realloc_if_needed();
+                if (ev_reset)
+                    dependencies.push_back(ev_reset);
            }
        }
    }
@ -763,15 +769,15 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
 }

 memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
-                                            uint32_t net_id, bool is_internal, size_t idx) {
+                                            uint32_t net_id, bool is_internal, size_t idx, bool reset) {
    auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
-            allocation_type type, bool reusable) {
+            allocation_type type, bool reusable, bool reset = true) {
        OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
        // Use layout with max tensor for dynamic shape with upper bound
        auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
-            return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable);
-        return pool.get_memory(static_layout, type);
+            return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
+        return pool.get_memory(static_layout, type, reset);
    };


@ -817,7 +823,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
                _node.id(),
                _node.get_memory_dependencies(),
                alloc_type,
-                false);
+                false,
+                reset);
    } else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
            _engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
@ -829,23 +836,24 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
        return _engine.allocate_memory(layout, alloc_type, false);
    } else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
-        return _engine.allocate_memory(layout, alloc_type);
+        return _engine.allocate_memory(layout, alloc_type, reset);
    } else {
        return get_memory_from_pool(_engine,
                layout,
                _node.id(),
                _node.get_memory_dependencies(),
                alloc_type,
-                true);
+                true,
+                reset);
    }
 }

-std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params) {
+std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem) {
    std::vector<memory::ptr> outputs;
    for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
        outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
                         *_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
-                         get_network_id(), _network.is_internal(), i));
+                         get_network_id(), _network.is_internal(), i, reset_mem));
    }
    return outputs;
 }
--- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
@ -245,7 +245,6 @@ public:

    JitDefinitions GetDefinitions(const Tensor::TensorBaseT<DType, Layout>& t) const {
        JitDefinitions definitions{
-            {_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())},
            {_name + "_VIEW_OFFSET", toCodeString(t.GetViewOffset())},
            {_name + "_LENGTH", toCodeString(t.LogicalSize())},
            {_name + "_DIMS", toCodeString(t.GetDims().size())},
@ -258,6 +257,7 @@ public:
        definitions.insert(definitions.end(), type_defs.begin(), type_defs.end());

        if (!t.is_dynamic()) {
+            definitions.push_back({_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())});
            definitions.push_back({_name + "_SIZE", toCodeString(t.GetDims().size())});
            definitions.push_back(
                {_name + "_SIZES_DATA",
@ -265,13 +265,34 @@ public:
            definitions.push_back(
                {_name + "_PITCHES",
                toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.pitch; })});
+        } else {
+            // calculate tensor offset
+            std::vector<std::string> padded_pitches = {
+                toVectorMulString({_name + "_X_PITCH", _name + "_PAD_BEFORE_SIZE_X"}),
+                toVectorMulString({_name + "_Y_PITCH", _name + "_PAD_BEFORE_SIZE_Y"}),
+                toVectorMulString({_name + "_Z_PITCH", _name + "_PAD_BEFORE_SIZE_Z"}),
+                toVectorMulString({_name + "_W_PITCH", _name + "_PAD_BEFORE_SIZE_W"}),
+                toVectorMulString({_name + "_FEATURE_PITCH", _name + "_PAD_BEFORE_FEATURE_NUM"}),
+                toVectorMulString({_name + "_BATCH_PITCH", _name + "_PAD_BEFORE_BATCH_NUM"})};
+            std::string offset_str = "(";
+            for (size_t i = 0; i < padded_pitches.size(); ++i) {
+                offset_str += padded_pitches[i];
+                if (i < padded_pitches.size() - 1)
+                    offset_str += " + ";
+            }
+            offset_str += ")";
+            definitions.push_back({_name + "_OFFSET", offset_str});
        }
        definitions.push_back(
            {_name + "_PAD_BEFORE",
-             toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.before; })});
+             toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
+                 return d.pad.before;
+             })});
        definitions.push_back(
            {_name + "_PAD_AFTER",
-             toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.after; })});
+             toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
+                 return d.pad.after;
+             })});

        return definitions;
    }
--- a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@ -120,7 +120,8 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
                                                  const primitive_id& id,
                                                  uint32_t network_id,
                                                  const std::set<primitive_id>& restrictions,
-                                                  allocation_type type) {
+                                                  allocation_type type,
+                                                  bool reset) {
    auto it = _non_padded_pool.lower_bound(layout.bytes_count());
    while (it != _non_padded_pool.end()) {
        if (it->second._network_id == network_id &&
@ -139,7 +140,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
    }
    GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
    // didn't find anything for you? create new resource
-    auto mem = alloc_memory(layout, type);
+    auto mem = alloc_memory(layout, type, reset);
    {
        _non_padded_pool.emplace(layout.bytes_count(),
                                 memory_record({{id, network_id}}, mem, network_id, type));
@ -221,21 +222,22 @@ memory::ptr memory_pool::get_memory(const layout& layout,
                                    uint32_t network_id,
                                    const std::set<primitive_id>& restrictions,
                                    allocation_type type,
-                                    bool reusable_across_network) {
+                                    bool reusable_across_network,
+                                    bool reset) {
    if (reusable_across_network) {
        // reusable within the same network
        if (!layout.format.is_image() && layout.data_padding == padding{{0, 0, 0, 0}, 0}) {
            // non-padded buffers
-            return get_from_non_padded_pool(layout, id, network_id, restrictions, type);
+            return get_from_non_padded_pool(layout, id, network_id, restrictions, type, reset);
        } else if (!layout.format.is_image()) {
            // padded buffers
            return get_from_padded_pool(layout, id, network_id, restrictions, type);
        } else {
            // images (reuse not yet implemented)
-            return alloc_memory(layout, type);
+            return alloc_memory(layout, type, reset);
        }
    } else {
-        return alloc_memory(layout, type);
+        return alloc_memory(layout, type, reset);
    }
 }

--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@ -151,7 +151,7 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
        }

        if (reset || res->is_memory_reset_needed(layout)) {
-            res->fill(get_service_stream());
+            get_service_stream().wait_for_events({res->fill(get_service_stream())});
        }

        return res;
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
@ -73,9 +73,6 @@ event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
    cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
    cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl);

-    // TODO: do we need sync here?
-    cl_stream.finish();
-
    return ev;
 }

--- a/src/plugins/intel_gpu/tests/dynamic_execution/memory_realloc_test.cpp
+++ b/src/plugins/intel_gpu/tests/dynamic_execution/memory_realloc_test.cpp
@ -6,6 +6,7 @@

 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/softmax.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
 #include <intel_gpu/primitives/data.hpp>

 #include "softmax_inst.h"
@ -19,6 +20,106 @@ using namespace cldnn;
 using namespace ::tests;

 namespace memory_realloc_tests {
+TEST(memory_reuse_realloc_reset_test, basic_conv_with_padding) {
+    auto& engine = get_test_engine();
+
+    layout weight_layout = layout{ov::PartialShape{1, 3, 3, 3}, data_types::f16, format::bfyx};
+
+    auto weights = engine.allocate_memory(weight_layout);
+    set_values<FLOAT16>(weights, {
+            1.0f, 1.0f, 1.0f,
+            1.0f, 1.0f, 1.0f,
+            1.0f, 1.0f, 1.0f,
+            //
+            2.0f, 2.0f, 2.0f,
+            2.0f, 2.0f, 2.0f,
+            2.0f, 2.0f, 2.0f,
+            //
+            3.0f, 3.0f, 3.0f,
+            3.0f, 3.0f, 3.0f,
+            3.0f, 3.0f, 3.0f,
+    });
+
+    layout input_layout_1 = layout{ov::PartialShape{1, 3, 5, 5}, data_types::f32, format::bfyx};
+    auto input_mem_1 = engine.allocate_memory(input_layout_1);
+    set_values(input_mem_1, {
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         //
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         //
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                         1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                        });
+
+    std::vector<float> ref_output_1 = {6,   18,  36, 54,  72,  54,  30,  12,  36, 72, 108, 144, 108,
+                                       60,  18,  54, 108, 162, 216, 162, 90,  18, 54, 108, 162, 216,
+                                       162, 90,  18, 54,  108, 162, 216, 162, 90, 12, 36,  72,  108,
+                                       144, 108, 60, 6,   18,  36,  54,  72,  54, 30};
+
+    layout input_layout_2 = layout{ov::PartialShape{1, 3, 2, 2}, data_types::f32, format::bfyx};
+    auto input_mem_2 = engine.allocate_memory(input_layout_2);
+    set_values(input_mem_2, {11.0f,  11.0f, 11.0f, 11.0f,
+                             11.0f,  11.0f, 11.0f, 11.0f,
+                             11.0f,  11.0f, 11.0f, 11.0f});
+    std::vector<float> ref_output_2 = { 66, 132, 132, 66, 132, 264, 264, 132, 132, 264, 264, 132, 66, 132, 132, 66};
+     std::vector<float> values_to_subtract = {};
+    auto input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
+    topology topology(input_layout("input", input_l),
+                      data("weights", weights),
+                      reorder("reorder", input_info("input"), format::bfyx, data_types::f16, 
+                      values_to_subtract, reorder_mean_mode::subtract, padding{{0, 0, 2, 2}, 0}),
+                      convolution("conv",
+                                  input_info("reorder"),
+                                  {"weights"},
+                                  {},     /*bias*/
+                                  {1, 1}, /*stride*/
+                                  {2, 2}, /*pad*/
+                                  {1, 1}, /*dilation*/
+                                  {2, 2},  /*pad_above*/
+                                  {2, 2},  /*pad_below*/
+                                  padding{{0, 0, 0, 0}, 0}),
+                      reorder("output", input_info("conv"), format::bfyx, data_types::f32)); /*output padding*/
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    network network(engine, topology, config);
+    network.set_input_data("input", input_mem_1);
+    auto outputs_1 = network.execute();
+    network.set_input_data("input", input_mem_2);
+    auto outputs_2 = network.execute();
+    auto output_mem_2 = outputs_2.begin()->second.get_memory();
+    cldnn::mem_lock<float> output_mem_2_ptr(output_mem_2, get_test_stream());
+    for (size_t i = 0; i < output_mem_2->get_layout().get_buffer_size().count(); ++i) {
+        ASSERT_EQ(output_mem_2_ptr[i], ref_output_2[i]);
+    }
+    // check padding of second run of reorder
+    // 0, 0, 0,  0,  0, 0,  
+    // 0, 0, 0,  0,  0, 0, 
+    // 0, 0, 11, 11, 0, 0, 
+    // 0, 0, 11, 11, 0, 0, 
+    // 0, 0,"0","0","0","0", // !! check pad_after
+    // 0, 0,"0","0","0","0", // !! check pad_after
+    auto reorder_mem = network.get_primitive("reorder")->output_memory_ptr();
+    cldnn::mem_lock<FLOAT16, mem_lock_type::read> reorder_mem_ptr(reorder_mem, get_test_stream());
+    for (size_t i = 26; i < 29; ++i) {
+        ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
+    }
+    for (size_t i = 32; i < 35; ++i) {
+        ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
+    }
+}

 TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
    static const int32_t