[GPU] Fix for conv/deconv weights calculated in runtime (#8952)

2021-12-15 12:17:13 +03:00 · 2021-12-15 12:17:13 +03:00 · b492b59136
commit b492b59136
parent 2f07b98251
14 changed files with 474 additions and 544 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp
@ -196,15 +196,16 @@ KernelsData ConvolutionKernelBase::GetCommonKernelsData(const Params& params,
        return {};
    }

+    auto preferredWeightsLayout = GetPreferredWeightsLayout(newParams);
    bool succeed = UpdateWeightsParams(newParams,
                                       options,
-                                       GetPreferredWeightsLayout(newParams),
+                                       preferredWeightsLayout,
                                       kd.weightsReorderParams,
                                       GetSupportedKey(),
                                       newParams.groups,
                                       newParams.transposed);

-    bool bSupportedWeightsLayout = newParams.weights.GetLayout() == GetPreferredWeightsLayout(newParams);
+    bool bSupportedWeightsLayout = newParams.weights.GetLayout() == preferredWeightsLayout;
    const bool bWeightsOK = bSupportedWeightsLayout || options.allowStaticInputReordering;

    if (!succeed || !bWeightsOK) {
--- a/inference-engine/thirdparty/clDNN/src/binary_convolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/binary_convolution.cpp
@ -125,7 +125,7 @@ binary_convolution_inst::typed_primitive_inst(network& network, binary_convoluti
                              "Only one-dimensional batch size are supported");
        CLDNN_ERROR_LESS_THAN(node.id(),
                              "Weights feature maps number",
-                              (input_inst.size.feature[0] + pad.feature[0]) / split,
+                              input_inst.size.feature[0],
                              "input feature maps number",
                              filter_inst.size.feature[0],
                              "Weights/ifm mismatch");
--- a/inference-engine/thirdparty/clDNN/src/convolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/convolution.cpp
@ -97,7 +97,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
        input_layout.format == format::image_2d_weights_winograd_6x3_s1_xfbyb)
        CLDNN_ERROR_MESSAGE(
            node.id(),
-            "Input for convolution should not be in windograd weights format - it is reserved for weights only");
+            "Input for convolution should not be in winograd weights format - it is reserved for weights only");

    if (input_layout.format == format::winograd_2x3_s1_data) {
        CLDNN_ERROR_NOT_EQUAL(node.id(),
@ -369,10 +369,19 @@ convolution_inst::typed_primitive_inst(network& network, convolution_node const&
                              "Only one-dimensional batch size are supported");
        CLDNN_ERROR_LESS_THAN(node.id(),
                              "Weights feature maps number",
-                              (input_inst.size.feature[0] + pad.feature[0]) / split,
+                              input_inst.size.feature[0],
                              "input feature maps number",
                              weights_ifm,
                              "Weights/ifm mismatch");
+
+        if (!argument.grouped_weights_shape && !format::is_grouped(filter_inst.format)) {
+            CLDNN_ERROR_NOT_EQUAL(node.id(),
+                                  "Weights feature maps number",
+                                  input_inst.size.feature[0],
+                                  "input feature maps number",
+                                  weights_ifm,
+                                  "Weights/ifm mismatch");
+        }
    }
 }
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/deconvolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/deconvolution.cpp
@ -82,11 +82,11 @@ layout deconvolution_inst::calc_output_layout(deconvolution_node const& node) {
    int32_t off_factor = -2;
    size_t spatial_dims = cldnn::format::traits(input_layout.format).spatial_num;
    CLDNN_ERROR_GREATER_THAN(node.id(),
-                                   "number of spatial dimensions",
-                                   spatial_dims,
-                                   "expected number of dimensions",
-                                   3,
-                                   "As for now, deconvolutions with more than 3 dimensions are not supported");
+                             "number of spatial dimensions",
+                             spatial_dims,
+                             "expected number of dimensions",
+                             3,
+                             "As for now, deconvolutions with more than 3 dimensions are not supported");

    int32_t x = off_factor * pad.spatial[0] + (input_layout.size.spatial[0] - 1) * strd.spatial[0] + filter_size.spatial[0];
    int32_t y = 1;
@ -208,6 +208,7 @@ deconvolution_inst::typed_primitive_inst(network& network, deconvolution_node co
                                  1,
                                  "Spatial[0] of bias should be 1. Bias isn't 1D vector.");
        }
+
        CLDNN_ERROR_NOT_EQUAL(node.id(),
                              "deconvolution padding filling value",
                              node.get_output_layout().data_padding.filling_value(),
@ -240,10 +241,19 @@ deconvolution_inst::typed_primitive_inst(network& network, deconvolution_node co
                              "Only one-dimensional features are supported");
        CLDNN_ERROR_LESS_THAN(node.id(),
                              "Weights feature maps number",
-                              (input_inst.size.feature[0] + pad.feature[0]) / split,
+                              input_inst.size.feature[0],
                              "input feature maps number",
                              weights_ifm,
-                              "Weights/ifm mimsmatch");
+                              "Weights/ifm mismatch");
+
+        if (!argument.grouped_weights_shape && !format::is_grouped(filter_inst.format)) {
+            CLDNN_ERROR_NOT_EQUAL(node.id(),
+                                  "Weights feature maps number",
+                                  input_inst.size.feature[0],
+                                  "input feature maps number",
+                                  weights_ifm,
+                                  "Weights/ifm mismatch");
+        }
    }
 }
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
@ -536,7 +536,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
        }
    };

-    const auto reorder_input_deconvolution = [&p, &lo, &rf](typed_program_node<deconvolution>& deconv_node) {
+    const auto reorder_input_and_weights_deconvolution = [&p, &lo, &rf](typed_program_node<deconvolution>& deconv_node) {
        auto& input = deconv_node.input();
        auto input_layout = input.get_output_layout();
        auto new_format = lo.get_preferred_format(deconv_node);
@ -547,14 +547,41 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
                p.add_intermediate(reorder.first, deconv_node, 0, !reorder.second);
            }
        }
+
+        auto& weights = deconv_node.weights();
+        auto weights_layout = weights.get_output_layout();
+        if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
+            auto dims = weights_layout.format.dimension();
+            auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
+            auto reorder = rf.get_reorder(weights.id(), weights_layout,
+                layout{ weights_layout.data_type, preferred_format, weights_layout.size });
+            if (reorder.first) {
+                p.add_intermediate(reorder.first, deconv_node, 1, !reorder.second);
+            }
+        }
+    };
+
+    const auto reorder_weights_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
+        auto& weights = conv_node.weights();
+        auto weights_layout = weights.get_output_layout();
+        if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type<data>() && !weights.is_constant()) {
+            auto dims = weights_layout.format.dimension();
+            auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx;
+            auto reorder = rf.get_reorder(weights.id(), weights_layout,
+                layout{ weights_layout.data_type, preferred_format, weights_layout.size });
+            if (reorder.first) {
+                p.add_intermediate(reorder.first, conv_node, 1, !reorder.second);
+            }
+        }
    };

    for (auto& prim : p.get_processing_order()) {
-        program_helpers::do_for_types<detection_output, binary_convolution, deconvolution>(
+        program_helpers::do_for_types<detection_output, binary_convolution, deconvolution, convolution>(
            *prim,
            reorder_input_detection_output,
            reorder_input_binary_convolution,
-            reorder_input_deconvolution);
+            reorder_input_and_weights_deconvolution,
+            reorder_weights_convolution);
    }

    for (auto n : p.get_processing_order()) {
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -1320,17 +1320,27 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
            impl_candidate = impl_types::ocl;
        }

+        size_t eltw_dep = 0;
        for (auto& fo : node.get_fused_primitives()) {
            if (fo.node->is_type<eltwise>()) {
                auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
                auto out_layout = node.get_output_layout();
                auto in_dt = in_layout.data_type;
                auto out_dt = out_layout.data_type;
-                if ((out_layout.count() == in_layout.count()) &&
-                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
-                    fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
-                    impl_candidate = impl_types::ocl;
-                    break;
+                if (fo.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(in_layout)) {
+                    if ((out_layout.count() == in_layout.count()) &&
+                        (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
+                        impl_candidate = impl_types::ocl;
+                        break;
+                    }
+                    if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding &&
+                        data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) {
+                        if (eltw_dep > 0) {
+                            impl_candidate = impl_types::ocl;
+                            break;
+                        }
+                        eltw_dep = fo.dep_start_idx;
+                    }
                }
            } else if (fo.node->is_type<activation>()) {
                // Some activations aren't implemented in oneDNN
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@ -514,15 +514,17 @@ void network::allocate_primitives() {
                        can_reuse_eltwise_mem = true;
                    }

-                    if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
-                        auto& eltw_inst = _primitives.at(eltw_in.id());
-                        auto& prim_inst = _primitives.at(node->id());
-                        auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
-                        auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
+                    if (!can_reuse_eltwise_mem) {
+                        if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
+                            auto& eltw_inst = _primitives.at(eltw_in.id());
+                            auto& prim_inst = _primitives.at(node->id());
+                            auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
+                            auto prim_mem_type = prim_inst->output_memory().get_allocation_type();

-                        // Keep lockable memory type for `prim_inst` output if needed
-                        if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
-                            can_reuse_eltwise_mem = false;
+                            // Keep lockable memory type for `prim_inst` output if needed
+                            if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
+                                can_reuse_eltwise_mem = false;
+                        }
                    }

                    if (fused_op.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(eltw_in_layout) && !can_reuse_eltwise_mem) {
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -741,10 +741,10 @@ program_node& program::get_or_create(std::shared_ptr<primitive> prim) {
 }

 void program::add_intermediate(program_node& node,
-                                    program_node& next,
-                                    size_t prev_idx,
-                                    bool connect_int_node_with_old_dep,
-                                    bool move_usrs_of_prev_to_node) {
+                               program_node& next,
+                               size_t prev_idx,
+                               bool connect_int_node_with_old_dep,
+                               bool move_usrs_of_prev_to_node) {
    if (connect_int_node_with_old_dep && !node.dependencies.empty())
        throw std::invalid_argument(
            "Node which is about to be added in between two other nodes should not have any existing dependencies");
@ -1112,8 +1112,8 @@ void program::remove_nodes(std::vector<program_node*>& to_remove) {
 // TODO: break this function into number of smaller ones + add per-primitive fields (possibly use
 // primitive_inst::to_string?)
 void program::dump_program(const char* stage,
-                                bool with_full_info,
-                                std::function<bool(program_node const&)> const& filter) const {
+                           bool with_full_info,
+                           std::function<bool(program_node const&)> const& filter) const {
    std::string path = get_dir_path(options);
    if (path.empty() || !with_full_info) {
        return;
@ -1230,7 +1230,7 @@ void program::save_pass_info(std::string pass_name) {
 }

 void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
-                                                std::vector<primitive_id> replaced_with_ids) {
+                                           std::vector<primitive_id> replaced_with_ids) {
    for (auto& e : optimized) {
        auto it = std::find_if(e.second.begin(), e.second.end(), [&optimized_primitive_id](const primitive_id& id) {
           return optimized_primitive_id == id;
--- a/inference-engine/thirdparty/clDNN/src/program_node.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program_node.cpp
@ -428,7 +428,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        // Ignore optimized operations for "previous" operation in our operation pair
        while (type_is_any_optimized(prev_type) && cur_post_op_idx < post_ops_size - 1) {
            prev_post_op_idx++;
-            cur_post_op_idx++;
+            if (prev_post_op_idx == cur_post_op_idx)
+                cur_post_op_idx++;
            prev_type = cur_post_ops[prev_post_op_idx].op_type;
            cur_type = cur_post_ops[cur_post_op_idx].op_type;
        }
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -681,7 +681,7 @@ TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) {
        reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32),
        convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
        reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32),
-        convolution("conv_output", "reorder_bfyx", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
+        convolution("conv_output", "reorder_bfyx", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
        activation("activation", "conv_output", activation_func::abs),
        reorder("reorder_output", "activation", p.default_format, data_types::f32)
    );
@ -10059,7 +10059,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature)
        data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
        convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
        reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32, values_to_subtract),
-        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
+        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
        activation("activation", "conv_output", activation_func::abs)
    );

@ -10088,7 +10088,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activat
        convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
        reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32),
        activation("activation_quantize", "reorder_fsv32", activation_func::relu),
-        convolution("conv_output", "activation_quantize", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
+        convolution("conv_output", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
        activation("activation", "conv_output", activation_func::abs)
    );

@ -10116,7 +10116,7 @@ TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) {
        data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
        convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation),
        reorder("reorder_fsv32", "conv_prim", layout(data_types::f32, format::fs_b_yx_fsv32, dw_tensor, padding{ {0, 0, 1, 1}, 0 })),
-        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, 1, dw_stride, p.pad, p.dilation),
+        convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation),
        activation("activation", "conv_output", activation_func::abs),
        activation("activation2", "conv_prim", activation_func::abs),
        eltwise("add_bias", { "activation", "activation2" }, eltwise_mode::sum)
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
@ -43,7 +43,7 @@ TEST(memory_tests, DISABLED_network_creation_loop)
 {
    engine eng;

-    memory in = memory::allocate(eng, layout{ data_types::f32, format::bfyx,{ 1, 1, 1000, 1000 } });
+    memory in = memory::allocate(eng, layout{ data_types::f32, format::bfyx, { 1, 1, 1000, 1000 } });

    topology tpl{
        input_layout("in", in->get_layout()),
@ -66,7 +66,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
    auto x_size = 1;
    auto y_size = 1;

-    auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
+    auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });

    topology topology;
    topology.add(input_layout("input", input->get_layout()));
@ -86,7 +86,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
    network.set_input_data("input", input);
    auto outputs = network.execute();

-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 64);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)64);
 }

 TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
@ -99,13 +99,13 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
    auto x_size = 4;
    auto y_size = 4;

-    auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
+    auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });

    topology topology;
    topology.add(input_layout("input", input->get_layout()));
    topology.add(activation("relu", "input", activation_func::relu));
    topology.add(activation("relu1", "relu", activation_func::relu));
-    topology.add(pooling("pool1", "relu1",pooling_mode::max, { 1,1,3,3 }, { 1,1,2,2 }));
+    topology.add(pooling("pool1", "relu1", pooling_mode::max, { 1, 1, 3, 3 }, { 1, 1, 2, 2 }));
    topology.add(activation("relu2", "pool1", activation_func::relu));
    topology.add(activation("relu3", "relu2", activation_func::relu));
    topology.add(activation("relu4", "relu3", activation_func::relu));
@ -133,7 +133,7 @@ TEST(memory_pool, multi_outputs_network) {
    auto x_size = 4;
    auto y_size = 4;

-    auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
+    auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });

    topology topology;
    topology.add(input_layout("input", input->get_layout()));
@ -153,7 +153,7 @@ TEST(memory_pool, multi_outputs_network) {
    network.set_input_data("input", input);
    auto outputs = network.execute();

-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1536);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 1536);
 }

 TEST(memory_pool, oooq) {
@ -171,14 +171,14 @@ TEST(memory_pool, oooq) {
    auto x_size = 4;
    auto y_size = 4;

-    auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });
+    auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } });

    topology topology;
    topology.add(input_layout("input", input->get_layout()));
    topology.add(activation("relu1", "input", activation_func::relu));
    topology.add(activation("relu2", "input", activation_func::relu));
    topology.add(activation("relu3", "input", activation_func::relu));
-    topology.add(concatenation("concat1", { "relu1", "relu2"},concatenation::along_f));
+    topology.add(concatenation("concat1", { "relu1", "relu2" },concatenation::along_f));
    topology.add(activation("relu4", "concat1", activation_func::relu));
    topology.add(activation("relu5", "relu3", activation_func::relu));
    topology.add(concatenation("concat2", { "relu4", "relu5" }, concatenation::along_f));
@ -209,7 +209,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
    auto inp_x_size = 4;
    auto inp_y_size = 4;

-    auto input = engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
+    auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });

    set_values(input,
    {   1.0f, 2.5f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 6.1f, 4.7f, 1.0f, 1.0f, 8.2f, 1.0f, 2.0f, 1.0f,
@ -227,7 +227,7 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
    topology.add(activation("relu4", "concat1", activation_func::relu));
    topology.add(activation("relu5", "relu3", activation_func::relu));
    topology.add(concatenation("concat2", { "relu4", "relu5" }, concatenation::along_f));
-    topology.add(activation("relu6", "concat2", activation_func::linear, {1.0f, 0.5f}));
+    topology.add(activation("relu6", "concat2", activation_func::linear, { 1.0f, 0.5f }));

    build_options bo;
    bo.set_option(build_option::optimize_data(true));
@ -286,8 +286,8 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) {
    auto inp_x_size = 4;
    auto inp_y_size = 4;

-    auto input= engine->allocate_memory({ data_types::f32, format::bfyx,{ tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
-    auto weights = engine->allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 3, 2 } });
+    auto input= engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } });
+    auto weights = engine->allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 3, 2 } });

    std::vector<float> dummy_input_data_1 = {
       /*f0 xy*/ 0.8f, 0.65f, 0.1f, 1.0f, 1.0f, 0.5f, 0.11f, 0.33f, 0.66f, 0.11f, 0.22f, 0.33f, 0.99f, 0.8f, 0.7f, 0.5f,
@ -373,10 +373,10 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
    layout lay_batch_8 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_8)) }};
    auto input_1 = engine->allocate_memory(lay_batch_1);
    auto input_8 = engine->allocate_memory(lay_batch_8);
-    auto weights = engine->allocate_memory({ dt, fmt, { 1, 1, 3, 2 } });
+    auto weights = engine->allocate_memory({ dt, fmt, { 1, 3, 3, 2 } });

-    std::vector<float> dummy_input_data_1 = generate_random_1d<float>(batch_1*feature_num*inp_x_size*inp_y_size, 0, 1);
-    std::vector<float> dummy_input_data_8 = generate_random_1d<float>(batch_8*feature_num*inp_x_size*inp_y_size, 0, 1);
+    std::vector<float> dummy_input_data_1 = generate_random_1d<float>(batch_1 * feature_num * inp_x_size * inp_y_size, 0, 1);
+    std::vector<float> dummy_input_data_8 = generate_random_1d<float>(batch_8 * feature_num * inp_x_size * inp_y_size, 0, 1);

    set_values(input_1, dummy_input_data_1);
    set_values(input_8, dummy_input_data_8);
@ -396,14 +396,14 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
    auto outputs = network_first.execute();

    auto dev_info = engine->get_device_info();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 4744);

    topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1

    network network_second(*engine, topo, bo);
    network_second.set_input_data("input", input_1);
    auto outputs_second = network_second.execute();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)4328);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 5912);
 }

 TEST(memory_pool, shared_dep_two_output) {
@ -459,20 +459,20 @@ TEST(memory_pool, non_opt_intermidate_opt_after) {

    auto input_memory1 = engine.allocate_memory(input_layout1);
    auto input_memory2 = engine.allocate_memory(input_layout2);
-    auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 }));
+    auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 }));
    auto data_memory = cldnn::data("scale_mem", scale_memory);

    set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f });
    set_values(input_memory2, { 5.0f, 6.0f, 7.0f, 8.0f });
-    set_values(scale_memory, { 1.0f});
+    set_values(scale_memory, { 1.0f });

    auto reshape_tensor = cldnn::tensor(8, 1, 1, 1);
    auto input = cldnn::input_layout("input1", input_layout1);
    auto input2 = cldnn::input_layout("input2", input_layout2);
    auto concat = cldnn::concatenation("concat", { "input1", "input2" }, cldnn::concatenation::along_b);
    auto reshape = cldnn::reshape("reshape", "concat", reshape_tensor);
-    auto crop1 = cldnn::crop("crop1", "reshape", { 1,1,1,1 }, { 0, 0, 0, 0 });
-    auto crop2 = cldnn::crop("crop2", "reshape", { 1,1,1,1 }, { 1, 0, 0, 0 });
+    auto crop1 = cldnn::crop("crop1", "reshape", { 1, 1, 1, 1 }, { 0, 0, 0, 0 });
+    auto crop2 = cldnn::crop("crop2", "reshape", { 1, 1, 1, 1 }, { 1, 0, 0, 0 });
    auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem");
    auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem");

@ -508,7 +508,7 @@ TEST(memory_pool, add_mem_dep_test) {
    auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 });

    auto input_memory1 = engine.allocate_memory(input_layout1);
-    auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1,1,1,1 }));
+    auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 }));
    auto data_memory = cldnn::data("scale_mem", scale_memory);

    set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f,
@ -518,8 +518,8 @@ TEST(memory_pool, add_mem_dep_test) {
    auto input = cldnn::input_layout("input1", input_layout1);
    auto actv1 = cldnn::activation("input_activ1", "input1", activation_func::abs);
    auto actv2 = cldnn::activation("input_activ2", "input1", activation_func::abs);
-    auto crop1 = cldnn::crop("crop1", "input_activ1", { 1,1,2,2 }, { 0, 0, 0, 0 });
-    auto crop2 = cldnn::crop("crop2", "input_activ2", { 1,1,2,2 }, { 0, 1, 0, 0 });
+    auto crop1 = cldnn::crop("crop1", "input_activ1", { 1, 1, 2, 2 }, { 0, 0, 0, 0 });
+    auto crop2 = cldnn::crop("crop2", "input_activ2", { 1, 1, 2, 2 }, { 0, 1, 0, 0 });
    auto eltwise1 = cldnn::scale("elt1", "crop1", "scale_mem");
    auto eltwise2 = cldnn::scale("elt2", "crop2", "scale_mem");
    auto actv3 = cldnn::activation("out3", "elt1", activation_func::abs);
--- a/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp
@ -137,11 +137,12 @@ static void CreateConvolutionBackpropDataOp(Program& p, const std::shared_ptr<ng

    auto weightsName = inputs[1];
    auto weights_node = op->get_input_node_shared_ptr(1);
-    // WA: For the cases like Const(weights)->Sub(zp)->Deconv.
+    bool hasConstantWeights = IsNodeOnConstPath(weights_node);
+    // WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
    // Dimensions order of weights blob is IOYX, but
    // the selected format is OIYX by default. So we need to swap (and transpose) I and O dimensions to match the format
    // For Constant node on input transpose is not needed, because the data is transposed on const node creation
-    if (IsNodeOnConstPath(weights_node) && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) {
+    if ((hasConstantWeights && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) || !hasConstantWeights) {
        std::string permuteName = layerName + "_cldnn_weights_permute";
        auto weights_rank = op->get_input_shape(1).size();
        std::vector<uint16_t> permute_order(weights_rank);
@ -195,11 +196,12 @@ static void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_p

    auto weightsName = inputs[1];
    auto weights_node = op->get_input_node_shared_ptr(1);
-    // WA: For the cases like Const(weights)->Sub(zp)->Deconv.
+    bool hasConstWeights = IsNodeOnConstPath(weights_node);
+    // WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
    // Dimensions order of weights blob is IOYX, but
    // the selected format is OIYX by default. So we need to swap I and O dimensions to match the format.
    // For Constant node on input transpose is not needed, because the data is transposed on const node creation
-    if (IsNodeOnConstPath(weights_node) && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) {
+    if ((hasConstWeights && std::dynamic_pointer_cast<ngraph::op::v0::Constant>(weights_node) == nullptr) || !hasConstWeights) {
        std::string permuteName = layerName + "_cldnn_weights_permute";
        auto weights_rank = op->get_input_shape(1).size();
        std::vector<uint16_t> permute_order(weights_rank);
--- a/src/plugins/intel_gpu/src/plugin/program.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program.cpp
@ -346,7 +346,7 @@ bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node) {
    std::function<bool(const std::shared_ptr<ngraph::Node>&)> is_const_node = [&nodes_processed, &is_const_node](const std::shared_ptr<ngraph::Node>& node) {
        if (nodes_processed.count(node)) return true;
        nodes_processed.insert(node);
-        // If input is constant, then drop if from the processing list
+        // If input is constant, then drop it from the processing list
        if (std::dynamic_pointer_cast<ngraph::op::v0::Constant>(node) != nullptr)
            return true;
        // If the node doesn't have any parents and it's not a constant, then we deal with dynamic path