From f5e199c494ea2544ca89beb43298b411e792452b Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 5 Apr 2023 16:20:51 +0400 Subject: [PATCH] [GPU] Don't reorder weights when can reinterpret (#16714) * [GPU] Don't reorder weights when can reinterpret * [GPU] Test fixes --- .../graph_optimizer/prepare_buffer_fusing.cpp | 7 +- src/plugins/intel_gpu/src/graph/network.cpp | 20 +- .../intel_gpu/src/graph/primitive_inst.cpp | 173 +++++++++--------- .../transformations/einsum_decomposition.cpp | 4 + src/plugins/intel_gpu/src/runtime/layout.cpp | 172 ++++++++--------- .../tests/module_tests/layout_test.cpp | 30 ++- .../passes/prepare_buffer_fusing_test.cpp | 2 +- .../tests/test_cases/reshape_gpu_test.cpp | 50 +++++ 8 files changed, 282 insertions(+), 176 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 8bbf2aa3b73..da711cb3ab3 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -324,17 +324,14 @@ void prepare_buffer_fusing::run(program& p) { If crop is before concat there can be padding mismtach, since concat changes padding. */ auto can_optimize = [](const program_node* node) { - bool is_dynamic = node->get_output_layout().is_dynamic(); + bool is_dynamic = node->is_dynamic(); bool is_planar = format::is_default_format(node->get_output_layout().format); bool no_pad = !node->get_output_layout().data_padding && !node->get_input_layouts().empty() && !node->get_input_layouts()[0].data_padding; - // The condition below check only output layout as cases like - // (dyn_shape) -> reshape -> (static_shape) -> some_static_primitive - // may have invalid set_arguments call as output memory of reshape won't be available until reshape primitive is executed if (node->is_type() && is_dynamic && is_planar && no_pad && !node->is_output() && !node->has_fused_primitives()) { return true; } - if (node->is_dynamic() || node->is_output() || node->has_fused_primitives()) { + if (is_dynamic || node->is_output() || node->has_fused_primitives()) { return false; } return true; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 893103494ac..31c1943b3f6 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -642,8 +642,22 @@ void network::set_arguments() { return; for (auto const& prim : _exec_order) { - if (!prim->is_dynamic()) - prim->set_arguments(); + if (!prim->is_dynamic()) { + bool can_set_args = true; + for (auto& dep : prim->dependencies()) { + // Skip set args for nodes with dynamic & optimized_out dependency + // This is needed to handle dynamic -> static cases like + // (dynamic) -> reshape -> (static) -> some_op + // In that case some_op is static and we may want to set arguments once, + // but dynamic optimized out reshape means that output buffer of reshape is unavailable + // and attempt to set args will fail. + if (dep.first->can_be_optimized() && dep.first->is_dynamic()) + can_set_args = false; + } + + if (can_set_args) + prim->set_arguments(); + } } _reset_arguments = false; } @@ -1308,7 +1322,7 @@ void network::allocate_primitive_instance(program_node const& node) { return true; } if (dep.first->can_be_optimized()) { - if (is_mutable_input(*dep.first)) { + if (is_mutable_input(*dep.first) || dep.first->is_dynamic()) { return true; } } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index f5f41bf0748..85e82982725 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -395,11 +395,13 @@ bool primitive_inst::update_impl() { impl->set_kernels(kernels); cache.add(updated_params, impl->clone()); }); - _impl = _dynamic_impl->clone(); - auto new_impl_params = _impl->canonicalize_shapes(*_impl_params); - _impl->update_dispatch_data(new_impl_params); + if (!can_be_optimized()) { + _impl = _dynamic_impl->clone(); + auto new_impl_params = _impl->canonicalize_shapes(*_impl_params); + _impl->update_dispatch_data(new_impl_params); - update_shape_info(new_impl_params); + update_shape_info(new_impl_params); + } } else { _impl = _node->type()->choose_impl(*_node, updated_params); auto& kernels_cache = get_network().get_program()->get_kernels_cache(); @@ -715,90 +717,97 @@ event::ptr primitive_inst::update_weights() { if (!weightable_node) return nullptr; + auto& engine = _network.get_engine(); auto& weights_params = _impl->_weights_reorder_params; - bool requires_reorder = weights_params.engine != kernel_selector::GenericKernelParams::Engine::NONE; - const auto weights_idx = _node->get_primitive()->input.size(); - const auto original_weights_memory = dep_memory_ptr(weights_idx); - auto expected_layout = requires_reorder ? from_weights_tensor(weights_params.dest) - : original_weights_memory->get_layout(); + auto weights_idx = _node->get_primitive()->input.size(); + auto original_weights_memory = dep_memory_ptr(weights_idx); + auto original_layout = original_weights_memory->get_layout(); - // Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion - expected_layout.set_partial_shape(original_weights_memory->get_layout().get_partial_shape()); - - if (requires_reorder && !_reordered_weights_cache.has(expected_layout)) { - GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false); - auto original_layout = original_weights_memory->get_layout(); - auto& engine = _network.get_engine(); - - auto get_kernel_key = [&]() -> size_t { - auto seed = _node->get_primitive()->hash(); - seed = hash_combine(seed, expected_layout.hash()); - seed = hash_combine(seed, original_layout.hash()); - return seed; - }; - - cldnn::kernel::ptr kernel = nullptr; - auto kernel_key = get_kernel_key(); - auto& cache = get_network().get_in_mem_kernels_cache(); - if (cache.has(kernel_key)) { - GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() - << " to " << expected_layout.to_short_string() << std::endl; - GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); - kernel = cache.get(kernel_key); - } else { - GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() - << " to " << expected_layout.to_short_string() << std::endl; - auto& kernels_cache = get_network().get_program()->get_kernels_cache(); - auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString}); - OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); - kernel = (kernels.begin()->second)[0]; - cache.add(kernel_key, kernel); - } - - auto& stream = get_network().get_stream(); - - bool can_reuse = false; - memory::ptr weights_memory = nullptr; - if (_reordered_weights_cache.is_full()) { - weights_memory = _reordered_weights_cache.get_lru_element().second; - can_reuse = weights_memory->size() <= expected_layout.bytes_count() && weights_memory != original_weights_memory; - } - - if (can_reuse) { - GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory" << std::endl; - weights_memory = engine.reinterpret_buffer(*weights_memory, expected_layout); - } else { - GPU_DEBUG_TRACE_DETAIL << id() << ": allocate weights memory" << std::endl; - auto alloc_type = engine.get_preferred_memory_allocation_type(); - weights_memory = engine.allocate_memory(expected_layout, alloc_type); - } - - _reordered_weights_cache.add(expected_layout, weights_memory); - _impl_params->weights_layout = optional_layout(expected_layout); - GPU_DEBUG_TRACE_DETAIL << id() << ": update weights cache: " << expected_layout.to_short_string() << " cache_size=" - << _reordered_weights_cache.size() << "/" << _reordered_weights_cache.capacity() << std::endl; - - kernel_arguments_data args; - args.inputs.push_back(original_weights_memory); - args.outputs.push_back(weights_memory); - stream.set_arguments(*kernel, weights_params.clKernel->params, args); - auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true); - - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { - stream.wait_for_events({ev}); - } - - return ev; - } else { + if (weights_params.engine == kernel_selector::GenericKernelParams::Engine::NONE) { // If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then - // incorrect memory buffer may be assigned, so push front original memory in LRU cache - if (weights_params.engine == kernel_selector::GenericKernelParams::Engine::NONE) { - _reordered_weights_cache.add(expected_layout, original_weights_memory); - _impl_params->weights_layout = optional_layout(expected_layout); + // incorrect memory buffer may be assigned, so reset cached weights for such case + _reordered_weights_cache.add(original_weights_memory->get_layout(), original_weights_memory); + } else { + auto expected_layout = from_weights_tensor(weights_params.dest); + // Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion + expected_layout.set_partial_shape(original_weights_memory->get_layout().get_partial_shape()); + _impl_params->weights_layout = optional_layout(expected_layout); + + if (_reordered_weights_cache.has(expected_layout)) { + GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); + GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights for " << expected_layout.to_short_string() << std::endl; + return nullptr; + } else if (original_layout.compatible(expected_layout)) { + GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); + GPU_DEBUG_TRACE_DETAIL << id() << ": reinterpret original weights memory from " << original_layout.to_short_string() + << " to " << expected_layout.to_short_string() << std::endl; + _reordered_weights_cache.add(expected_layout, engine.reinterpret_buffer(*original_weights_memory, expected_layout)); + return nullptr; + } else { + GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false); + auto get_kernel_key = [&]() -> size_t { + auto seed = _node->get_primitive()->hash(); + seed = hash_combine(seed, expected_layout.hash()); + seed = hash_combine(seed, original_layout.hash()); + return seed; + }; + + cldnn::kernel::ptr kernel = nullptr; + auto kernel_key = get_kernel_key(); + auto& cache = get_network().get_in_mem_kernels_cache(); + if (cache.has(kernel_key)) { + GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() + << " to " << expected_layout.to_short_string() << std::endl; + GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); + kernel = cache.get(kernel_key); + } else { + GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() + << " to " << expected_layout.to_short_string() << std::endl; + auto& kernels_cache = get_network().get_program()->get_kernels_cache(); + auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString}); + OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); + kernel = (kernels.begin()->second)[0]; + cache.add(kernel_key, kernel); + } + + auto& stream = get_network().get_stream(); + + bool can_reuse = false; + memory::ptr weights_memory = nullptr; + if (_reordered_weights_cache.is_full()) { + weights_memory = _reordered_weights_cache.get_lru_element().second; + can_reuse = weights_memory->size() <= expected_layout.bytes_count() && weights_memory != original_weights_memory; + } + + if (can_reuse) { + GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory for new layout " << expected_layout.to_short_string() << std::endl; + weights_memory = engine.reinterpret_buffer(*weights_memory, expected_layout); + } else { + GPU_DEBUG_TRACE_DETAIL << id() << ": allocate weights memory" << std::endl; + auto alloc_type = engine.get_preferred_memory_allocation_type(); + weights_memory = engine.allocate_memory(expected_layout, alloc_type); + } + + _reordered_weights_cache.add(expected_layout, weights_memory); + GPU_DEBUG_TRACE_DETAIL << id() << ": update weights cache: " << expected_layout.to_short_string() << " cache_size=" + << _reordered_weights_cache.size() << "/" << _reordered_weights_cache.capacity() << std::endl; + + kernel_arguments_data args; + args.inputs.push_back(original_weights_memory); + args.outputs.push_back(weights_memory); + stream.set_arguments(*kernel, weights_params.clKernel->params, args); + auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true); + + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { + stream.wait_for_events({ev}); + } + + return ev; } } + GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); return nullptr; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/einsum_decomposition.cpp b/src/plugins/intel_gpu/src/plugin/transformations/einsum_decomposition.cpp index 6b04b45f50d..b7c5fe4b5e3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/einsum_decomposition.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/einsum_decomposition.cpp @@ -893,6 +893,10 @@ EinsumDecomposition::EinsumDecomposition() { return false; } + if (einsum_node->is_dynamic()) { + return false; + } + auto equation = einsum_node->get_equation(); std::vector input_subscripts; std::string output_subscript; diff --git a/src/plugins/intel_gpu/src/runtime/layout.cpp b/src/plugins/intel_gpu/src/runtime/layout.cpp index 29076654686..6cceb61a311 100644 --- a/src/plugins/intel_gpu/src/runtime/layout.cpp +++ b/src/plugins/intel_gpu/src/runtime/layout.cpp @@ -11,86 +11,6 @@ namespace cldnn { static inline bool check_redundant_1d_along_feature(layout const& l1, layout const& l2); namespace { -// pair.first tells whether l1 and l2 are absolutely identical -// pair.second tells whether l1 and l2 can be reinterpreted to each other without need of reordering -// note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted -// nor dropped) note: if layouts describe two buffers with different size, consider them not to be identical even if -// smaller buffer can be considered to hold subsequence of larger buffer, -// this behavior is required to force buffer allocation for smaller buffer which, currently, should always be -// performed -std::pair are_layouts_identical(layout const& l1, layout const& l2) { - const auto& l1_pad = l1.data_padding; - const auto& l2_pad = l2.data_padding; - - if (l1.is_dynamic() || l2.is_dynamic()) - return {false, false}; - - auto l1_size = l1.get_tensor(); - auto l2_size = l2.get_tensor(); - if (l1 == l2) - return {true, true}; - if (check_redundant_1d_along_feature(l1, l2)) - return {false, true}; - if (l1.data_type != l2.data_type) - return {false, false}; - // Reorders between bfyx, bfzyx, bfwzyx can pe reinterpeted as reshape when - // there is no padding and both hold same number of elements. - if (format::is_default_format(l1.format) && format::is_default_format(l2.format) && - !l1_pad && !l2_pad && l1.get_linear_size() == l2.get_linear_size()) - return {false, true}; - if (l1_size != l2_size) - return {false, false}; - if (l1.get_linear_size() != l2.get_linear_size()) - return {false, false}; - - auto check_format = [&l1, &l2](cldnn::format format) { - return (l1.format == format && l2.format != format) || - (l2.format == format && l1.format != format); - }; - - if (check_format(format::b_fs_yx_fsv2) || - check_format(format::b_fs_yx_fsv4) || - check_format(format::fs_b_yx_fsv32) || - check_format(format::b_fs_yx_fsv16) || - check_format(format::b_fs_yx_fsv32) || - check_format(format::b_fs_zyx_fsv2) || - check_format(format::b_fs_zyx_fsv4) || - check_format(format::b_fs_zyx_fsv32) || - check_format(format::b_fs_zyx_fsv16) || - check_format(format::bs_fs_yx_bsv4_fsv4) || - check_format(format::bs_fs_yx_bsv8_fsv4) || - check_format(format::bs_fs_zyx_bsv8_fsv4) || - check_format(format::bs_fs_yx_bsv8_fsv2) || - check_format(format::bs_fs_zyx_bsv8_fsv2) || - check_format(format::bs_fs_yx_bsv4_fsv2) || - check_format(format::bs_fs_yx_bsv32_fsv16) || - check_format(format::bs_fs_yx_bsv32_fsv32) || - check_format(format::bs_fs_yx_bsv16_fsv16) || - check_format(format::bs_fs_yx_bsv16_fsv32) || - check_format(format::bs_fs_zyx_bsv16_fsv32) || - check_format(format::bs_fs_zyx_bsv16_fsv16) || - check_format(format::bs_fs_zyx_bsv32_fsv16) || - check_format(format::bs_fs_zyx_bsv32_fsv32)) - return {false, false}; - - auto l1_pitch = l1.get_pitches(); - auto l2_pitch = l2.get_pitches(); - - // ignore pitches which will never be used (for dims with size == 1) - for (size_t i = 0; i < tensor_dim_max; ++i) - if (l1_size.raw[i] == 1) - l1_pitch.raw[i] = 0; - for (size_t i = 0; i < tensor_dim_max; ++i) - if (l2_size.raw[i] == 1) - l2_pitch.raw[i] = 0; - - auto l1_offset = l1.get_linear_offset(); - auto l2_offset = l2.get_linear_offset(); - if (l1_pitch == l2_pitch && l1_offset == l2_offset) - return {false, true}; - - return {false, false}; -} std::vector convert_dimensions(const std::vector& sizes, std::string in_order, std::string out_order) { std::vector new_sizes(out_order.size(), {-1}); @@ -497,12 +417,100 @@ layout layout::with_padding(padding const& padd) const { return ret; } +// tells whether l1 and l2 can be reinterpreted to each other without need of reordering +// note: layouts can only be considered identical if data size described by both layouts match (so no data are genereted +// nor dropped) note: if layouts describe two buffers with different size, consider them not to be identical even if +// smaller buffer can be considered to hold subsequence of larger buffer, +// this behavior is required to force buffer allocation for smaller buffer which, currently, should always be +// performed bool layout::compatible(const layout& other) const { - return are_layouts_identical(*this, other).second; + auto& l1 = *this; + auto& l2 = other; + const auto& l1_pad = l1.data_padding; + const auto& l2_pad = l2.data_padding; + + if (l1.is_dynamic() || l2.is_dynamic()) + return false; + + auto l1_size = l1.get_tensor(); + auto l2_size = l2.get_tensor(); + if (l1 == l2) + return true; + if (check_redundant_1d_along_feature(l1, l2)) + return true; + if (l1.data_type != l2.data_type) + return false; + // Reorders between bfyx, bfzyx, bfwzyx can be reinterpeted as reshape when + // there is no padding and both hold same number of elements. + if (format::is_default_format(l1.format) && format::is_default_format(l2.format) && + !l1_pad && !l2_pad && l1.get_linear_size() == l2.get_linear_size()) + return true; + if (l1_size != l2_size) + return false; + if (l1.get_linear_size() != l2.get_linear_size()) + return false; + + auto check_format = [&l1, &l2](cldnn::format format) { + return (l1.format == format && l2.format != format) || + (l2.format == format && l1.format != format); + }; + + const auto& blocks1 = format::block_sizes(l1.format); + const auto& blocks2 = format::block_sizes(l2.format); + + // TODO: Relax restrictions below + if (blocks1 != blocks2 || + (!blocks1.empty() && format::traits(l1.format)._order != format::traits(l2.format)._order)) + return false; + + if (check_format(format::b_fs_yx_fsv2) || + check_format(format::b_fs_yx_fsv4) || + check_format(format::fs_b_yx_fsv32) || + check_format(format::b_fs_yx_fsv16) || + check_format(format::b_fs_yx_fsv32) || + check_format(format::b_fs_zyx_fsv2) || + check_format(format::b_fs_zyx_fsv4) || + check_format(format::b_fs_zyx_fsv32) || + check_format(format::b_fs_zyx_fsv16) || + check_format(format::bs_fs_yx_bsv4_fsv4) || + check_format(format::bs_fs_yx_bsv8_fsv4) || + check_format(format::bs_fs_zyx_bsv8_fsv4) || + check_format(format::bs_fs_yx_bsv8_fsv2) || + check_format(format::bs_fs_zyx_bsv8_fsv2) || + check_format(format::bs_fs_yx_bsv4_fsv2) || + check_format(format::bs_fs_yx_bsv32_fsv16) || + check_format(format::bs_fs_yx_bsv32_fsv32) || + check_format(format::bs_fs_yx_bsv16_fsv16) || + check_format(format::bs_fs_yx_bsv16_fsv32) || + check_format(format::bs_fs_zyx_bsv16_fsv32) || + check_format(format::bs_fs_zyx_bsv16_fsv16) || + check_format(format::bs_fs_zyx_bsv32_fsv16) || + check_format(format::bs_fs_zyx_bsv32_fsv32)) + return false; + + auto l1_pitch = l1.get_pitches(); + auto l2_pitch = l2.get_pitches(); + + // ignore pitches which will never be used (for dims with size == 1) + for (size_t i = 0; i < tensor_dim_max; ++i) + if (l1_size.raw[i] == 1) + l1_pitch.raw[i] = 0; + for (size_t i = 0; i < tensor_dim_max; ++i) + if (l2_size.raw[i] == 1) + l2_pitch.raw[i] = 0; + + auto l1_offset = l1.get_linear_offset(); + auto l2_offset = l2.get_linear_offset(); + if (l1_pitch == l2_pitch && l1_offset == l2_offset) + return true; + + return false; } bool layout::identical(const layout& other) const { - return are_layouts_identical(*this, other).first; + if (is_dynamic() || other.is_dynamic()) + return false; + return *this == other; } ov::PartialShape layout::transform(const ov::PartialShape& pshape, cldnn::format old_fmt, cldnn::format new_fmt) { diff --git a/src/plugins/intel_gpu/tests/module_tests/layout_test.cpp b/src/plugins/intel_gpu/tests/module_tests/layout_test.cpp index 4e110514f11..4cd52dbdb28 100644 --- a/src/plugins/intel_gpu/tests/module_tests/layout_test.cpp +++ b/src/plugins/intel_gpu/tests/module_tests/layout_test.cpp @@ -196,8 +196,8 @@ class layout_cmp_test : public testing::TestWithParam { TEST_P(layout_cmp_test, basic) { auto p = GetParam(); - EXPECT_EQ(p.l1.identical(p.l2), p.is_identical); - EXPECT_EQ(p.l1.compatible(p.l2), p.is_compatible); + EXPECT_EQ(p.l1.identical(p.l2), p.is_identical) << p.l1.to_short_string() << " -> " << p.l2.to_short_string(); + EXPECT_EQ(p.l1.compatible(p.l2), p.is_compatible) << p.l1.to_short_string() << " -> " << p.l2.to_short_string(); } INSTANTIATE_TEST_SUITE_P(smoke, layout_cmp_test, @@ -209,11 +209,35 @@ INSTANTIATE_TEST_SUITE_P(smoke, layout_cmp_test, {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx}, layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, false, false}, {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, - layout{ov::PartialShape{1, 2, 3, 4, 1}, data_types::f16, format::bfzyx}, false, true}, + layout{ov::PartialShape{1, 2, 1, 3, 4}, data_types::f16, format::bfzyx}, false, true}, {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, layout{ov::PartialShape{1, 2, 3, 4, 1, 1}, data_types::f16, format::bfwzyx}, false, true}, + {layout{ov::PartialShape{1, 2, 3, 4, 1, 1}, data_types::f16, format::bfwzyx}, + layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, false, true}, + {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{1, 2, 1, 1, 3, 4}, data_types::f16, format::bfwzyx}, false, true}, {layout{ov::PartialShape{1, 32, 4, 4}, data_types::f32, format::b_fs_yx_fsv32, padding({0, 0, 1, 1}, 0)}, layout{ov::PartialShape{1, 32, 4, 4}, data_types::f32, format::b_fs_yx_fsv32, padding({0, 0, 0, 0}, 0)}, false, false}, {layout{ov::PartialShape{1, 32, 4, 4}, data_types::f32, format::b_fs_yx_fsv32, padding({0, 0, 1, 1}, 0)}, layout{ov::PartialShape{1, 32, 4, 4}, data_types::f32, format::b_fs_yx_fsv32, padding({0, 0, 1, 1}, 0)}, true, true}, + {layout{ov::PartialShape{10, 20}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{10, 20}, data_types::f16, format::os_iyx_osv16}, false, false}, + {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::oiyx}, false, true}, + {layout{ov::PartialShape{128, 10}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{128, 10}, data_types::f16, format::os_iyx_osv32}, false, false}, + {layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{1, 2, 3, 4}, data_types::f16, format::yxfb}, false, false}, + {layout{ov::PartialShape{1, 2, 1, 1}, data_types::f16, format::bfyx}, + layout{ov::PartialShape{1, 2, 1, 1}, data_types::f16, format::b_fs_yx_fsv16}, false, false}, + {layout{ov::PartialShape{1, 2, 1, 1, 1}, data_types::f16, format::b_fs_zyx_fsv16}, + layout{ov::PartialShape{1, 2, 1, 1}, data_types::f16, format::b_fs_yx_fsv16}, false, false}, + {layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::os_is_zyx_isv16_osv16}, + layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::is_os_zyx_isv16_osv16}, false, false}, + {layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::g_os_yx_is_osv8_isv2}, + layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::g_os_y_is_x_osv8_isv2}, false, false}, + {layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::goiyx}, + layout{ov::PartialShape{4, 2, 3, 4, 5}, data_types::f16, format::gioyx}, false, false}, + {layout{ov::PartialShape{9, 17, 3, 2, 5}, data_types::f16, format::is_os_zyx_isa8_osv8_isv2}, + layout{ov::PartialShape{9, 17, 3, 2, 5}, data_types::f16, format::os_is_zyx_isa8_osv8_isv2}, false, false}, })); diff --git a/src/plugins/intel_gpu/tests/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/passes/prepare_buffer_fusing_test.cpp index 0a5358d19fd..420b49cd8ee 100644 --- a/src/plugins/intel_gpu/tests/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/passes/prepare_buffer_fusing_test.cpp @@ -85,7 +85,7 @@ TEST(prepare_buffer_fusing, static_node_after_optimized_out_dyn_reshape) { program_wrapper::apply_opt_pass(*prog); program_wrapper::apply_opt_pass(*prog); ASSERT_NO_THROW(prog->get_node("reshape")); - ASSERT_FALSE(prog->get_node("reshape").can_be_optimized()); + ASSERT_TRUE(prog->get_node("reshape").can_be_optimized()); program_wrapper::apply_opt_pass(*prog); ASSERT_TRUE(has_node_with_type(*prog)); diff --git a/src/plugins/intel_gpu/tests/test_cases/reshape_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/reshape_gpu_test.cpp index acb659a11a3..e3cdc480c9f 100644 --- a/src/plugins/intel_gpu/tests/test_cases/reshape_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/reshape_gpu_test.cpp @@ -11,6 +11,8 @@ #include #include +#include "reshape_inst.h" + using namespace cldnn; using namespace ::tests; using namespace testing; @@ -942,6 +944,54 @@ TEST(reshape_gpu_f32, basic_runtime_dynamic_shape_with_const_optimized_out) { } } +TEST(reshape_gpu_f32, basic_dynamic_shape_to_static_optimized_out) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory(layout{ov::PartialShape{2, 10}, data_types::f32, format::bfyx}); + topology topology; + topology.add(input_layout("input", layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx})); + topology.add(reshape("reshape", input_info("input"), false, {2, 10}, {2, 10})); + topology.add(reduce("reduce", input_info("reshape"), reduce_mode::max, {1}, true)); + + // clang-format off + std::vector input_data = { + 0.0, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, + 0.0, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, + }; + // clang-format on + + set_values(input, input_data); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + auto outputs = network.execute(); + + ASSERT_TRUE(network.get_primitive("reshape")->can_be_optimized()); + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "reduce"); + + auto output = outputs.at("reduce").get_memory(); + + ASSERT_EQ(output->get_layout().data_type, input->get_layout().data_type); + ASSERT_EQ(output->get_layout().format, format::bfyx); + ASSERT_TRUE(output->get_layout().is_static()); + ov::PartialShape expected_shape = {2, 1}; + ASSERT_EQ(output->get_layout().get_partial_shape(), expected_shape); + + cldnn::mem_lock output_ptr(output, get_test_stream()); + std::vector expected_res = {9.f, 9.f}; + ASSERT_EQ(output_ptr.size(), expected_res.size()); + + + for (size_t i = 0; i < expected_res.size(); i++) { + ASSERT_EQ(expected_res[i], output_ptr[i]); + } +} + #ifdef RUN_ALL_MODEL_CACHING_TESTS TEST(reshape_gpu_f32, basic_2dim_in_place_cached) { generic_reshape_test(