From c70f0ca45d04f9a3b7a0a9ce4999afc2b43ecc06 Mon Sep 17 00:00:00 2001 From: Wilson Seok Date: Wed, 25 Oct 2023 16:09:11 +0900 Subject: [PATCH] [GPU] skip excessive mem alloc request in build (#20399) * skip excessive mem alloc request in build * update mem check function * fix os behavior * update mem size check location * only dynamic shape case takes check_allocatable * update check condition --- .../include/intel_gpu/runtime/engine.hpp | 2 + .../intel_gpu/src/graph/primitive_inst.cpp | 7 +- .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 34 +-- .../intel_gpu/src/runtime/ocl/ocl_engine.hpp | 2 +- .../gpu_dyn_huge_input_range.cpp | 235 ++++++++++++++++++ 5 files changed, 263 insertions(+), 17 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/functional/dynamic_tests/gpu_dyn_huge_input_range.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index ee8d10bb580..6b9195097b3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -79,6 +79,8 @@ public: /// Checks whether two memory objects represents the same physical memory virtual bool is_the_same_buffer(const memory& mem1, const memory& mem2) = 0; + virtual bool check_allocatable(const layout& layout, allocation_type type) = 0; + /// Returns basic allocation type which will be used as a fallback when allocation type is not specified or device doesn't support some features. virtual allocation_type get_default_allocation_type() const = 0; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 92f9f60743b..a81d0bd10ad 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -991,7 +991,6 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool , _outputs({memory::ptr()}) , _reordered_weights_cache(network.get_weights_cache_capacity()) , _output_changed(false) - , _mem_allocated(allocate_memory) , _is_dynamic(node.is_dynamic() || node.generates_dynamic_output()) , _type(node.type()) , _id(node.id()) @@ -1006,6 +1005,12 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool , _can_share_buffer(node.can_share_buffer()) , _is_constant(node.is_constant()) , _needs_completion_event(is_any_user_cpu(node.get_users()) || node.is_output()) { + // When dynamic shape node has huge upper boundary which causes bigger mem size than system max allocable mem size, do not allocate in build time. + auto output_layout = node.get_output_layout(); + if (allocate_memory && node.is_dynamic() && (!network.get_engine().check_allocatable(output_layout, allocation_type::usm_host))) { + allocate_memory = false; + } + _mem_allocated = allocate_memory; if (allocate_memory) { // In case when output is mutable_data primitive, and other users dependencies are only used for // suychronization, The output memory of such primitive will be fused with mutable_data diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 325bb3aa581..9e4bbd9aa6b 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -125,29 +125,33 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const : allocation_type::unknown; } -bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) const { +bool ocl_engine::check_allocatable(const layout& layout, allocation_type type) { OPENVINO_ASSERT(supports_allocation(type) || type == allocation_type::cl_mem, "[GPU] Unsupported allocation type: ", type); - auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); -#ifdef __unix__ - // Prevent from being killed by Ooo Killer of Linux - OPENVINO_ASSERT(layout.bytes_count() + used_mem <= get_max_memory_size(), - "[GPU] Exceeded max size of memory allocation: ", - "Required ", layout.bytes_count(), " bytes, already occupied : ", used_mem, " bytes, ", - "but available memory size is ", get_max_memory_size(), " bytes"); -#else - if (layout.bytes_count() + used_mem > get_max_memory_size()) { - GPU_DEBUG_COUT << "[Warning] [GPU] Exceeded max size of memory allocation: " << "Required " << layout.bytes_count() << " bytes, already occupied : " - << used_mem << " bytes, but available memory size is " << get_max_memory_size() << " bytes" << std::endl; - GPU_DEBUG_COUT << "Please note that performance might drop due to memory swap." << std::endl; + auto alloc_mem_size = layout.bytes_count(); + auto max_mem_size = get_device_info().max_alloc_mem_size; + if (alloc_mem_size > max_mem_size) { + auto used_mem = get_used_device_memory(allocation_type::usm_device) + get_used_device_memory(allocation_type::usm_host); + GPU_DEBUG_LOG << "[GPU] Mem size info: " << "Required " << alloc_mem_size << " bytes, already occupied : " + << used_mem << " bytes, available memory size is " << get_max_memory_size() << " bytes, but max allocable memory size is " + << max_mem_size << " bytes." << std::endl; + return false; } -#endif + return true; } memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); - check_allocatable(layout, type); + bool allocatable = check_allocatable(layout, type); + if (!allocatable) { +#ifdef __unix__ + OPENVINO_ASSERT(allocatable, "[GPU] Exceeded max size of memory allocation, check debug message for size info"); +#else + GPU_DEBUG_COUT << "[Warning][GPU] Please note that performance might drop due to memory swap caused by exceeded mem size alloc." << std::endl; +#endif + } + try { memory::ptr res = nullptr; if (layout.format.is_image_2d()) { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index 6d414139651..ee76fcca82a 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -28,7 +28,7 @@ public: memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; - bool check_allocatable(const layout& layout, allocation_type type) const; + bool check_allocatable(const layout& layout, allocation_type type) override; void* get_user_context() const override; diff --git a/src/plugins/intel_gpu/tests/functional/dynamic_tests/gpu_dyn_huge_input_range.cpp b/src/plugins/intel_gpu/tests/functional/dynamic_tests/gpu_dyn_huge_input_range.cpp new file mode 100644 index 00000000000..62eb867df97 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/dynamic_tests/gpu_dyn_huge_input_range.cpp @@ -0,0 +1,235 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/single_layer/strided_slice.hpp" +#include "shared_test_classes/single_layer/shape_of.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "ov_models/builders.hpp" +#include "common_test_utils/test_constants.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" + +using namespace InferenceEngine; +using namespace ov::test; + +namespace GPULayerTestsDefinitions { + +struct StridedSliceParams { + std::vector begin; + std::vector end; + std::vector stride; + std::vector beginMask; + std::vector endMask; + std::vector newAxisMask; + std::vector shrinkAxisMask; + std::vector ellipsisAxisMask; +}; + +typedef std::tuple< + InputShape, // Input shapes + StridedSliceParams, + ElementType, // Element type + std::vector, // begin/end/stride input type + std::map // Additional network configuration +> StridedSliceLayerParamSet; + +class DynamicShapeHugeRangeGPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + InputShape shapes; + StridedSliceParams params; + ElementType elementType; + std::vector restInputType; + TargetDevice targetDevice; + std::map additionalConfig; + std::tie(shapes, params, elementType, restInputType, additionalConfig) = obj.param; + + std::ostringstream results; + results << "IS=" << ov::test::utils::partialShape2str({shapes.first}) << "_"; + results << "TS="; + for (const auto& item : shapes.second) { + results << ov::test::utils::vec2str(item) << "_"; + } + results << "netPRC=" << elementType << "_"; + results << "begin=" << ov::test::utils::vec2str(params.begin) << "_"; + results << "end=" << ov::test::utils::vec2str(params.end) << "_"; + results << "stride=" << ov::test::utils::vec2str(params.stride) << "_"; + results << "begin_m=" << ov::test::utils::vec2str(params.beginMask) << "_"; + results << "end_m=" << ov::test::utils::vec2str(params.endMask) << "_"; + results << "new_axis_m=" << (params.newAxisMask.empty() ? "def" : ov::test::utils::vec2str(params.newAxisMask)) << "_"; + results << "shrink_m=" << (params.shrinkAxisMask.empty() ? "def" : ov::test::utils::vec2str(params.shrinkAxisMask)) << "_"; + results << "ellipsis_m=" << (params.ellipsisAxisMask.empty() ? "def" : ov::test::utils::vec2str(params.ellipsisAxisMask)) << "_"; + results << "beginType=" << restInputType[0] << "_"; + results << "endType=" << restInputType[1] << "_"; + results << "strideType=" << restInputType[2] << "_"; + results << "config=("; + for (const auto& configEntry : additionalConfig) { + results << configEntry.first << ", " << configEntry.second << ":"; + } + results << ")"; + + return results.str(); + } + + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& funcInputs = function->inputs(); + ov::Tensor tensor; + + // input0: data + int32_t idx = 0; + tensor = ov::test::utils::create_and_fill_tensor(funcInputs[idx].get_element_type(), targetInputStaticShapes[idx]); + inputs.insert({funcInputs[idx].get_node_shared_ptr(), tensor}); + + // input1: begin + if (restInputType[0] == ngraph::helpers::InputLayerType::PARAMETER) { + idx += 1; + tensor = ov::Tensor(funcInputs[idx].get_element_type(), targetInputStaticShapes[idx]); + auto *dataPtr = tensor.data(); + for (size_t i = 0; i < begin.size(); i++) { + dataPtr[i] = static_cast(begin[i]); + } + inputs.insert({funcInputs[idx].get_node_shared_ptr(), tensor}); + } + + // input2: end + if (restInputType[1] == ngraph::helpers::InputLayerType::PARAMETER) { + idx += 1; + tensor = ov::Tensor(funcInputs[idx].get_element_type(), targetInputStaticShapes[idx]); + auto *dataPtr = tensor.data(); + for (size_t i = 0; i < end.size(); i++) { + dataPtr[i] = static_cast(end[i]); + } + inputs.insert({funcInputs[idx].get_node_shared_ptr(), tensor}); + } + + // input3: stride + if (restInputType[2] == ngraph::helpers::InputLayerType::PARAMETER) { + idx += 1; + tensor = ov::Tensor(funcInputs[idx].get_element_type(), targetInputStaticShapes[idx]); + auto *dataPtr = tensor.data(); + for (size_t i = 0; i < stride.size(); i++) { + dataPtr[i] = static_cast(stride[i]); + } + inputs.insert({funcInputs[idx].get_node_shared_ptr(), tensor}); + } + + inferRequestNum++; + } + +protected: + std::vector begin; + std::vector end; + std::vector stride; + std::vector restInputType; + size_t inferRequestNum = 0; + + void SetUp() override { + InputShape shapes; + StridedSliceParams ssParams; + std::map additionalConfig; + std::tie(shapes, ssParams, inType, restInputType, additionalConfig) = this->GetParam(); + + begin = ssParams.begin; + end = ssParams.end; + stride = ssParams.stride; + + targetDevice = ov::test::utils::DEVICE_GPU; + + std::vector inputShapes; + inputShapes.push_back(shapes); + if (restInputType[0] == ngraph::helpers::InputLayerType::PARAMETER) + inputShapes.push_back(InputShape({static_cast(begin.size())}, std::vector(shapes.second.size(), {begin.size()}))); + if (restInputType[1] == ngraph::helpers::InputLayerType::PARAMETER) + inputShapes.push_back(InputShape({static_cast(end.size())}, std::vector(shapes.second.size(), {end.size()}))); + if (restInputType[2] == ngraph::helpers::InputLayerType::PARAMETER) + inputShapes.push_back(InputShape({static_cast(stride.size())}, std::vector(shapes.second.size(), {stride.size()}))); + + init_input_shapes(inputShapes); + + ov::ParameterVector params{std::make_shared(inType, inputDynamicShapes.front())}; + + std::shared_ptr beginInput, endInput, strideInput; + if (restInputType[0] == ngraph::helpers::InputLayerType::PARAMETER) { + auto beginNode = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{begin.size()}); + params.push_back(beginNode); + beginInput = beginNode; + } else { + beginInput = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{begin.size()}, begin); + } + + if (restInputType[1] == ngraph::helpers::InputLayerType::PARAMETER) { + auto endNode = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{end.size()}); + params.push_back(endNode); + endInput = endNode; + } else { + endInput = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{end.size()}, end); + } + + if (restInputType[2] == ngraph::helpers::InputLayerType::PARAMETER) { + auto strideNode = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{stride.size()}); + params.push_back(strideNode); + strideInput = strideNode; + } else { + strideInput = std::make_shared(ngraph::element::Type_t::i64, ov::Shape{stride.size()}, stride); + } + + auto stridedSliceOp = std::make_shared(params[0], beginInput, endInput, strideInput, ssParams.beginMask, ssParams.endMask, + ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask); + + auto shapeOfOp = std::make_shared(stridedSliceOp, ov::element::Type_t::i32); + + ngraph::ResultVector results; + for (size_t i = 0; i < shapeOfOp->get_output_size(); i++) { + results.push_back(std::make_shared(shapeOfOp->output(i))); + } + + function = std::make_shared(results, params, "result"); + } +}; + +TEST_P(DynamicShapeHugeRangeGPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + run(); +} + +namespace { + +std::map emptyAdditionalConfig; + +const std::vector inputPrecisions = { + ElementType::f32 +}; + +const std::vector> restInputTypes = { + {ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::CONSTANT}, + {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::PARAMETER}, + {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::CONSTANT}, + {ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT}, + {ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::PARAMETER}, + {ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::PARAMETER}, + {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT, ngraph::helpers::InputLayerType::PARAMETER}, + {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT}, +}; + +const std::vector inputShapesDynamic2D_excessive_uppper_boundary = { + {{{0, 1000}, {0, 364000000}, 4}, + {{640, 640, 4}}}, +}; + +const std::vector paramsPlain2D_excessive_uppper_boundary = { + StridedSliceParams{ { 0, 1 }, { 0, 2147483647 }, { 1, 1 }, { 1, 0 }, { 1, 0 }, { }, { }, { } }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_Dynamic_2D_excessive_uppper_boundary, DynamicShapeHugeRangeGPUTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapesDynamic2D_excessive_uppper_boundary), + ::testing::ValuesIn(paramsPlain2D_excessive_uppper_boundary), + ::testing::ValuesIn(inputPrecisions), + ::testing::Values(restInputTypes[0]), + ::testing::Values(emptyAdditionalConfig)), + DynamicShapeHugeRangeGPUTest::getTestCaseName); +} // namespace +} // namespace GPULayerTestsDefinitions