diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/arg_max_min.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/arg_max_min.hpp index 4d438d39391..f0b348463ef 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/arg_max_min.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/arg_max_min.hpp @@ -43,6 +43,25 @@ struct arg_max_min : public primitive_base { sort(sort), values_first(values_first) {} + /// @brief Constructs arg_max_min for top_k parameter + arg_max_min(const primitive_id& id, + const input_info& input, + const input_info& topk_id, + ov::op::TopKMode mode, + uint32_t top_k, + int64_t axis, + ov::op::TopKSortType sort = ov::op::TopKSortType::SORT_VALUES, + bool values_first = false, + const padding& output_padding = padding(), + data_types output_data_type = data_types::f32, + const size_t num_outputs = 1) + : primitive_base(id, {input, topk_id}, {output_padding}, {optional_data_type{output_data_type}}, num_outputs), + mode(mode), + top_k(top_k), + axis(axis), + sort(sort), + values_first(values_first) {} + /// @brief Type of output - max or min. ov::op::TopKMode mode; /// @brief Number of indices to output. diff --git a/src/plugins/intel_gpu/src/graph/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/arg_max_min.cpp index bae8df2fdc8..ca79e4dee67 100644 --- a/src/plugins/intel_gpu/src/graph/arg_max_min.cpp +++ b/src/plugins/intel_gpu/src/graph/arg_max_min.cpp @@ -75,7 +75,8 @@ std::vector arg_max_min_inst::calc_output_layouts(arg_max_min_node const auto input_layout = impl_param.get_input_layout(); ov::op::v1::TopK op; - op.set_axis(input_layout.get().rank(), desc->axis); + auto input_rank = input_layout.get().rank(); + op.set_axis(input_rank, desc->axis); op.set_mode(desc->mode); op.set_sort_type(desc->sort); @@ -85,13 +86,24 @@ std::vector arg_max_min_inst::calc_output_layouts(arg_max_min_node const ShapeType{} }; - int64_t top_k = desc->top_k; + auto& constant_mem = impl_param.memory_deps; + if (desc->top_k > 0) { + std::map const_data; + auto topk = desc->top_k; + auto top_k_tensor = std::make_shared(ov::element::u32, ov::Shape{1}, static_cast(&topk)); + const_data = { {1, top_k_tensor} }; - auto top_k_tensor = std::make_shared(ov::element::i64, ov::Shape{1}, static_cast(&top_k)); - std::map> const_data = { - {1, top_k_tensor} - }; - ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); + ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); + } else if (constant_mem.count(1)) { + std::map const_data; + auto target_shape_mem = constant_mem.at(1); + cldnn::mem_lock target_shape_lock(target_shape_mem, impl_param.prog->get_stream()); + const_data.emplace(1, make_host_tensor(target_shape_mem->get_layout(), target_shape_lock.data())); + + ov::op::v1::shape_infer(&op, input_shapes, output_shapes, const_data); + } else { + output_shapes[0] = output_shapes[1] = ShapeType::dynamic(input_layout.get().size()); + } for (size_t i = 0; i < desc->num_outputs; ++i) { auto dt = desc->output_data_types[i].value_or(input_layout.data_type); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp index b9922e59646..37376772be1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp @@ -52,29 +52,41 @@ protected: kernel_arguments_data args = parent::get_arguments(instance); if (instance.node->has_second_output()) { - args.inputs.erase(args.inputs.begin() + 1); // erase constant input in case of TOP_K + if (args.inputs.size() > 1) { + args.inputs.erase(args.inputs.begin() + 1); // erase constant input in case of TOP_K + } } return args; } public: - static std::unique_ptr create(const arg_max_min_node& arg, const kernel_impl_params& impl_param) { + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { const auto& primitive = impl_param.typed_desc(); const auto& axis = primitive->axis; const auto& top_k = primitive->top_k; const auto& mode = primitive->mode; const auto& sort_type = primitive->sort; const auto& values_first = primitive->values_first; - const auto& outputs_num = arg.get_output_nums(); // second output passed as input for TOP_K layer + const auto& outputs_num = (primitive->input_size() == 3 ? 2 : primitive->output_size()); auto argm_params = get_default_params(impl_param); auto argm_optional_params = get_default_optional_params(impl_param.get_program()); argm_params.outputs_num = outputs_num; - argm_params.topK = top_k; - argm_params.argMaxMinAxis = GetArgMaxMinAxis(axis, arg.get_output_layout().get_rank()); + argm_params.argMaxMinAxis = GetArgMaxMinAxis(axis, impl_param.get_output_layout().get_rank()); + + auto& constant_mem = impl_param.memory_deps; + if (constant_mem.count(1)) { + // The topK could be got by reading impl_param.memory_deps.at(1). + // However, here we utilize output_layout and axis information to minimize mem_lock. + auto output_layout = impl_param.get_output_layout(0); + auto out_dims = output_layout.get_dims(); + argm_params.topK = out_dims[axis]; + } else { + argm_params.topK = top_k; + } if (mode == ov::op::TopKMode::MAX) argm_params.argMaxMinOut = kernel_selector::argm_output::MAX; @@ -86,9 +98,9 @@ public: else argm_params.argMaxMinSortType = kernel_selector::argm_sort::INDEX; - if (arg.has_second_output()) { // for backward compatibility + if (outputs_num == 2) { // for backward compatibility argm_params.has_second_output = true; - if (arg.use_multiple_outputs()) { + if (primitive->input_size() != 3) { argm_params.use_multiple_outputs = true; argm_params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); } else { @@ -98,10 +110,7 @@ public: argm_params.values_first = values_first; - auto& kernel_selector = kernel_selector::arg_max_min_kernel_selector::Instance(); - auto best_kernel = kernel_selector.get_best_kernel(argm_params, argm_optional_params); - - return make_unique(best_kernel); + return {argm_params, argm_optional_params}; } }; @@ -119,7 +128,10 @@ attach_arg_max_min_impl::attach_arg_max_min_impl() { format::bfzyx}; - implementation_map::add(impl_types::ocl, arg_max_min_impl::create, types, formats); + implementation_map::add(impl_types::ocl, + typed_primitive_impl_ocl::create, + types, + formats); } } // namespace detail } // namespace ocl diff --git a/src/plugins/intel_gpu/src/plugin/ops/topk.cpp b/src/plugins/intel_gpu/src/plugin/ops/topk.cpp index 7128895f51e..13870b5f52a 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/topk.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/topk.cpp @@ -41,16 +41,19 @@ static void CreateTopKOp(Program& p, const std::shared_ptr } return output_data_types; }; + + auto topk_constant = std::dynamic_pointer_cast(op->input_value(1).get_node_shared_ptr()); auto argmaxPrim = cldnn::arg_max_min(layerName, - inputs, - mode, - top_k, - chosen_axis, - stype, - true, - cldnn::padding({0, 0, 0, 0}, 0), - cldnn::element_type_to_data_type(op->get_output_element_type(0)), - num_outputs); + inputs[0], + inputs[1], + mode, + (topk_constant ? top_k : 0), + chosen_axis, + stype, + true, + cldnn::padding({0, 0, 0, 0}, 0), + cldnn::element_type_to_data_type(op->get_output_element_type(0)), + num_outputs); argmaxPrim.output_paddings = get_output_paddings(); argmaxPrim.output_data_types = get_output_data_types(); p.add_primitive(*op, argmaxPrim); diff --git a/src/plugins/intel_gpu/tests/shape_infer/arg_max_min_si_test.cpp b/src/plugins/intel_gpu/tests/shape_infer/arg_max_min_si_test.cpp index b285a4063ff..778b7b8186c 100644 --- a/src/plugins/intel_gpu/tests/shape_infer/arg_max_min_si_test.cpp +++ b/src/plugins/intel_gpu/tests/shape_infer/arg_max_min_si_test.cpp @@ -129,4 +129,4 @@ INSTANTIATE_TEST_SUITE_P(smoke, arg_max_min_test, }, })); -} // shape_infer_tests +} // namespace shape_infer_tests diff --git a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/top_k.cpp b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/top_k.cpp index c7c6b7c245b..54c75bd99d1 100644 --- a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/top_k.cpp +++ b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/top_k.cpp @@ -19,16 +19,16 @@ using namespace ov::test; namespace GPULayerTestsDefinitions { typedef std::tuple< - int64_t, // keepK - int64_t, // axis - ngraph::opset4::TopK::Mode, // mode - ngraph::opset4::TopK::SortType, // sort - ElementType, // Net precision - ElementType, // Input precision - ElementType, // Output precision - InputShape, // inputShape - TargetDevice, // Device name - std::map // Additional network configuration + int64_t, // keepK + int64_t, // axis + ngraph::opset4::TopK::Mode, // mode + ngraph::opset4::TopK::SortType, // sort + ElementType, // Net precision + ElementType, // Input precision + ElementType, // Output precision + InputShape, // inputShape + TargetDevice, // Device name + ngraph::helpers::InputLayerType // Input type > TopKLayerTestParamsSet; class TopKLayerGPUTest : public testing::WithParamInterface, @@ -43,8 +43,8 @@ public: ElementType netPrecision, inPrc, outPrc; InputShape inputShape; TargetDevice targetDevice; - std::map additionalConfig; - std::tie(keepK, axis, mode, sort, netPrecision, inPrc, outPrc, inputShape, targetDevice, additionalConfig) = basicParamsSet; + ngraph::helpers::InputLayerType inputType; + std::tie(keepK, axis, mode, sort, netPrecision, inPrc, outPrc, inputShape, targetDevice, inputType) = basicParamsSet; std::ostringstream result; result << "k=" << keepK << "_"; @@ -58,11 +58,8 @@ public: for (const auto& shape : inputShape.second) { result << CommonTestUtils::vec2str(shape) << "_"; } - result << "config=("; - for (const auto& configEntry : additionalConfig) { - result << configEntry.first << ", " << configEntry.second << ":"; - } result << ")_"; + result << "inputType=" << inputType; result << "TargetDevice=" << targetDevice; return result.str(); @@ -77,16 +74,29 @@ protected: ngraph::opset4::TopK::SortType sort; ElementType inPrc, outPrc; InputShape inputShape; - std::map additionalConfig; - std::tie(keepK, axis, mode, sort, netPrecision, inPrc, outPrc, inputShape, targetDevice, additionalConfig) = basicParamsSet; + std::tie(keepK, axis, mode, sort, netPrecision, inPrc, outPrc, inputShape, targetDevice, inputType) = basicParamsSet; - init_input_shapes({inputShape}); + if (inputType == ngraph::helpers::InputLayerType::CONSTANT) { + init_input_shapes({inputShape}); + } else { + inputDynamicShapes = {inputShape.first, {}}; + for (size_t i = 0; i < inputShape.second.size(); ++i) { + targetStaticShapes.push_back({inputShape.second[i], {}}); + } + } auto params = ngraph::builder::makeDynamicParams(netPrecision, {inputDynamicShapes[0]}); std::shared_ptr topk; - auto k = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{}, &keepK); - topk = std::dynamic_pointer_cast(std::make_shared(params[0], k, axis, mode, sort)); + if (inputType == ngraph::helpers::InputLayerType::CONSTANT) { + auto k = std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape{}, &keepK); + topk = std::dynamic_pointer_cast(std::make_shared(params[0], k, axis, mode, sort)); + } else { + auto k = std::make_shared(ngraph::element::Type_t::i64, inputDynamicShapes[1]); + params.push_back(k); + topk = std::dynamic_pointer_cast( + std::make_shared(params[0], k, axis, mode, sort)); + } ngraph::ResultVector results; for (size_t i = 0; i < topk->get_output_size(); i++) { @@ -104,60 +114,41 @@ protected: tensor = ov::test::utils::create_and_fill_tensor(funcInputs[0].get_element_type(), shape); size_t size = tensor.get_size(); - if (netPrecision == ElementType::f32 || netPrecision == ElementType::i32) { + if (netPrecision == ElementType::f32) { std::vector data(size); - // For int32, deliberately set big numbers which are not accurately representable in fp32 - int start = netPrecision == ElementType::i32 ? pow(2, 30) + 1 : - static_cast(size / 2); + int start = - static_cast(size / 2); std::iota(data.begin(), data.end(), start); std::mt19937 gen(0); std::shuffle(data.begin(), data.end(), gen); - if (netPrecision == ElementType::f32) { - auto *rawBlobDataPtr = static_cast(tensor.data()); - for (size_t i = 0; i < size; ++i) { - rawBlobDataPtr[i] = static_cast(data[i]); - } - } else { - auto *rawBlobDataPtr = static_cast(tensor.data()); - for (size_t i = 0; i < size; ++i) { - rawBlobDataPtr[i] = static_cast(data[i]); - } - } - } else if (netPrecision == ElementType::bf16) { - size_t O = 1, A = 1, I = 1; - A = shape[axis]; - for (size_t i = 0; i < axis; i++) - O *= shape[i]; - for (size_t i = axis + 1; i < shape.size(); i++) - I *= shape[i]; - if (O * A * I != size) - FAIL() << "Incorrect blob shape " << shape; - - auto *rawBlobDataPtr = static_cast(tensor.data()); - for (size_t o = 0; o < O; o++) { - for (size_t i = 0; i < I; i++) { - std::vector data(A); - int start = - static_cast(A / 2); - std::iota(data.begin(), data.end(), start); - const size_t seed = (o + 1) * (i + 1); - std::mt19937 gen(seed); - std::shuffle(data.begin(), data.end(), gen); - for (size_t a = 0; a < A; a++) { - rawBlobDataPtr[o * A * I + a * I + i] = static_cast(data[a]); - } - } + auto *rawBlobDataPtr = static_cast(tensor.data()); + for (size_t i = 0; i < size; ++i) { + rawBlobDataPtr[i] = static_cast(data[i]); } } else { FAIL() << "generate_inputs for " << netPrecision << " precision isn't supported"; } inputs.insert({funcInputs[0].get_node_shared_ptr(), tensor}); + + if (inputType == ngraph::helpers::InputLayerType::PARAMETER) { + const auto& kPrecision = funcInputs[1].get_element_type(); + const auto& kShape = targetInputStaticShapes[1]; + + const size_t startFrom = 1; + const size_t range = targetInputStaticShapes[0][axis]; + const size_t seed = inferRequestNum++; + const auto kTensor = ov::test::utils::create_and_fill_tensor(kPrecision, kShape, range, startFrom, 1, seed); + + inputs.insert({funcInputs[1].get_node_shared_ptr(), kTensor}); + } } private: int64_t axis; + size_t inferRequestNum = 0; ElementType netPrecision; - bool staticShape; + ngraph::helpers::InputLayerType inputType; }; TEST_P(TopKLayerGPUTest, CompareWithRefs) { @@ -168,14 +159,12 @@ TEST_P(TopKLayerGPUTest, CompareWithRefs) { namespace { -std::map emptyAdditionalConfig; - const std::vector netPrecisions = { ElementType::f32, }; -const std::vector axes = {0, 1, 2, 3}; -const std::vector k = {1, 5, 7, 18, 21}; +const std::vector axes = {0, 3}; +const std::vector k = {3, 5, 7}; const std::vector modes = { ngraph::opset4::TopK::Mode::MIN, @@ -189,12 +178,12 @@ const std::vector sortTypes = { std::vector inputShapesDynamic = { { - {{21, {20, 25}, 21, {20, 25}}, {{21, 21, 21, 21}, {21, 22, 21, 23}}}, - {ov::PartialShape::dynamic(4), {{21, 21, 21, 21}, {21, 22, 21, 23}}} + {ov::PartialShape::dynamic(4), {{7, 7, 7, 7}, {7, 8, 7, 9}}}, + {{-1, -1, -1, -1}, {{8, 9, 10, 11}, {11, 7, 8, 9}}} } }; -INSTANTIATE_TEST_CASE_P(smoke_TopK_dynamic, TopKLayerGPUTest, +INSTANTIATE_TEST_CASE_P(smoke_TopK_constant_dynamic, TopKLayerGPUTest, ::testing::Combine( ::testing::ValuesIn(k), ::testing::ValuesIn(axes), @@ -205,7 +194,21 @@ INSTANTIATE_TEST_CASE_P(smoke_TopK_dynamic, TopKLayerGPUTest, ::testing::Values(ElementType::undefined), ::testing::ValuesIn(inputShapesDynamic), ::testing::Values(CommonTestUtils::DEVICE_GPU), - ::testing::Values(emptyAdditionalConfig)), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT)), + TopKLayerGPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_TopK_parameter_dynamic, TopKLayerGPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::ValuesIn(axes), + ::testing::ValuesIn(modes), + ::testing::ValuesIn(sortTypes), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(ElementType::undefined), + ::testing::Values(ElementType::undefined), + ::testing::ValuesIn(inputShapesDynamic), + ::testing::Values(CommonTestUtils::DEVICE_GPU), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER)), TopKLayerGPUTest::getTestCaseName); } // namespace