From 99cc3624b72b723aa7334f735ac8b325a9c1e668 Mon Sep 17 00:00:00 2001 From: Paul Youngsoo Ahn Date: Thu, 24 Aug 2023 09:57:18 +0900 Subject: [PATCH] [GPU] Fix accuracy issue (#19351) - [scatter_update] Use input index for input buffer instead of output index - [concat cpu impl] Sync input layout and mem_ptr when input host tensor creation - Add unit tests for scatter_update and concat cpu impl --- .../intel_gpu/src/graph/impls/cpu/concat.cpp | 11 ++- .../cl_kernels/scatter_update_ref.cl | 6 +- .../test_cases/concatenation_gpu_test.cpp | 7 ++ .../test_cases/scatter_update_gpu_test.cpp | 98 +++++++++++++++++++ 4 files changed, 116 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp index 9254ef1bc2c..f96e79b4d12 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp @@ -69,17 +69,18 @@ struct concatenation_impl : public typed_primitive_impl { std::vector input_mem_ptrs; for (size_t i = 0; i < instance.dependencies().size(); i++) { auto& dep = instance.dependencies().at(i); - if (dep.first->get_output_layout().count() > 0) - input_mem_ptrs.push_back(instance.dep_memory_ptr(i)); + if (dep.first->get_output_layout().count() > 0) { + auto mem_ptr = instance.dep_memory_ptr(i); + input_host_tensors.push_back(make_tensor(params->input_layouts[i], mem_ptr->lock(stream, mem_lock_type::read))); + // push mem_ptr to input_mem_ptr to unlock after processing + input_mem_ptrs.push_back(mem_ptr); + } } auto output_mem_ptr = instance.output_memory_ptr(); cldnn::mem_lock output_lock(output_mem_ptr, stream); - for (size_t i = 0; i < input_mem_ptrs.size(); i++) - input_host_tensors.push_back(make_tensor(params->input_layouts[i], input_mem_ptrs[i]->lock(stream, mem_lock_type::read))); - output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); if (!op) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/scatter_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/scatter_update_ref.cl index 718e65d11e8..57eb51b149d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/scatter_update_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/scatter_update_ref.cl @@ -12,6 +12,7 @@ #define AXIS_X (OUTPUT_DIMS - 1) #define GET_OUTPUT_INDEX(idx_order) OUTPUT_GET_INDEX(idx_order) +#define GET_INPUT_INDEX(idx_order) INPUT0_GET_INDEX(idx_order) #if OUTPUT_DIMS == 4 #define ORDER b,f,y,x @@ -121,8 +122,10 @@ KERNEL(scatter_update_ref)(OPTIONAL_SHAPE_INFO_ARG #endif const uint output_idx = GET_OUTPUT_INDEX(ORDER); + const uint dict_idx = GET_INPUT_INDEX(ORDER); - INPUT0_TYPE val = dictionary[output_idx]; + // Use input index instead of output index because output padding is not empty. + INPUT0_TYPE val = dictionary[dict_idx]; #if HAS_FUSED_OPS FUSED_OPS_FIRST_KERNEL; output[output_idx] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT_FIRST_KERNEL); @@ -233,6 +236,7 @@ KERNEL(scatter_update_ref)(OPTIONAL_SHAPE_INFO_ARG } #undef GET_OUTPUT_INDEX +#undef GET_INPUT_INDEX #undef ORDER #undef AXIS_B #undef AXIS_F diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp index 59a3103706d..93db6bc8d82 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp @@ -192,6 +192,13 @@ void start_concat_test_dynamic(impl_types impl_type) { {{1, 5, 3, 4}, data_types::f32, format::bfyx}, {{1, 3, 3, 4}, data_types::f32, format::bfyx}, {{1, 2, 3, 4}, data_types::f32, format::bfyx}); + + if (impl_type == impl_types::cpu) { + run_on_shapes({{1, 2, 3, 4}, data_types::f32, format::bfyx}, + {{1, 0, 3, 4}, data_types::f32, format::bfyx}, + {{1, 3, 3, 4}, data_types::f32, format::bfyx}, + {{1, 8, 3, 4}, data_types::f32, format::bfyx}); + } } TEST(concat_gpu, dynamic_4d_f) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp index 08e57abb114..4c6c90ddbb6 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp @@ -1774,3 +1774,101 @@ TEST(scatter_update_gpu_fp16, d21214_bfzyx_axisX_bfwzyx_cached) { TEST(scatter_update_gpu_fp16, d2411_axisB_cached) { test_d2411_axisB(true); } + +TEST(scatter_update_gpu_fp32, output_padding) { + // Dictionary : 2x2x1x4 + // Indexes : 3x1x1x1 + // Updates : 2x2x1x3 + // Axis : 3 + // Output : 2x2x1x4 + // Input values in fp32 + + // Indexes: + // 2.f, 0.f, 3.f + // + // Updates: + // 20.f, 30.f, 40.f, + // 50.f, 60.f, 70.f, + // + // 80.f, 90.f, 100.f, + // 110.f, 120.f, 130.f + // + // Dictionary: + // 0.f, 1.f, 2.f, 3.f, + // 4.f, 5.f, 6.f, 7.f, + // + // 8.f, 9.f, 10.f, 11.f, + // 12.f, 13.f, 14.f, 15.f + // + // Output: + // 30.f, 1.f, 20.f, 40.f, + // 60.f, 5.f, 50.f, 70.f, + // + // 90.f, 9.f, 80.f, 100.f, + // 120.f, 13.f, 110.f, 130.f + + auto& engine = get_test_engine(); + + for(const auto target_format : formats2D) { + auto input1 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{2, 2, 4, 1}}); // Dictionary + auto input2 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{3, 1, 1, 1}}); // Indexes + auto input3 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{2, 2, 3, 1}}); // Updates + auto axis = 3; + + set_values(input1, { + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + 12.f, 13.f, 14.f, 15.f + }); + + set_values(input2, { + 2.f, 0.f, 3.f + }); + + set_values(input3, { + 20.f, 30.f, 40.f, + 50.f, 60.f, 70.f, + 80.f, 90.f, 100.f, + 110.f, 120.f, 130.f + }); + + padding output_padding = padding({1,1}, {1,1}); + + topology topology; + topology.add(input_layout("InputDictionary", input1->get_layout())); + topology.add(input_layout("InputText", input2->get_layout())); + topology.add(input_layout("InputUpdates", input3->get_layout())); + topology.add(reorder("DictionaryReordered", input_info("InputDictionary"), target_format, data_types::f32)); + topology.add(reorder("TextReordered", input_info("InputText"), target_format, data_types::f32)); + topology.add(reorder("UpdatesReordered", input_info("InputUpdates"), target_format, data_types::f32)); + topology.add( + scatter_update("scatter_update", input_info("DictionaryReordered"), input_info("TextReordered"), input_info("UpdatesReordered"), axis, output_padding) + ); + topology.add(reorder("out", input_info("scatter_update"), plain_2d_format, data_types::f32)); + + network network(engine, topology, get_test_default_config(engine)); + + network.set_input_data("InputDictionary", input1); + network.set_input_data("InputText", input2); + network.set_input_data("InputUpdates", input3); + + auto outputs = network.execute(); + + auto output = outputs.at("out").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + std::vector expected_results = { + 30.f, 1.f, 20.f, 40.f, + 60.f, 5.f, 50.f, 70.f, + 90.f, 9.f, 80.f, 100.f, + 120.f, 13.f, 110.f, 130.f + }; + + + for (size_t i = 0; i < expected_results.size(); ++i) { + ASSERT_EQ(expected_results[i], output_ptr[i]) + << "i=" << i << ", target_format=" << target_format; + } + } +}