[GPU] Fix malfunction in crop static kernel in dynamic shape scenario (#16586)

* Fix malfunction in crop static kernel in dynamic shape execution * Add unittest * Fix lint errort
2023-03-28 21:19:24 -07:00 · 2023-03-28 21:19:24 -07:00 · daf562832f
commit daf562832f
parent 6c766a81b5
5 changed files with 29 additions and 14 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@ -46,6 +46,7 @@ static constexpr Property<bool, PropertyMutability::RW> optimize_data{"GPU_OPTIM
 static constexpr Property<bool, PropertyMutability::RW> allow_static_input_reorder{"GPU_ALLOW_STATIC_INPUT_REORDER"};
 static constexpr Property<bool, PropertyMutability::RW> partial_build_program{"GPU_PARTIAL_BUILD"};
 static constexpr Property<bool, PropertyMutability::RW> allow_new_shape_infer{"GPU_ALLOW_NEW_SHAPE_INFER"};
+static constexpr Property<bool, PropertyMutability::RW> use_only_static_kernels_for_dynamic_shape{"GPU_USE_ONLY_STATIC_KERNELS_FOR_DYNAMIC_SHAPE"};
 static constexpr Property<std::string, PropertyMutability::RW> dump_graphs{"GPU_DUMP_GRAPHS"};
 static constexpr Property<std::vector<std::string>, PropertyMutability::RW> custom_outputs{"GPU_CUSTOM_OUTPUTS"};
 static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implementations{"GPU_FORCE_IMPLEMENTATIONS"};
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@ -34,9 +34,10 @@ void compile_graph::run(program& p) {
    std::exception_ptr exception;
    for (size_t idx = 0; idx < proc_order.size(); idx++) {
        auto& node = *(std::next(proc_order.begin(), idx));
+        bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape);
        bool can_select_impl = !node->is_type<data>() &&
                               !(node->is_type<mutable_data>() && node->get_dependencies().empty()) &&
-                               (!node->is_dynamic() || node->type()->does_dynamic_implementation_exist(*node));
+                               (!node->is_dynamic() || (use_shape_agnostic_impl && node->type()->does_dynamic_implementation_exist(*node)));

        // TODO: Remove this WA once we have shape agnostic reshape kernel
        if (node->is_type<reshape>() && node->is_dynamic() && !node->can_be_optimized())
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp
@ -25,12 +25,11 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {

 public:
    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
-        const auto& primitive = impl_param.typed_desc<crop>();
        auto params = get_default_params<kernel_selector::eltwise_params>(impl_param, is_shape_agnostic);
        auto optional_params = get_default_optional_params<kernel_selector::eltwise_optional_params>(impl_param.get_program());

        params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0)}, kernel_selector::eltwise_mode::ASSIGN});
-        if (impl_param.get_program().get_node(primitive->id).is_dynamic()) {
+        if (impl_param.is_dynamic() || is_shape_agnostic) {
            // WA to always match compiled dynamic kernel with dispatch data
            // W/O enforcing this option we may generate kernel for "broadcast" scneario due to umatched tensor dimensions
            // but in runtime dispatch data will be generated for non-broadcast case as shapes are actually same.
--- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp
+++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@ -68,7 +68,8 @@ void ExecutionConfig::set_default() {
        std::make_tuple(ov::intel_gpu::dump_graphs, ""),
        std::make_tuple(ov::intel_gpu::force_implementations, ImplForcingMap{}),
        std::make_tuple(ov::intel_gpu::partial_build_program, false),
-        std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false));
+        std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false),
+        std::make_tuple(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape, false));
 }

 void ExecutionConfig::register_property_impl(const std::pair<std::string, ov::Any>& property, PropertyVisibility visibility, BaseValidator::Ptr validator) {
--- a/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp
@ -1291,22 +1291,35 @@ TEST(crop_gpu, dynamic_i32_in2x3x2x2_crop_offsets) {
    set_values(input, input_vec);
    ExecutionConfig config = get_test_default_config(engine);
    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
-    network network(engine, topology, config);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("crop").get_memory();
-    cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
-
+    network network1(engine, topology, config); // run with shape agnostic kernel
+    network1.set_input_data("input", input);
+    auto outputs1 = network1.execute();
+    auto output1 = outputs1.at("crop").get_memory();
+    cldnn::mem_lock<int32_t> output1_ptr(output1, get_test_stream());
    for (int b = 0; b < crop_batch_num; ++b) { //B
        for (int f = 0; f < crop_feature_num; ++f) { //F
            for (int y = 0; y < crop_y_size; ++y) { //Y
                for (int x = 0; x < crop_x_size; ++x) { //X
                    int linear_id = (b + batch_offset) * (feature_num * y_size * x_size) + (f + feature_offset) * (y_size * x_size) + (y + y_offset) * x_size + (x + x_offset);
                    int output_linear_id = b * (crop_feature_num * crop_y_size * crop_x_size) + f * (crop_y_size * crop_x_size) + y * crop_x_size + x;
-                    ASSERT_EQ(output_ptr[output_linear_id], input_vec[linear_id]);
+                    ASSERT_EQ(output1_ptr[output_linear_id], input_vec[linear_id]);
+                }
+            }
+        }
+    }
+    config.set_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape(true));
+    network network2(engine, topology, config); // run with static kernel
+    network2.set_input_data("input", input);
+    auto outputs2 = network2.execute();
+    auto output2 = outputs2.at("crop").get_memory();
+    cldnn::mem_lock<int32_t> output2_ptr(output2, get_test_stream());
+    for (int b = 0; b < crop_batch_num; ++b) { //B
+        for (int f = 0; f < crop_feature_num; ++f) { //F
+            for (int y = 0; y < crop_y_size; ++y) { //Y
+                for (int x = 0; x < crop_x_size; ++x) { //X
+                    int linear_id = (b + batch_offset) * (feature_num * y_size * x_size) + (f + feature_offset) * (y_size * x_size) + (y + y_offset) * x_size + (x + x_offset);
+                    int output_linear_id = b * (crop_feature_num * crop_y_size * crop_x_size) + f * (crop_y_size * crop_x_size) + y * crop_x_size + x;
+                    ASSERT_EQ(output2_ptr[output_linear_id], input_vec[linear_id]);
                }
            }
        }