[GPU] Fix accuracy issue for assign and fully_connected_mmad (#17430)

* Add reorder with usr's output data type for assign Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix incorrect input index for handling leftovers Signed-off-by: Andrew Park <andrew.park@intel.com> * Add TCs for ov_gpu_unit_tests Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
2023-05-10 12:31:18 +09:00
parent 3b90165581
commit 00eacd2a96
4 changed files with 71 additions and 2 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
@@ -8,6 +8,7 @@
 #include "program_node.h"
 #include "mutable_data_inst.h"
 #include "convert_color_inst.h"
+#include "assign_inst.h"
 #include "tensor_type.h"
 #include <memory>
 #include <vector>
@@ -57,6 +58,21 @@ void add_required_reorders::run(program& p) {
        if (usr->is_type<data>())
            continue;

+        // If usr is assign and input and output data types are different
+        // add reorder with usr's output data type between dep and usr
+        if (usr->is_type<assign>()) {
+            auto& dep = usr->get_dependency(0);
+            auto dep_layout = dep.get_output_layout();
+            auto out_layout = usr->get_output_layout();
+            bool required_reorder = out_layout.data_type != dep_layout.data_type;
+            if (required_reorder) {
+                auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
+                auto& new_reorder_node = p.get_or_create(new_reorder);
+                p.add_intermediate(new_reorder_node, *usr, dep);
+                new_reorder_node.recalc_output_layout(false);
+            }
+        }
+
        if (optimize_data) {
            auto fused_ops = usr->get_fused_primitives();
            auto out_layout = usr->get_output_layout();
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_MMAD.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_MMAD.cl
@@ -204,9 +204,9 @@ KERNEL(fully_connected_gpu_MMAD)(
 #endif  // SPATIAL_MAJOR

 #if !SPLIT_SPATIAL
-            uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * FEATURE_PITCH;
+            uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH;
 #else
-            uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * FEATURE_PITCH +
+            uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH +
                             zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
 #endif // !SPLIT_SPATIAL
            uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -1588,6 +1588,20 @@ INSTANTIATE_TEST_SUITE_P(
    fully_connected_i8_i8_test::PrintToStringParamName
 );

+INSTANTIATE_TEST_SUITE_P(
+    mmad,
+    fully_connected_i8_i8_test,
+    testing::Combine(
+        testing::Values(1),
+        testing::Values(16, 43, 64),
+        testing::Values(1),
+        testing::Values(1),
+        testing::Values(16, 32, 64),
+        testing::Values(format::bfyx, format::b_fs_yx_fsv32)
+    ),
+    fully_connected_i8_i8_test::PrintToStringParamName
+);
+
 INSTANTIATE_TEST_SUITE_P(
    basic,
    fully_connected_i8_u8_test,
--- a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp
@@ -143,6 +143,41 @@ TEST(variable_test_common, exception_on_wrong_layout) {
    test_exception_on_wrong_layout<float>(false);
 }

+template <typename T>
+void test_different_output_data_type(bool is_caching_test) {
+    auto& engine = get_test_engine();
+
+    const layout in_layout{data_types::f32, format::bfyx, tensor{1}};
+    const auto input_data = engine.allocate_memory(in_layout);
+    std::vector<float> inputs = { 70.0f };
+    set_values(input_data, inputs);
+
+    const layout variable_layout{data_types::f16, format::bfyx, tensor{1}};
+
+    topology topology;
+    topology.add(input_layout("input", input_data->get_layout()));
+    topology.add(assign("assign", { input_info("input") }, "v0", variable_layout));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
+
+    network->assign_variables_memories({ { "v0", std::make_shared<network::VariableState>(engine.allocate_memory(variable_layout)) } });
+    network->set_input_data("input", input_data);
+
+    const auto outputs = network->execute();
+    const auto output = outputs.at("assign").get_memory();
+    const cldnn::mem_lock<T> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < output_ptr.size(); ++i) {
+         ASSERT_EQ(half_to_float(output_ptr[i]), inputs[i]);
+    }
+}
+
+TEST(variable_test_common, different_output_data_type) {
+    test_different_output_data_type<int16_t>(false);
+}
+
 template <typename T>
 void test_variables_are_preserved_across_inferences(bool is_caching_test) {
    auto& engine = get_test_engine();
@@ -217,6 +252,10 @@ TEST_P(variable_test_f32, variable_f32_cached) {
 TEST(variable_test_common, exception_on_wrong_layout_cached) {
    test_exception_on_wrong_layout<float>(true);
 }
+
+TEST(variable_test_common, different_output_data_type_cached) {
+    test_different_output_data_type<int16_t>(true);
+}
 #endif
 TEST(variable_test_common, variables_are_preserved_across_inferences_cached) {
    test_variables_are_preserved_across_inferences<int>(true);