[GPU] Fix accuracy issue for assign and fully_connected_mmad (#17430)

* Add reorder with usr's output data type for assign

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Fix incorrect input index for handling leftovers

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Add TCs for ov_gpu_unit_tests

Signed-off-by: Andrew Park <andrew.park@intel.com>

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park
2023-05-10 12:31:18 +09:00
committed by GitHub
parent 3b90165581
commit 00eacd2a96
4 changed files with 71 additions and 2 deletions

View File

@@ -8,6 +8,7 @@
#include "program_node.h"
#include "mutable_data_inst.h"
#include "convert_color_inst.h"
#include "assign_inst.h"
#include "tensor_type.h"
#include <memory>
#include <vector>
@@ -57,6 +58,21 @@ void add_required_reorders::run(program& p) {
if (usr->is_type<data>())
continue;
// If usr is assign and input and output data types are different
// add reorder with usr's output data type between dep and usr
if (usr->is_type<assign>()) {
auto& dep = usr->get_dependency(0);
auto dep_layout = dep.get_output_layout();
auto out_layout = usr->get_output_layout();
bool required_reorder = out_layout.data_type != dep_layout.data_type;
if (required_reorder) {
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, dep);
new_reorder_node.recalc_output_layout(false);
}
}
if (optimize_data) {
auto fused_ops = usr->get_fused_primitives();
auto out_layout = usr->get_output_layout();

View File

@@ -204,9 +204,9 @@ KERNEL(fully_connected_gpu_MMAD)(
#endif // SPATIAL_MAJOR
#if !SPLIT_SPATIAL
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * FEATURE_PITCH;
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH;
#else
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * FEATURE_PITCH +
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH +
zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
#endif // !SPLIT_SPATIAL
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;

View File

@@ -1588,6 +1588,20 @@ INSTANTIATE_TEST_SUITE_P(
fully_connected_i8_i8_test::PrintToStringParamName
);
INSTANTIATE_TEST_SUITE_P(
mmad,
fully_connected_i8_i8_test,
testing::Combine(
testing::Values(1),
testing::Values(16, 43, 64),
testing::Values(1),
testing::Values(1),
testing::Values(16, 32, 64),
testing::Values(format::bfyx, format::b_fs_yx_fsv32)
),
fully_connected_i8_i8_test::PrintToStringParamName
);
INSTANTIATE_TEST_SUITE_P(
basic,
fully_connected_i8_u8_test,

View File

@@ -143,6 +143,41 @@ TEST(variable_test_common, exception_on_wrong_layout) {
test_exception_on_wrong_layout<float>(false);
}
template <typename T>
void test_different_output_data_type(bool is_caching_test) {
auto& engine = get_test_engine();
const layout in_layout{data_types::f32, format::bfyx, tensor{1}};
const auto input_data = engine.allocate_memory(in_layout);
std::vector<float> inputs = { 70.0f };
set_values(input_data, inputs);
const layout variable_layout{data_types::f16, format::bfyx, tensor{1}};
topology topology;
topology.add(input_layout("input", input_data->get_layout()));
topology.add(assign("assign", { input_info("input") }, "v0", variable_layout));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->assign_variables_memories({ { "v0", std::make_shared<network::VariableState>(engine.allocate_memory(variable_layout)) } });
network->set_input_data("input", input_data);
const auto outputs = network->execute();
const auto output = outputs.at("assign").get_memory();
const cldnn::mem_lock<T> output_ptr(output, get_test_stream());
for (size_t i = 0; i < output_ptr.size(); ++i) {
ASSERT_EQ(half_to_float(output_ptr[i]), inputs[i]);
}
}
TEST(variable_test_common, different_output_data_type) {
test_different_output_data_type<int16_t>(false);
}
template <typename T>
void test_variables_are_preserved_across_inferences(bool is_caching_test) {
auto& engine = get_test_engine();
@@ -217,6 +252,10 @@ TEST_P(variable_test_f32, variable_f32_cached) {
TEST(variable_test_common, exception_on_wrong_layout_cached) {
test_exception_on_wrong_layout<float>(true);
}
TEST(variable_test_common, different_output_data_type_cached) {
test_different_output_data_type<int16_t>(true);
}
#endif
TEST(variable_test_common, variables_are_preserved_across_inferences_cached) {
test_variables_are_preserved_across_inferences<int>(true);