[GPU] Fix accuracy issue for assign and fully_connected_mmad (#17430)
* Add reorder with usr's output data type for assign Signed-off-by: Andrew Park <andrew.park@intel.com> * Fix incorrect input index for handling leftovers Signed-off-by: Andrew Park <andrew.park@intel.com> * Add TCs for ov_gpu_unit_tests Signed-off-by: Andrew Park <andrew.park@intel.com> --------- Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
committed by
GitHub
parent
3b90165581
commit
00eacd2a96
@@ -8,6 +8,7 @@
|
||||
#include "program_node.h"
|
||||
#include "mutable_data_inst.h"
|
||||
#include "convert_color_inst.h"
|
||||
#include "assign_inst.h"
|
||||
#include "tensor_type.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
@@ -57,6 +58,21 @@ void add_required_reorders::run(program& p) {
|
||||
if (usr->is_type<data>())
|
||||
continue;
|
||||
|
||||
// If usr is assign and input and output data types are different
|
||||
// add reorder with usr's output data type between dep and usr
|
||||
if (usr->is_type<assign>()) {
|
||||
auto& dep = usr->get_dependency(0);
|
||||
auto dep_layout = dep.get_output_layout();
|
||||
auto out_layout = usr->get_output_layout();
|
||||
bool required_reorder = out_layout.data_type != dep_layout.data_type;
|
||||
if (required_reorder) {
|
||||
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
|
||||
auto& new_reorder_node = p.get_or_create(new_reorder);
|
||||
p.add_intermediate(new_reorder_node, *usr, dep);
|
||||
new_reorder_node.recalc_output_layout(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (optimize_data) {
|
||||
auto fused_ops = usr->get_fused_primitives();
|
||||
auto out_layout = usr->get_output_layout();
|
||||
|
||||
@@ -204,9 +204,9 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
#endif // SPATIAL_MAJOR
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * FEATURE_PITCH;
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH;
|
||||
#else
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * FEATURE_PITCH +
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * MMAD_INPUT_FBLOCK_PITCH +
|
||||
zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||
#endif // !SPLIT_SPATIAL
|
||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
|
||||
|
||||
@@ -1588,6 +1588,20 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
fully_connected_i8_i8_test::PrintToStringParamName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
mmad,
|
||||
fully_connected_i8_i8_test,
|
||||
testing::Combine(
|
||||
testing::Values(1),
|
||||
testing::Values(16, 43, 64),
|
||||
testing::Values(1),
|
||||
testing::Values(1),
|
||||
testing::Values(16, 32, 64),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_i8_i8_test::PrintToStringParamName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
basic,
|
||||
fully_connected_i8_u8_test,
|
||||
|
||||
@@ -143,6 +143,41 @@ TEST(variable_test_common, exception_on_wrong_layout) {
|
||||
test_exception_on_wrong_layout<float>(false);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void test_different_output_data_type(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const layout in_layout{data_types::f32, format::bfyx, tensor{1}};
|
||||
const auto input_data = engine.allocate_memory(in_layout);
|
||||
std::vector<float> inputs = { 70.0f };
|
||||
set_values(input_data, inputs);
|
||||
|
||||
const layout variable_layout{data_types::f16, format::bfyx, tensor{1}};
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", input_data->get_layout()));
|
||||
topology.add(assign("assign", { input_info("input") }, "v0", variable_layout));
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
|
||||
|
||||
network->assign_variables_memories({ { "v0", std::make_shared<network::VariableState>(engine.allocate_memory(variable_layout)) } });
|
||||
network->set_input_data("input", input_data);
|
||||
|
||||
const auto outputs = network->execute();
|
||||
const auto output = outputs.at("assign").get_memory();
|
||||
const cldnn::mem_lock<T> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < output_ptr.size(); ++i) {
|
||||
ASSERT_EQ(half_to_float(output_ptr[i]), inputs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(variable_test_common, different_output_data_type) {
|
||||
test_different_output_data_type<int16_t>(false);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void test_variables_are_preserved_across_inferences(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
@@ -217,6 +252,10 @@ TEST_P(variable_test_f32, variable_f32_cached) {
|
||||
TEST(variable_test_common, exception_on_wrong_layout_cached) {
|
||||
test_exception_on_wrong_layout<float>(true);
|
||||
}
|
||||
|
||||
TEST(variable_test_common, different_output_data_type_cached) {
|
||||
test_different_output_data_type<int16_t>(true);
|
||||
}
|
||||
#endif
|
||||
TEST(variable_test_common, variables_are_preserved_across_inferences_cached) {
|
||||
test_variables_are_preserved_across_inferences<int>(true);
|
||||
|
||||
Reference in New Issue
Block a user