[GPU] Resolve accuracy issue from clamp fused prims (#19409)
+ Added condition when clamp activation is added to fused-ops for fp16 overflow + Added test-cases Signed-off-by: Min, Byungil <byungil.min@intel.com>
This commit is contained in:
parent
cddcec8ba8
commit
bcedb0bb9b
@ -32,8 +32,12 @@ void clamp_fp16_output::run(program& p) {
|
||||
local_desc.input_layout = node->get_output_layout();
|
||||
local_desc.f_param = act_node.get_fuse_params();
|
||||
local_desc.outer_dep_start_idx = -1; // No external dep
|
||||
local_desc.total_num_deps = 0;
|
||||
local_desc.total_num_deps = 1;
|
||||
local_desc.output_layout = node->get_output_layout();
|
||||
if (node->get_fused_primitives().size() > 0) {
|
||||
local_desc.fused_deps.emplace(node->get_fused_primitives().back().desc->id, 0);
|
||||
}
|
||||
|
||||
node->add_fused_primitive(local_desc);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,149 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
#include "random_generator.hpp"
|
||||
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
#include "data_inst.h"
|
||||
#include "gemm_inst.h"
|
||||
#include "softmax_inst.h"
|
||||
#include "pass_manager.h"
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
TEST(clamp_fp16_output_test, test_gemm_softmax_simple) {
|
||||
auto& engine = get_test_engine();
|
||||
ov::Shape in1_shape = { 1, 1, 3, 4 };
|
||||
ov::Shape in2_shape = { 1, 4 };
|
||||
auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx};
|
||||
auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx};
|
||||
auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx});
|
||||
auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
|
||||
|
||||
std::vector<float> input1_data = {
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
3.f, 3.f, 2.f, -1.f,
|
||||
};
|
||||
|
||||
std::vector<float> input2_data = {
|
||||
2.f, 5.f, -4.f, -7.f
|
||||
};
|
||||
set_values(input1, input1_data);
|
||||
set_values(input2, input2_data);
|
||||
|
||||
std::vector<float> out_data = {
|
||||
0.f, 0.8803f, 0.1192f
|
||||
};
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in1_layout),
|
||||
input_layout("input2", in2_layout),
|
||||
reorder("input1_fp16", input_info("input1"), format::any, data_types::f16),
|
||||
reorder("input2_fp16", input_info("input2"), format::any, data_types::f16),
|
||||
gemm("gemm", { input_info("input1_fp16"), input_info("input2_fp16") }, data_types::f16, false, true, 1.0f, 0.0f, 4, 2),
|
||||
softmax("softmax", input_info("gemm"), 2),
|
||||
reorder("reorder", input_info("softmax"), format::any, data_types::f32)
|
||||
);
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input1", input1);
|
||||
network.set_input_data("input2", input2);
|
||||
|
||||
auto inst = network.get_primitive("reorder");
|
||||
auto impl = inst->get_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("reorder").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
|
||||
for (uint32_t i = 0; i < out_data.size(); ++i) {
|
||||
ASSERT_NEAR(output_ptr[i], out_data[i], 1e-4);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(clamp_fp16_output_test, test_gemm_softmax_mult_fused) {
|
||||
auto& engine = get_test_engine();
|
||||
ov::Shape in1_shape = { 1, 1, 3, 4 };
|
||||
ov::Shape in2_shape = { 1, 4 };
|
||||
auto in1_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx};
|
||||
auto in2_layout = layout{ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx};
|
||||
auto input1 = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f32, format::bfyx});
|
||||
auto input2 = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f32, format::bfyx});
|
||||
auto input_elt = engine.allocate_memory({ov::PartialShape{1, 1, 3, 1}, data_types::f32, format::bfyx});
|
||||
|
||||
std::vector<float> input1_data = {
|
||||
1.f, -2.f, 3.f, -4.f,
|
||||
5.f, 6.f, 1.f, 2.f,
|
||||
3.f, 3.f, 2.f, -1.f,
|
||||
};
|
||||
|
||||
std::vector<float> input2_data = {
|
||||
2.f, 5.f, -4.f, -7.f
|
||||
};
|
||||
|
||||
std::vector<float> elt_data = {
|
||||
10.f, -5.5f, -0.05f
|
||||
};
|
||||
|
||||
set_values(input1, input1_data);
|
||||
set_values(input2, input2_data);
|
||||
set_values(input_elt, elt_data);
|
||||
|
||||
std::vector<float> out_data = {
|
||||
0.1209f, 0.0269f, 0.8520f
|
||||
};
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input1", in1_layout),
|
||||
input_layout("input2", in2_layout),
|
||||
data("elt_input", input_elt),
|
||||
reorder("input1_fp16", input_info("input1"), format::any, data_types::f16),
|
||||
reorder("input2_fp16", input_info("input2"), format::any, data_types::f16),
|
||||
reorder("elt_input_fp16", input_info("elt_input"), format::any, data_types::f16),
|
||||
gemm("gemm", { input_info("input1_fp16"), input_info("input2_fp16") }, data_types::f16, false, true, 1.0f, 0.0f, 4, 2),
|
||||
eltwise("eltwise", input_info("gemm"), input_info("elt_input_fp16"), eltwise_mode::sum),
|
||||
softmax("softmax", input_info("eltwise"), 2),
|
||||
reorder("reorder", input_info("softmax"), format::any, data_types::f32)
|
||||
);
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input1", input1);
|
||||
network.set_input_data("input2", input2);
|
||||
|
||||
auto inst = network.get_primitive("reorder");
|
||||
auto impl = inst->get_impl();
|
||||
ASSERT_TRUE(impl != nullptr);
|
||||
ASSERT_TRUE(impl->is_dynamic());
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("reorder").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
auto prog = network.get_program();
|
||||
ASSERT_TRUE(!has_node(*prog, "eltwise"));
|
||||
|
||||
ASSERT_EQ(output_ptr.size(), (uint32_t)3);
|
||||
for (uint32_t i = 0; i < out_data.size(); ++i) {
|
||||
ASSERT_NEAR(output_ptr[i], out_data[i], 1e-4);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user