[GPU] Apply m_pythondiv for fusing of eltwise div (#17590)

This commit is contained in:
Kelvin Choi 2023-06-03 09:29:02 +09:00 committed by GitHub
parent acb4b1d37b
commit ec0daa5b10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 142 additions and 21 deletions

View File

@ -195,7 +195,8 @@ void CreateUnaryEltwiseOp(Program& p, const std::shared_ptr<ngraph::Node>& node,
void CreateElementwiseOp(Program& p,
const std::shared_ptr<ngraph::Node>& node,
cldnn::eltwise_mode mode,
std::vector<float> coefficients = {});
std::vector<float> coefficients = {},
bool pythondiv = true);
bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node);

View File

@ -86,7 +86,8 @@ struct eltwise : public primitive_base<eltwise> {
mode(mode),
coefficients(std::vector<float>(0)),
stride(std::vector<tensor>(0)),
broadcast_spec(spec.m_type, spec.m_axis) { }
broadcast_spec(spec.m_type, spec.m_axis),
m_pythondiv(true) { }
/// @brief Constructs eltwise primitive.
/// @param id This primitive id.
@ -106,7 +107,8 @@ struct eltwise : public primitive_base<eltwise> {
mode(mode),
coefficients(std::vector<float>(0)),
stride(stride),
broadcast_spec(spec.m_type, spec.m_axis) { }
broadcast_spec(spec.m_type, spec.m_axis),
m_pythondiv(true) { }
/// @brief Constructs eltwise primitive.
/// @param id This primitive id.
@ -124,7 +126,8 @@ struct eltwise : public primitive_base<eltwise> {
mode(mode),
coefficients(std::vector<float>(0)),
stride(std::vector<tensor>(0)),
broadcast_spec(spec.m_type, spec.m_axis) { }
broadcast_spec(spec.m_type, spec.m_axis),
m_pythondiv(true) { }
/// @brief Constructs eltwise primitive.
/// @param id This primitive id.
@ -140,7 +143,8 @@ struct eltwise : public primitive_base<eltwise> {
mode(mode),
coefficients(std::vector<float>(0)),
stride(std::vector<tensor>(0)),
broadcast_spec(spec.m_type, spec.m_axis) { }
broadcast_spec(spec.m_type, spec.m_axis),
m_pythondiv(true) { }
/// @brief Constructs eltwise primitive.
/// @param id This primitive id.
@ -149,18 +153,21 @@ struct eltwise : public primitive_base<eltwise> {
/// @param coefficients Blob-wise coefficient.
/// @param data_type Expected output data type.
/// @param spec Auto broadcast rule specificiation.
/// @param m_pythondiv Specifies if floor division should be calculate. Supported only for integer data types.
eltwise(const primitive_id& id,
const std::vector<input_info>& inputs,
eltwise_mode mode,
std::vector<float> coeffs,
data_types data_type,
const ov::op::AutoBroadcastSpec& spec = ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY),
bool m_pythondiv = true,
const padding& output_padding = padding())
: primitive_base(id, inputs, {output_padding}, {optional_data_type{data_type}}),
mode(mode),
coefficients(std::move(coeffs)),
stride(std::vector<tensor>(0)),
broadcast_spec(spec.m_type, spec.m_axis) {
broadcast_spec(spec.m_type, spec.m_axis),
m_pythondiv(m_pythondiv) {
if (mode == eltwise_mode::sum && !coefficients.empty() && coefficients.size() != inputs.size()) {
throw std::invalid_argument("Invalid eltwise sum coefficients count (should be equal to 0 or input.size)");
}
@ -174,6 +181,8 @@ struct eltwise : public primitive_base<eltwise> {
std::vector<tensor> stride;
/// @brief Define auto broadcast rule specification.
ov::op::AutoBroadcastSpec broadcast_spec;
/// @brief Define m_pythondiv.
bool m_pythondiv;
size_t hash() const override {
size_t seed = primitive::hash();
@ -182,6 +191,7 @@ struct eltwise : public primitive_base<eltwise> {
for (auto& s : stride) {
seed = cldnn::hash_combine(seed, s.hash());
}
seed = cldnn::hash_combine(seed, m_pythondiv);
return seed;
}
@ -194,7 +204,8 @@ struct eltwise : public primitive_base<eltwise> {
return mode == rhs_casted.mode &&
coefficients == rhs_casted.coefficients &&
broadcast_spec == rhs_casted.broadcast_spec &&
stride == rhs_casted.stride;
stride == rhs_casted.stride &&
m_pythondiv == rhs_casted.m_pythondiv;
}
void save(BinaryOutputBuffer& ob) const override {
@ -202,7 +213,8 @@ struct eltwise : public primitive_base<eltwise> {
ob << make_data(&mode, sizeof(eltwise_mode));
ob << coefficients;
ob << stride;
ob << make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));;
ob << make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));
ob << m_pythondiv;
}
void load(BinaryInputBuffer& ib) override {
@ -210,7 +222,8 @@ struct eltwise : public primitive_base<eltwise> {
ib >> make_data(&mode, sizeof(eltwise_mode));
ib >> coefficients;
ib >> stride;
ib >> make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));;
ib >> make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));
ib >> m_pythondiv;
}
};
} // namespace cldnn

View File

@ -1123,7 +1123,7 @@ std::shared_ptr<kernel_selector::fuse_params> convert_fuse_params(std::shared_pt
} else if (p->type() == eltwise::type_id()) {
auto casted = std::dynamic_pointer_cast<EltwiseFuseParams>(p);
kernel_selector::eltwise_mode mode = convert_to_eltwise_mode(casted->_desc->mode);
return std::make_shared<kernel_selector::eltwise_fuse_params>(mode);
return std::make_shared<kernel_selector::eltwise_fuse_params>(mode, casted->_desc->m_pythondiv);
} else if (p->type() == quantize::type_id()) {
auto casted = std::dynamic_pointer_cast<QuantizeFuseParams>(p);
return std::make_shared<kernel_selector::quantize_fuse_params>(casted->_scale_shift_opt,

View File

@ -1707,6 +1707,7 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
auto vec_size = conf.vec_size;
std::string shuffle_var = conf.shuffle_var_name;
bool is_shuffled = false;
bool floor_integer_div = false;
auto& dep_data = desc.dep_data;
int first_fused_ops_idx = -1;
@ -1738,14 +1739,40 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
in_vars_converted.push_back(in_name);
}
if (desc.GetType() == KernelType::ELTWISE) {
auto p = desc.GetOpParams<eltwise_fuse_params>();
if (!p)
IE_THROW() << "[clDNN] Eltwise fuse params can't be nullptr";
if (p->mode == kernel_selector::EltwiseMode::DIV) {
if (p->m_pythondiv)
floor_integer_div = true;
}
}
auto get_acc_t = [&]() -> Datatype {
std::vector<Datatype> input_types = {desc.output_tensor.GetDType()};
for (auto& dep : dep_data) {
input_types.push_back(dep.data_type);
}
std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };
std::vector<Datatype> types_prioritized = { };
if (floor_integer_div) {
if (std::all_of(input_types.begin(), input_types.end(),
[=](const Datatype& t) -> bool { return (t != Datatype::F32 && t != Datatype::F16); })) {
types_prioritized = { Datatype::INT64, Datatype::INT32, Datatype::UINT32, Datatype::INT16, Datatype::UINT16, Datatype::INT8, Datatype::UINT8 };
for (auto& type : types_prioritized) {
if (std::any_of(input_types.begin(), input_types.end(),
[=](const Datatype& t) -> bool { return (t == type); })) {
return type;
}
}
}
}
floor_integer_div = false;
types_prioritized.clear();
types_prioritized = { Datatype::F32, Datatype::F16 };
for (auto& type : types_prioritized) {
if (std::any_of(input_types.begin(), input_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
return type;
@ -1776,8 +1803,6 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
switch (desc.GetType()) {
case KernelType::ELTWISE: {
auto p = desc.GetOpParams<eltwise_fuse_params>();
if (!p)
throw std::runtime_error("[clDNN] Eltwise fuse params can't be nullptr");
std::string op = "";
switch (p->mode) {
case kernel_selector::EltwiseMode::ADD:
@ -1797,7 +1822,13 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
}
auto tmp_var = out_var + "_tmp";
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
auto acc_t_type = GetType(get_acc_t(), vec_size);
op_decls += "\\\n\t" + acc_t_type + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
if (floor_integer_div) {
auto tmp_var_rem = tmp_var + "_rem";
op_decls += "\\\n\t" + acc_t_type + " " + tmp_var_rem + " = " + input_vars[0] + " % " + input_vars[1] + ";";
op_decls += "\\\n\t" + tmp_var + " -= " + "((" + tmp_var_rem + " != 0 && (" + input_vars[0] + " < 0) != (" + input_vars[1] + " < 0)) ? 1 : 0);";
}
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
break;
}

View File

@ -90,8 +90,11 @@ struct eltwise_optional_params : optional_params {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct eltwise_fuse_params : fuse_params {
EltwiseMode mode;
bool m_pythondiv;
eltwise_fuse_params(EltwiseMode mode) : fuse_params(KernelType::ELTWISE), mode(mode) {}
eltwise_fuse_params(EltwiseMode mode, bool m_pythondiv) : fuse_params(KernelType::ELTWISE)
, mode(mode)
, m_pythondiv(m_pythondiv) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -36,7 +36,8 @@ namespace intel_gpu {
void CreateElementwiseOp(Program& p,
const std::shared_ptr<ngraph::Node>& op,
cldnn::eltwise_mode mode,
std::vector<float> coefficients) {
std::vector<float> coefficients,
bool pythondiv) {
auto inputs = p.GetInputInfo(op);
std::string layerName = layer_type_name_ID(op);
@ -84,7 +85,8 @@ void CreateElementwiseOp(Program& p,
mode,
std::move(coefficients),
out_dt,
op->get_autob());
op->get_autob(),
pythondiv);
p.add_primitive(*op, eltwisePrim);
}
@ -110,7 +112,7 @@ static void CreateSubtractOp(Program& p, const std::shared_ptr<ngraph::op::v1::S
}
static void CreateDivideOp(Program& p, const std::shared_ptr<ngraph::op::v1::Divide>& op) {
CreateElementwiseOp(p, op, cldnn::eltwise_mode::div);
CreateElementwiseOp(p, op, cldnn::eltwise_mode::div, {}, op->is_pythondiv());
}
static void CreateSquaredDifferenceOp(Program& p, const std::shared_ptr<ngraph::op::v0::SquaredDifference>& op) {

View File

@ -7,6 +7,7 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/eltwise.hpp>
#include <intel_gpu/primitives/gather.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include <intel_gpu/primitives/data.hpp>
@ -2344,6 +2345,76 @@ TEST(eltwise_gpu_int, basic_in4x4x4x4) {
}
}
TEST(eltwise_gpu_int, div_gather_fusing) {
auto& engine = get_test_engine();
auto input1 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 3, 2, 1, 2 } }); // Dictionary
auto input2 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 2, 2, 1, 1 } }); // Indexes
auto input3 = engine.allocate_memory({ data_types::i32, format::bfyx, { 2, 2, 2, 2 } }); // 2nd input of eltwise
set_values(input1, {
5, 6, 7, 8,
-5, -6, -7, -8,
9, 10, 11, 12
});
set_values(input2, {
0, 1,
2, 1
});
std::vector<int32_t> input_3_vec = {
2, 2, 2, 2,
2, 2, 2, 2,
-2, -2, -2, -2,
-2, -2, -2, -2
};
set_values(input3, input_3_vec);
topology topology;
topology.add(input_layout("InputDictionary", input1->get_layout()));
topology.add(input_layout("InputText", input2->get_layout()));
topology.add(input_layout("Input3", input3->get_layout()));
topology.add(gather("gather", input_info("InputDictionary"), input_info("InputText"), 0, ov::Shape{2, 2, 2, 2}));
topology.add(reorder("gather_reorder", input_info("gather"), { data_types::i32, format::bfyx, { 2, 2, 2, 2 } }));
topology.add(eltwise("eltwise",
{ input_info("gather_reorder"), input_info("Input3") },
eltwise_mode::div,
std::vector<float>(0),
data_types::i32,
DEFAULT_BROADCAST_SPEC,
true));
topology.add(reorder("eltwise_reorder", input_info("eltwise"), { data_types::i32, format::bfyx, { 2, 2, 2, 2 } }));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("InputDictionary", input1);
network.set_input_data("InputText", input2);
network.set_input_data("Input3", input3);
auto outputs = network.execute();
auto output = outputs.at("eltwise_reorder").get_memory();
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
std::vector<int32_t> gather_expected_results = {
5, 6, 7, 8,
-5, -6, -7, -8,
9, 10, 11, 12,
-5, -6, -7, -8
};
for (size_t i = 0; i < 16; ++i) {
auto expected = gather_expected_results[i] / input_3_vec[i];
auto rem = gather_expected_results[i] % input_3_vec[i];
if (rem != 0 && (gather_expected_results[i] < 0) != (input_3_vec[i] < 0))
expected -= 1;
ASSERT_EQ(expected, output_ptr[i]);
}
}
TEST(eltwise_gpu_f32_int, basic_in4x4x4x4) {
// Same params as in eltwise_gpu_f32, sub_basic_in4x4x4x4 but using int types for first input.
//

View File

@ -24,7 +24,7 @@ using namespace tests;
class check_hash_value: public ::testing::Test {
public:
void test_eltwise_basic (bool is_caching_test) {
void test_eltwise_basic(bool is_caching_test) {
auto& engine = get_test_engine();
auto input1 = engine.allocate_memory({ { 2, 2, 2, 2 }, data_types::f32, format::bfyx });
@ -43,8 +43,8 @@ public:
const auto primitive_hash = primitve->hash();
const auto params_hash = prim_inst->get_impl_params()->hash();
ASSERT_EQ(primitive_hash, 11385140218618178073UL);
ASSERT_EQ(params_hash, 15305755526697935028UL);
ASSERT_EQ(primitive_hash, 4145865612957978777UL);
ASSERT_EQ(params_hash, 10122138955874758498UL);
}
void test_fc_basic(bool is_caching_test) {