[GPU] Apply m_pythondiv for fusing of eltwise div (#17590)
This commit is contained in:
parent
acb4b1d37b
commit
ec0daa5b10
@ -195,7 +195,8 @@ void CreateUnaryEltwiseOp(Program& p, const std::shared_ptr<ngraph::Node>& node,
|
||||
void CreateElementwiseOp(Program& p,
|
||||
const std::shared_ptr<ngraph::Node>& node,
|
||||
cldnn::eltwise_mode mode,
|
||||
std::vector<float> coefficients = {});
|
||||
std::vector<float> coefficients = {},
|
||||
bool pythondiv = true);
|
||||
|
||||
bool IsNodeOnConstPath(const std::shared_ptr<ngraph::Node>& node);
|
||||
|
||||
|
@ -86,7 +86,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
mode(mode),
|
||||
coefficients(std::vector<float>(0)),
|
||||
stride(std::vector<tensor>(0)),
|
||||
broadcast_spec(spec.m_type, spec.m_axis) { }
|
||||
broadcast_spec(spec.m_type, spec.m_axis),
|
||||
m_pythondiv(true) { }
|
||||
|
||||
/// @brief Constructs eltwise primitive.
|
||||
/// @param id This primitive id.
|
||||
@ -106,7 +107,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
mode(mode),
|
||||
coefficients(std::vector<float>(0)),
|
||||
stride(stride),
|
||||
broadcast_spec(spec.m_type, spec.m_axis) { }
|
||||
broadcast_spec(spec.m_type, spec.m_axis),
|
||||
m_pythondiv(true) { }
|
||||
|
||||
/// @brief Constructs eltwise primitive.
|
||||
/// @param id This primitive id.
|
||||
@ -124,7 +126,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
mode(mode),
|
||||
coefficients(std::vector<float>(0)),
|
||||
stride(std::vector<tensor>(0)),
|
||||
broadcast_spec(spec.m_type, spec.m_axis) { }
|
||||
broadcast_spec(spec.m_type, spec.m_axis),
|
||||
m_pythondiv(true) { }
|
||||
|
||||
/// @brief Constructs eltwise primitive.
|
||||
/// @param id This primitive id.
|
||||
@ -140,7 +143,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
mode(mode),
|
||||
coefficients(std::vector<float>(0)),
|
||||
stride(std::vector<tensor>(0)),
|
||||
broadcast_spec(spec.m_type, spec.m_axis) { }
|
||||
broadcast_spec(spec.m_type, spec.m_axis),
|
||||
m_pythondiv(true) { }
|
||||
|
||||
/// @brief Constructs eltwise primitive.
|
||||
/// @param id This primitive id.
|
||||
@ -149,18 +153,21 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
/// @param coefficients Blob-wise coefficient.
|
||||
/// @param data_type Expected output data type.
|
||||
/// @param spec Auto broadcast rule specificiation.
|
||||
/// @param m_pythondiv Specifies if floor division should be calculate. Supported only for integer data types.
|
||||
eltwise(const primitive_id& id,
|
||||
const std::vector<input_info>& inputs,
|
||||
eltwise_mode mode,
|
||||
std::vector<float> coeffs,
|
||||
data_types data_type,
|
||||
const ov::op::AutoBroadcastSpec& spec = ov::op::AutoBroadcastSpec(ov::op::AutoBroadcastType::NUMPY),
|
||||
bool m_pythondiv = true,
|
||||
const padding& output_padding = padding())
|
||||
: primitive_base(id, inputs, {output_padding}, {optional_data_type{data_type}}),
|
||||
mode(mode),
|
||||
coefficients(std::move(coeffs)),
|
||||
stride(std::vector<tensor>(0)),
|
||||
broadcast_spec(spec.m_type, spec.m_axis) {
|
||||
broadcast_spec(spec.m_type, spec.m_axis),
|
||||
m_pythondiv(m_pythondiv) {
|
||||
if (mode == eltwise_mode::sum && !coefficients.empty() && coefficients.size() != inputs.size()) {
|
||||
throw std::invalid_argument("Invalid eltwise sum coefficients count (should be equal to 0 or input.size)");
|
||||
}
|
||||
@ -174,6 +181,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
std::vector<tensor> stride;
|
||||
/// @brief Define auto broadcast rule specification.
|
||||
ov::op::AutoBroadcastSpec broadcast_spec;
|
||||
/// @brief Define m_pythondiv.
|
||||
bool m_pythondiv;
|
||||
|
||||
size_t hash() const override {
|
||||
size_t seed = primitive::hash();
|
||||
@ -182,6 +191,7 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
for (auto& s : stride) {
|
||||
seed = cldnn::hash_combine(seed, s.hash());
|
||||
}
|
||||
seed = cldnn::hash_combine(seed, m_pythondiv);
|
||||
return seed;
|
||||
}
|
||||
|
||||
@ -194,7 +204,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
return mode == rhs_casted.mode &&
|
||||
coefficients == rhs_casted.coefficients &&
|
||||
broadcast_spec == rhs_casted.broadcast_spec &&
|
||||
stride == rhs_casted.stride;
|
||||
stride == rhs_casted.stride &&
|
||||
m_pythondiv == rhs_casted.m_pythondiv;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
@ -202,7 +213,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
ob << make_data(&mode, sizeof(eltwise_mode));
|
||||
ob << coefficients;
|
||||
ob << stride;
|
||||
ob << make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));;
|
||||
ob << make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));
|
||||
ob << m_pythondiv;
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
@ -210,7 +222,8 @@ struct eltwise : public primitive_base<eltwise> {
|
||||
ib >> make_data(&mode, sizeof(eltwise_mode));
|
||||
ib >> coefficients;
|
||||
ib >> stride;
|
||||
ib >> make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));;
|
||||
ib >> make_data(&broadcast_spec, sizeof(ov::op::AutoBroadcastSpec));
|
||||
ib >> m_pythondiv;
|
||||
}
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
@ -1123,7 +1123,7 @@ std::shared_ptr<kernel_selector::fuse_params> convert_fuse_params(std::shared_pt
|
||||
} else if (p->type() == eltwise::type_id()) {
|
||||
auto casted = std::dynamic_pointer_cast<EltwiseFuseParams>(p);
|
||||
kernel_selector::eltwise_mode mode = convert_to_eltwise_mode(casted->_desc->mode);
|
||||
return std::make_shared<kernel_selector::eltwise_fuse_params>(mode);
|
||||
return std::make_shared<kernel_selector::eltwise_fuse_params>(mode, casted->_desc->m_pythondiv);
|
||||
} else if (p->type() == quantize::type_id()) {
|
||||
auto casted = std::dynamic_pointer_cast<QuantizeFuseParams>(p);
|
||||
return std::make_shared<kernel_selector::quantize_fuse_params>(casted->_scale_shift_opt,
|
||||
|
@ -1707,6 +1707,7 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
auto vec_size = conf.vec_size;
|
||||
std::string shuffle_var = conf.shuffle_var_name;
|
||||
bool is_shuffled = false;
|
||||
bool floor_integer_div = false;
|
||||
|
||||
auto& dep_data = desc.dep_data;
|
||||
int first_fused_ops_idx = -1;
|
||||
@ -1738,14 +1739,40 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
in_vars_converted.push_back(in_name);
|
||||
}
|
||||
|
||||
if (desc.GetType() == KernelType::ELTWISE) {
|
||||
auto p = desc.GetOpParams<eltwise_fuse_params>();
|
||||
if (!p)
|
||||
IE_THROW() << "[clDNN] Eltwise fuse params can't be nullptr";
|
||||
|
||||
if (p->mode == kernel_selector::EltwiseMode::DIV) {
|
||||
if (p->m_pythondiv)
|
||||
floor_integer_div = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto get_acc_t = [&]() -> Datatype {
|
||||
std::vector<Datatype> input_types = {desc.output_tensor.GetDType()};
|
||||
for (auto& dep : dep_data) {
|
||||
input_types.push_back(dep.data_type);
|
||||
}
|
||||
|
||||
std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };
|
||||
std::vector<Datatype> types_prioritized = { };
|
||||
if (floor_integer_div) {
|
||||
if (std::all_of(input_types.begin(), input_types.end(),
|
||||
[=](const Datatype& t) -> bool { return (t != Datatype::F32 && t != Datatype::F16); })) {
|
||||
types_prioritized = { Datatype::INT64, Datatype::INT32, Datatype::UINT32, Datatype::INT16, Datatype::UINT16, Datatype::INT8, Datatype::UINT8 };
|
||||
for (auto& type : types_prioritized) {
|
||||
if (std::any_of(input_types.begin(), input_types.end(),
|
||||
[=](const Datatype& t) -> bool { return (t == type); })) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
floor_integer_div = false;
|
||||
types_prioritized.clear();
|
||||
types_prioritized = { Datatype::F32, Datatype::F16 };
|
||||
for (auto& type : types_prioritized) {
|
||||
if (std::any_of(input_types.begin(), input_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
|
||||
return type;
|
||||
@ -1776,8 +1803,6 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
switch (desc.GetType()) {
|
||||
case KernelType::ELTWISE: {
|
||||
auto p = desc.GetOpParams<eltwise_fuse_params>();
|
||||
if (!p)
|
||||
throw std::runtime_error("[clDNN] Eltwise fuse params can't be nullptr");
|
||||
std::string op = "";
|
||||
switch (p->mode) {
|
||||
case kernel_selector::EltwiseMode::ADD:
|
||||
@ -1797,7 +1822,13 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
}
|
||||
|
||||
auto tmp_var = out_var + "_tmp";
|
||||
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
|
||||
auto acc_t_type = GetType(get_acc_t(), vec_size);
|
||||
op_decls += "\\\n\t" + acc_t_type + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
|
||||
if (floor_integer_div) {
|
||||
auto tmp_var_rem = tmp_var + "_rem";
|
||||
op_decls += "\\\n\t" + acc_t_type + " " + tmp_var_rem + " = " + input_vars[0] + " % " + input_vars[1] + ";";
|
||||
op_decls += "\\\n\t" + tmp_var + " -= " + "((" + tmp_var_rem + " != 0 && (" + input_vars[0] + " < 0) != (" + input_vars[1] + " < 0)) ? 1 : 0);";
|
||||
}
|
||||
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
|
||||
break;
|
||||
}
|
||||
|
@ -90,8 +90,11 @@ struct eltwise_optional_params : optional_params {
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct eltwise_fuse_params : fuse_params {
|
||||
EltwiseMode mode;
|
||||
bool m_pythondiv;
|
||||
|
||||
eltwise_fuse_params(EltwiseMode mode) : fuse_params(KernelType::ELTWISE), mode(mode) {}
|
||||
eltwise_fuse_params(EltwiseMode mode, bool m_pythondiv) : fuse_params(KernelType::ELTWISE)
|
||||
, mode(mode)
|
||||
, m_pythondiv(m_pythondiv) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -36,7 +36,8 @@ namespace intel_gpu {
|
||||
void CreateElementwiseOp(Program& p,
|
||||
const std::shared_ptr<ngraph::Node>& op,
|
||||
cldnn::eltwise_mode mode,
|
||||
std::vector<float> coefficients) {
|
||||
std::vector<float> coefficients,
|
||||
bool pythondiv) {
|
||||
auto inputs = p.GetInputInfo(op);
|
||||
std::string layerName = layer_type_name_ID(op);
|
||||
|
||||
@ -84,7 +85,8 @@ void CreateElementwiseOp(Program& p,
|
||||
mode,
|
||||
std::move(coefficients),
|
||||
out_dt,
|
||||
op->get_autob());
|
||||
op->get_autob(),
|
||||
pythondiv);
|
||||
|
||||
p.add_primitive(*op, eltwisePrim);
|
||||
}
|
||||
@ -110,7 +112,7 @@ static void CreateSubtractOp(Program& p, const std::shared_ptr<ngraph::op::v1::S
|
||||
}
|
||||
|
||||
static void CreateDivideOp(Program& p, const std::shared_ptr<ngraph::op::v1::Divide>& op) {
|
||||
CreateElementwiseOp(p, op, cldnn::eltwise_mode::div);
|
||||
CreateElementwiseOp(p, op, cldnn::eltwise_mode::div, {}, op->is_pythondiv());
|
||||
}
|
||||
|
||||
static void CreateSquaredDifferenceOp(Program& p, const std::shared_ptr<ngraph::op::v0::SquaredDifference>& op) {
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/eltwise.hpp>
|
||||
#include <intel_gpu/primitives/gather.hpp>
|
||||
#include <intel_gpu/primitives/reorder.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
@ -2344,6 +2345,76 @@ TEST(eltwise_gpu_int, basic_in4x4x4x4) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(eltwise_gpu_int, div_gather_fusing) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input1 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 3, 2, 1, 2 } }); // Dictionary
|
||||
auto input2 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 2, 2, 1, 1 } }); // Indexes
|
||||
auto input3 = engine.allocate_memory({ data_types::i32, format::bfyx, { 2, 2, 2, 2 } }); // 2nd input of eltwise
|
||||
|
||||
set_values(input1, {
|
||||
5, 6, 7, 8,
|
||||
-5, -6, -7, -8,
|
||||
9, 10, 11, 12
|
||||
});
|
||||
|
||||
set_values(input2, {
|
||||
0, 1,
|
||||
2, 1
|
||||
});
|
||||
|
||||
std::vector<int32_t> input_3_vec = {
|
||||
2, 2, 2, 2,
|
||||
2, 2, 2, 2,
|
||||
-2, -2, -2, -2,
|
||||
-2, -2, -2, -2
|
||||
};
|
||||
set_values(input3, input_3_vec);
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("InputDictionary", input1->get_layout()));
|
||||
topology.add(input_layout("InputText", input2->get_layout()));
|
||||
topology.add(input_layout("Input3", input3->get_layout()));
|
||||
topology.add(gather("gather", input_info("InputDictionary"), input_info("InputText"), 0, ov::Shape{2, 2, 2, 2}));
|
||||
topology.add(reorder("gather_reorder", input_info("gather"), { data_types::i32, format::bfyx, { 2, 2, 2, 2 } }));
|
||||
topology.add(eltwise("eltwise",
|
||||
{ input_info("gather_reorder"), input_info("Input3") },
|
||||
eltwise_mode::div,
|
||||
std::vector<float>(0),
|
||||
data_types::i32,
|
||||
DEFAULT_BROADCAST_SPEC,
|
||||
true));
|
||||
topology.add(reorder("eltwise_reorder", input_info("eltwise"), { data_types::i32, format::bfyx, { 2, 2, 2, 2 } }));
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
network network(engine, topology, config);
|
||||
|
||||
network.set_input_data("InputDictionary", input1);
|
||||
network.set_input_data("InputText", input2);
|
||||
network.set_input_data("Input3", input3);
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("eltwise_reorder").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
|
||||
std::vector<int32_t> gather_expected_results = {
|
||||
5, 6, 7, 8,
|
||||
-5, -6, -7, -8,
|
||||
9, 10, 11, 12,
|
||||
-5, -6, -7, -8
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < 16; ++i) {
|
||||
auto expected = gather_expected_results[i] / input_3_vec[i];
|
||||
auto rem = gather_expected_results[i] % input_3_vec[i];
|
||||
if (rem != 0 && (gather_expected_results[i] < 0) != (input_3_vec[i] < 0))
|
||||
expected -= 1;
|
||||
ASSERT_EQ(expected, output_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(eltwise_gpu_f32_int, basic_in4x4x4x4) {
|
||||
// Same params as in eltwise_gpu_f32, sub_basic_in4x4x4x4 but using int types for first input.
|
||||
//
|
||||
|
@ -24,7 +24,7 @@ using namespace tests;
|
||||
|
||||
class check_hash_value: public ::testing::Test {
|
||||
public:
|
||||
void test_eltwise_basic (bool is_caching_test) {
|
||||
void test_eltwise_basic(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input1 = engine.allocate_memory({ { 2, 2, 2, 2 }, data_types::f32, format::bfyx });
|
||||
@ -43,8 +43,8 @@ public:
|
||||
const auto primitive_hash = primitve->hash();
|
||||
const auto params_hash = prim_inst->get_impl_params()->hash();
|
||||
|
||||
ASSERT_EQ(primitive_hash, 11385140218618178073UL);
|
||||
ASSERT_EQ(params_hash, 15305755526697935028UL);
|
||||
ASSERT_EQ(primitive_hash, 4145865612957978777UL);
|
||||
ASSERT_EQ(params_hash, 10122138955874758498UL);
|
||||
}
|
||||
|
||||
void test_fc_basic(bool is_caching_test) {
|
||||
|
Loading…
Reference in New Issue
Block a user