[GPU] Fixed safe index func for per-channel case (#10136)

Co-authored-by: Ilya Znamenskiy <ilya.znamenskiy@intel.com>
This commit is contained in:
Vladimir Paramuzov
2022-02-07 09:59:52 +03:00
committed by GitHub
parent a2ca1d4499
commit 12746efbe5
3 changed files with 73 additions and 4 deletions

View File

@@ -454,15 +454,18 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
// We support broadcast only if corresponding dimension is equal to 1.
// Otherwise, dimensions should be equal and using "f" should be safe.
if (_tensor.PitchesDifferFromLogicalDims() && _tensor.SimpleLayout()) {
std::string f_pitch = toCodeString(_tensor.Feature().pitch);
definitions.push_back({ safe_index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
auto f_pitch = toCodeString(_tensor.Feature().pitch);
auto f_size = toCodeString(_tensor.Feature().v);
definitions.push_back({ safe_index_func_name, "(" + offset + " + ((f) % " + f_size + ") * " + f_pitch + ")" });
definitions.push_back({ index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
} else if (_tensor.PitchesDifferFromLogicalDims()) {
// TODO This should be solved differently, by setting the macro arguments to zero
definitions.push_back({ safe_index_func_name, safe_index_func_val });
definitions.push_back({ index_func_name, index_func_val });
} else {
definitions.push_back({ safe_index_func_name, "(" + toCodeString(_tensor.Feature().pad.before) + " + (f))" });
auto f_pad = toCodeString(_tensor.Feature().pad.before);
auto f_size = toCodeString(_tensor.Feature().v);
definitions.push_back({ safe_index_func_name, "((" + f_pad + " + (f)) % " + f_size + ")" });
definitions.push_back({ index_func_name, "(" + toCodeString(_tensor.Feature().pad.before) + " + (f))" });
}
} else {

View File

@@ -28,6 +28,8 @@
#include <transformations/rt_info/fused_names_attribute.hpp>
#include "openvino/pass/serialize.hpp"
#include "intel_gpu/runtime/device_query.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include <performance_heuristics.hpp>
@@ -103,7 +105,8 @@ InferenceEngine::CNNNetwork Plugin::CloneAndTransformNetwork(const InferenceEngi
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) {
clonedNetwork.serialize(debug_config->dump_graphs + "/" + network.getName() + "_" + "transformed_func.xml");
auto path_base = debug_config->dump_graphs + "/" + network.getName() + "_" + "transformed_func";
ov::pass::Serialize(path_base + ".xml", path_base + ".bin").run_on_model(clonedNetwork.getFunction());
}
return clonedNetwork;
}

View File

@@ -2578,6 +2578,69 @@ TEST(eltwise_gpu_f16, fs_b_yx_fsv32_broadcast)
}
}
TEST(eltwise_gpu_f16, fs_b_yx_fsv32_broadcast_bfyx)
{
auto& engine = get_test_engine();
bool f16_supported = engine.get_device_info().supports_fp16;
if (!f16_supported) {
std::cout << "[ SKIPPED ] float16 combinations are skipped (cl_khr_fp16 is not supported)." << std::endl;
return;
}
size_t input_b = 2;
size_t input_f = 72;
size_t input1_y = 10, input1_x = 10;
tensor input1_tensor(input_b, input_f, input1_x, input1_y);
tensor input2_tensor(1, input_f, 1, 1);
VVVVF<FLOAT16> input1_rnd = generate_random_4d<FLOAT16>(input_b, input_f, input1_y, input1_x, 1, 3);
VVVVF<FLOAT16> input2_rnd = generate_random_4d<FLOAT16>(1, input_f, 1, 1, 1, 3);
VF<FLOAT16> input1_flatten = flatten_4d<FLOAT16>(format::bfyx, input1_rnd);
VF<FLOAT16> input2_flatten = flatten_4d<FLOAT16>(format::bfyx, input2_rnd);
auto input1 = engine.allocate_memory({ data_types::f16, format::bfyx, input1_tensor });
auto input2 = engine.allocate_memory({ data_types::f16, format::bfyx, input2_tensor });
set_values(input1, input1_flatten);
set_values(input2, input2_flatten);
topology ref_topology;
ref_topology.add(input_layout("input1", input1->get_layout()));
ref_topology.add(input_layout("input2", input2->get_layout()));
ref_topology.add(eltwise("eltwise", "input1", "input2", eltwise_mode::prod));
network ref_network(engine, ref_topology);
ref_network.set_input_data("input1", input1);
ref_network.set_input_data("input2", input2);
auto ref_outputs = ref_network.execute();
auto ref_output = ref_outputs.at("eltwise").get_memory();
cldnn::mem_lock<FLOAT16> ref_ptr(ref_output, get_test_stream());
topology fsv32_topology;
fsv32_topology.add(input_layout("input1", input1->get_layout()));
fsv32_topology.add(input_layout("input2", input2->get_layout()));
fsv32_topology.add(reorder("reorder1", "input1", layout(data_types::f16, format::fs_b_yx_fsv32, input1_tensor)));
fsv32_topology.add(eltwise("eltwise", "reorder1", "input2", eltwise_mode::prod));
fsv32_topology.add(reorder("reorder_bfyx", "eltwise", layout(data_types::f16, format::bfyx, input1_tensor)));
network fsv32_network(engine, fsv32_topology);
fsv32_network.set_input_data("input1", input1);
fsv32_network.set_input_data("input2", input2);
auto fsv32_outputs = fsv32_network.execute();
auto fsv32_output = fsv32_outputs.at("reorder_bfyx").get_memory();
cldnn::mem_lock<FLOAT16> fsv32_ptr(fsv32_output, get_test_stream());
ASSERT_EQ(ref_ptr.size(), fsv32_ptr.size());
for (size_t i = 0; i < ref_ptr.size(); i++) {
ASSERT_EQ(float(ref_ptr[i]), float(fsv32_ptr[i]));
}
}
TEST(eltwise_gpu_f32, broadcast_test_in4x4x2x2x2) {
// Input : 2x2x2x2x1
// Input2 : 2x2x1x1x2