[GPU] Fixed friendly name of out transpose, improve Pad performance (#8546)
* Fixed friendly names in post-processing nodes * [GPU] Added fsv16 support for pad operation
This commit is contained in:
parent
5352c2b370
commit
3b34f09a9b
@ -6,6 +6,7 @@
|
||||
#include "ngraph/ops.hpp"
|
||||
#include "ngraph_ops/nms_ie_internal.hpp"
|
||||
#include "cldnn_itt.h"
|
||||
#include "cldnn/runtime/debug_configuration.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace InferenceEngine::details;
|
||||
@ -231,6 +232,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateSingleLayerPrimitive");
|
||||
InitProfileInfo(op->get_friendly_name(), op->get_type_name());
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
|
||||
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
|
||||
}
|
||||
|
||||
bool is_created = false;
|
||||
const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
|
||||
while (op_type_info != nullptr) {
|
||||
@ -251,8 +258,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
|
||||
|
||||
if (!is_created) {
|
||||
IE_THROW() << "Operation: " << op->get_friendly_name()
|
||||
<< " of type " << op->get_type_name()
|
||||
<< "(op::v" << op->get_type_info().version << ") is not supported";
|
||||
<< " of type " << op->get_type_name()
|
||||
<< "(op::v" << op->get_type_info().version << ") is not supported";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,12 +23,16 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::byxf);
|
||||
k.EnableInputLayout(DataLayout::bfzyx);
|
||||
k.EnableInputLayout(DataLayout::bfwzyx);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::yxfb);
|
||||
k.EnableOutputLayout(DataLayout::byxf);
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
k.EnableOutputLayout(DataLayout::bfwzyx);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
|
@ -5,10 +5,35 @@
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_DIMS < 5
|
||||
return INPUT0_GET_INDEX(b, f, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
return INPUT0_GET_INDEX(b, f, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
return INPUT0_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error [clDNN border_gpu_ref.cl]: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_DIMS < 5
|
||||
return OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
return OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#elif OUTPUT_DIMS == 6
|
||||
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error [clDNN border_gpu_ref.cl]: output format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
KERNEL(border_gpu_ref)(
|
||||
const __global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output)
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output)
|
||||
{
|
||||
// [CONSTEXPR]
|
||||
// Border sizes (left-top set and right-bottom set):
|
||||
@ -72,7 +97,7 @@ KERNEL(border_gpu_ref)(
|
||||
const uint out_w = out_yw / OUTPUT_SIZE_Y;
|
||||
|
||||
#ifdef BORDER_TYPE_CONSTANT
|
||||
UNIT_TYPE in_val = TO_UNIT_TYPE(BORDER_VALUE);
|
||||
INPUT0_TYPE in_val = TO_INPUT0_TYPE(BORDER_VALUE);
|
||||
|
||||
if (out_x >= blt_sx & out_x < in_lx &
|
||||
out_y >= blt_sy & out_y < in_ly &
|
||||
@ -88,7 +113,7 @@ KERNEL(border_gpu_ref)(
|
||||
const uint in_f = out_f - blt_sf;
|
||||
const uint in_b = out_b - blt_sb;
|
||||
|
||||
const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
in_val = input[in_pos];
|
||||
}
|
||||
#elif defined BORDER_TYPE_EDGE
|
||||
@ -99,8 +124,8 @@ KERNEL(border_gpu_ref)(
|
||||
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
|
||||
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);
|
||||
|
||||
const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
UNIT_TYPE in_val = input[in_pos];
|
||||
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
INPUT0_TYPE in_val = input[in_pos];
|
||||
#elif defined BORDER_TYPE_MIRROR
|
||||
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
|
||||
const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - 1 - out_y : in_sy + in_ly - 1 - out_y);
|
||||
@ -109,8 +134,8 @@ KERNEL(border_gpu_ref)(
|
||||
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
|
||||
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);
|
||||
|
||||
const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
UNIT_TYPE in_val = input[in_pos];
|
||||
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
INPUT0_TYPE in_val = input[in_pos];
|
||||
#elif defined BORDER_TYPE_MIRROR_101
|
||||
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
|
||||
const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - out_y : in_sy + in_ly - 2 - out_y);
|
||||
@ -119,12 +144,12 @@ KERNEL(border_gpu_ref)(
|
||||
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
|
||||
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);
|
||||
|
||||
const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
UNIT_TYPE in_val = input[in_pos];
|
||||
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
|
||||
INPUT0_TYPE in_val = input[in_pos];
|
||||
#else
|
||||
#error Unsupported border type.
|
||||
#endif
|
||||
|
||||
const uint out_pos = GET_DATA_INDEX_6D(OUTPUT, out_b, out_f, out_w, out_z, out_y, out_x);
|
||||
const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
|
||||
output[out_pos] = in_val;
|
||||
}
|
||||
|
11
inference-engine/thirdparty/clDNN/src/border.cpp
vendored
11
inference-engine/thirdparty/clDNN/src/border.cpp
vendored
@ -87,23 +87,12 @@ std::string border_inst::to_string(border_node const& node) {
|
||||
border_inst::typed_primitive_inst(network& network, border_node const& node) : parent(network, node) {
|
||||
auto input_layout = node.input().get_output_layout();
|
||||
|
||||
const auto input_format = input_layout.format;
|
||||
const auto& input_sizes = input_layout.size;
|
||||
|
||||
auto lt_sizes = argument.left_top_sizes.sub(tensor(0));
|
||||
auto rb_sizes = argument.right_bottom_sizes.sub(tensor(0));
|
||||
auto b_type = argument.type;
|
||||
|
||||
CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
|
||||
"Input format",
|
||||
input_format.value,
|
||||
"supported border primitive input formats",
|
||||
format::bfyx,
|
||||
format::yxfb,
|
||||
format::byxf,
|
||||
format::bfzyx,
|
||||
format::bfwzyx);
|
||||
|
||||
tensor null_tensor = tensor(0);
|
||||
|
||||
// Check if sizes of border are in proper range.
|
||||
|
@ -72,22 +72,36 @@ attach_border_impl::attach_border_impl() {
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
std::make_tuple(data_types::i8, format::yxfb),
|
||||
std::make_tuple(data_types::u8, format::yxfb),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
std::make_tuple(data_types::u8, format::bfyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::byxf),
|
||||
std::make_tuple(data_types::u8, format::byxf),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::i8, format::bfzyx),
|
||||
std::make_tuple(data_types::u8, format::bfzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::i8, format::bfwzyx),
|
||||
std::make_tuple(data_types::u8, format::bfwzyx),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
|
||||
|
||||
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
|
||||
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -47,6 +47,7 @@
|
||||
#include "lstm_gemm_inst.h"
|
||||
#include "mutable_data_inst.h"
|
||||
#include "pooling_inst.h"
|
||||
#include "border_inst.h"
|
||||
#include "primitive_inst.h"
|
||||
#include "prior_box_inst.h"
|
||||
#include "proposal_inst.h"
|
||||
@ -1295,6 +1296,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
prim.type() != cldnn::input_layout::type_id() &&
|
||||
prim.type() != cldnn::softmax::type_id() &&
|
||||
prim.type() != cldnn::prior_box::type_id() &&
|
||||
prim.type() != cldnn::border::type_id() &&
|
||||
prim.type() != cldnn::resample::type_id() &&
|
||||
prim.type() != cldnn::crop::type_id() &&
|
||||
prim.type() != cldnn::scale::type_id() &&
|
||||
|
@ -106,6 +106,84 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(border_gpu, basic_fsv16_0x0x1x2_0x0x3x4_border_constant) {
|
||||
// Input (XY) : 4x3
|
||||
// Output (XY): 10x7
|
||||
|
||||
constexpr auto in_size_b = 1;
|
||||
constexpr auto in_size_f = 1;
|
||||
constexpr auto in_size_y = 3;
|
||||
constexpr auto in_size_x = 4;
|
||||
|
||||
constexpr auto blt_size_b = 0;
|
||||
constexpr auto blt_size_f = 0;
|
||||
constexpr auto blt_size_y = 1;
|
||||
constexpr auto blt_size_x = 2;
|
||||
|
||||
constexpr auto brb_size_b = 0;
|
||||
constexpr auto brb_size_f = 0;
|
||||
constexpr auto brb_size_y = 3;
|
||||
constexpr auto brb_size_x = 4;
|
||||
|
||||
constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
|
||||
constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
|
||||
constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
|
||||
constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
|
||||
|
||||
auto& engine = get_test_engine();
|
||||
auto input = engine.allocate_memory({data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});
|
||||
|
||||
topology topology;
|
||||
topology.add(
|
||||
input_layout("input", input->get_layout())
|
||||
);
|
||||
topology.add(
|
||||
reorder("border_input", "input", cldnn::format::b_fs_yx_fsv16, cldnn::data_types::f32),
|
||||
border("border", "border_input",
|
||||
{blt_size_b, blt_size_f, blt_size_x, blt_size_y},
|
||||
{brb_size_b, brb_size_f, brb_size_x, brb_size_y},
|
||||
border_type::constant, 0.0f),
|
||||
reorder("output", "border", cldnn::format::yxfb, cldnn::data_types::f32)
|
||||
);
|
||||
|
||||
std::vector<float> input_data = {
|
||||
1, -2, 3, -4,
|
||||
5, 6, 7, 8,
|
||||
-10, 12, 13, -13,
|
||||
};
|
||||
std::vector<float> out_data = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, -2, 3, -4, 0, 0, 0, 0,
|
||||
0, 0, 5, 6, 7, 8, 0, 0, 0, 0,
|
||||
0, 0, -10, 12, 13, -13, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
set_values(input, input_data);
|
||||
|
||||
cldnn::network network(engine, topology);
|
||||
network.set_input_data("input", input);
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("output").get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));
|
||||
|
||||
for (auto b = 0; b < out_size_b; ++b) { // B
|
||||
for (auto f = 0; f < out_size_f; ++f) { // F
|
||||
for (auto y = 0; y < out_size_y; ++y) { // Y
|
||||
for (auto x = 0; x < out_size_x; ++x) { // X
|
||||
auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB
|
||||
|
||||
EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(border_gpu, basic_bfzyx_0x0x1x01_0x0x0x0x3_border_constant) {
|
||||
|
||||
constexpr auto in_size_b = 1;
|
||||
|
@ -542,10 +542,12 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
|
||||
}
|
||||
// Apply post-processing
|
||||
node = result->get_input_source_output(0);
|
||||
bool post_processing_applied = false;
|
||||
if (output->m_postprocess) {
|
||||
for (const auto& action : output->m_postprocess->actions()) {
|
||||
auto action_result = action({node}, context);
|
||||
node = std::get<0>(action_result);
|
||||
post_processing_applied = true;
|
||||
}
|
||||
}
|
||||
// Implicit: Convert element type + layout to user's tensor implicitly
|
||||
@ -561,10 +563,18 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
|
||||
for (const auto& action : implicit_steps.actions()) {
|
||||
auto action_result = action({node}, context);
|
||||
node = std::get<0>(action_result);
|
||||
post_processing_applied = true;
|
||||
}
|
||||
node.get_node_shared_ptr()->set_friendly_name(
|
||||
result->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name());
|
||||
|
||||
// Reset friendly name of input node to avoid names collision
|
||||
// when there is at a new node inserted by post-processing steps
|
||||
// If no new nodes are inserted by post-processing, then we need to preserve friendly name of input
|
||||
// as it's required for old API correct work
|
||||
if (post_processing_applied)
|
||||
result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name("");
|
||||
|
||||
// Create result
|
||||
auto new_result = std::make_shared<ov::op::v0::Result>(node);
|
||||
new_result->set_friendly_name(result->get_friendly_name());
|
||||
|
Loading…
Reference in New Issue
Block a user