[Core] Conv reference optimization. (#17303)
This commit is contained in:
committed by
GitHub
parent
db58355fad
commit
a6b043b1ca
@@ -78,7 +78,7 @@ namespace v4 {
|
||||
/// \ingroup ov_ops_cpp_api
|
||||
class OPENVINO_API Proposal : public op::v0::Proposal {
|
||||
public:
|
||||
OPENVINO_OP("Proposal", "opset4", op::Op);
|
||||
OPENVINO_OP("Proposal", "opset4", op::v0::Proposal);
|
||||
Proposal() = default;
|
||||
/// \brief Constructs a Proposal operation
|
||||
///
|
||||
|
||||
@@ -4,31 +4,21 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <cfenv>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <future>
|
||||
|
||||
#include "ngraph/axis_vector.hpp"
|
||||
#include "ngraph/coordinate_transform.hpp"
|
||||
#include "ngraph/runtime/reference/concat.hpp"
|
||||
#include "ngraph/runtime/reference/helpers.hpp"
|
||||
#include "ngraph/runtime/reference/reverse.hpp"
|
||||
#include "ngraph/runtime/reference/split.hpp"
|
||||
#include "ngraph/util.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
namespace runtime {
|
||||
namespace reference {
|
||||
namespace {
|
||||
|
||||
constexpr size_t in_batch_axis = 0;
|
||||
constexpr size_t in_channel_axis = 1;
|
||||
constexpr size_t filter_out_ch_axis = 0;
|
||||
constexpr size_t filter_in_ch_axis = 1;
|
||||
constexpr size_t out_batch_axis = 0;
|
||||
constexpr size_t out_channel_axis = 1;
|
||||
constexpr size_t spatial_axis = 2;
|
||||
|
||||
struct ConvolutionParams {
|
||||
std::vector<int64_t> strides;
|
||||
@@ -54,65 +44,164 @@ constexpr inline bool in_range(Int val, std::pair<Int, Int> range) noexcept {
|
||||
return val >= range.first && val < range.second;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void convolve_2D_channels(const ConvolutionParams& p,
|
||||
const T* batch,
|
||||
const Shape& batch_shape,
|
||||
const T* filter,
|
||||
const Shape& filter_shape,
|
||||
T* out) {
|
||||
const int dilation_y = static_cast<int>(p.dilation[0]);
|
||||
const int dilation_x = static_cast<int>(p.dilation[1]);
|
||||
|
||||
const int pad_begin_y = static_cast<int>(p.pads_begin[0]);
|
||||
const int pad_begin_x = static_cast<int>(p.pads_begin[1]);
|
||||
|
||||
const int stride_y = static_cast<int>(p.strides[0]);
|
||||
const int stride_x = static_cast<int>(p.strides[1]);
|
||||
|
||||
const int input_size_y = static_cast<int>(batch_shape[1]);
|
||||
const int input_size_x = static_cast<int>(batch_shape[2]);
|
||||
|
||||
const int input_size_yx = input_size_y * input_size_x;
|
||||
|
||||
const size_t f_channels = filter_shape[0];
|
||||
const int filter_size_y = static_cast<int>(filter_shape[1]);
|
||||
const int filter_size_x = static_cast<int>(filter_shape[2]);
|
||||
|
||||
const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (dilation_y - 1));
|
||||
const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (dilation_x - 1));
|
||||
|
||||
const int i_y_lim = static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y + p.output_padding[0]);
|
||||
const int i_x_lim = static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x + p.output_padding[1]);
|
||||
|
||||
const int f_y_increment = dilation_y * input_size_x;
|
||||
const int f_x_increment = dilation_x;
|
||||
|
||||
const int f_y_block = filter_size_y * f_y_increment;
|
||||
const int f_x_block = filter_size_x * f_x_increment;
|
||||
|
||||
for (int i_y = -pad_begin_y; i_y <= i_y_lim; i_y += stride_y) {
|
||||
const int i_y_m = i_y * input_size_x;
|
||||
const int f_y_up_lim = f_y_block + i_y_m;
|
||||
|
||||
for (int i_x = -pad_begin_x; i_x <= i_x_lim; i_x += stride_x) {
|
||||
const int f_x_up_lim = f_x_block + i_x;
|
||||
auto input_channel = batch;
|
||||
auto filter_channel = filter;
|
||||
T sum = 0;
|
||||
size_t filter_channels_count = f_channels;
|
||||
|
||||
while (filter_channels_count--) {
|
||||
for (int f_y_i = i_y_m; f_y_i < f_y_up_lim; f_y_i += f_y_increment) {
|
||||
if (f_y_i < 0 || f_y_i >= input_size_yx) {
|
||||
filter_channel += filter_size_x;
|
||||
continue;
|
||||
}
|
||||
const int x_up_bound = input_size_x + f_y_i;
|
||||
for (int f_x_i = f_y_i + i_x; f_x_i < f_x_up_lim + f_y_i;
|
||||
f_x_i += f_x_increment, filter_channel++) {
|
||||
if (f_x_i < f_y_i || f_x_i >= x_up_bound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sum += input_channel[f_x_i] * filter_channel[0];
|
||||
}
|
||||
}
|
||||
input_channel += input_size_yx;
|
||||
}
|
||||
*out = sum;
|
||||
++out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void convolve_3D_channels(const ConvolutionParams& p,
|
||||
const T* batch,
|
||||
const Shape& batch_shape,
|
||||
const T* filter,
|
||||
const Shape& filter_shape,
|
||||
T*& out) {
|
||||
T* out) {
|
||||
const int dilation_z = static_cast<int>(p.dilation[0]);
|
||||
const int dilation_y = static_cast<int>(p.dilation[1]);
|
||||
const int dilation_x = static_cast<int>(p.dilation[2]);
|
||||
|
||||
const int pad_begin_z = static_cast<int>(p.pads_begin[0]);
|
||||
const int pad_begin_y = static_cast<int>(p.pads_begin[1]);
|
||||
const int pad_begin_x = static_cast<int>(p.pads_begin[2]);
|
||||
|
||||
const int stride_z = static_cast<int>(p.strides[0]);
|
||||
const int stride_y = static_cast<int>(p.strides[1]);
|
||||
const int stride_x = static_cast<int>(p.strides[2]);
|
||||
|
||||
const int input_size_z = static_cast<int>(batch_shape[1]);
|
||||
const int input_size_y = static_cast<int>(batch_shape[2]);
|
||||
const int input_size_x = static_cast<int>(batch_shape[3]);
|
||||
|
||||
const int input_size_yx = input_size_y * input_size_x;
|
||||
const int input_size_zyx = input_size_z * input_size_yx;
|
||||
|
||||
const size_t f_channels = filter_shape[0];
|
||||
const int filter_size_z = static_cast<int>(filter_shape[1]);
|
||||
const int filter_size_y = static_cast<int>(filter_shape[2]);
|
||||
const int filter_size_x = static_cast<int>(filter_shape[3]);
|
||||
const int dilated_filter_size_z = static_cast<int>(filter_size_z + (filter_size_z - 1) * (p.dilation[0] - 1));
|
||||
const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (p.dilation[1] - 1));
|
||||
const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (p.dilation[2] - 1));
|
||||
const int filter_size_yx = filter_size_y * filter_size_x;
|
||||
|
||||
const Shape input_channel_shape(++batch_shape.begin(), batch_shape.end());
|
||||
const size_t input_channel_size = shape_size(input_channel_shape);
|
||||
const Shape filter_channel_shape(++filter_shape.begin(), filter_shape.end());
|
||||
const size_t filter_channel_size = shape_size(filter_channel_shape);
|
||||
const int dilated_filter_size_z = static_cast<int>(filter_size_z + (filter_size_z - 1) * (dilation_z - 1));
|
||||
const int dilated_filter_size_y = static_cast<int>(filter_size_y + (filter_size_y - 1) * (dilation_y - 1));
|
||||
const int dilated_filter_size_x = static_cast<int>(filter_size_x + (filter_size_x - 1) * (dilation_x - 1));
|
||||
|
||||
for (int i_z = static_cast<int>(-p.pads_begin[0]);
|
||||
i_z <= static_cast<int>(p.pads_end[0] + input_size_z - dilated_filter_size_z + p.output_padding[0]);
|
||||
i_z += static_cast<int>(p.strides[0])) {
|
||||
for (int i_y = static_cast<int>(-p.pads_begin[1]);
|
||||
i_y <= static_cast<int>(p.pads_end[1] + input_size_y - dilated_filter_size_y + p.output_padding[1]);
|
||||
i_y += static_cast<int>(p.strides[1])) {
|
||||
for (int i_x = static_cast<int>(-p.pads_begin[2]);
|
||||
i_x <= static_cast<int>(p.pads_end[2] + input_size_x - dilated_filter_size_x + p.output_padding[2]);
|
||||
i_x += static_cast<int>(p.strides[2])) {
|
||||
const int i_z_lim = static_cast<int>(p.pads_end[0] + input_size_z - dilated_filter_size_z + p.output_padding[0]);
|
||||
const int i_y_lim = static_cast<int>(p.pads_end[1] + input_size_y - dilated_filter_size_y + p.output_padding[1]);
|
||||
const int i_x_lim = static_cast<int>(p.pads_end[2] + input_size_x - dilated_filter_size_x + p.output_padding[2]);
|
||||
|
||||
const int f_z_increment = dilation_z * input_size_yx;
|
||||
const int f_y_increment = dilation_y * input_size_x;
|
||||
const int f_x_increment = dilation_x;
|
||||
|
||||
const int f_z_block = filter_size_z * f_z_increment;
|
||||
const int f_y_block = filter_size_y * f_y_increment;
|
||||
const int f_x_block = filter_size_x * f_x_increment;
|
||||
|
||||
for (int i_z = -pad_begin_z; i_z <= i_z_lim; i_z += stride_z) {
|
||||
const int s_z_shift = i_z * input_size_yx;
|
||||
const int f_z_up_bound = f_z_block + s_z_shift;
|
||||
|
||||
for (int i_y = -pad_begin_y; i_y <= i_y_lim; i_y += stride_y) {
|
||||
const int i_y_m = i_y * input_size_x;
|
||||
|
||||
for (int i_x = -pad_begin_x; i_x <= i_x_lim; i_x += stride_x) {
|
||||
auto input_channel = batch;
|
||||
auto filter_channel = filter;
|
||||
T sum = 0;
|
||||
size_t filter_channels_count = filter_shape[0];
|
||||
size_t filter_channels_count = f_channels;
|
||||
|
||||
while (filter_channels_count--) {
|
||||
for (int f_z = 0; f_z < filter_size_z; ++f_z) {
|
||||
for (int f_y = 0; f_y < filter_size_y; ++f_y) {
|
||||
for (int f_x = 0; f_x < filter_size_x; ++f_x) {
|
||||
int rel_i_z = i_z + (f_z * static_cast<int>(p.dilation[0]));
|
||||
int rel_i_y = i_y + (f_y * static_cast<int>(p.dilation[1]));
|
||||
int rel_i_x = i_x + (f_x * static_cast<int>(p.dilation[2]));
|
||||
|
||||
bool padding =
|
||||
!(in_range(rel_i_x, {0, input_size_x}) && in_range(rel_i_y, {0, input_size_y}) &&
|
||||
in_range(rel_i_z, {0, input_size_z}));
|
||||
if (padding)
|
||||
for (int f_z_i = s_z_shift; f_z_i < f_z_up_bound; f_z_i += f_z_increment) {
|
||||
if (f_z_i < 0 || f_z_i >= input_size_zyx) {
|
||||
filter_channel += filter_size_yx;
|
||||
continue;
|
||||
}
|
||||
const int y_up_bound = f_z_i + input_size_yx;
|
||||
const int y_shift = f_z_i + i_y_m;
|
||||
for (int f_y_i = y_shift; f_y_i < f_y_block + y_shift; f_y_i += f_y_increment) {
|
||||
if (f_y_i < f_z_i || f_y_i >= y_up_bound) {
|
||||
filter_channel += filter_size_x;
|
||||
continue;
|
||||
}
|
||||
const int x_up_bound = input_size_x + f_y_i;
|
||||
for (int f_x_i = f_y_i + i_x; f_x_i < f_x_block + f_y_i + i_x;
|
||||
f_x_i += f_x_increment, filter_channel++) {
|
||||
if (f_x_i < f_y_i || f_x_i >= x_up_bound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int f_buf_idx = (f_z * filter_size_y * filter_size_x) + (f_y * filter_size_x) + f_x;
|
||||
int i_buf_idx =
|
||||
(rel_i_z * input_size_y * input_size_x) + (rel_i_y * input_size_x) + rel_i_x;
|
||||
sum += static_cast<T>(input_channel[i_buf_idx]) *
|
||||
static_cast<T>(filter_channel[f_buf_idx]);
|
||||
sum += input_channel[f_x_i] * filter_channel[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
input_channel += input_channel_size;
|
||||
filter_channel += filter_channel_size;
|
||||
input_channel += input_size_zyx;
|
||||
}
|
||||
*out = sum;
|
||||
++out;
|
||||
@@ -121,6 +210,20 @@ void convolve_3D_channels(const ConvolutionParams& p,
|
||||
}
|
||||
}
|
||||
|
||||
inline void extend_to_2D(ConvolutionParams& p, Shape& in_shape, Shape& filter_shape) {
|
||||
const int spatial_rank = static_cast<int>(in_shape.size() - 2);
|
||||
if (spatial_rank < 2) {
|
||||
int missing_dims = 2 - spatial_rank;
|
||||
p.dilation.insert(std::prev(p.dilation.end(), spatial_rank), missing_dims, 1);
|
||||
p.strides.insert(std::prev(p.strides.end(), spatial_rank), missing_dims, 1);
|
||||
p.pads_begin.insert(std::prev(p.pads_begin.end(), spatial_rank), missing_dims, 0);
|
||||
p.pads_end.insert(std::prev(p.pads_end.end(), spatial_rank), missing_dims, 0);
|
||||
p.output_padding.insert(std::prev(p.output_padding.end(), spatial_rank), missing_dims, 0);
|
||||
in_shape.insert(std::next(in_shape.end(), -spatial_rank), missing_dims, 1);
|
||||
filter_shape.insert(std::prev(filter_shape.end(), spatial_rank), missing_dims, 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void extend_to_3D(ConvolutionParams& p, Shape& in_shape, Shape& filter_shape) {
|
||||
int spatial_rank = static_cast<int>(in_shape.size() - 2);
|
||||
if (spatial_rank < 3) {
|
||||
@@ -209,9 +312,7 @@ void convolution(const T* in,
|
||||
const Strides& strides,
|
||||
const Strides& dilations,
|
||||
const CoordinateDiff& pads_begin,
|
||||
const CoordinateDiff& pads_end)
|
||||
|
||||
{
|
||||
const CoordinateDiff& pads_end) {
|
||||
validate_convolution_parameters(in_shape, f_shape, out_shape, strides, dilations, pads_begin, pads_end);
|
||||
|
||||
// here we are converting all param types to int's to avoid arithmetic issues
|
||||
@@ -222,26 +323,83 @@ void convolution(const T* in,
|
||||
// convolution implementation to convolve also in 1D & 2D case
|
||||
Shape input_shape{in_shape};
|
||||
Shape filters_shape{f_shape};
|
||||
if (in_shape.size() < 5) {
|
||||
extend_to_3D(params, input_shape, filters_shape);
|
||||
if (in_shape.size() < 4) {
|
||||
extend_to_2D(params, input_shape, filters_shape);
|
||||
}
|
||||
|
||||
const size_t batches_count = input_shape[in_batch_axis];
|
||||
const Shape batch_shape(++input_shape.begin(), input_shape.end());
|
||||
const size_t batch_size = shape_size(batch_shape);
|
||||
const size_t out_spatial_size =
|
||||
std::accumulate(out_shape.begin() + 2, out_shape.end(), size_t(1), std::multiplies<size_t>());
|
||||
|
||||
const size_t filters_count = filters_shape[filter_out_ch_axis];
|
||||
const Shape filter_shape(++filters_shape.begin(), filters_shape.end());
|
||||
const size_t filter_size = shape_size(filter_shape);
|
||||
|
||||
auto batch = in;
|
||||
for (size_t batch_idx = 0; batch_idx < batches_count; ++batch_idx) {
|
||||
auto filter = f;
|
||||
for (size_t f_idx = 0; f_idx < filters_count; ++f_idx) {
|
||||
convolve_3D_channels(params, batch, batch_shape, filter, filter_shape, out);
|
||||
filter += filter_size;
|
||||
const size_t work_amount = batches_count * filters_count;
|
||||
|
||||
void (*conv_channels)(const ConvolutionParams&, const T*, const Shape&, const T*, const Shape&, T*);
|
||||
if (in_shape.size() == 5) {
|
||||
conv_channels = &convolve_3D_channels;
|
||||
} else {
|
||||
conv_channels = &convolve_2D_channels;
|
||||
}
|
||||
|
||||
auto ncores = std::thread::hardware_concurrency() / 2;
|
||||
if (ncores == 0) {
|
||||
ncores = 1;
|
||||
}
|
||||
std::vector<std::future<void>> futures(ncores);
|
||||
|
||||
auto split_work = [](const size_t& n, const size_t& nthr, const size_t ithr, size_t& n_start, size_t& n_end) {
|
||||
if (nthr <= 1 || n == 0) {
|
||||
n_start = 0;
|
||||
n_end = n;
|
||||
} else {
|
||||
auto n1 = (n + nthr - 1) / nthr;
|
||||
auto n2 = n1 - 1;
|
||||
auto T1 = n - n2 * nthr;
|
||||
n_end = ithr < T1 ? n1 : n2;
|
||||
n_start = ithr <= T1 ? ithr * n1 : T1 * n1 + (ithr - T1) * n2;
|
||||
}
|
||||
batch += batch_size;
|
||||
|
||||
n_end += n_start;
|
||||
};
|
||||
|
||||
auto ker_callback = [&](int nthr, int ithr) {
|
||||
size_t start = 0, end = 0;
|
||||
split_work(work_amount, nthr, ithr, start, end);
|
||||
if (start >= end) {
|
||||
return;
|
||||
}
|
||||
size_t batch_idx = start / filters_count;
|
||||
size_t c_idx = start % filters_count;
|
||||
|
||||
auto in_data = in + batch_size * batch_idx;
|
||||
auto filter = f + filter_size * c_idx;
|
||||
auto out_data = out + out_spatial_size * filters_count * batch_idx + out_spatial_size * c_idx;
|
||||
|
||||
for (; batch_idx < batches_count; ++batch_idx) {
|
||||
for (; c_idx < filters_count && start < end; c_idx++, start++) {
|
||||
conv_channels(params, in_data, batch_shape, filter, filter_shape, out_data);
|
||||
filter += filter_size;
|
||||
out_data += out_spatial_size;
|
||||
}
|
||||
if (start >= end) {
|
||||
break;
|
||||
}
|
||||
filter = f;
|
||||
c_idx = 0;
|
||||
in_data += batch_size;
|
||||
}
|
||||
};
|
||||
|
||||
for (size_t ithr = 0; ithr < ncores; ithr++) {
|
||||
futures[ithr] = std::async(ker_callback, ncores, ithr);
|
||||
}
|
||||
for (size_t ithr = 0; ithr < ncores; ithr++) {
|
||||
futures[ithr].get();
|
||||
}
|
||||
}
|
||||
} // namespace reference
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "ngraph/axis_vector.hpp"
|
||||
#include "ngraph/runtime/reference/convolution.hpp"
|
||||
#include "ngraph/runtime/reference/reverse.hpp"
|
||||
#include "ngraph/util.hpp"
|
||||
|
||||
namespace ngraph {
|
||||
@@ -171,8 +172,8 @@ void convolution_backprop_impl(const T* in,
|
||||
// convolution implementation to convolve also in 1D & 2D case
|
||||
Shape input_shape{in_shape};
|
||||
Shape filters_shape{f_shape};
|
||||
if (in_shape.size() < 5) {
|
||||
extend_to_3D(params, input_shape, filters_shape);
|
||||
if (in_shape.size() < 4) {
|
||||
extend_to_2D(params, input_shape, filters_shape);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < input_shape.size() - 2; ++i) {
|
||||
@@ -186,24 +187,26 @@ void convolution_backprop_impl(const T* in,
|
||||
}
|
||||
|
||||
// convert output shape to 3D, contains only dimensions
|
||||
Shape out_shape_3d{out_shape.begin() + 2, out_shape.end()};
|
||||
Shape out_shape_2d{out_shape.begin() + 2, out_shape.end()};
|
||||
const size_t out_spatial_size =
|
||||
std::accumulate(out_shape.begin() + 2, out_shape.end(), size_t(1), std::multiplies<size_t>());
|
||||
|
||||
int out_shape_rank = static_cast<int>(out_shape.size()) - 2;
|
||||
if (out_shape_rank < 3) {
|
||||
int missing_dims = 3 - out_shape_rank;
|
||||
out_shape_3d.insert(std::prev(out_shape_3d.end(), out_shape_rank), missing_dims, 1);
|
||||
if (out_shape_rank < 2) {
|
||||
int missing_dims = 2 - out_shape_rank;
|
||||
out_shape_2d.insert(std::prev(out_shape_2d.end(), out_shape_rank), missing_dims, 1);
|
||||
}
|
||||
|
||||
// modify params.pads_end when output_shape was provided in ctor in order to
|
||||
// calculate expected number of output elements
|
||||
for (size_t i = 0; i < out_shape_3d.size(); i++) {
|
||||
if (out_shape_3d[i] > 1) {
|
||||
for (size_t i = 0; i < out_shape_2d.size(); i++) {
|
||||
if (out_shape_2d[i] > 1) {
|
||||
// expected_dim = (in - 1)* strides + filter - 2*padding + out_padding
|
||||
// strides is already applied (through 0's extension in input)
|
||||
// padding = pads_begin + pads_end, formula below is using
|
||||
// params.pad_begin/params.pads_end:
|
||||
const size_t expected_dim =
|
||||
out_shape_3d[i] - ((input_shape[i + 2] - 1) - filters_shape[i + 2] + params.pads_begin[i] +
|
||||
out_shape_2d[i] - ((input_shape[i + 2] - 1) - filters_shape[i + 2] + params.pads_begin[i] +
|
||||
params.pads_end[i] + 2 + params.output_padding[i]);
|
||||
params.pads_end[i] += expected_dim;
|
||||
}
|
||||
@@ -217,15 +220,69 @@ void convolution_backprop_impl(const T* in,
|
||||
Shape batch_shape(++input_shape.begin(), input_shape.end());
|
||||
const size_t batch_size = shape_size(batch_shape);
|
||||
|
||||
auto batch = in;
|
||||
const size_t work_amount = batches_count * filters_count;
|
||||
|
||||
for (size_t batch_idx = 0; batch_idx < batches_count; ++batch_idx) {
|
||||
auto filter = f;
|
||||
for (size_t f_idx = 0; f_idx < filters_count; ++f_idx) {
|
||||
convolve_3D_channels(params, batch, batch_shape, filter, filter_shape, out);
|
||||
filter += filter_size;
|
||||
void (*conv_channels)(const ConvolutionParams&, const T*, const Shape&, const T*, const Shape&, T*);
|
||||
if (in_shape.size() == 5) {
|
||||
conv_channels = &convolve_3D_channels;
|
||||
} else {
|
||||
conv_channels = &convolve_2D_channels;
|
||||
}
|
||||
|
||||
auto ncores = std::thread::hardware_concurrency() / 2;
|
||||
if (ncores == 0) {
|
||||
ncores = 1;
|
||||
}
|
||||
std::vector<std::future<void>> futures(ncores);
|
||||
|
||||
auto split_work = [](const size_t& n, const size_t& nthr, const size_t ithr, size_t& n_start, size_t& n_end) {
|
||||
if (nthr <= 1 || n == 0) {
|
||||
n_start = 0;
|
||||
n_end = n;
|
||||
} else {
|
||||
auto n1 = (n + nthr - 1) / nthr;
|
||||
auto n2 = n1 - 1;
|
||||
auto T1 = n - n2 * nthr;
|
||||
n_end = ithr < T1 ? n1 : n2;
|
||||
n_start = ithr <= T1 ? ithr * n1 : T1 * n1 + (ithr - T1) * n2;
|
||||
}
|
||||
batch += batch_size;
|
||||
|
||||
n_end += n_start;
|
||||
};
|
||||
|
||||
auto ker_callback = [&](int nthr, int ithr) {
|
||||
size_t start = 0, end = 0;
|
||||
split_work(work_amount, nthr, ithr, start, end);
|
||||
if (start >= end) {
|
||||
return;
|
||||
}
|
||||
size_t batch_idx = start / filters_count;
|
||||
size_t c_idx = start % filters_count;
|
||||
|
||||
auto in_data = in + batch_size * batch_idx;
|
||||
auto filter = f + filter_size * c_idx;
|
||||
auto out_data = out + out_spatial_size * filters_count * batch_idx + out_spatial_size * c_idx;
|
||||
|
||||
for (; batch_idx < batches_count; ++batch_idx) {
|
||||
for (; c_idx < filters_count && start < end; c_idx++, start++) {
|
||||
conv_channels(params, in_data, batch_shape, filter, filter_shape, out_data);
|
||||
filter += filter_size;
|
||||
out_data += out_spatial_size;
|
||||
}
|
||||
if (start >= end) {
|
||||
break;
|
||||
}
|
||||
filter = f;
|
||||
c_idx = 0;
|
||||
in_data += batch_size;
|
||||
}
|
||||
};
|
||||
|
||||
for (size_t ithr = 0; ithr < ncores; ithr++) {
|
||||
futures[ithr] = std::async(ker_callback, ncores, ithr);
|
||||
}
|
||||
for (size_t ithr = 0; ithr < ncores; ithr++) {
|
||||
futures[ithr].get();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,7 +362,7 @@ void convolution_backprop_in(const T* delta_in,
|
||||
if (stride_dim >= 2) {
|
||||
extend_with_zeros(stride, in_shape, delta_in, conv_input_shape, extended_input);
|
||||
std::fill(conv_stride.begin(), conv_stride.end(), 1);
|
||||
conv_input_data = &extended_input[0];
|
||||
conv_input_data = extended_input.data();
|
||||
}
|
||||
|
||||
const size_t dilation_dim =
|
||||
@@ -317,7 +374,7 @@ void convolution_backprop_in(const T* delta_in,
|
||||
conv_filter_shape,
|
||||
extended_filter);
|
||||
std::fill(conv_filter_dilation.begin(), conv_filter_dilation.end(), 1);
|
||||
conv_filter_data = &extended_filter[0];
|
||||
conv_filter_data = extended_filter.data();
|
||||
}
|
||||
|
||||
convolution_backprop_impl(conv_input_data,
|
||||
|
||||
@@ -114,47 +114,56 @@ void convolve_2D_channels(const ConvolutionParams& p,
|
||||
const T* mask,
|
||||
const Shape& mask_shape,
|
||||
T* out,
|
||||
size_t group_idx,
|
||||
int64_t groups,
|
||||
int64_t deformable_groups,
|
||||
bool bilinear_interpolation_pad) {
|
||||
const size_t group_idx,
|
||||
const int64_t groups,
|
||||
const int64_t deformable_groups,
|
||||
const bool bilinear_interpolation_pad) {
|
||||
const int input_size_y = static_cast<int>(batch_shape[1]);
|
||||
const int input_size_x = static_cast<int>(batch_shape[2]);
|
||||
const int filter_size_y = static_cast<int>(filter_shape[1]);
|
||||
const int filter_size_x = static_cast<int>(filter_shape[2]);
|
||||
const int dilated_filter_size_y = filter_size_y + (filter_size_y - 1) * (static_cast<int>(p.dilation[0]) - 1);
|
||||
const int dilated_filter_size_x = filter_size_x + (filter_size_x - 1) * (static_cast<int>(p.dilation[1]) - 1);
|
||||
const int dilation_y = static_cast<int>(p.dilation[0]);
|
||||
const int dilation_x = static_cast<int>(p.dilation[1]);
|
||||
const int dilated_filter_size_y = filter_size_y + (filter_size_y - 1) * (dilation_y - 1);
|
||||
const int dilated_filter_size_x = filter_size_x + (filter_size_x - 1) * (dilation_x - 1);
|
||||
|
||||
const int i_y_lim = static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y);
|
||||
const int i_x_lim = static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x);
|
||||
|
||||
const int input_channel_size = static_cast<int>(shape_size(shape_reduce(batch_shape)));
|
||||
const int filter_channel_size = static_cast<int>(shape_size(shape_reduce(filter_shape)));
|
||||
const int offsets_size = static_cast<int>(shape_size(offset_shape));
|
||||
const int offsets_spatial_size = static_cast<int>(shape_size(shape_reduce(offset_shape)));
|
||||
const int filter_channels_count = static_cast<int>(filter_shape[0]);
|
||||
const int mask_size = static_cast<int>(shape_size(mask_shape));
|
||||
const int mask_spatial_size = static_cast<int>(shape_size(shape_reduce(mask_shape)));
|
||||
|
||||
const int group_idx_m = filter_channels_count * static_cast<int>(group_idx);
|
||||
const int group_idx_d = filter_channels_count * groups / deformable_groups;
|
||||
|
||||
const int f_shift_inc = 2 * offsets_spatial_size;
|
||||
|
||||
int out_idx = 0;
|
||||
for (int i_y = static_cast<int>(-p.pads_begin[0]);
|
||||
i_y <= static_cast<int>(p.pads_end[0] + input_size_y - dilated_filter_size_y);
|
||||
i_y += static_cast<int>(p.strides[0])) {
|
||||
for (int i_x = static_cast<int>(-p.pads_begin[1]);
|
||||
i_x <= static_cast<int>(p.pads_end[1] + input_size_x - dilated_filter_size_x);
|
||||
i_x += static_cast<int>(p.strides[1])) {
|
||||
for (int i_y = static_cast<int>(-p.pads_begin[0]); i_y <= i_y_lim; i_y += static_cast<int>(p.strides[0])) {
|
||||
for (int i_x = static_cast<int>(-p.pads_begin[1]); i_x <= i_x_lim; i_x += static_cast<int>(p.strides[1])) {
|
||||
auto input_channel = batch;
|
||||
auto filter_channel = filter;
|
||||
T sum = 0;
|
||||
for (int fc = 0; fc < filter_channels_count; fc++) {
|
||||
auto deformable_group_idx =
|
||||
(filter_channels_count * group_idx + fc) / (filter_channels_count * groups / deformable_groups);
|
||||
for (int f_y = 0; f_y < filter_size_y; ++f_y) {
|
||||
for (int f_x = 0; f_x < filter_size_x; ++f_x) {
|
||||
int f_buf_idx = (f_y * filter_size_x) + f_x;
|
||||
T y_offset = offsets[deformable_group_idx * offsets_size +
|
||||
f_buf_idx * 2 * offsets_spatial_size + out_idx];
|
||||
T x_offset = offsets[deformable_group_idx * offsets_size +
|
||||
(f_buf_idx * 2 + 1) * offsets_spatial_size + out_idx];
|
||||
T rel_i_y = static_cast<T>(i_y + (f_y * p.dilation[0]) + y_offset);
|
||||
T rel_i_x = static_cast<T>(i_x + (f_x * p.dilation[1]) + x_offset);
|
||||
const int deformable_group_idx = (group_idx_m + fc) / group_idx_d;
|
||||
int f_y_shift = deformable_group_idx * offsets_size + out_idx;
|
||||
int f_x_shift = f_y_shift + offsets_spatial_size;
|
||||
int f_mask_shift = deformable_group_idx * mask_size + out_idx;
|
||||
int i_y_dil = i_y;
|
||||
for (int f_y = 0; f_y < filter_size_y; ++f_y, i_y_dil += dilation_y) {
|
||||
int i_x_dil = i_x;
|
||||
for (int f_x = 0; f_x < filter_size_x; ++f_x,
|
||||
f_y_shift += f_shift_inc,
|
||||
f_x_shift += f_shift_inc,
|
||||
f_mask_shift += mask_spatial_size,
|
||||
i_x_dil += dilation_x,
|
||||
filter_channel++) {
|
||||
T rel_i_y = static_cast<T>(i_y_dil + offsets[f_y_shift]);
|
||||
T rel_i_x = static_cast<T>(i_x_dil + offsets[f_x_shift]);
|
||||
|
||||
bool padding;
|
||||
if (bilinear_interpolation_pad) {
|
||||
@@ -168,19 +177,17 @@ void convolve_2D_channels(const ConvolutionParams& p,
|
||||
if (padding)
|
||||
continue;
|
||||
|
||||
T mask_scalar =
|
||||
mask[deformable_group_idx * mask_size + f_buf_idx * mask_spatial_size + out_idx];
|
||||
T mask_scalar = mask[f_mask_shift];
|
||||
sum += static_cast<T>(bilinear_interpolation(input_channel,
|
||||
static_cast<float>(rel_i_x),
|
||||
static_cast<float>(rel_i_y),
|
||||
input_size_x,
|
||||
input_size_y,
|
||||
bilinear_interpolation_pad)) *
|
||||
filter_channel[f_buf_idx] * mask_scalar;
|
||||
filter_channel[0] * mask_scalar;
|
||||
}
|
||||
}
|
||||
input_channel += input_channel_size;
|
||||
filter_channel += filter_channel_size;
|
||||
}
|
||||
out[out_idx++] = sum;
|
||||
}
|
||||
@@ -206,9 +213,7 @@ void deformable_convolution(const T* in,
|
||||
const CoordinateDiff& pads_end,
|
||||
const int64_t groups,
|
||||
const int64_t deformable_groups,
|
||||
const bool bilinear_interpolation_pad)
|
||||
|
||||
{
|
||||
const bool bilinear_interpolation_pad) {
|
||||
using namespace def_conv_impl;
|
||||
|
||||
validate_deformable_convolution_params(in_shape,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/runtime/reference/convolution.hpp"
|
||||
#include "ngraph/util.hpp"
|
||||
#include "ngraph/runtime/reference/helpers.hpp"
|
||||
|
||||
namespace {
|
||||
constexpr size_t filter_group_axis = 0;
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ngraph/runtime/reference/convolution_backprop_data.hpp"
|
||||
#include "ngraph/runtime/reference/group_convolution.hpp"
|
||||
#include "ngraph/util.hpp"
|
||||
|
||||
|
||||
@@ -99,12 +99,6 @@ protected:
|
||||
ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hiddenSize/*, activations, {}, {}, clip, linearBeforeReset*/);
|
||||
|
||||
function = makeNgraphFunction(netPrecision, params, augruCellOp, "AUGRUCell");
|
||||
auto filePrefix = CommonTestUtils::generateTestFilePrefix();
|
||||
std::string xmlFileName = filePrefix + "_AUGRUCell.xml";
|
||||
std::string binFileName = filePrefix + "_AUGRUCell.bin";
|
||||
ov::pass::Serialize serializer(xmlFileName, binFileName);
|
||||
serializer.run_on_model(function);
|
||||
std::cout << "Saved subgraph IR." << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -16,8 +16,6 @@
|
||||
#include "ie_core.hpp"
|
||||
#include "ie_precision.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
|
||||
#include "functional_test_utils/blob_utils.hpp"
|
||||
#include "shared_test_classes/base/layer_test_utils.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
|
||||
Reference in New Issue
Block a user