faster implementation of gather_nd operator for reference implementations (#2897)

* faster implementation of gather_nd operator for reference implementations

* remove old impl and time measurements

* exclude test for gather_nd for IE_CPU (output mismatch)

* apply review comments and rename variables for clarify

* rename variables according to PR comments

* try to apply all PR suggestions

* fix indices calcualtions

Co-authored-by: Patryk Elszkowski <patryk.elszkowki@intel.com>
This commit is contained in:
Patryk Elszkowski
2020-12-01 10:48:58 +01:00
committed by GitHub
parent ce1ec9eab0
commit 8e64eb21cb
4 changed files with 204 additions and 179 deletions

View File

@@ -51,13 +51,10 @@ namespace ngraph
const Shape& out_shape,
size_t axis)
{
using namespace std;
// prepare shape of params_prime (remove first "axis" dimensions)
Shape params_prime_shape(params_shape);
params_prime_shape.erase(params_prime_shape.begin(),
params_prime_shape.begin() + axis);
const Shape params_prime_shape(params_shape.begin() + axis, params_shape.end());
// prepare shape of indices_prime
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
const size_t indices_ndim = indices_shape.size();
Shape indices_prime_shape;
// prepare shape of out_prime (same as params_prime except for first dim)
Shape out_prime_shape(params_prime_shape);
@@ -73,8 +70,8 @@ namespace ngraph
indices_prime_shape.emplace_back(1);
// Create a CoordinateTransform for "out" that visits the outer "axis" dimensions
size_t out_ndim = static_cast<size_t>(out_shape.size());
Coordinate out_outer_start_corner(out_ndim, 0);
const size_t out_ndim = out_shape.size();
const Coordinate out_outer_start_corner(out_ndim, 0);
Coordinate out_outer_end_corner(out_shape);
for (size_t i = axis; i < out_ndim; i++)
{
@@ -90,44 +87,43 @@ namespace ngraph
out_outer_axis_order);
// Create a CoordinateTransform for "params" that visits the outer "axis" dimensions
size_t params_ndim = static_cast<size_t>(params_shape.size());
Coordinate params_outer_start_corner(params_ndim, 0);
const size_t params_ndim = params_shape.size();
const Coordinate params_outer_start_corner(params_ndim, 0);
Coordinate params_outer_end_corner(params_shape);
for (size_t i = axis; i < params_ndim; i++)
{
params_outer_end_corner[i] = 1;
}
Strides params_outer_strides(params_ndim, 1);
const Strides params_outer_strides(params_ndim, 1);
AxisVector params_outer_axis_order(params_ndim);
std::iota(params_outer_axis_order.begin(), params_outer_axis_order.end(), 0);
CoordinateTransform params_outer_transform(params_shape,
params_outer_start_corner,
params_outer_end_corner,
params_outer_strides,
params_outer_axis_order);
const CoordinateTransform params_outer_transform(params_shape,
params_outer_start_corner,
params_outer_end_corner,
params_outer_strides,
params_outer_axis_order);
// Create a CoordinateTransform for "indices" that visits only the first element
// along inner most axis
Coordinate indices_outer_start_corner(indices_ndim, 0);
const Coordinate indices_outer_start_corner(indices_ndim, 0);
Coordinate indices_outer_end_corner(indices_shape);
if (indices_ndim > 0)
{
indices_outer_end_corner[indices_ndim - 1] = 1;
}
Strides indices_outer_strides(indices_ndim, 1);
const Strides indices_outer_strides(indices_ndim, 1);
AxisVector indices_outer_axis_order(indices_ndim);
std::iota(indices_outer_axis_order.begin(), indices_outer_axis_order.end(), 0);
CoordinateTransform indices_outer_transform(indices_shape,
indices_outer_start_corner,
indices_outer_end_corner,
indices_outer_strides,
indices_outer_axis_order);
const CoordinateTransform indices_outer_transform(indices_shape,
indices_outer_start_corner,
indices_outer_end_corner,
indices_outer_strides,
indices_outer_axis_order);
// Create an inner CoordinateTransfrom for "out"
size_t out_inner_ndim = out_ndim - axis;
Shape out_inner_shape(out_shape);
out_inner_shape.erase(out_inner_shape.begin(), out_inner_shape.begin() + axis);
Coordinate out_inner_start_corner(out_inner_ndim, 0);
const size_t out_inner_ndim = out_ndim - axis;
const Shape out_inner_shape(out_shape.begin() + axis, out_shape.end());
const Coordinate out_inner_start_corner(out_inner_ndim, 0);
Coordinate out_inner_end_corner(out_inner_shape);
if (indices_ndim > 0)
{
@@ -137,14 +133,14 @@ namespace ngraph
{
out_inner_end_corner[i] = 1;
}
Strides out_inner_strides(out_inner_ndim, 1);
const Strides out_inner_strides(out_inner_ndim, 1);
AxisVector out_inner_axis_order(out_inner_ndim);
std::iota(out_inner_axis_order.begin(), out_inner_axis_order.end(), 0);
CoordinateTransform out_inner_transform(out_inner_shape,
out_inner_start_corner,
out_inner_end_corner,
out_inner_strides,
out_inner_axis_order);
const CoordinateTransform out_inner_transform(out_inner_shape,
out_inner_start_corner,
out_inner_end_corner,
out_inner_strides,
out_inner_axis_order);
auto out_outer_coord_iter = out_outer_transform.begin();
for (const Coordinate& params_outer_coord : params_outer_transform)
@@ -169,11 +165,11 @@ namespace ngraph
params_prime_shape,
indices_prime_shape,
out_prime_shape);
out_inner_coord_iter++;
++out_inner_coord_iter;
}
out_outer_coord_iter++;
++out_outer_coord_iter;
}
}
}
}
}
} // namespace reference
} // namespace runtime
} // namespace ngraph

View File

@@ -16,6 +16,8 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <numeric>
#include "ngraph/coordinate_transform.hpp"
@@ -26,171 +28,169 @@ namespace ngraph
{
namespace reference
{
// foreach leaf_vector_index in indices.shape[:-1]
// vector = indices[leaf_vector_index]
// out[leaf_vector_index:] = params[vector]
template <typename T, typename U>
void gather_nd_batch(const T* params,
const U* indices,
T* out,
const Shape& params_shape,
const Shape& indices_shape,
const Shape& out_shape)
namespace
{
using namespace std;
// Create a CoordinateTransform for "indices" that visits only the first element
// along inner most axis
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
Coordinate indices_outer_start_corner(indices_ndim, 0);
Coordinate indices_outer_end_corner(indices_shape);
size_t slice_rank = indices_shape[indices_ndim - 1];
indices_outer_end_corner[indices_ndim - 1] = 1;
Strides indices_strides(indices_ndim, 1);
AxisVector indices_axis_order(indices_ndim);
std::iota(indices_axis_order.begin(), indices_axis_order.end(), 0);
CoordinateTransform indices_outer_transform(indices_shape,
indices_outer_start_corner,
indices_outer_end_corner,
indices_strides,
indices_axis_order);
template <bool check>
using Required = typename std::enable_if<check, bool>::type;
// Create a matching CoordinateTransform for "out" that visits the same outer
// coordinates
size_t out_ndim = static_cast<size_t>(out_shape.size());
Coordinate out_start_corner(out_ndim, 0);
Coordinate out_end_corner(out_shape);
for (size_t i = indices_ndim - 1; i < out_ndim; i++)
template <typename It>
struct IsRandomAccessIt
{
out_end_corner[i] = 1;
}
Strides out_strides(out_ndim, 1);
AxisVector out_axis_order(out_ndim);
std::iota(out_axis_order.begin(), out_axis_order.end(), 0);
CoordinateTransform out_transform(
out_shape, out_start_corner, out_end_corner, out_strides, out_axis_order);
size_t params_ndim = static_cast<size_t>(params_shape.size());
Strides params_strides(params_ndim, 1);
AxisVector params_axis_order(params_ndim);
std::iota(params_axis_order.begin(), params_axis_order.end(), 0);
static constexpr bool value =
std::is_same<typename It::iterator_category,
std::random_access_iterator_tag>::value;
};
// Gather slices from "params" and copy to "out"
auto out_coord_iter = out_transform.begin();
for (const Coordinate& indices_coord : indices_outer_transform)
template <typename Iterator, Required<IsRandomAccessIt<Iterator>::value> = true>
class Span
{
Coordinate params_start_corner(params_ndim, 0);
Coordinate params_end_corner(params_shape);
auto indices_index = indices_outer_transform.index(indices_coord);
for (size_t i = 0; i < slice_rank; i++)
public:
Span(Iterator begin, Iterator end)
: m_begin{begin}
, m_end{end}
{
U index = indices[indices_index];
// take care of negative indices
index = index >= 0 ? index : index + params_shape[i];
params_start_corner[i] = index;
params_end_corner[i] = index + 1;
indices_index++;
}
CoordinateTransform params_transform(params_shape,
params_start_corner,
params_end_corner,
params_strides,
params_axis_order);
if (out_coord_iter == out_transform.end())
break;
auto out_index = out_transform.index(*out_coord_iter);
for (const Coordinate& params_coord : params_transform)
{
out[out_index] = params[params_transform.index(params_coord)];
out_index++;
}
out_coord_iter++;
}
}
Iterator begin() const { return m_begin; }
Iterator end() const { return m_end; };
typename Iterator::value_type operator[](size_t idx) const
{
return *next(m_begin, idx);
}
typename Iterator::difference_type size() const
{
return std::distance(m_begin, m_end);
}
private:
Iterator m_begin;
Iterator m_end;
};
template <typename Iterator>
Span<Iterator> span(Iterator begin, Iterator end)
{
return Span<Iterator>{begin, end};
};
template <typename Iterator>
std::vector<size_t> get_indices_offsets(const Iterator beg,
const Iterator end,
size_t last_slice_size)
{
auto next_e = beg;
auto i = std::distance(beg, end);
std::vector<size_t> offsets(i + 1, last_slice_size);
while (i-- > 0)
{
offsets[i] = *next_e * offsets[i + 1];
++next_e;
}
return offsets;
}
} // namespace
///
/// Implementation find maximum length of *slice* of input *params* which might be
/// copied to *out* index by index.
/// +-------+--------------+-------+
/// | batch | indices[:-1] | slice |
/// | shape | shape | shape |
/// +-------+--------------+-------+
///
template <typename T, typename U>
void gather_nd(const T* params,
const U* indices,
T* out,
void gather_nd(const T* const params,
const U* const indices,
T* const out,
const Shape& params_shape,
const Shape& indices_shape,
const Shape& out_shape,
int batch_dims = 0)
const int batch_dims = 0)
{
using namespace std;
if (batch_dims == 0)
using std::begin;
using std::end;
using std::next;
using std::prev;
const auto rbegin = [](const Shape& s) { // generic since C++14
return s.rbegin();
};
const Shape batch_shape(begin(params_shape), next(begin(params_shape), batch_dims));
const auto batch_size = shape_size(batch_shape);
if (batch_dims && batch_size != out_shape.front())
{
gather_nd_batch(params, indices, out, params_shape, indices_shape, out_shape);
return;
throw std::domain_error{
"out_shape should have on first dim multiplication of batch number of first"
"dimensions of shape "};
}
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
Coordinate indices_outer_start_corner(indices_ndim, 0);
Coordinate indices_outer_end_corner(indices_shape);
for (size_t i = batch_dims; i < indices_ndim; i++)
if (!std::equal(begin(params_shape),
next(begin(params_shape), batch_dims),
begin(indices_shape)))
{
indices_outer_end_corner[i] = 1;
throw std::domain_error{
"dimensions in params and indices have to be equal on batch dimensions"};
}
Strides indices_strides(indices_ndim, 1);
AxisVector indices_axis_order(indices_ndim);
std::iota(indices_axis_order.begin(), indices_axis_order.end(), 0);
CoordinateTransform indices_outer_transform(indices_shape,
indices_outer_start_corner,
indices_outer_end_corner,
indices_strides,
indices_axis_order);
size_t params_ndim = static_cast<size_t>(params_shape.size());
Coordinate params_outer_start_corner(params_ndim, 0);
Coordinate params_outer_end_corner(params_shape);
for (size_t i = batch_dims; i < params_ndim; i++)
const auto first_slice_index_in_params = batch_dims + indices_shape.back();
if (!(first_slice_index_in_params <= params_shape.size()))
{
params_outer_end_corner[i] = 1;
throw std::domain_error{
"params_shape should have enough rank to be index by indices"};
}
Strides params_strides(params_ndim, 1);
AxisVector params_axis_order(params_ndim);
std::iota(params_axis_order.begin(), params_axis_order.end(), 0);
CoordinateTransform params_outer_transform(params_shape,
params_outer_start_corner,
params_outer_end_corner,
params_strides,
params_axis_order);
size_t out_ndim = static_cast<size_t>(out_shape.size());
Coordinate out_start_corner(out_ndim, 0);
Coordinate out_end_corner(out_shape);
for (size_t i = 1; i < out_ndim; i++)
const auto slice_shape =
span(next(begin(params_shape), first_slice_index_in_params), end(params_shape));
const auto slice_size = shape_size(slice_shape);
const auto dims_begin = next(rbegin(params_shape), slice_shape.size());
const auto dims_end = next(dims_begin, indices_shape.back() - 1);
const auto indices_offsets = get_indices_offsets(dims_begin, dims_end, slice_size);
const auto batch_offset = indices_offsets.front() * params_shape[batch_dims];
const auto k_1_indices =
span(next(begin(indices_shape), batch_dims), prev(end(indices_shape)));
const auto k_1_params =
span(next(begin(params_shape), batch_dims), prev(end(params_shape)));
const auto number_of_slices_to_copy_in_one_batch = shape_size(k_1_indices);
const auto coordinates_size = indices_shape.back();
for (size_t batch = 0; batch != batch_size; ++batch)
{
out_end_corner[i] = 1;
}
Strides out_strides(out_ndim, 1);
AxisVector out_axis_order(out_ndim);
std::iota(out_axis_order.begin(), out_axis_order.end(), 0);
CoordinateTransform out_transform(
out_shape, out_start_corner, out_end_corner, out_strides, out_axis_order);
const auto input_batch_offset = batch * batch_offset;
const auto output_batch_offset =
batch * number_of_slices_to_copy_in_one_batch * slice_size;
const auto coordinates_batch_offset =
batch * number_of_slices_to_copy_in_one_batch * coordinates_size;
for (size_t slice = 0; slice != number_of_slices_to_copy_in_one_batch; ++slice)
{
const auto slice_coordinates =
next(indices, coordinates_batch_offset + slice * coordinates_size);
Shape indices_shape_batch(indices_shape.begin() + batch_dims, indices_shape.end());
Shape params_shape_batch(params_shape.begin() + batch_dims, params_shape.end());
Shape output_shape_batch(out_shape.begin() + 1, out_shape.end());
auto out_coord_iter = out_transform.begin();
auto params_coord_iter = params_outer_transform.begin();
for (const Coordinate& indices_coord : indices_outer_transform)
{
if (params_coord_iter == params_outer_transform.end() ||
out_coord_iter == out_transform.end())
break;
auto indices_index = indices_outer_transform.index(indices_coord);
auto params_index = params_outer_transform.index(*params_coord_iter);
auto output_index = out_transform.index(*out_coord_iter);
gather_nd_batch(params + params_index,
indices + indices_index,
out + output_index,
params_shape_batch,
indices_shape_batch,
output_shape_batch);
out_coord_iter++;
params_coord_iter++;
size_t input_slice_offset = input_batch_offset;
for (size_t c = 0; c != coordinates_size; ++c)
{
const auto i_c = slice_coordinates[c];
const auto index = i_c < 0 ? k_1_params[c] + i_c : i_c;
input_slice_offset += index * indices_offsets[c];
}
const auto output_slice_offset = output_batch_offset + slice * slice_size;
std::copy(next(params, input_slice_offset),
next(params, input_slice_offset + slice_size),
next(out, output_slice_offset));
}
}
}
}
}
}
} // namespace reference
} // namespace runtime
} // namespace ngraph

View File

@@ -372,6 +372,32 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_1d_from_3d)
MIN_FLOAT_TOLERANCE_BITS));
}
NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_1d_from_3d_negative)
{
Shape params_shape{2, 2, 2};
Shape indices_shape{2, 2, 2};
Shape out_shape{2, 2, 2};
auto P = make_shared<op::Parameter>(element::f32, params_shape);
auto I = make_shared<op::Parameter>(element::i32, indices_shape);
auto G = make_shared<op::v5::GatherND>(P, I);
auto f = make_shared<Function>(G, ParameterVector{P, I});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto p = backend->create_tensor(element::f32, params_shape);
copy_data(p, vector<float>{1.0f, 1.1f, 1.2f, 1.3f, 2.0f, 2.1f, 2.2f, 2.3f});
auto i = backend->create_tensor(element::i32, indices_shape);
copy_data(i, vector<int32_t>{0, -1, -1, 0, 0, 0, 1, 1});
auto result = backend->create_tensor(element::f32, out_shape);
auto c = backend->compile(f);
c->call_with_validate({result}, {p, i});
EXPECT_TRUE(test::all_close_f((vector<float>{1.2f, 1.3f, 2.0f, 2.1f, 1.0f, 1.1f, 2.2f, 2.3f}),
read_vector<float>(result),
MIN_FLOAT_TOLERANCE_BITS));
}
NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_2d_from_3d)
{
Shape params_shape{2, 2, 2};

View File

@@ -1144,6 +1144,9 @@ IE_CPU.nonmaxsuppression_two_classes
# Bug in CPU plugin for ROIPooling when pooled size is 1x1 and method is bilinear
IE_CPU.roi_pooling_1x1_bilinear
# output mismatch
IE_CPU.gather_nd_batch_1d_from_3d_negative
#-------------------------------------------------------------------------------
#
# Inference Engine GPU plugin excludes