faster implementation of gather_nd operator for reference implementations (#2897)
* faster implementation of gather_nd operator for reference implementations * remove old impl and time measurements * exclude test for gather_nd for IE_CPU (output mismatch) * apply review comments and rename variables for clarify * rename variables according to PR comments * try to apply all PR suggestions * fix indices calcualtions Co-authored-by: Patryk Elszkowski <patryk.elszkowki@intel.com>
This commit is contained in:
committed by
GitHub
parent
ce1ec9eab0
commit
8e64eb21cb
@@ -51,13 +51,10 @@ namespace ngraph
|
||||
const Shape& out_shape,
|
||||
size_t axis)
|
||||
{
|
||||
using namespace std;
|
||||
// prepare shape of params_prime (remove first "axis" dimensions)
|
||||
Shape params_prime_shape(params_shape);
|
||||
params_prime_shape.erase(params_prime_shape.begin(),
|
||||
params_prime_shape.begin() + axis);
|
||||
const Shape params_prime_shape(params_shape.begin() + axis, params_shape.end());
|
||||
// prepare shape of indices_prime
|
||||
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
|
||||
const size_t indices_ndim = indices_shape.size();
|
||||
Shape indices_prime_shape;
|
||||
// prepare shape of out_prime (same as params_prime except for first dim)
|
||||
Shape out_prime_shape(params_prime_shape);
|
||||
@@ -73,8 +70,8 @@ namespace ngraph
|
||||
indices_prime_shape.emplace_back(1);
|
||||
|
||||
// Create a CoordinateTransform for "out" that visits the outer "axis" dimensions
|
||||
size_t out_ndim = static_cast<size_t>(out_shape.size());
|
||||
Coordinate out_outer_start_corner(out_ndim, 0);
|
||||
const size_t out_ndim = out_shape.size();
|
||||
const Coordinate out_outer_start_corner(out_ndim, 0);
|
||||
Coordinate out_outer_end_corner(out_shape);
|
||||
for (size_t i = axis; i < out_ndim; i++)
|
||||
{
|
||||
@@ -90,44 +87,43 @@ namespace ngraph
|
||||
out_outer_axis_order);
|
||||
|
||||
// Create a CoordinateTransform for "params" that visits the outer "axis" dimensions
|
||||
size_t params_ndim = static_cast<size_t>(params_shape.size());
|
||||
Coordinate params_outer_start_corner(params_ndim, 0);
|
||||
const size_t params_ndim = params_shape.size();
|
||||
const Coordinate params_outer_start_corner(params_ndim, 0);
|
||||
Coordinate params_outer_end_corner(params_shape);
|
||||
for (size_t i = axis; i < params_ndim; i++)
|
||||
{
|
||||
params_outer_end_corner[i] = 1;
|
||||
}
|
||||
Strides params_outer_strides(params_ndim, 1);
|
||||
const Strides params_outer_strides(params_ndim, 1);
|
||||
AxisVector params_outer_axis_order(params_ndim);
|
||||
std::iota(params_outer_axis_order.begin(), params_outer_axis_order.end(), 0);
|
||||
CoordinateTransform params_outer_transform(params_shape,
|
||||
params_outer_start_corner,
|
||||
params_outer_end_corner,
|
||||
params_outer_strides,
|
||||
params_outer_axis_order);
|
||||
const CoordinateTransform params_outer_transform(params_shape,
|
||||
params_outer_start_corner,
|
||||
params_outer_end_corner,
|
||||
params_outer_strides,
|
||||
params_outer_axis_order);
|
||||
|
||||
// Create a CoordinateTransform for "indices" that visits only the first element
|
||||
// along inner most axis
|
||||
Coordinate indices_outer_start_corner(indices_ndim, 0);
|
||||
const Coordinate indices_outer_start_corner(indices_ndim, 0);
|
||||
Coordinate indices_outer_end_corner(indices_shape);
|
||||
if (indices_ndim > 0)
|
||||
{
|
||||
indices_outer_end_corner[indices_ndim - 1] = 1;
|
||||
}
|
||||
Strides indices_outer_strides(indices_ndim, 1);
|
||||
const Strides indices_outer_strides(indices_ndim, 1);
|
||||
AxisVector indices_outer_axis_order(indices_ndim);
|
||||
std::iota(indices_outer_axis_order.begin(), indices_outer_axis_order.end(), 0);
|
||||
CoordinateTransform indices_outer_transform(indices_shape,
|
||||
indices_outer_start_corner,
|
||||
indices_outer_end_corner,
|
||||
indices_outer_strides,
|
||||
indices_outer_axis_order);
|
||||
const CoordinateTransform indices_outer_transform(indices_shape,
|
||||
indices_outer_start_corner,
|
||||
indices_outer_end_corner,
|
||||
indices_outer_strides,
|
||||
indices_outer_axis_order);
|
||||
|
||||
// Create an inner CoordinateTransfrom for "out"
|
||||
size_t out_inner_ndim = out_ndim - axis;
|
||||
Shape out_inner_shape(out_shape);
|
||||
out_inner_shape.erase(out_inner_shape.begin(), out_inner_shape.begin() + axis);
|
||||
Coordinate out_inner_start_corner(out_inner_ndim, 0);
|
||||
const size_t out_inner_ndim = out_ndim - axis;
|
||||
const Shape out_inner_shape(out_shape.begin() + axis, out_shape.end());
|
||||
const Coordinate out_inner_start_corner(out_inner_ndim, 0);
|
||||
Coordinate out_inner_end_corner(out_inner_shape);
|
||||
if (indices_ndim > 0)
|
||||
{
|
||||
@@ -137,14 +133,14 @@ namespace ngraph
|
||||
{
|
||||
out_inner_end_corner[i] = 1;
|
||||
}
|
||||
Strides out_inner_strides(out_inner_ndim, 1);
|
||||
const Strides out_inner_strides(out_inner_ndim, 1);
|
||||
AxisVector out_inner_axis_order(out_inner_ndim);
|
||||
std::iota(out_inner_axis_order.begin(), out_inner_axis_order.end(), 0);
|
||||
CoordinateTransform out_inner_transform(out_inner_shape,
|
||||
out_inner_start_corner,
|
||||
out_inner_end_corner,
|
||||
out_inner_strides,
|
||||
out_inner_axis_order);
|
||||
const CoordinateTransform out_inner_transform(out_inner_shape,
|
||||
out_inner_start_corner,
|
||||
out_inner_end_corner,
|
||||
out_inner_strides,
|
||||
out_inner_axis_order);
|
||||
|
||||
auto out_outer_coord_iter = out_outer_transform.begin();
|
||||
for (const Coordinate& params_outer_coord : params_outer_transform)
|
||||
@@ -169,11 +165,11 @@ namespace ngraph
|
||||
params_prime_shape,
|
||||
indices_prime_shape,
|
||||
out_prime_shape);
|
||||
out_inner_coord_iter++;
|
||||
++out_inner_coord_iter;
|
||||
}
|
||||
out_outer_coord_iter++;
|
||||
++out_outer_coord_iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace reference
|
||||
} // namespace runtime
|
||||
} // namespace ngraph
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <numeric>
|
||||
|
||||
#include "ngraph/coordinate_transform.hpp"
|
||||
@@ -26,171 +28,169 @@ namespace ngraph
|
||||
{
|
||||
namespace reference
|
||||
{
|
||||
// foreach leaf_vector_index in indices.shape[:-1]
|
||||
// vector = indices[leaf_vector_index]
|
||||
// out[leaf_vector_index:] = params[vector]
|
||||
template <typename T, typename U>
|
||||
void gather_nd_batch(const T* params,
|
||||
const U* indices,
|
||||
T* out,
|
||||
const Shape& params_shape,
|
||||
const Shape& indices_shape,
|
||||
const Shape& out_shape)
|
||||
namespace
|
||||
{
|
||||
using namespace std;
|
||||
// Create a CoordinateTransform for "indices" that visits only the first element
|
||||
// along inner most axis
|
||||
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
|
||||
Coordinate indices_outer_start_corner(indices_ndim, 0);
|
||||
Coordinate indices_outer_end_corner(indices_shape);
|
||||
size_t slice_rank = indices_shape[indices_ndim - 1];
|
||||
indices_outer_end_corner[indices_ndim - 1] = 1;
|
||||
Strides indices_strides(indices_ndim, 1);
|
||||
AxisVector indices_axis_order(indices_ndim);
|
||||
std::iota(indices_axis_order.begin(), indices_axis_order.end(), 0);
|
||||
CoordinateTransform indices_outer_transform(indices_shape,
|
||||
indices_outer_start_corner,
|
||||
indices_outer_end_corner,
|
||||
indices_strides,
|
||||
indices_axis_order);
|
||||
template <bool check>
|
||||
using Required = typename std::enable_if<check, bool>::type;
|
||||
|
||||
// Create a matching CoordinateTransform for "out" that visits the same outer
|
||||
// coordinates
|
||||
size_t out_ndim = static_cast<size_t>(out_shape.size());
|
||||
Coordinate out_start_corner(out_ndim, 0);
|
||||
Coordinate out_end_corner(out_shape);
|
||||
for (size_t i = indices_ndim - 1; i < out_ndim; i++)
|
||||
template <typename It>
|
||||
struct IsRandomAccessIt
|
||||
{
|
||||
out_end_corner[i] = 1;
|
||||
}
|
||||
Strides out_strides(out_ndim, 1);
|
||||
AxisVector out_axis_order(out_ndim);
|
||||
std::iota(out_axis_order.begin(), out_axis_order.end(), 0);
|
||||
CoordinateTransform out_transform(
|
||||
out_shape, out_start_corner, out_end_corner, out_strides, out_axis_order);
|
||||
size_t params_ndim = static_cast<size_t>(params_shape.size());
|
||||
Strides params_strides(params_ndim, 1);
|
||||
AxisVector params_axis_order(params_ndim);
|
||||
std::iota(params_axis_order.begin(), params_axis_order.end(), 0);
|
||||
static constexpr bool value =
|
||||
std::is_same<typename It::iterator_category,
|
||||
std::random_access_iterator_tag>::value;
|
||||
};
|
||||
|
||||
// Gather slices from "params" and copy to "out"
|
||||
auto out_coord_iter = out_transform.begin();
|
||||
for (const Coordinate& indices_coord : indices_outer_transform)
|
||||
template <typename Iterator, Required<IsRandomAccessIt<Iterator>::value> = true>
|
||||
class Span
|
||||
{
|
||||
Coordinate params_start_corner(params_ndim, 0);
|
||||
Coordinate params_end_corner(params_shape);
|
||||
auto indices_index = indices_outer_transform.index(indices_coord);
|
||||
for (size_t i = 0; i < slice_rank; i++)
|
||||
public:
|
||||
Span(Iterator begin, Iterator end)
|
||||
: m_begin{begin}
|
||||
, m_end{end}
|
||||
{
|
||||
U index = indices[indices_index];
|
||||
// take care of negative indices
|
||||
index = index >= 0 ? index : index + params_shape[i];
|
||||
params_start_corner[i] = index;
|
||||
params_end_corner[i] = index + 1;
|
||||
indices_index++;
|
||||
}
|
||||
CoordinateTransform params_transform(params_shape,
|
||||
params_start_corner,
|
||||
params_end_corner,
|
||||
params_strides,
|
||||
params_axis_order);
|
||||
if (out_coord_iter == out_transform.end())
|
||||
break;
|
||||
auto out_index = out_transform.index(*out_coord_iter);
|
||||
for (const Coordinate& params_coord : params_transform)
|
||||
{
|
||||
out[out_index] = params[params_transform.index(params_coord)];
|
||||
out_index++;
|
||||
}
|
||||
out_coord_iter++;
|
||||
}
|
||||
}
|
||||
|
||||
Iterator begin() const { return m_begin; }
|
||||
Iterator end() const { return m_end; };
|
||||
typename Iterator::value_type operator[](size_t idx) const
|
||||
{
|
||||
return *next(m_begin, idx);
|
||||
}
|
||||
|
||||
typename Iterator::difference_type size() const
|
||||
{
|
||||
return std::distance(m_begin, m_end);
|
||||
}
|
||||
|
||||
private:
|
||||
Iterator m_begin;
|
||||
Iterator m_end;
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
Span<Iterator> span(Iterator begin, Iterator end)
|
||||
{
|
||||
return Span<Iterator>{begin, end};
|
||||
};
|
||||
|
||||
template <typename Iterator>
|
||||
std::vector<size_t> get_indices_offsets(const Iterator beg,
|
||||
const Iterator end,
|
||||
size_t last_slice_size)
|
||||
{
|
||||
auto next_e = beg;
|
||||
auto i = std::distance(beg, end);
|
||||
std::vector<size_t> offsets(i + 1, last_slice_size);
|
||||
while (i-- > 0)
|
||||
{
|
||||
offsets[i] = *next_e * offsets[i + 1];
|
||||
++next_e;
|
||||
}
|
||||
|
||||
return offsets;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
///
|
||||
/// Implementation find maximum length of *slice* of input *params* which might be
|
||||
/// copied to *out* index by index.
|
||||
/// +-------+--------------+-------+
|
||||
/// | batch | indices[:-1] | slice |
|
||||
/// | shape | shape | shape |
|
||||
/// +-------+--------------+-------+
|
||||
///
|
||||
template <typename T, typename U>
|
||||
void gather_nd(const T* params,
|
||||
const U* indices,
|
||||
T* out,
|
||||
void gather_nd(const T* const params,
|
||||
const U* const indices,
|
||||
T* const out,
|
||||
const Shape& params_shape,
|
||||
const Shape& indices_shape,
|
||||
const Shape& out_shape,
|
||||
int batch_dims = 0)
|
||||
const int batch_dims = 0)
|
||||
{
|
||||
using namespace std;
|
||||
if (batch_dims == 0)
|
||||
using std::begin;
|
||||
using std::end;
|
||||
using std::next;
|
||||
using std::prev;
|
||||
const auto rbegin = [](const Shape& s) { // generic since C++14
|
||||
return s.rbegin();
|
||||
};
|
||||
|
||||
const Shape batch_shape(begin(params_shape), next(begin(params_shape), batch_dims));
|
||||
const auto batch_size = shape_size(batch_shape);
|
||||
|
||||
if (batch_dims && batch_size != out_shape.front())
|
||||
{
|
||||
gather_nd_batch(params, indices, out, params_shape, indices_shape, out_shape);
|
||||
return;
|
||||
throw std::domain_error{
|
||||
"out_shape should have on first dim multiplication of batch number of first"
|
||||
"dimensions of shape "};
|
||||
}
|
||||
|
||||
size_t indices_ndim = static_cast<size_t>(indices_shape.size());
|
||||
Coordinate indices_outer_start_corner(indices_ndim, 0);
|
||||
Coordinate indices_outer_end_corner(indices_shape);
|
||||
for (size_t i = batch_dims; i < indices_ndim; i++)
|
||||
if (!std::equal(begin(params_shape),
|
||||
next(begin(params_shape), batch_dims),
|
||||
begin(indices_shape)))
|
||||
{
|
||||
indices_outer_end_corner[i] = 1;
|
||||
throw std::domain_error{
|
||||
"dimensions in params and indices have to be equal on batch dimensions"};
|
||||
}
|
||||
Strides indices_strides(indices_ndim, 1);
|
||||
AxisVector indices_axis_order(indices_ndim);
|
||||
std::iota(indices_axis_order.begin(), indices_axis_order.end(), 0);
|
||||
CoordinateTransform indices_outer_transform(indices_shape,
|
||||
indices_outer_start_corner,
|
||||
indices_outer_end_corner,
|
||||
indices_strides,
|
||||
indices_axis_order);
|
||||
|
||||
size_t params_ndim = static_cast<size_t>(params_shape.size());
|
||||
Coordinate params_outer_start_corner(params_ndim, 0);
|
||||
Coordinate params_outer_end_corner(params_shape);
|
||||
for (size_t i = batch_dims; i < params_ndim; i++)
|
||||
const auto first_slice_index_in_params = batch_dims + indices_shape.back();
|
||||
|
||||
if (!(first_slice_index_in_params <= params_shape.size()))
|
||||
{
|
||||
params_outer_end_corner[i] = 1;
|
||||
throw std::domain_error{
|
||||
"params_shape should have enough rank to be index by indices"};
|
||||
}
|
||||
Strides params_strides(params_ndim, 1);
|
||||
AxisVector params_axis_order(params_ndim);
|
||||
std::iota(params_axis_order.begin(), params_axis_order.end(), 0);
|
||||
CoordinateTransform params_outer_transform(params_shape,
|
||||
params_outer_start_corner,
|
||||
params_outer_end_corner,
|
||||
params_strides,
|
||||
params_axis_order);
|
||||
|
||||
size_t out_ndim = static_cast<size_t>(out_shape.size());
|
||||
Coordinate out_start_corner(out_ndim, 0);
|
||||
Coordinate out_end_corner(out_shape);
|
||||
for (size_t i = 1; i < out_ndim; i++)
|
||||
const auto slice_shape =
|
||||
span(next(begin(params_shape), first_slice_index_in_params), end(params_shape));
|
||||
const auto slice_size = shape_size(slice_shape);
|
||||
|
||||
const auto dims_begin = next(rbegin(params_shape), slice_shape.size());
|
||||
const auto dims_end = next(dims_begin, indices_shape.back() - 1);
|
||||
|
||||
const auto indices_offsets = get_indices_offsets(dims_begin, dims_end, slice_size);
|
||||
|
||||
const auto batch_offset = indices_offsets.front() * params_shape[batch_dims];
|
||||
|
||||
const auto k_1_indices =
|
||||
span(next(begin(indices_shape), batch_dims), prev(end(indices_shape)));
|
||||
|
||||
const auto k_1_params =
|
||||
span(next(begin(params_shape), batch_dims), prev(end(params_shape)));
|
||||
|
||||
const auto number_of_slices_to_copy_in_one_batch = shape_size(k_1_indices);
|
||||
|
||||
const auto coordinates_size = indices_shape.back();
|
||||
|
||||
for (size_t batch = 0; batch != batch_size; ++batch)
|
||||
{
|
||||
out_end_corner[i] = 1;
|
||||
}
|
||||
Strides out_strides(out_ndim, 1);
|
||||
AxisVector out_axis_order(out_ndim);
|
||||
std::iota(out_axis_order.begin(), out_axis_order.end(), 0);
|
||||
CoordinateTransform out_transform(
|
||||
out_shape, out_start_corner, out_end_corner, out_strides, out_axis_order);
|
||||
const auto input_batch_offset = batch * batch_offset;
|
||||
const auto output_batch_offset =
|
||||
batch * number_of_slices_to_copy_in_one_batch * slice_size;
|
||||
const auto coordinates_batch_offset =
|
||||
batch * number_of_slices_to_copy_in_one_batch * coordinates_size;
|
||||
for (size_t slice = 0; slice != number_of_slices_to_copy_in_one_batch; ++slice)
|
||||
{
|
||||
const auto slice_coordinates =
|
||||
next(indices, coordinates_batch_offset + slice * coordinates_size);
|
||||
|
||||
Shape indices_shape_batch(indices_shape.begin() + batch_dims, indices_shape.end());
|
||||
Shape params_shape_batch(params_shape.begin() + batch_dims, params_shape.end());
|
||||
Shape output_shape_batch(out_shape.begin() + 1, out_shape.end());
|
||||
auto out_coord_iter = out_transform.begin();
|
||||
auto params_coord_iter = params_outer_transform.begin();
|
||||
for (const Coordinate& indices_coord : indices_outer_transform)
|
||||
{
|
||||
if (params_coord_iter == params_outer_transform.end() ||
|
||||
out_coord_iter == out_transform.end())
|
||||
break;
|
||||
auto indices_index = indices_outer_transform.index(indices_coord);
|
||||
auto params_index = params_outer_transform.index(*params_coord_iter);
|
||||
auto output_index = out_transform.index(*out_coord_iter);
|
||||
gather_nd_batch(params + params_index,
|
||||
indices + indices_index,
|
||||
out + output_index,
|
||||
params_shape_batch,
|
||||
indices_shape_batch,
|
||||
output_shape_batch);
|
||||
|
||||
out_coord_iter++;
|
||||
params_coord_iter++;
|
||||
size_t input_slice_offset = input_batch_offset;
|
||||
for (size_t c = 0; c != coordinates_size; ++c)
|
||||
{
|
||||
const auto i_c = slice_coordinates[c];
|
||||
const auto index = i_c < 0 ? k_1_params[c] + i_c : i_c;
|
||||
input_slice_offset += index * indices_offsets[c];
|
||||
}
|
||||
const auto output_slice_offset = output_batch_offset + slice * slice_size;
|
||||
std::copy(next(params, input_slice_offset),
|
||||
next(params, input_slice_offset + slice_size),
|
||||
next(out, output_slice_offset));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace reference
|
||||
} // namespace runtime
|
||||
} // namespace ngraph
|
||||
|
||||
@@ -372,6 +372,32 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_1d_from_3d)
|
||||
MIN_FLOAT_TOLERANCE_BITS));
|
||||
}
|
||||
|
||||
NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_1d_from_3d_negative)
|
||||
{
|
||||
Shape params_shape{2, 2, 2};
|
||||
Shape indices_shape{2, 2, 2};
|
||||
Shape out_shape{2, 2, 2};
|
||||
auto P = make_shared<op::Parameter>(element::f32, params_shape);
|
||||
auto I = make_shared<op::Parameter>(element::i32, indices_shape);
|
||||
auto G = make_shared<op::v5::GatherND>(P, I);
|
||||
auto f = make_shared<Function>(G, ParameterVector{P, I});
|
||||
|
||||
auto backend = runtime::Backend::create("${BACKEND_NAME}");
|
||||
|
||||
// Create some tensors for input/output
|
||||
auto p = backend->create_tensor(element::f32, params_shape);
|
||||
copy_data(p, vector<float>{1.0f, 1.1f, 1.2f, 1.3f, 2.0f, 2.1f, 2.2f, 2.3f});
|
||||
auto i = backend->create_tensor(element::i32, indices_shape);
|
||||
copy_data(i, vector<int32_t>{0, -1, -1, 0, 0, 0, 1, 1});
|
||||
auto result = backend->create_tensor(element::f32, out_shape);
|
||||
|
||||
auto c = backend->compile(f);
|
||||
c->call_with_validate({result}, {p, i});
|
||||
EXPECT_TRUE(test::all_close_f((vector<float>{1.2f, 1.3f, 2.0f, 2.1f, 1.0f, 1.1f, 2.2f, 2.3f}),
|
||||
read_vector<float>(result),
|
||||
MIN_FLOAT_TOLERANCE_BITS));
|
||||
}
|
||||
|
||||
NGRAPH_TEST(${BACKEND_NAME}, gather_nd_batch_2d_from_3d)
|
||||
{
|
||||
Shape params_shape{2, 2, 2};
|
||||
|
||||
@@ -1144,6 +1144,9 @@ IE_CPU.nonmaxsuppression_two_classes
|
||||
# Bug in CPU plugin for ROIPooling when pooled size is 1x1 and method is bilinear
|
||||
IE_CPU.roi_pooling_1x1_bilinear
|
||||
|
||||
# output mismatch
|
||||
IE_CPU.gather_nd_batch_1d_from_3d_negative
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Inference Engine GPU plugin excludes
|
||||
|
||||
Reference in New Issue
Block a user