Optimisations for binary operations broadcast. Phase 2. (#1295)

This commit is contained in:
Vladislav Volkov 2020-07-13 06:11:02 +03:00 committed by GitHub
parent 9dedb39cfc
commit bce6ca07df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 137 additions and 140 deletions

View File

@ -14,6 +14,7 @@
// limitations under the License.
//*****************************************************************************
#include <algorithm>
#include <cstdio>
#include <iostream>
#include <numeric>
@ -430,18 +431,9 @@ CoordinateIterator::CoordinateIterator(const Shape& target_shape, bool is_end)
{
// The case where we have a zero-length axis is a bit special, in that
// the iterator always starts out of bounds.
m_empty = false;
bool const empty = std::find(target_shape.begin(), target_shape.end(), 0) != target_shape.end();
for (auto s : target_shape)
{
if (s == 0)
{
m_empty = true;
break;
}
}
m_oob = is_end || m_empty;
m_oob = is_end || empty;
}
void CoordinateIterator::operator++()
@ -449,12 +441,14 @@ void CoordinateIterator::operator++()
advance(m_target_shape.size() - 1);
}
void CoordinateIterator::advance(size_t axis) noexcept
size_t CoordinateIterator::advance(size_t axis) noexcept
{
m_oob |= m_target_shape.empty();
if (m_oob)
return;
return m_target_shape.size();
bool carry_out = false;
// Increment the target coordinate.
do
@ -464,17 +458,20 @@ void CoordinateIterator::advance(size_t axis) noexcept
if (m_coordinate[axis] < m_target_shape[axis])
{
// No carry-out, so we are done.
return;
return axis;
}
else
{
m_coordinate[axis] = 0;
carry_out = true;
}
} while (axis-- > 0);
// If we are still here there was carry-out from the most significant axis. We are now out of
// bounds.
m_oob = true;
return m_target_shape.size();
}
CoordinateIterator CoordinateIterator::operator++(int)

View File

@ -60,17 +60,16 @@ namespace ngraph
/// \brief Increments iterator using specified axis of the shape n times.
/// \param axis index used for iteration
void advance(size_t axis) noexcept;
size_t advance(size_t axis) noexcept;
/// \brief Useful function to build the last iterator.
/// Returns a singleton that points to the last iterator.
static const CoordinateIterator& end();
private:
Shape m_target_shape;
const Shape& m_target_shape;
Coordinate m_coordinate;
bool m_oob;
bool m_empty;
};
/// \brief Class which allows to calculate item index with given coordinates in tensor

View File

@ -18,6 +18,7 @@
#include <cstddef>
#include <utility>
#include "ngraph/coordinate_transform.hpp"
#include "ngraph/op/util/attr_types.hpp"
#include "ngraph/shape_util.hpp"
@ -30,50 +31,69 @@ namespace ngraph
{
namespace internal
{
inline void
row_major_strides(const Shape& shape, size_t* strides, size_t size) noexcept
{
size_t* st = strides + size - 1;
size_t s = 1;
for (auto d = shape.rbegin(); d != shape.rend(); d++)
{
*st-- = s;
s *= *d;
}
std::fill(strides, st + 1, s);
}
template <typename C, typename T>
inline T value_with_padding_or(const C& arr,
size_t padding,
size_t idx,
T&& default_value)
{
return idx < padding ? std::forward<T>(default_value) : arr[idx - padding];
}
template <int A0, int A1, typename T, typename U, typename Functor>
inline void numpy_autobroadcast_binop(const T* arg0,
const T* arg1,
U* out,
const Shape& arg0_shape,
const Shape& arg1_shape,
const Shape& shape0,
const Shape& shape1,
const size_t* strides0,
const size_t* strides1,
const size_t padding0,
const size_t padding1,
const Shape& output_shape,
const size_t stride,
const size_t axis,
const size_t stride,
Functor elementwise_functor)
{
CoordinateTransformBasic arg0_transform(arg0_shape);
CoordinateTransformBasic arg1_transform(arg1_shape);
for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();
it != ite;
it.advance(axis))
for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();;)
{
const Coordinate& output_coord = *it;
size_t const idx0 = arg0_transform.index(output_coord);
size_t const idx1 = arg1_transform.index(output_coord);
for (size_t i = 0; i < stride; ++i)
*out++ = elementwise_functor(arg0[idx0 + i * A0], arg1[idx1 + i * A1]);
*out++ = elementwise_functor(arg0[i * A0], arg1[i * A1]);
arg0 += A0 ? stride : 1;
arg1 += A1 ? stride : 1;
auto p = it.advance(axis);
if (it == ite)
break;
if (value_with_padding_or(shape0, padding0, p, 1) == 1)
arg0 -= strides0[p];
if (value_with_padding_or(shape1, padding1, p, 1) == 1)
arg1 -= strides1[p];
}
}
inline void calculate_fixed_idx_and_stride(size_t& arg0_p, // Fixed idx
size_t arg1_p,
size_t& stride,
size_t arg0_shape_padding,
size_t arg1_shape_padding,
const Shape& arg0_shape,
const Shape& arg1_shape)
inline size_t calculate_fixed_axis(size_t axis, const size_t* strides)
{
while ((arg0_p < arg0_shape_padding ||
arg0_shape[arg0_p - arg0_shape_padding] == 1) &&
(arg0_p >= arg1_shape_padding &&
arg1_shape[arg0_p - arg1_shape_padding] != 1) &&
--arg0_p > arg1_p)
;
stride = arg0_p < arg1_shape_padding
? shape_size(arg1_shape)
: row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
while (axis > 0 && strides[axis - 1] == 1)
--axis;
return axis;
}
}
@ -134,34 +154,37 @@ namespace ngraph
// ------------
// [ 3, 2, 6]
{
using namespace internal;
size_t const shape_rank =
std::max(arg0_shape.size(), arg1_shape.size()) + 1;
size_t const arg0_shape_padding = shape_rank - arg0_shape.size();
size_t const arg1_shape_padding = shape_rank - arg1_shape.size();
// TODO: Use compiler-specific alloca() or variable-length array
std::vector<size_t> tmp(shape_rank * 2);
size_t* strides0 = tmp.data();
size_t* strides1 = tmp.data() + shape_rank;
row_major_strides(arg0_shape, strides0, shape_rank);
row_major_strides(arg1_shape, strides1, shape_rank);
size_t const padding0 = shape_rank - arg0_shape.size();
size_t const padding1 = shape_rank - arg1_shape.size();
Shape output_shape(shape_rank, 0);
size_t arg0_p = 0, arg1_p = 0;
size_t axis = 0;
for (size_t i = 0; i < shape_rank; i++)
{
Shape::value_type arg0_dim =
i < arg0_shape_padding ? 1 : arg0_shape[i - arg0_shape_padding];
Shape::value_type arg1_dim =
i < arg1_shape_padding ? 1 : arg1_shape[i - arg1_shape_padding];
auto const dim0 = value_with_padding_or(arg0_shape, padding0, i, 1);
auto const dim1 = value_with_padding_or(arg1_shape, padding1, i, 1);
output_shape[i] = arg0_dim == 1 ? arg1_dim : arg0_dim;
output_shape[i] = std::max(dim0, dim1);
if (arg0_dim != arg1_dim)
{
if (arg0_dim == 1)
arg0_p = std::max(arg0_p, i);
if (arg1_dim == 1)
arg1_p = std::max(arg1_p, i);
}
if (dim0 != dim1)
axis = std::max(axis, i);
}
#if 0
// Universal function without optimisations
CoordinateTransformBasic arg0_transform(arg0_shape);
@ -179,86 +202,64 @@ namespace ngraph
*dst++ = elementwise_functor(arg0[idx0], arg1[idx1]);
}
#else
using internal::numpy_autobroadcast_binop;
using internal::calculate_fixed_idx_and_stride;
if (arg0_p < arg1_p)
if (axis == 0)
{
size_t stride =
row_major_stride(arg0_shape, arg1_p - arg0_shape_padding);
if (stride > 1)
numpy_autobroadcast_binop<1, 1>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
output_shape,
stride,
arg1_p,
elementwise_functor);
else
{
calculate_fixed_idx_and_stride(arg1_p,
arg0_p,
stride,
arg1_shape_padding,
arg0_shape_padding,
arg1_shape,
arg0_shape);
numpy_autobroadcast_binop<1, 0>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
output_shape,
stride,
arg1_p,
elementwise_functor);
}
}
else if (arg0_p > arg1_p)
{
size_t stride =
row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
if (stride > 1)
numpy_autobroadcast_binop<1, 1>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
output_shape,
stride,
arg0_p,
elementwise_functor);
else
{
calculate_fixed_idx_and_stride(arg0_p,
arg1_p,
stride,
arg0_shape_padding,
arg1_shape_padding,
arg0_shape,
arg1_shape);
numpy_autobroadcast_binop<0, 1>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
output_shape,
stride,
arg0_p,
elementwise_functor);
}
}
else
{
for (size_t i = 0, end = shape_size(output_shape); i < end; ++i)
for (size_t i = 0, end = strides0[0]; i < end; ++i)
out[i] = elementwise_functor(arg0[i], arg1[i]);
}
else if (strides0[axis] == 1 &&
value_with_padding_or(arg0_shape, padding0, axis, 1) == 1)
{
axis = calculate_fixed_axis(axis, strides0);
numpy_autobroadcast_binop<0, 1>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
strides0,
strides1,
padding0,
padding1,
output_shape,
axis,
strides1[axis],
elementwise_functor);
}
else if (strides1[axis] == 1 &&
value_with_padding_or(arg1_shape, padding1, axis, 1) == 1)
{
axis = calculate_fixed_axis(axis, strides1);
numpy_autobroadcast_binop<1, 0>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
strides0,
strides1,
padding0,
padding1,
output_shape,
axis,
strides0[axis],
elementwise_functor);
}
else
numpy_autobroadcast_binop<1, 1>(arg0,
arg1,
out,
arg0_shape,
arg1_shape,
strides0,
strides1,
padding0,
padding1,
output_shape,
axis,
strides0[axis],
elementwise_functor);
#endif
}
break;