Optimisations for binary operations broadcast. Phase 2. (#1295)
This commit is contained in:
parent
9dedb39cfc
commit
bce6ca07df
@ -14,6 +14,7 @@
|
||||
// limitations under the License.
|
||||
//*****************************************************************************
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
@ -430,18 +431,9 @@ CoordinateIterator::CoordinateIterator(const Shape& target_shape, bool is_end)
|
||||
{
|
||||
// The case where we have a zero-length axis is a bit special, in that
|
||||
// the iterator always starts out of bounds.
|
||||
m_empty = false;
|
||||
bool const empty = std::find(target_shape.begin(), target_shape.end(), 0) != target_shape.end();
|
||||
|
||||
for (auto s : target_shape)
|
||||
{
|
||||
if (s == 0)
|
||||
{
|
||||
m_empty = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
m_oob = is_end || m_empty;
|
||||
m_oob = is_end || empty;
|
||||
}
|
||||
|
||||
void CoordinateIterator::operator++()
|
||||
@ -449,12 +441,14 @@ void CoordinateIterator::operator++()
|
||||
advance(m_target_shape.size() - 1);
|
||||
}
|
||||
|
||||
void CoordinateIterator::advance(size_t axis) noexcept
|
||||
size_t CoordinateIterator::advance(size_t axis) noexcept
|
||||
{
|
||||
m_oob |= m_target_shape.empty();
|
||||
|
||||
if (m_oob)
|
||||
return;
|
||||
return m_target_shape.size();
|
||||
|
||||
bool carry_out = false;
|
||||
|
||||
// Increment the target coordinate.
|
||||
do
|
||||
@ -464,17 +458,20 @@ void CoordinateIterator::advance(size_t axis) noexcept
|
||||
if (m_coordinate[axis] < m_target_shape[axis])
|
||||
{
|
||||
// No carry-out, so we are done.
|
||||
return;
|
||||
return axis;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_coordinate[axis] = 0;
|
||||
carry_out = true;
|
||||
}
|
||||
} while (axis-- > 0);
|
||||
|
||||
// If we are still here there was carry-out from the most significant axis. We are now out of
|
||||
// bounds.
|
||||
m_oob = true;
|
||||
|
||||
return m_target_shape.size();
|
||||
}
|
||||
|
||||
CoordinateIterator CoordinateIterator::operator++(int)
|
||||
|
@ -60,17 +60,16 @@ namespace ngraph
|
||||
|
||||
/// \brief Increments iterator using specified axis of the shape n times.
|
||||
/// \param axis index used for iteration
|
||||
void advance(size_t axis) noexcept;
|
||||
size_t advance(size_t axis) noexcept;
|
||||
|
||||
/// \brief Useful function to build the last iterator.
|
||||
/// Returns a singleton that points to the last iterator.
|
||||
static const CoordinateIterator& end();
|
||||
|
||||
private:
|
||||
Shape m_target_shape;
|
||||
const Shape& m_target_shape;
|
||||
Coordinate m_coordinate;
|
||||
bool m_oob;
|
||||
bool m_empty;
|
||||
};
|
||||
|
||||
/// \brief Class which allows to calculate item index with given coordinates in tensor
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include <utility>
|
||||
#include "ngraph/coordinate_transform.hpp"
|
||||
#include "ngraph/op/util/attr_types.hpp"
|
||||
#include "ngraph/shape_util.hpp"
|
||||
@ -30,50 +31,69 @@ namespace ngraph
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
inline void
|
||||
row_major_strides(const Shape& shape, size_t* strides, size_t size) noexcept
|
||||
{
|
||||
size_t* st = strides + size - 1;
|
||||
size_t s = 1;
|
||||
for (auto d = shape.rbegin(); d != shape.rend(); d++)
|
||||
{
|
||||
*st-- = s;
|
||||
s *= *d;
|
||||
}
|
||||
std::fill(strides, st + 1, s);
|
||||
}
|
||||
|
||||
template <typename C, typename T>
|
||||
inline T value_with_padding_or(const C& arr,
|
||||
size_t padding,
|
||||
size_t idx,
|
||||
T&& default_value)
|
||||
{
|
||||
return idx < padding ? std::forward<T>(default_value) : arr[idx - padding];
|
||||
}
|
||||
|
||||
template <int A0, int A1, typename T, typename U, typename Functor>
|
||||
inline void numpy_autobroadcast_binop(const T* arg0,
|
||||
const T* arg1,
|
||||
U* out,
|
||||
const Shape& arg0_shape,
|
||||
const Shape& arg1_shape,
|
||||
const Shape& shape0,
|
||||
const Shape& shape1,
|
||||
const size_t* strides0,
|
||||
const size_t* strides1,
|
||||
const size_t padding0,
|
||||
const size_t padding1,
|
||||
const Shape& output_shape,
|
||||
const size_t stride,
|
||||
const size_t axis,
|
||||
const size_t stride,
|
||||
Functor elementwise_functor)
|
||||
{
|
||||
CoordinateTransformBasic arg0_transform(arg0_shape);
|
||||
CoordinateTransformBasic arg1_transform(arg1_shape);
|
||||
|
||||
for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();
|
||||
it != ite;
|
||||
it.advance(axis))
|
||||
for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();;)
|
||||
{
|
||||
const Coordinate& output_coord = *it;
|
||||
size_t const idx0 = arg0_transform.index(output_coord);
|
||||
size_t const idx1 = arg1_transform.index(output_coord);
|
||||
for (size_t i = 0; i < stride; ++i)
|
||||
*out++ = elementwise_functor(arg0[idx0 + i * A0], arg1[idx1 + i * A1]);
|
||||
*out++ = elementwise_functor(arg0[i * A0], arg1[i * A1]);
|
||||
|
||||
arg0 += A0 ? stride : 1;
|
||||
arg1 += A1 ? stride : 1;
|
||||
|
||||
auto p = it.advance(axis);
|
||||
|
||||
if (it == ite)
|
||||
break;
|
||||
|
||||
if (value_with_padding_or(shape0, padding0, p, 1) == 1)
|
||||
arg0 -= strides0[p];
|
||||
|
||||
if (value_with_padding_or(shape1, padding1, p, 1) == 1)
|
||||
arg1 -= strides1[p];
|
||||
}
|
||||
}
|
||||
|
||||
inline void calculate_fixed_idx_and_stride(size_t& arg0_p, // Fixed idx
|
||||
size_t arg1_p,
|
||||
size_t& stride,
|
||||
size_t arg0_shape_padding,
|
||||
size_t arg1_shape_padding,
|
||||
const Shape& arg0_shape,
|
||||
const Shape& arg1_shape)
|
||||
inline size_t calculate_fixed_axis(size_t axis, const size_t* strides)
|
||||
{
|
||||
while ((arg0_p < arg0_shape_padding ||
|
||||
arg0_shape[arg0_p - arg0_shape_padding] == 1) &&
|
||||
(arg0_p >= arg1_shape_padding &&
|
||||
arg1_shape[arg0_p - arg1_shape_padding] != 1) &&
|
||||
--arg0_p > arg1_p)
|
||||
;
|
||||
|
||||
stride = arg0_p < arg1_shape_padding
|
||||
? shape_size(arg1_shape)
|
||||
: row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
|
||||
while (axis > 0 && strides[axis - 1] == 1)
|
||||
--axis;
|
||||
return axis;
|
||||
}
|
||||
}
|
||||
|
||||
@ -134,34 +154,37 @@ namespace ngraph
|
||||
// ------------
|
||||
// [ 3, 2, 6]
|
||||
{
|
||||
using namespace internal;
|
||||
|
||||
size_t const shape_rank =
|
||||
std::max(arg0_shape.size(), arg1_shape.size()) + 1;
|
||||
size_t const arg0_shape_padding = shape_rank - arg0_shape.size();
|
||||
size_t const arg1_shape_padding = shape_rank - arg1_shape.size();
|
||||
|
||||
// TODO: Use compiler-specific alloca() or variable-length array
|
||||
std::vector<size_t> tmp(shape_rank * 2);
|
||||
|
||||
size_t* strides0 = tmp.data();
|
||||
size_t* strides1 = tmp.data() + shape_rank;
|
||||
|
||||
row_major_strides(arg0_shape, strides0, shape_rank);
|
||||
row_major_strides(arg1_shape, strides1, shape_rank);
|
||||
|
||||
size_t const padding0 = shape_rank - arg0_shape.size();
|
||||
size_t const padding1 = shape_rank - arg1_shape.size();
|
||||
|
||||
Shape output_shape(shape_rank, 0);
|
||||
|
||||
size_t arg0_p = 0, arg1_p = 0;
|
||||
size_t axis = 0;
|
||||
|
||||
for (size_t i = 0; i < shape_rank; i++)
|
||||
{
|
||||
Shape::value_type arg0_dim =
|
||||
i < arg0_shape_padding ? 1 : arg0_shape[i - arg0_shape_padding];
|
||||
Shape::value_type arg1_dim =
|
||||
i < arg1_shape_padding ? 1 : arg1_shape[i - arg1_shape_padding];
|
||||
auto const dim0 = value_with_padding_or(arg0_shape, padding0, i, 1);
|
||||
auto const dim1 = value_with_padding_or(arg1_shape, padding1, i, 1);
|
||||
|
||||
output_shape[i] = arg0_dim == 1 ? arg1_dim : arg0_dim;
|
||||
output_shape[i] = std::max(dim0, dim1);
|
||||
|
||||
if (arg0_dim != arg1_dim)
|
||||
{
|
||||
if (arg0_dim == 1)
|
||||
arg0_p = std::max(arg0_p, i);
|
||||
|
||||
if (arg1_dim == 1)
|
||||
arg1_p = std::max(arg1_p, i);
|
||||
}
|
||||
if (dim0 != dim1)
|
||||
axis = std::max(axis, i);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Universal function without optimisations
|
||||
CoordinateTransformBasic arg0_transform(arg0_shape);
|
||||
@ -179,86 +202,64 @@ namespace ngraph
|
||||
*dst++ = elementwise_functor(arg0[idx0], arg1[idx1]);
|
||||
}
|
||||
#else
|
||||
using internal::numpy_autobroadcast_binop;
|
||||
using internal::calculate_fixed_idx_and_stride;
|
||||
|
||||
if (arg0_p < arg1_p)
|
||||
if (axis == 0)
|
||||
{
|
||||
size_t stride =
|
||||
row_major_stride(arg0_shape, arg1_p - arg0_shape_padding);
|
||||
|
||||
if (stride > 1)
|
||||
numpy_autobroadcast_binop<1, 1>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
output_shape,
|
||||
stride,
|
||||
arg1_p,
|
||||
elementwise_functor);
|
||||
else
|
||||
{
|
||||
calculate_fixed_idx_and_stride(arg1_p,
|
||||
arg0_p,
|
||||
stride,
|
||||
arg1_shape_padding,
|
||||
arg0_shape_padding,
|
||||
arg1_shape,
|
||||
arg0_shape);
|
||||
|
||||
numpy_autobroadcast_binop<1, 0>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
output_shape,
|
||||
stride,
|
||||
arg1_p,
|
||||
elementwise_functor);
|
||||
}
|
||||
}
|
||||
else if (arg0_p > arg1_p)
|
||||
{
|
||||
size_t stride =
|
||||
row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
|
||||
|
||||
if (stride > 1)
|
||||
numpy_autobroadcast_binop<1, 1>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
output_shape,
|
||||
stride,
|
||||
arg0_p,
|
||||
elementwise_functor);
|
||||
else
|
||||
{
|
||||
calculate_fixed_idx_and_stride(arg0_p,
|
||||
arg1_p,
|
||||
stride,
|
||||
arg0_shape_padding,
|
||||
arg1_shape_padding,
|
||||
arg0_shape,
|
||||
arg1_shape);
|
||||
|
||||
numpy_autobroadcast_binop<0, 1>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
output_shape,
|
||||
stride,
|
||||
arg0_p,
|
||||
elementwise_functor);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0, end = shape_size(output_shape); i < end; ++i)
|
||||
for (size_t i = 0, end = strides0[0]; i < end; ++i)
|
||||
out[i] = elementwise_functor(arg0[i], arg1[i]);
|
||||
}
|
||||
else if (strides0[axis] == 1 &&
|
||||
value_with_padding_or(arg0_shape, padding0, axis, 1) == 1)
|
||||
{
|
||||
axis = calculate_fixed_axis(axis, strides0);
|
||||
|
||||
numpy_autobroadcast_binop<0, 1>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
strides0,
|
||||
strides1,
|
||||
padding0,
|
||||
padding1,
|
||||
output_shape,
|
||||
axis,
|
||||
strides1[axis],
|
||||
elementwise_functor);
|
||||
}
|
||||
else if (strides1[axis] == 1 &&
|
||||
value_with_padding_or(arg1_shape, padding1, axis, 1) == 1)
|
||||
{
|
||||
axis = calculate_fixed_axis(axis, strides1);
|
||||
|
||||
numpy_autobroadcast_binop<1, 0>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
strides0,
|
||||
strides1,
|
||||
padding0,
|
||||
padding1,
|
||||
output_shape,
|
||||
axis,
|
||||
strides0[axis],
|
||||
elementwise_functor);
|
||||
}
|
||||
else
|
||||
numpy_autobroadcast_binop<1, 1>(arg0,
|
||||
arg1,
|
||||
out,
|
||||
arg0_shape,
|
||||
arg1_shape,
|
||||
strides0,
|
||||
strides1,
|
||||
padding0,
|
||||
padding1,
|
||||
output_shape,
|
||||
axis,
|
||||
strides0[axis],
|
||||
elementwise_functor);
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user