Optimisations for binary operations broadcast. Phase 2. (#1295)

2020-07-13 06:11:02 +03:00 · 2020-07-13 06:11:02 +03:00 · bce6ca07df
commit bce6ca07df
parent 9dedb39cfc
3 changed files with 137 additions and 140 deletions
--- a/ngraph/src/ngraph/coordinate_transform.cpp
+++ b/ngraph/src/ngraph/coordinate_transform.cpp
@ -14,6 +14,7 @@
 // limitations under the License.
 //*****************************************************************************

+#include <algorithm>
 #include <cstdio>
 #include <iostream>
 #include <numeric>
@ -430,18 +431,9 @@ CoordinateIterator::CoordinateIterator(const Shape& target_shape, bool is_end)
 {
    // The case where we have a zero-length axis is a bit special, in that
    // the iterator always starts out of bounds.
-    m_empty = false;
+    bool const empty = std::find(target_shape.begin(), target_shape.end(), 0) != target_shape.end();

-    for (auto s : target_shape)
-    {
-        if (s == 0)
-        {
-            m_empty = true;
-            break;
-        }
-    }
-
-    m_oob = is_end || m_empty;
+    m_oob = is_end || empty;
 }

 void CoordinateIterator::operator++()
@ -449,12 +441,14 @@ void CoordinateIterator::operator++()
    advance(m_target_shape.size() - 1);
 }

-void CoordinateIterator::advance(size_t axis) noexcept
+size_t CoordinateIterator::advance(size_t axis) noexcept
 {
    m_oob |= m_target_shape.empty();

    if (m_oob)
-        return;
+        return m_target_shape.size();
+
+    bool carry_out = false;

    // Increment the target coordinate.
    do
@ -464,17 +458,20 @@ void CoordinateIterator::advance(size_t axis) noexcept
        if (m_coordinate[axis] < m_target_shape[axis])
        {
            // No carry-out, so we are done.
-            return;
+            return axis;
        }
        else
        {
            m_coordinate[axis] = 0;
+            carry_out = true;
        }
    } while (axis-- > 0);

    // If we are still here there was carry-out from the most significant axis. We are now out of
    // bounds.
    m_oob = true;
+
+    return m_target_shape.size();
 }

 CoordinateIterator CoordinateIterator::operator++(int)
--- a/ngraph/src/ngraph/coordinate_transform.hpp
+++ b/ngraph/src/ngraph/coordinate_transform.hpp
@ -60,17 +60,16 @@ namespace ngraph

        /// \brief Increments iterator using specified axis of the shape n times.
        /// \param axis index used for iteration
-        void advance(size_t axis) noexcept;
+        size_t advance(size_t axis) noexcept;

        /// \brief Useful function to build the last iterator.
        ///        Returns a singleton that points to the last iterator.
        static const CoordinateIterator& end();

    private:
-        Shape m_target_shape;
+        const Shape& m_target_shape;
        Coordinate m_coordinate;
        bool m_oob;
-        bool m_empty;
    };

    /// \brief Class which allows to calculate item index with given coordinates in tensor
--- a/ngraph/src/ngraph/runtime/reference/autobroadcast_binop.hpp
+++ b/ngraph/src/ngraph/runtime/reference/autobroadcast_binop.hpp
@ -18,6 +18,7 @@

 #include <cstddef>

+#include <utility>
 #include "ngraph/coordinate_transform.hpp"
 #include "ngraph/op/util/attr_types.hpp"
 #include "ngraph/shape_util.hpp"
@ -30,50 +31,69 @@ namespace ngraph
        {
            namespace internal
            {
+                inline void
+                    row_major_strides(const Shape& shape, size_t* strides, size_t size) noexcept
+                {
+                    size_t* st = strides + size - 1;
+                    size_t s = 1;
+                    for (auto d = shape.rbegin(); d != shape.rend(); d++)
+                    {
+                        *st-- = s;
+                        s *= *d;
+                    }
+                    std::fill(strides, st + 1, s);
+                }
+
+                template <typename C, typename T>
+                inline T value_with_padding_or(const C& arr,
+                                               size_t padding,
+                                               size_t idx,
+                                               T&& default_value)
+                {
+                    return idx < padding ? std::forward<T>(default_value) : arr[idx - padding];
+                }
+
                template <int A0, int A1, typename T, typename U, typename Functor>
                inline void numpy_autobroadcast_binop(const T* arg0,
                                                      const T* arg1,
                                                      U* out,
-                                                      const Shape& arg0_shape,
-                                                      const Shape& arg1_shape,
+                                                      const Shape& shape0,
+                                                      const Shape& shape1,
+                                                      const size_t* strides0,
+                                                      const size_t* strides1,
+                                                      const size_t padding0,
+                                                      const size_t padding1,
                                                      const Shape& output_shape,
-                                                      const size_t stride,
                                                      const size_t axis,
+                                                      const size_t stride,
                                                      Functor elementwise_functor)
                {
-                    CoordinateTransformBasic arg0_transform(arg0_shape);
-                    CoordinateTransformBasic arg1_transform(arg1_shape);
-
-                    for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();
-                         it != ite;
-                         it.advance(axis))
+                    for (CoordinateIterator it(output_shape), ite = CoordinateIterator::end();;)
                    {
-                        const Coordinate& output_coord = *it;
-                        size_t const idx0 = arg0_transform.index(output_coord);
-                        size_t const idx1 = arg1_transform.index(output_coord);
                        for (size_t i = 0; i < stride; ++i)
-                            *out++ = elementwise_functor(arg0[idx0 + i * A0], arg1[idx1 + i * A1]);
+                            *out++ = elementwise_functor(arg0[i * A0], arg1[i * A1]);
+
+                        arg0 += A0 ? stride : 1;
+                        arg1 += A1 ? stride : 1;
+
+                        auto p = it.advance(axis);
+
+                        if (it == ite)
+                            break;
+
+                        if (value_with_padding_or(shape0, padding0, p, 1) == 1)
+                            arg0 -= strides0[p];
+
+                        if (value_with_padding_or(shape1, padding1, p, 1) == 1)
+                            arg1 -= strides1[p];
                    }
                }

-                inline void calculate_fixed_idx_and_stride(size_t& arg0_p, // Fixed idx
-                                                           size_t arg1_p,
-                                                           size_t& stride,
-                                                           size_t arg0_shape_padding,
-                                                           size_t arg1_shape_padding,
-                                                           const Shape& arg0_shape,
-                                                           const Shape& arg1_shape)
+                inline size_t calculate_fixed_axis(size_t axis, const size_t* strides)
                {
-                    while ((arg0_p < arg0_shape_padding ||
-                            arg0_shape[arg0_p - arg0_shape_padding] == 1) &&
-                           (arg0_p >= arg1_shape_padding &&
-                            arg1_shape[arg0_p - arg1_shape_padding] != 1) &&
-                           --arg0_p > arg1_p)
-                        ;
-
-                    stride = arg0_p < arg1_shape_padding
-                                 ? shape_size(arg1_shape)
-                                 : row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
+                    while (axis > 0 && strides[axis - 1] == 1)
+                        --axis;
+                    return axis;
                }
            }

@ -134,34 +154,37 @@ namespace ngraph
                    //                 ------------
                    //                 [ 3, 2, 6]
                    {
+                        using namespace internal;
+
                        size_t const shape_rank =
                            std::max(arg0_shape.size(), arg1_shape.size()) + 1;
-                        size_t const arg0_shape_padding = shape_rank - arg0_shape.size();
-                        size_t const arg1_shape_padding = shape_rank - arg1_shape.size();
+
+                        // TODO: Use compiler-specific alloca() or variable-length array
+                        std::vector<size_t> tmp(shape_rank * 2);
+
+                        size_t* strides0 = tmp.data();
+                        size_t* strides1 = tmp.data() + shape_rank;
+
+                        row_major_strides(arg0_shape, strides0, shape_rank);
+                        row_major_strides(arg1_shape, strides1, shape_rank);
+
+                        size_t const padding0 = shape_rank - arg0_shape.size();
+                        size_t const padding1 = shape_rank - arg1_shape.size();

                        Shape output_shape(shape_rank, 0);

-                        size_t arg0_p = 0, arg1_p = 0;
+                        size_t axis = 0;

                        for (size_t i = 0; i < shape_rank; i++)
                        {
-                            Shape::value_type arg0_dim =
-                                i < arg0_shape_padding ? 1 : arg0_shape[i - arg0_shape_padding];
-                            Shape::value_type arg1_dim =
-                                i < arg1_shape_padding ? 1 : arg1_shape[i - arg1_shape_padding];
+                            auto const dim0 = value_with_padding_or(arg0_shape, padding0, i, 1);
+                            auto const dim1 = value_with_padding_or(arg1_shape, padding1, i, 1);

-                            output_shape[i] = arg0_dim == 1 ? arg1_dim : arg0_dim;
+                            output_shape[i] = std::max(dim0, dim1);

-                            if (arg0_dim != arg1_dim)
-                            {
-                                if (arg0_dim == 1)
-                                    arg0_p = std::max(arg0_p, i);
-
-                                if (arg1_dim == 1)
-                                    arg1_p = std::max(arg1_p, i);
-                            }
+                            if (dim0 != dim1)
+                                axis = std::max(axis, i);
                        }
-
 #if 0
                        // Universal function without optimisations
                        CoordinateTransformBasic arg0_transform(arg0_shape);
@ -179,86 +202,64 @@ namespace ngraph
                            *dst++ = elementwise_functor(arg0[idx0], arg1[idx1]);
                        }
 #else
-                        using internal::numpy_autobroadcast_binop;
-                        using internal::calculate_fixed_idx_and_stride;

-                        if (arg0_p < arg1_p)
+                        if (axis == 0)
                        {
-                            size_t stride =
-                                row_major_stride(arg0_shape, arg1_p - arg0_shape_padding);
-
-                            if (stride > 1)
-                                numpy_autobroadcast_binop<1, 1>(arg0,
-                                                                arg1,
-                                                                out,
-                                                                arg0_shape,
-                                                                arg1_shape,
-                                                                output_shape,
-                                                                stride,
-                                                                arg1_p,
-                                                                elementwise_functor);
-                            else
-                            {
-                                calculate_fixed_idx_and_stride(arg1_p,
-                                                               arg0_p,
-                                                               stride,
-                                                               arg1_shape_padding,
-                                                               arg0_shape_padding,
-                                                               arg1_shape,
-                                                               arg0_shape);
-
-                                numpy_autobroadcast_binop<1, 0>(arg0,
-                                                                arg1,
-                                                                out,
-                                                                arg0_shape,
-                                                                arg1_shape,
-                                                                output_shape,
-                                                                stride,
-                                                                arg1_p,
-                                                                elementwise_functor);
-                            }
-                        }
-                        else if (arg0_p > arg1_p)
-                        {
-                            size_t stride =
-                                row_major_stride(arg1_shape, arg0_p - arg1_shape_padding);
-
-                            if (stride > 1)
-                                numpy_autobroadcast_binop<1, 1>(arg0,
-                                                                arg1,
-                                                                out,
-                                                                arg0_shape,
-                                                                arg1_shape,
-                                                                output_shape,
-                                                                stride,
-                                                                arg0_p,
-                                                                elementwise_functor);
-                            else
-                            {
-                                calculate_fixed_idx_and_stride(arg0_p,
-                                                               arg1_p,
-                                                               stride,
-                                                               arg0_shape_padding,
-                                                               arg1_shape_padding,
-                                                               arg0_shape,
-                                                               arg1_shape);
-
-                                numpy_autobroadcast_binop<0, 1>(arg0,
-                                                                arg1,
-                                                                out,
-                                                                arg0_shape,
-                                                                arg1_shape,
-                                                                output_shape,
-                                                                stride,
-                                                                arg0_p,
-                                                                elementwise_functor);
-                            }
-                        }
-                        else
-                        {
-                            for (size_t i = 0, end = shape_size(output_shape); i < end; ++i)
+                            for (size_t i = 0, end = strides0[0]; i < end; ++i)
                                out[i] = elementwise_functor(arg0[i], arg1[i]);
                        }
+                        else if (strides0[axis] == 1 &&
+                                 value_with_padding_or(arg0_shape, padding0, axis, 1) == 1)
+                        {
+                            axis = calculate_fixed_axis(axis, strides0);
+
+                            numpy_autobroadcast_binop<0, 1>(arg0,
+                                                            arg1,
+                                                            out,
+                                                            arg0_shape,
+                                                            arg1_shape,
+                                                            strides0,
+                                                            strides1,
+                                                            padding0,
+                                                            padding1,
+                                                            output_shape,
+                                                            axis,
+                                                            strides1[axis],
+                                                            elementwise_functor);
+                        }
+                        else if (strides1[axis] == 1 &&
+                                 value_with_padding_or(arg1_shape, padding1, axis, 1) == 1)
+                        {
+                            axis = calculate_fixed_axis(axis, strides1);
+
+                            numpy_autobroadcast_binop<1, 0>(arg0,
+                                                            arg1,
+                                                            out,
+                                                            arg0_shape,
+                                                            arg1_shape,
+                                                            strides0,
+                                                            strides1,
+                                                            padding0,
+                                                            padding1,
+                                                            output_shape,
+                                                            axis,
+                                                            strides0[axis],
+                                                            elementwise_functor);
+                        }
+                        else
+                            numpy_autobroadcast_binop<1, 1>(arg0,
+                                                            arg1,
+                                                            out,
+                                                            arg0_shape,
+                                                            arg1_shape,
+                                                            strides0,
+                                                            strides1,
+                                                            padding0,
+                                                            padding1,
+                                                            output_shape,
+                                                            axis,
+                                                            strides0[axis],
+                                                            elementwise_functor);
 #endif
                    }
                    break;