Optimize transpose reference implementation (#18137)

* Optimize transpose reference implementation Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com> * different approach for computing input offset * use std::vector for rev_order --------- Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com>
2023-07-03 19:27:43 +02:00 · 2023-07-03 19:27:43 +02:00 · 64cecf2c7c
commit 64cecf2c7c
parent 43a278f343
3 changed files with 233 additions and 97 deletions
--- a/src/core/reference/CMakeLists.txt
+++ b/src/core/reference/CMakeLists.txt
@ -22,6 +22,8 @@ ie_faster_build(${TARGET_NAME}
    UNITY
    PCH PRIVATE "src/precomp.hpp")

+set_ie_threading_interface_for(${TARGET_NAME})
+
 target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64)

 if(NOT BUILD_SHARED_LIBS)
--- a/src/core/reference/src/runtime/opt_kernel/reshape.cpp
+++ b/src/core/reference/src/runtime/opt_kernel/reshape.cpp
@ -9,6 +9,7 @@

 #include "ngraph/check.hpp"
 #include "ngraph/runtime/reference/reshape.hpp"
+#include "openvino/core/parallel.hpp"

 using namespace ngraph;

@ -41,58 +42,107 @@ void reshape_in1(const char* in,
    }
 }

+static size_t get_threshold() {
+    // TODO: find a better way, not hardcoded value
+    return (1 << 9) * parallel_get_num_threads();
+}
+
+static inline void copy_element(char* out, const char* in, size_t elem_size) {
+#define CASE(type)                                                          \
+    case sizeof(type):                                                      \
+        *reinterpret_cast<type*>(out) = *reinterpret_cast<const type*>(in); \
+        break;
+
+    switch (elem_size) {
+        CASE(int32_t)
+        CASE(int64_t)
+        CASE(int16_t)
+        CASE(int8_t)
+    default:
+        std::memcpy(out, in, elem_size);
+        break;
+    }
+#undef CASE
+}
+
 void reshape_in2(const char* in,
                 char* out,
                 const Shape& in_shape,
                 const AxisVector& in_axis_order,
                 const Shape& out_shape,
                 size_t elem_size) {
-    size_t size[2];
-    size_t in_index[2];
-    size_t* map_index[2];
-    for (size_t i = 0; i < 2; i++) {
-        size[i] = in_shape[in_axis_order[i]];
-        map_index[in_axis_order[i]] = &in_index[i];
-    }
-    for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
-        for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
-            // clang-format off
-                memcpy(out,
-                       in + (*map_index[0] * in_shape[1] +
-                             *map_index[1]) * elem_size,
-                       elem_size);
+    size_t num_elements = shape_size(in_shape);
+    if (num_elements <= get_threshold()) {
+        for (size_t i = 0; i < out_shape[0]; i++) {
+            size_t off = i;
+            for (size_t j = 0; j < out_shape[1]; j++) {
+                copy_element(out, in + off * elem_size, elem_size);
                out += elem_size;
-            // clang-format on
+                off += out_shape[0];
+            }
        }
+    } else {
+        ov::parallel_for2d(out_shape[0], out_shape[1], [in, out, &out_shape, elem_size](size_t i, size_t j) {
+            size_t in_off = j * out_shape[0] + i;
+            size_t out_off = i * out_shape[1] + j;
+            copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
+        });
    }
 }

+static std::vector<size_t> get_strides(size_t rank, size_t elem_size, const AxisVector& order, const Shape& in_shape) {
+    std::vector<size_t> rev_order(rank);
+    for (size_t i = 0; i < rank; i++) {
+        rev_order[order[i]] = i;
+    }
+
+    std::vector<size_t> strides(rank);
+    strides[rev_order[rank - 1]] = elem_size;
+    for (size_t i = rank - 1; i > 0; i--) {
+        strides[rev_order[i - 1]] = strides[rev_order[i]] * in_shape[i];
+    }
+
+    return strides;
+}
+
 void reshape_in3(const char* in,
                 char* out,
                 const Shape& in_shape,
                 const AxisVector& in_axis_order,
                 const Shape& out_shape,
                 size_t elem_size) {
-    size_t size[3];
-    size_t in_index[3];
-    size_t* map_index[3];
-    for (size_t i = 0; i < 3; i++) {
-        size[i] = in_shape[in_axis_order[i]];
-        map_index[in_axis_order[i]] = &in_index[i];
-    }
-    for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
-        for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
-            for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
-                // clang-format off
-                    memcpy(out,
-                           in + (*map_index[0] * in_shape[1] * in_shape[2] +
-                                 *map_index[1] * in_shape[2] +
-                                 *map_index[2]) * elem_size,
-                           elem_size);
+    size_t num_elements = shape_size(in_shape);
+    if (num_elements <= get_threshold()) {
+        const auto strides = get_strides(3, elem_size, in_axis_order, in_shape);
+
+        size_t off_0 = 0;
+        for (size_t i = 0; i < out_shape[0]; i++) {
+            size_t off_1 = off_0;
+            for (size_t j = 0; j < out_shape[1]; j++) {
+                size_t in_off = off_1;
+                for (size_t k = 0; k < out_shape[2]; k++) {
+                    copy_element(out, in + in_off, elem_size);
                    out += elem_size;
-                // clang-format on
+                    in_off += strides[2];
+                }
+                off_1 += strides[1];
            }
+            off_0 += strides[0];
        }
+    } else {
+        ov::parallel_for3d(out_shape[0],
+                           out_shape[1],
+                           out_shape[2],
+                           [in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) {
+                               size_t in_indexes[3];
+                               in_indexes[in_axis_order[0]] = i;
+                               in_indexes[in_axis_order[1]] = j;
+                               in_indexes[in_axis_order[2]] = k;
+                               size_t in_off =
+                                   (in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2];
+                               size_t out_off = (i * out_shape[1] + j) * out_shape[2] + k;
+                               copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
+                           });
    }
 }

@ -102,29 +152,46 @@ void reshape_in4(const char* in,
                 const AxisVector& in_axis_order,
                 const Shape& out_shape,
                 size_t elem_size) {
-    size_t size[4];
-    size_t in_index[4];
-    size_t* map_index[4];
-    for (size_t i = 0; i < 4; i++) {
-        size[i] = in_shape[in_axis_order[i]];
-        map_index[in_axis_order[i]] = &in_index[i];
-    }
-    for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
-        for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
-            for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
-                for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
-                    // clang-format off
-                        memcpy(out,
-                               in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] +
-                                     *map_index[1] * in_shape[2] * in_shape[3] +
-                                     *map_index[2] * in_shape[3] +
-                                     *map_index[3]) * elem_size,
-                               elem_size);
+    size_t num_elements = shape_size(in_shape);
+    if (num_elements <= get_threshold()) {
+        const auto strides = get_strides(4, elem_size, in_axis_order, in_shape);
+
+        size_t off_0 = 0;
+        for (size_t i = 0; i < out_shape[0]; i++) {
+            size_t off_1 = off_0;
+            for (size_t j = 0; j < out_shape[1]; j++) {
+                size_t off_2 = off_1;
+                for (size_t k = 0; k < out_shape[2]; k++) {
+                    size_t in_off = off_2;
+                    for (size_t l = 0; l < out_shape[3]; l++) {
+                        copy_element(out, in + in_off, elem_size);
                        out += elem_size;
-                    // clang-format on
+                        in_off += strides[3];
+                    }
+                    off_2 += strides[2];
                }
+                off_1 += strides[1];
            }
+            off_0 += strides[0];
        }
+    } else {
+        ov::parallel_for4d(
+            out_shape[0],
+            out_shape[1],
+            out_shape[2],
+            out_shape[3],
+            [in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k, size_t l) {
+                size_t in_indexes[4];
+                in_indexes[in_axis_order[0]] = i;
+                in_indexes[in_axis_order[1]] = j;
+                in_indexes[in_axis_order[2]] = k;
+                in_indexes[in_axis_order[3]] = l;
+                size_t in_off =
+                    ((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
+                    in_indexes[3];
+                size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l;
+                copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
+            });
    }
 }

@ -134,32 +201,58 @@ void reshape_in5(const char* in,
                 const AxisVector& in_axis_order,
                 const Shape& out_shape,
                 size_t elem_size) {
-    size_t size[5];
-    size_t in_index[5];
-    size_t* map_index[5];
-    for (size_t i = 0; i < 5; i++) {
-        size[i] = in_shape[in_axis_order[i]];
-        map_index[in_axis_order[i]] = &in_index[i];
-    }
-    for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
-        for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
-            for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
-                for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
-                    for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
-                        // clang-format off
-                            memcpy(out,
-                                   in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] +
-                                         *map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] +
-                                         *map_index[2] * in_shape[3] * in_shape[4] +
-                                         *map_index[3] * in_shape[4] +
-                                         *map_index[4]) * elem_size,
-                                   elem_size);
+    size_t num_elements = shape_size(in_shape);
+    if (num_elements <= get_threshold()) {
+        const auto strides = get_strides(5, elem_size, in_axis_order, in_shape);
+
+        size_t off_0 = 0;
+        for (size_t i = 0; i < out_shape[0]; i++) {
+            size_t off_1 = off_0;
+            for (size_t j = 0; j < out_shape[1]; j++) {
+                size_t off_2 = off_1;
+                for (size_t k = 0; k < out_shape[2]; k++) {
+                    size_t off_3 = off_2;
+                    for (size_t l = 0; l < out_shape[3]; l++) {
+                        size_t in_off = off_3;
+                        for (size_t m = 0; m < out_shape[4]; m++) {
+                            copy_element(out, in + in_off, elem_size);
                            out += elem_size;
-                        // clang-format on
+                            in_off += strides[4];
+                        }
+                        off_3 += strides[3];
                    }
+                    off_2 += strides[2];
                }
+                off_1 += strides[1];
            }
+            off_0 += strides[0];
        }
+    } else {
+        ov::parallel_for5d(
+            out_shape[0],
+            out_shape[1],
+            out_shape[2],
+            out_shape[3],
+            out_shape[4],
+            [in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
+                                                                       size_t j,
+                                                                       size_t k,
+                                                                       size_t l,
+                                                                       size_t m) {
+                size_t in_indexes[5];
+                in_indexes[in_axis_order[0]] = i;
+                in_indexes[in_axis_order[1]] = j;
+                in_indexes[in_axis_order[2]] = k;
+                in_indexes[in_axis_order[3]] = l;
+                in_indexes[in_axis_order[4]] = m;
+                size_t in_off =
+                    (((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
+                     in_indexes[3]) *
+                        in_shape[4] +
+                    in_indexes[4];
+                size_t out_off = (((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m;
+                copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
+            });
    }
 }

@ -169,43 +262,79 @@ void reshape_in6(const char* in,
                 const AxisVector& in_axis_order,
                 const Shape& out_shape,
                 size_t elem_size) {
-    size_t size[6];
-    size_t in_index[6];
-    size_t* map_index[6];
-    for (size_t i = 0; i < 6; i++) {
-        size[i] = in_shape[in_axis_order[i]];
-        map_index[in_axis_order[i]] = &in_index[i];
-    }
-    for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
-        for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
-            for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
-                for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
-                    for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
-                        for (in_index[5] = 0; in_index[5] < size[5]; ++in_index[5]) {
-                            // clang-format off
-                                memcpy(out,
-                                       in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
-                                             *map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
-                                             *map_index[2] * in_shape[3] * in_shape[4] * in_shape[5] +
-                                             *map_index[3] * in_shape[4] * in_shape[5] +
-                                             *map_index[4] * in_shape[5] +
-                                             *map_index[5]) * elem_size,
-                                       elem_size);
+    size_t num_elements = shape_size(in_shape);
+    if (num_elements <= get_threshold()) {
+        const auto strides = get_strides(6, elem_size, in_axis_order, in_shape);
+
+        size_t off_0 = 0;
+        for (size_t i = 0; i < out_shape[0]; i++) {
+            size_t off_1 = off_0;
+            for (size_t j = 0; j < out_shape[1]; j++) {
+                size_t off_2 = off_1;
+                for (size_t k = 0; k < out_shape[2]; k++) {
+                    size_t off_3 = off_2;
+                    for (size_t l = 0; l < out_shape[3]; l++) {
+                        size_t off_4 = off_3;
+                        for (size_t m = 0; m < out_shape[4]; m++) {
+                            size_t in_off = off_4;
+                            for (size_t n = 0; n < out_shape[5]; n++) {
+                                copy_element(out, in + in_off, elem_size);
                                out += elem_size;
-                            // clang-format on
+                                in_off += strides[5];
+                            }
+                            off_4 += strides[4];
                        }
+                        off_3 += strides[3];
                    }
+                    off_2 += strides[2];
                }
+                off_1 += strides[1];
            }
+            off_0 += strides[0];
        }
+    } else {
+        ov::parallel_for6d(
+            out_shape[0],
+            out_shape[1],
+            out_shape[2],
+            out_shape[3],
+            out_shape[4],
+            out_shape[5],
+            [in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
+                                                                       size_t j,
+                                                                       size_t k,
+                                                                       size_t l,
+                                                                       size_t m,
+                                                                       size_t n) {
+                size_t in_indexes[6];
+                in_indexes[in_axis_order[0]] = i;
+                in_indexes[in_axis_order[1]] = j;
+                in_indexes[in_axis_order[2]] = k;
+                in_indexes[in_axis_order[3]] = l;
+                in_indexes[in_axis_order[4]] = m;
+                in_indexes[in_axis_order[5]] = n;
+                size_t in_off =
+                    ((((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
+                      in_indexes[3]) *
+                         in_shape[4] +
+                     in_indexes[4]) *
+                        in_shape[5] +
+                    in_indexes[5];
+                size_t out_off = ((((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m) *
+                                     out_shape[5] +
+                                 n;
+                copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
+            });
    }
 }
+
 bool no_axis_reordering(const AxisVector& axis_order) {
    auto tmp = axis_order;
    std::sort(begin(tmp), end(tmp));
    tmp.erase(std::unique(begin(tmp), end(tmp)), end(tmp));
    return tmp == axis_order;
 }
+
 }  // namespace
 void runtime::opt_kernel::reshape(const char* in,
                                  char* out,
--- a/src/core/tests/reshape_opt_kernel.cpp
+++ b/src/core/tests/reshape_opt_kernel.cpp
@ -45,11 +45,16 @@ TEST_P(ReshapeOptKernel, reshape_opt_kernel) {
    const AxisVector axis_order = get_axis_order(p.order, p.input.get_shape().size());
    std::vector<ElementValue> output_buff(p.input.get_vector().size());

+    const auto& in_shape = p.input.get_shape();
+    Shape out_shape(in_shape.size());
+    for (size_t i = 0; i < out_shape.size(); i++)
+        out_shape[i] = in_shape[axis_order[i]];
+
    runtime::opt_kernel::reshape((const char*)p.input.data(),
                                 (char*)output_buff.data(),
-                                 p.input.get_shape(),
+                                 in_shape,
                                 axis_order,
-                                 p.output.get_shape(),
+                                 out_shape,
                                 sizeof(ElementValue));
    EXPECT_EQ(p.output.get_vector(), output_buff);
 }