Optimize transpose reference implementation (#18137)

* Optimize transpose reference implementation

Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com>

* different approach for computing input offset

* use std::vector for rev_order

---------

Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com>
This commit is contained in:
Mateusz Tabaka 2023-07-03 19:27:43 +02:00 committed by GitHub
parent 43a278f343
commit 64cecf2c7c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 233 additions and 97 deletions

View File

@ -22,6 +22,8 @@ ie_faster_build(${TARGET_NAME}
UNITY
PCH PRIVATE "src/precomp.hpp")
set_ie_threading_interface_for(${TARGET_NAME})
target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64)
if(NOT BUILD_SHARED_LIBS)

View File

@ -9,6 +9,7 @@
#include "ngraph/check.hpp"
#include "ngraph/runtime/reference/reshape.hpp"
#include "openvino/core/parallel.hpp"
using namespace ngraph;
@ -41,58 +42,107 @@ void reshape_in1(const char* in,
}
}
static size_t get_threshold() {
// TODO: find a better way, not hardcoded value
return (1 << 9) * parallel_get_num_threads();
}
static inline void copy_element(char* out, const char* in, size_t elem_size) {
#define CASE(type) \
case sizeof(type): \
*reinterpret_cast<type*>(out) = *reinterpret_cast<const type*>(in); \
break;
switch (elem_size) {
CASE(int32_t)
CASE(int64_t)
CASE(int16_t)
CASE(int8_t)
default:
std::memcpy(out, in, elem_size);
break;
}
#undef CASE
}
void reshape_in2(const char* in,
char* out,
const Shape& in_shape,
const AxisVector& in_axis_order,
const Shape& out_shape,
size_t elem_size) {
size_t size[2];
size_t in_index[2];
size_t* map_index[2];
for (size_t i = 0; i < 2; i++) {
size[i] = in_shape[in_axis_order[i]];
map_index[in_axis_order[i]] = &in_index[i];
}
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
// clang-format off
memcpy(out,
in + (*map_index[0] * in_shape[1] +
*map_index[1]) * elem_size,
elem_size);
size_t num_elements = shape_size(in_shape);
if (num_elements <= get_threshold()) {
for (size_t i = 0; i < out_shape[0]; i++) {
size_t off = i;
for (size_t j = 0; j < out_shape[1]; j++) {
copy_element(out, in + off * elem_size, elem_size);
out += elem_size;
// clang-format on
off += out_shape[0];
}
}
} else {
ov::parallel_for2d(out_shape[0], out_shape[1], [in, out, &out_shape, elem_size](size_t i, size_t j) {
size_t in_off = j * out_shape[0] + i;
size_t out_off = i * out_shape[1] + j;
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
});
}
}
static std::vector<size_t> get_strides(size_t rank, size_t elem_size, const AxisVector& order, const Shape& in_shape) {
std::vector<size_t> rev_order(rank);
for (size_t i = 0; i < rank; i++) {
rev_order[order[i]] = i;
}
std::vector<size_t> strides(rank);
strides[rev_order[rank - 1]] = elem_size;
for (size_t i = rank - 1; i > 0; i--) {
strides[rev_order[i - 1]] = strides[rev_order[i]] * in_shape[i];
}
return strides;
}
void reshape_in3(const char* in,
char* out,
const Shape& in_shape,
const AxisVector& in_axis_order,
const Shape& out_shape,
size_t elem_size) {
size_t size[3];
size_t in_index[3];
size_t* map_index[3];
for (size_t i = 0; i < 3; i++) {
size[i] = in_shape[in_axis_order[i]];
map_index[in_axis_order[i]] = &in_index[i];
}
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
// clang-format off
memcpy(out,
in + (*map_index[0] * in_shape[1] * in_shape[2] +
*map_index[1] * in_shape[2] +
*map_index[2]) * elem_size,
elem_size);
size_t num_elements = shape_size(in_shape);
if (num_elements <= get_threshold()) {
const auto strides = get_strides(3, elem_size, in_axis_order, in_shape);
size_t off_0 = 0;
for (size_t i = 0; i < out_shape[0]; i++) {
size_t off_1 = off_0;
for (size_t j = 0; j < out_shape[1]; j++) {
size_t in_off = off_1;
for (size_t k = 0; k < out_shape[2]; k++) {
copy_element(out, in + in_off, elem_size);
out += elem_size;
// clang-format on
in_off += strides[2];
}
off_1 += strides[1];
}
off_0 += strides[0];
}
} else {
ov::parallel_for3d(out_shape[0],
out_shape[1],
out_shape[2],
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) {
size_t in_indexes[3];
in_indexes[in_axis_order[0]] = i;
in_indexes[in_axis_order[1]] = j;
in_indexes[in_axis_order[2]] = k;
size_t in_off =
(in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2];
size_t out_off = (i * out_shape[1] + j) * out_shape[2] + k;
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
});
}
}
@ -102,29 +152,46 @@ void reshape_in4(const char* in,
const AxisVector& in_axis_order,
const Shape& out_shape,
size_t elem_size) {
size_t size[4];
size_t in_index[4];
size_t* map_index[4];
for (size_t i = 0; i < 4; i++) {
size[i] = in_shape[in_axis_order[i]];
map_index[in_axis_order[i]] = &in_index[i];
}
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
// clang-format off
memcpy(out,
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] +
*map_index[1] * in_shape[2] * in_shape[3] +
*map_index[2] * in_shape[3] +
*map_index[3]) * elem_size,
elem_size);
size_t num_elements = shape_size(in_shape);
if (num_elements <= get_threshold()) {
const auto strides = get_strides(4, elem_size, in_axis_order, in_shape);
size_t off_0 = 0;
for (size_t i = 0; i < out_shape[0]; i++) {
size_t off_1 = off_0;
for (size_t j = 0; j < out_shape[1]; j++) {
size_t off_2 = off_1;
for (size_t k = 0; k < out_shape[2]; k++) {
size_t in_off = off_2;
for (size_t l = 0; l < out_shape[3]; l++) {
copy_element(out, in + in_off, elem_size);
out += elem_size;
// clang-format on
in_off += strides[3];
}
off_2 += strides[2];
}
off_1 += strides[1];
}
off_0 += strides[0];
}
} else {
ov::parallel_for4d(
out_shape[0],
out_shape[1],
out_shape[2],
out_shape[3],
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k, size_t l) {
size_t in_indexes[4];
in_indexes[in_axis_order[0]] = i;
in_indexes[in_axis_order[1]] = j;
in_indexes[in_axis_order[2]] = k;
in_indexes[in_axis_order[3]] = l;
size_t in_off =
((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
in_indexes[3];
size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l;
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
});
}
}
@ -134,32 +201,58 @@ void reshape_in5(const char* in,
const AxisVector& in_axis_order,
const Shape& out_shape,
size_t elem_size) {
size_t size[5];
size_t in_index[5];
size_t* map_index[5];
for (size_t i = 0; i < 5; i++) {
size[i] = in_shape[in_axis_order[i]];
map_index[in_axis_order[i]] = &in_index[i];
}
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
// clang-format off
memcpy(out,
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] +
*map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] +
*map_index[2] * in_shape[3] * in_shape[4] +
*map_index[3] * in_shape[4] +
*map_index[4]) * elem_size,
elem_size);
size_t num_elements = shape_size(in_shape);
if (num_elements <= get_threshold()) {
const auto strides = get_strides(5, elem_size, in_axis_order, in_shape);
size_t off_0 = 0;
for (size_t i = 0; i < out_shape[0]; i++) {
size_t off_1 = off_0;
for (size_t j = 0; j < out_shape[1]; j++) {
size_t off_2 = off_1;
for (size_t k = 0; k < out_shape[2]; k++) {
size_t off_3 = off_2;
for (size_t l = 0; l < out_shape[3]; l++) {
size_t in_off = off_3;
for (size_t m = 0; m < out_shape[4]; m++) {
copy_element(out, in + in_off, elem_size);
out += elem_size;
// clang-format on
in_off += strides[4];
}
off_3 += strides[3];
}
off_2 += strides[2];
}
off_1 += strides[1];
}
off_0 += strides[0];
}
} else {
ov::parallel_for5d(
out_shape[0],
out_shape[1],
out_shape[2],
out_shape[3],
out_shape[4],
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
size_t j,
size_t k,
size_t l,
size_t m) {
size_t in_indexes[5];
in_indexes[in_axis_order[0]] = i;
in_indexes[in_axis_order[1]] = j;
in_indexes[in_axis_order[2]] = k;
in_indexes[in_axis_order[3]] = l;
in_indexes[in_axis_order[4]] = m;
size_t in_off =
(((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
in_indexes[3]) *
in_shape[4] +
in_indexes[4];
size_t out_off = (((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m;
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
});
}
}
@ -169,43 +262,79 @@ void reshape_in6(const char* in,
const AxisVector& in_axis_order,
const Shape& out_shape,
size_t elem_size) {
size_t size[6];
size_t in_index[6];
size_t* map_index[6];
for (size_t i = 0; i < 6; i++) {
size[i] = in_shape[in_axis_order[i]];
map_index[in_axis_order[i]] = &in_index[i];
}
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
for (in_index[5] = 0; in_index[5] < size[5]; ++in_index[5]) {
// clang-format off
memcpy(out,
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
*map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
*map_index[2] * in_shape[3] * in_shape[4] * in_shape[5] +
*map_index[3] * in_shape[4] * in_shape[5] +
*map_index[4] * in_shape[5] +
*map_index[5]) * elem_size,
elem_size);
size_t num_elements = shape_size(in_shape);
if (num_elements <= get_threshold()) {
const auto strides = get_strides(6, elem_size, in_axis_order, in_shape);
size_t off_0 = 0;
for (size_t i = 0; i < out_shape[0]; i++) {
size_t off_1 = off_0;
for (size_t j = 0; j < out_shape[1]; j++) {
size_t off_2 = off_1;
for (size_t k = 0; k < out_shape[2]; k++) {
size_t off_3 = off_2;
for (size_t l = 0; l < out_shape[3]; l++) {
size_t off_4 = off_3;
for (size_t m = 0; m < out_shape[4]; m++) {
size_t in_off = off_4;
for (size_t n = 0; n < out_shape[5]; n++) {
copy_element(out, in + in_off, elem_size);
out += elem_size;
// clang-format on
in_off += strides[5];
}
off_4 += strides[4];
}
off_3 += strides[3];
}
off_2 += strides[2];
}
off_1 += strides[1];
}
off_0 += strides[0];
}
} else {
ov::parallel_for6d(
out_shape[0],
out_shape[1],
out_shape[2],
out_shape[3],
out_shape[4],
out_shape[5],
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
size_t j,
size_t k,
size_t l,
size_t m,
size_t n) {
size_t in_indexes[6];
in_indexes[in_axis_order[0]] = i;
in_indexes[in_axis_order[1]] = j;
in_indexes[in_axis_order[2]] = k;
in_indexes[in_axis_order[3]] = l;
in_indexes[in_axis_order[4]] = m;
in_indexes[in_axis_order[5]] = n;
size_t in_off =
((((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
in_indexes[3]) *
in_shape[4] +
in_indexes[4]) *
in_shape[5] +
in_indexes[5];
size_t out_off = ((((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m) *
out_shape[5] +
n;
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
});
}
}
bool no_axis_reordering(const AxisVector& axis_order) {
auto tmp = axis_order;
std::sort(begin(tmp), end(tmp));
tmp.erase(std::unique(begin(tmp), end(tmp)), end(tmp));
return tmp == axis_order;
}
} // namespace
void runtime::opt_kernel::reshape(const char* in,
char* out,

View File

@ -45,11 +45,16 @@ TEST_P(ReshapeOptKernel, reshape_opt_kernel) {
const AxisVector axis_order = get_axis_order(p.order, p.input.get_shape().size());
std::vector<ElementValue> output_buff(p.input.get_vector().size());
const auto& in_shape = p.input.get_shape();
Shape out_shape(in_shape.size());
for (size_t i = 0; i < out_shape.size(); i++)
out_shape[i] = in_shape[axis_order[i]];
runtime::opt_kernel::reshape((const char*)p.input.data(),
(char*)output_buff.data(),
p.input.get_shape(),
in_shape,
axis_order,
p.output.get_shape(),
out_shape,
sizeof(ElementValue));
EXPECT_EQ(p.output.get_vector(), output_buff);
}