Optimize transpose reference implementation (#18137)
* Optimize transpose reference implementation Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com> * different approach for computing input offset * use std::vector for rev_order --------- Signed-off-by: Mateusz Tabaka <mateusz.tabaka@intel.com>
This commit is contained in:
parent
43a278f343
commit
64cecf2c7c
@ -22,6 +22,8 @@ ie_faster_build(${TARGET_NAME}
|
||||
UNITY
|
||||
PCH PRIVATE "src/precomp.hpp")
|
||||
|
||||
set_ie_threading_interface_for(${TARGET_NAME})
|
||||
|
||||
target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64)
|
||||
|
||||
if(NOT BUILD_SHARED_LIBS)
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
#include "ngraph/check.hpp"
|
||||
#include "ngraph/runtime/reference/reshape.hpp"
|
||||
#include "openvino/core/parallel.hpp"
|
||||
|
||||
using namespace ngraph;
|
||||
|
||||
@ -41,58 +42,107 @@ void reshape_in1(const char* in,
|
||||
}
|
||||
}
|
||||
|
||||
static size_t get_threshold() {
|
||||
// TODO: find a better way, not hardcoded value
|
||||
return (1 << 9) * parallel_get_num_threads();
|
||||
}
|
||||
|
||||
static inline void copy_element(char* out, const char* in, size_t elem_size) {
|
||||
#define CASE(type) \
|
||||
case sizeof(type): \
|
||||
*reinterpret_cast<type*>(out) = *reinterpret_cast<const type*>(in); \
|
||||
break;
|
||||
|
||||
switch (elem_size) {
|
||||
CASE(int32_t)
|
||||
CASE(int64_t)
|
||||
CASE(int16_t)
|
||||
CASE(int8_t)
|
||||
default:
|
||||
std::memcpy(out, in, elem_size);
|
||||
break;
|
||||
}
|
||||
#undef CASE
|
||||
}
|
||||
|
||||
void reshape_in2(const char* in,
|
||||
char* out,
|
||||
const Shape& in_shape,
|
||||
const AxisVector& in_axis_order,
|
||||
const Shape& out_shape,
|
||||
size_t elem_size) {
|
||||
size_t size[2];
|
||||
size_t in_index[2];
|
||||
size_t* map_index[2];
|
||||
for (size_t i = 0; i < 2; i++) {
|
||||
size[i] = in_shape[in_axis_order[i]];
|
||||
map_index[in_axis_order[i]] = &in_index[i];
|
||||
}
|
||||
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
|
||||
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
|
||||
// clang-format off
|
||||
memcpy(out,
|
||||
in + (*map_index[0] * in_shape[1] +
|
||||
*map_index[1]) * elem_size,
|
||||
elem_size);
|
||||
size_t num_elements = shape_size(in_shape);
|
||||
if (num_elements <= get_threshold()) {
|
||||
for (size_t i = 0; i < out_shape[0]; i++) {
|
||||
size_t off = i;
|
||||
for (size_t j = 0; j < out_shape[1]; j++) {
|
||||
copy_element(out, in + off * elem_size, elem_size);
|
||||
out += elem_size;
|
||||
// clang-format on
|
||||
off += out_shape[0];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ov::parallel_for2d(out_shape[0], out_shape[1], [in, out, &out_shape, elem_size](size_t i, size_t j) {
|
||||
size_t in_off = j * out_shape[0] + i;
|
||||
size_t out_off = i * out_shape[1] + j;
|
||||
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<size_t> get_strides(size_t rank, size_t elem_size, const AxisVector& order, const Shape& in_shape) {
|
||||
std::vector<size_t> rev_order(rank);
|
||||
for (size_t i = 0; i < rank; i++) {
|
||||
rev_order[order[i]] = i;
|
||||
}
|
||||
|
||||
std::vector<size_t> strides(rank);
|
||||
strides[rev_order[rank - 1]] = elem_size;
|
||||
for (size_t i = rank - 1; i > 0; i--) {
|
||||
strides[rev_order[i - 1]] = strides[rev_order[i]] * in_shape[i];
|
||||
}
|
||||
|
||||
return strides;
|
||||
}
|
||||
|
||||
void reshape_in3(const char* in,
|
||||
char* out,
|
||||
const Shape& in_shape,
|
||||
const AxisVector& in_axis_order,
|
||||
const Shape& out_shape,
|
||||
size_t elem_size) {
|
||||
size_t size[3];
|
||||
size_t in_index[3];
|
||||
size_t* map_index[3];
|
||||
for (size_t i = 0; i < 3; i++) {
|
||||
size[i] = in_shape[in_axis_order[i]];
|
||||
map_index[in_axis_order[i]] = &in_index[i];
|
||||
}
|
||||
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
|
||||
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
|
||||
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
|
||||
// clang-format off
|
||||
memcpy(out,
|
||||
in + (*map_index[0] * in_shape[1] * in_shape[2] +
|
||||
*map_index[1] * in_shape[2] +
|
||||
*map_index[2]) * elem_size,
|
||||
elem_size);
|
||||
size_t num_elements = shape_size(in_shape);
|
||||
if (num_elements <= get_threshold()) {
|
||||
const auto strides = get_strides(3, elem_size, in_axis_order, in_shape);
|
||||
|
||||
size_t off_0 = 0;
|
||||
for (size_t i = 0; i < out_shape[0]; i++) {
|
||||
size_t off_1 = off_0;
|
||||
for (size_t j = 0; j < out_shape[1]; j++) {
|
||||
size_t in_off = off_1;
|
||||
for (size_t k = 0; k < out_shape[2]; k++) {
|
||||
copy_element(out, in + in_off, elem_size);
|
||||
out += elem_size;
|
||||
// clang-format on
|
||||
in_off += strides[2];
|
||||
}
|
||||
off_1 += strides[1];
|
||||
}
|
||||
off_0 += strides[0];
|
||||
}
|
||||
} else {
|
||||
ov::parallel_for3d(out_shape[0],
|
||||
out_shape[1],
|
||||
out_shape[2],
|
||||
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) {
|
||||
size_t in_indexes[3];
|
||||
in_indexes[in_axis_order[0]] = i;
|
||||
in_indexes[in_axis_order[1]] = j;
|
||||
in_indexes[in_axis_order[2]] = k;
|
||||
size_t in_off =
|
||||
(in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2];
|
||||
size_t out_off = (i * out_shape[1] + j) * out_shape[2] + k;
|
||||
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,29 +152,46 @@ void reshape_in4(const char* in,
|
||||
const AxisVector& in_axis_order,
|
||||
const Shape& out_shape,
|
||||
size_t elem_size) {
|
||||
size_t size[4];
|
||||
size_t in_index[4];
|
||||
size_t* map_index[4];
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
size[i] = in_shape[in_axis_order[i]];
|
||||
map_index[in_axis_order[i]] = &in_index[i];
|
||||
}
|
||||
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
|
||||
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
|
||||
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
|
||||
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
|
||||
// clang-format off
|
||||
memcpy(out,
|
||||
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] +
|
||||
*map_index[1] * in_shape[2] * in_shape[3] +
|
||||
*map_index[2] * in_shape[3] +
|
||||
*map_index[3]) * elem_size,
|
||||
elem_size);
|
||||
size_t num_elements = shape_size(in_shape);
|
||||
if (num_elements <= get_threshold()) {
|
||||
const auto strides = get_strides(4, elem_size, in_axis_order, in_shape);
|
||||
|
||||
size_t off_0 = 0;
|
||||
for (size_t i = 0; i < out_shape[0]; i++) {
|
||||
size_t off_1 = off_0;
|
||||
for (size_t j = 0; j < out_shape[1]; j++) {
|
||||
size_t off_2 = off_1;
|
||||
for (size_t k = 0; k < out_shape[2]; k++) {
|
||||
size_t in_off = off_2;
|
||||
for (size_t l = 0; l < out_shape[3]; l++) {
|
||||
copy_element(out, in + in_off, elem_size);
|
||||
out += elem_size;
|
||||
// clang-format on
|
||||
in_off += strides[3];
|
||||
}
|
||||
off_2 += strides[2];
|
||||
}
|
||||
off_1 += strides[1];
|
||||
}
|
||||
off_0 += strides[0];
|
||||
}
|
||||
} else {
|
||||
ov::parallel_for4d(
|
||||
out_shape[0],
|
||||
out_shape[1],
|
||||
out_shape[2],
|
||||
out_shape[3],
|
||||
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k, size_t l) {
|
||||
size_t in_indexes[4];
|
||||
in_indexes[in_axis_order[0]] = i;
|
||||
in_indexes[in_axis_order[1]] = j;
|
||||
in_indexes[in_axis_order[2]] = k;
|
||||
in_indexes[in_axis_order[3]] = l;
|
||||
size_t in_off =
|
||||
((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
|
||||
in_indexes[3];
|
||||
size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l;
|
||||
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -134,32 +201,58 @@ void reshape_in5(const char* in,
|
||||
const AxisVector& in_axis_order,
|
||||
const Shape& out_shape,
|
||||
size_t elem_size) {
|
||||
size_t size[5];
|
||||
size_t in_index[5];
|
||||
size_t* map_index[5];
|
||||
for (size_t i = 0; i < 5; i++) {
|
||||
size[i] = in_shape[in_axis_order[i]];
|
||||
map_index[in_axis_order[i]] = &in_index[i];
|
||||
}
|
||||
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
|
||||
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
|
||||
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
|
||||
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
|
||||
for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
|
||||
// clang-format off
|
||||
memcpy(out,
|
||||
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] +
|
||||
*map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] +
|
||||
*map_index[2] * in_shape[3] * in_shape[4] +
|
||||
*map_index[3] * in_shape[4] +
|
||||
*map_index[4]) * elem_size,
|
||||
elem_size);
|
||||
size_t num_elements = shape_size(in_shape);
|
||||
if (num_elements <= get_threshold()) {
|
||||
const auto strides = get_strides(5, elem_size, in_axis_order, in_shape);
|
||||
|
||||
size_t off_0 = 0;
|
||||
for (size_t i = 0; i < out_shape[0]; i++) {
|
||||
size_t off_1 = off_0;
|
||||
for (size_t j = 0; j < out_shape[1]; j++) {
|
||||
size_t off_2 = off_1;
|
||||
for (size_t k = 0; k < out_shape[2]; k++) {
|
||||
size_t off_3 = off_2;
|
||||
for (size_t l = 0; l < out_shape[3]; l++) {
|
||||
size_t in_off = off_3;
|
||||
for (size_t m = 0; m < out_shape[4]; m++) {
|
||||
copy_element(out, in + in_off, elem_size);
|
||||
out += elem_size;
|
||||
// clang-format on
|
||||
in_off += strides[4];
|
||||
}
|
||||
off_3 += strides[3];
|
||||
}
|
||||
off_2 += strides[2];
|
||||
}
|
||||
off_1 += strides[1];
|
||||
}
|
||||
off_0 += strides[0];
|
||||
}
|
||||
} else {
|
||||
ov::parallel_for5d(
|
||||
out_shape[0],
|
||||
out_shape[1],
|
||||
out_shape[2],
|
||||
out_shape[3],
|
||||
out_shape[4],
|
||||
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
|
||||
size_t j,
|
||||
size_t k,
|
||||
size_t l,
|
||||
size_t m) {
|
||||
size_t in_indexes[5];
|
||||
in_indexes[in_axis_order[0]] = i;
|
||||
in_indexes[in_axis_order[1]] = j;
|
||||
in_indexes[in_axis_order[2]] = k;
|
||||
in_indexes[in_axis_order[3]] = l;
|
||||
in_indexes[in_axis_order[4]] = m;
|
||||
size_t in_off =
|
||||
(((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
|
||||
in_indexes[3]) *
|
||||
in_shape[4] +
|
||||
in_indexes[4];
|
||||
size_t out_off = (((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m;
|
||||
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -169,43 +262,79 @@ void reshape_in6(const char* in,
|
||||
const AxisVector& in_axis_order,
|
||||
const Shape& out_shape,
|
||||
size_t elem_size) {
|
||||
size_t size[6];
|
||||
size_t in_index[6];
|
||||
size_t* map_index[6];
|
||||
for (size_t i = 0; i < 6; i++) {
|
||||
size[i] = in_shape[in_axis_order[i]];
|
||||
map_index[in_axis_order[i]] = &in_index[i];
|
||||
}
|
||||
for (in_index[0] = 0; in_index[0] < size[0]; ++in_index[0]) {
|
||||
for (in_index[1] = 0; in_index[1] < size[1]; ++in_index[1]) {
|
||||
for (in_index[2] = 0; in_index[2] < size[2]; ++in_index[2]) {
|
||||
for (in_index[3] = 0; in_index[3] < size[3]; ++in_index[3]) {
|
||||
for (in_index[4] = 0; in_index[4] < size[4]; ++in_index[4]) {
|
||||
for (in_index[5] = 0; in_index[5] < size[5]; ++in_index[5]) {
|
||||
// clang-format off
|
||||
memcpy(out,
|
||||
in + (*map_index[0] * in_shape[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
|
||||
*map_index[1] * in_shape[2] * in_shape[3] * in_shape[4] * in_shape[5] +
|
||||
*map_index[2] * in_shape[3] * in_shape[4] * in_shape[5] +
|
||||
*map_index[3] * in_shape[4] * in_shape[5] +
|
||||
*map_index[4] * in_shape[5] +
|
||||
*map_index[5]) * elem_size,
|
||||
elem_size);
|
||||
size_t num_elements = shape_size(in_shape);
|
||||
if (num_elements <= get_threshold()) {
|
||||
const auto strides = get_strides(6, elem_size, in_axis_order, in_shape);
|
||||
|
||||
size_t off_0 = 0;
|
||||
for (size_t i = 0; i < out_shape[0]; i++) {
|
||||
size_t off_1 = off_0;
|
||||
for (size_t j = 0; j < out_shape[1]; j++) {
|
||||
size_t off_2 = off_1;
|
||||
for (size_t k = 0; k < out_shape[2]; k++) {
|
||||
size_t off_3 = off_2;
|
||||
for (size_t l = 0; l < out_shape[3]; l++) {
|
||||
size_t off_4 = off_3;
|
||||
for (size_t m = 0; m < out_shape[4]; m++) {
|
||||
size_t in_off = off_4;
|
||||
for (size_t n = 0; n < out_shape[5]; n++) {
|
||||
copy_element(out, in + in_off, elem_size);
|
||||
out += elem_size;
|
||||
// clang-format on
|
||||
in_off += strides[5];
|
||||
}
|
||||
off_4 += strides[4];
|
||||
}
|
||||
off_3 += strides[3];
|
||||
}
|
||||
off_2 += strides[2];
|
||||
}
|
||||
off_1 += strides[1];
|
||||
}
|
||||
off_0 += strides[0];
|
||||
}
|
||||
} else {
|
||||
ov::parallel_for6d(
|
||||
out_shape[0],
|
||||
out_shape[1],
|
||||
out_shape[2],
|
||||
out_shape[3],
|
||||
out_shape[4],
|
||||
out_shape[5],
|
||||
[in, out, in_axis_order, &in_shape, &out_shape, elem_size](size_t i,
|
||||
size_t j,
|
||||
size_t k,
|
||||
size_t l,
|
||||
size_t m,
|
||||
size_t n) {
|
||||
size_t in_indexes[6];
|
||||
in_indexes[in_axis_order[0]] = i;
|
||||
in_indexes[in_axis_order[1]] = j;
|
||||
in_indexes[in_axis_order[2]] = k;
|
||||
in_indexes[in_axis_order[3]] = l;
|
||||
in_indexes[in_axis_order[4]] = m;
|
||||
in_indexes[in_axis_order[5]] = n;
|
||||
size_t in_off =
|
||||
((((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3] +
|
||||
in_indexes[3]) *
|
||||
in_shape[4] +
|
||||
in_indexes[4]) *
|
||||
in_shape[5] +
|
||||
in_indexes[5];
|
||||
size_t out_off = ((((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3] + l) * out_shape[4] + m) *
|
||||
out_shape[5] +
|
||||
n;
|
||||
copy_element(out + out_off * elem_size, in + in_off * elem_size, elem_size);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
bool no_axis_reordering(const AxisVector& axis_order) {
|
||||
auto tmp = axis_order;
|
||||
std::sort(begin(tmp), end(tmp));
|
||||
tmp.erase(std::unique(begin(tmp), end(tmp)), end(tmp));
|
||||
return tmp == axis_order;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
void runtime::opt_kernel::reshape(const char* in,
|
||||
char* out,
|
||||
|
@ -45,11 +45,16 @@ TEST_P(ReshapeOptKernel, reshape_opt_kernel) {
|
||||
const AxisVector axis_order = get_axis_order(p.order, p.input.get_shape().size());
|
||||
std::vector<ElementValue> output_buff(p.input.get_vector().size());
|
||||
|
||||
const auto& in_shape = p.input.get_shape();
|
||||
Shape out_shape(in_shape.size());
|
||||
for (size_t i = 0; i < out_shape.size(); i++)
|
||||
out_shape[i] = in_shape[axis_order[i]];
|
||||
|
||||
runtime::opt_kernel::reshape((const char*)p.input.data(),
|
||||
(char*)output_buff.data(),
|
||||
p.input.get_shape(),
|
||||
in_shape,
|
||||
axis_order,
|
||||
p.output.get_shape(),
|
||||
out_shape,
|
||||
sizeof(ElementValue));
|
||||
EXPECT_EQ(p.output.get_vector(), output_buff);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user