From 068229c8151338f448c29f00ccbc82ce5b42e006 Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Mon, 19 Apr 2021 21:11:58 +0300 Subject: [PATCH] Improve performance of the Resize 3c/3p and Resize 8UC1 (#4945) * scratch buffer * Refactoring horizontal path * * Refactoring horizontal pass. Step2 * * Refactoring horizontal pass. Step 3 * * Refactoring vertical pass. Step2 * Refactoring horizontal pass. Step4 * * Clean * Applied comments. * * Applied comments. Part 2 --- .../ie_preprocess_gapi_kernels_neon.cpp | 232 ++++++++---------- .../ie_preprocess_gapi_kernels.cpp | 6 +- .../ie_preprocess_gapi_kernels_simd_impl.hpp | 145 +++++------ .../thirdparty/ocv/opencv_hal_neon.hpp | 18 ++ 4 files changed, 196 insertions(+), 205 deletions(-) diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp index 493ed365e45..779db927c32 100644 --- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp +++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp @@ -228,20 +228,90 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array, chan } } +CV_ALWAYS_INLINE void vertical_4LPI(const uint8_t* src0[], const uint8_t* src1[], + uchar tmp[], const short beta[], const int length) { + constexpr int nlanes = static_cast(v_uint8::nlanes); + constexpr int half_nlanes = nlanes / 2; + GAPI_Assert(length >= half_nlanes); + + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + v_int16 lo1, hi1, lo2, hi2; + v_int32 res1_s32, res2_s32; + int w = 0; + for (;;) { + for (; w <= length - half_nlanes; w += half_nlanes) { + v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w])); + v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w])); + v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w])); + v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w])); + + v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w])); + v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w])); + v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w])); + v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w])); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_interleave(r0, r1, lo1, hi1); + v_interleave(r2, r3, lo2, hi2); + + v_int32 lo1_s32 = v_reinterpret_as_s32(lo1); + v_int32 hi1_s32 = v_reinterpret_as_s32(hi1); + v_int32 lo2_s32 = v_reinterpret_as_s32(lo2); + v_int32 hi2_s32 = v_reinterpret_as_s32(hi2); + + v_interleave(lo1_s32, lo2_s32, res1_s32, res2_s32); + + v_int16 res1 = v_reinterpret_as_s16(res1_s32); + v_int16 res2 = v_reinterpret_as_s16(res2_s32); + + v_pack_u_store(&tmp[4 * w + 0], res1); + v_pack_u_store(&tmp[4 * w + half_nlanes], res2); + + v_interleave(hi1_s32, hi2_s32, res1_s32, res2_s32); + + v_int16 res3 = v_reinterpret_as_s16(res1_s32); + v_int16 res4 = v_reinterpret_as_s16(res2_s32); + + v_pack_u_store(&tmp[4 * w + 2*half_nlanes], res3); + v_pack_u_store(&tmp[4 * w + 3*half_nlanes], res4); + } + + if (w < length) { + w = length - half_nlanes; + continue; + } + break; + } +} + template CV_ALWAYS_INLINE void horizontal_4LPI(std::array, chanNum>& dst, - const uchar* tmp, const short mapsx[], - const short clone[], const int length) { + const uchar* tmp, const short mapsx[], const uchar _mask_horizontal[], + const short clone[], + const int length) { constexpr int nlanes = static_cast(v_uint8::nlanes); - const int half_nlanes = nlanes / 2; + constexpr int half_nlanes = nlanes / 2; GAPI_Assert(length >= half_nlanes); const int shift = static_cast(half_nlanes / 4); - uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 }; v_uint8 hmask = vx_load(_mask_horizontal); v_uint8 val_0, val_1, val_2, val_3; + int x = 0; for (;;) { for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) { @@ -315,71 +385,19 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array static_assert(v_uint8::nlanes == 16, "The wide of NEON vector is 128 bits, so one vector contains 16 uchars"); constexpr int nlanes = static_cast(v_uint8::nlanes); - constexpr int half_nlanes = nlanes / 2; bool xRatioEq = inSz.width == outSz.width; bool yRatioEq = inSz.height == outSz.height; if (!xRatioEq && !yRatioEq) { + uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15 }; if (4 == lpi) { // vertical pass - int inLength = inSz.width * chanNum; - GAPI_Assert(inLength >= half_nlanes); - - v_int16 b0 = vx_setall_s16(beta[0]); - v_int16 b1 = vx_setall_s16(beta[1]); - v_int16 b2 = vx_setall_s16(beta[2]); - v_int16 b3 = vx_setall_s16(beta[3]); - - uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15 }; - v_uint8 vmask = vx_load(_mask_vertical); - - int w = 0; - for (;;) { - for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) { - v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w])); - v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w])); - v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w])); - v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w])); - - v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w])); - v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w])); - v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w])); - v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w])); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_pack_u(r0, r1); - v_uint8 q1 = v_pack_u(r2, r3); - - v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1)); - v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1); - - v_uint8 q4 = v_shuffle(q2, vmask); - v_uint8 q5 = v_shuffle(q3, vmask); - - vx_store(&tmp[4 * w + 0], q4); - vx_store(&tmp[4 * w + 2 * half_nlanes], q5); - } - - if (w < inLength) { - w = inLength - half_nlanes; - continue; - } - break; - } + vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum); // horizontal pass - horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width); + horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width); } else { // if any lpi int inLength = inSz.width * chanNum; @@ -397,6 +415,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array } } else if (!xRatioEq) { GAPI_DbgAssert(yRatioEq); + uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15 }; if (4 == lpi) { int inLength = inSz.width * chanNum; @@ -422,7 +442,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array } // horizontal pass - horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width); + horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width); } else { // any LPI for (int l = 0; l < lpi; ++l) { const uchar* src = src0[l]; @@ -469,9 +489,8 @@ void calcRowLinear_8U(C3, std::array, 3>& dst, const Size& inSz, const Size& outSz, const int lpi) { - constexpr int chanNum = 3; - calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, - beta, tmp, inSz, outSz, lpi); + calcRowLinear_8UC_Impl_<3>(dst, src0, src1, alpha, clone, mapsx, + beta, tmp, inSz, outSz, lpi); } // Resize (bi-linear, 8UC4) @@ -486,20 +505,18 @@ void calcRowLinear_8U(C4, std::array, 4>& dst, const Size& inSz, const Size& outSz, const int lpi) { - constexpr int chanNum = 4; - calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, - beta, tmp, inSz, outSz, lpi); + calcRowLinear_8UC_Impl_<4>(dst, src0, src1, alpha, clone, mapsx, + beta, tmp, inSz, outSz, lpi); } CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[], const uchar* tmp, const short mapsx[], + const uchar _mask_horizontal[], const short clone[], const int length) { constexpr int nlanes = static_cast(v_uint8::nlanes); - const int half_nlanes = nlanes / 2; + constexpr int half_nlanes = nlanes / 2; GAPI_Assert(length >= half_nlanes); - uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, - 1, 5, 9, 13, 3, 7, 11, 15 }; v_uint8 hmask = vx_load(_mask_horizontal); int x = 0; for (;;) { @@ -557,12 +574,11 @@ CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[], } } -CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[], +CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst, const uchar* src, const short mapsx[], - const short alpha[], const int length, - const int line) { + const short alpha[], const int length) { constexpr int nlanes = static_cast(v_uint8::nlanes); - const int half_nlanes = nlanes / 2; + constexpr int half_nlanes = nlanes / 2; GAPI_Assert(length >= half_nlanes); v_int16 t0, t1; int x = 0; @@ -573,7 +589,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[], v_deinterleave_expand(t, t0, t1); v_int16 d = v_mulhrs(t0 - t1, a0) + t1; - v_pack_u_store(&dst[line][x], d); + v_pack_u_store(&dst[x], d); } if (x < length) { @@ -608,79 +624,34 @@ void calcRowLinear_8UC1(uint8_t* dst[], if (!xRatioEq && !yRatioEq) { GAPI_Assert(inSz.width >= half_nlanes); + uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15 }; if (4 == lpi) { // vertical pass - v_int16 b0 = vx_setall_s16(beta[0]); - v_int16 b1 = vx_setall_s16(beta[1]); - v_int16 b2 = vx_setall_s16(beta[2]); - v_int16 b3 = vx_setall_s16(beta[3]); - - uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15 }; - v_uint8 vmask = vx_load(_mask_vertical); - - int w = 0; - for (;;) { - for (; w <= inSz.width - half_nlanes; w += half_nlanes) { - v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w])); - v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w])); - v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w])); - v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w])); - - v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w])); - v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w])); - v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w])); - v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w])); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_pack_u(r0, r1); - v_uint8 q1 = v_pack_u(r2, r3); - - v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1)); - v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1); - - v_uint8 q4 = v_shuffle(q2, vmask); - v_uint8 q5 = v_shuffle(q3, vmask); - - vx_store(&tmp[4 * w + 0], q4); - vx_store(&tmp[4 * w + 2 * half_nlanes], q5); - } - - if (w < inSz.width) { - w = inSz.width - half_nlanes; - continue; - } - break; - } + vertical_4LPI(src0, src1, tmp, beta, inSz.width); // horizontal pass - horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width); + horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width); } else { // if any lpi for (int l = 0; l < lpi; ++l) { short beta0 = beta[l]; const uchar* s0 = src0[l]; const uchar* s1 = src1[l]; + uchar* _dst = dst[l]; // vertical pass vertical_anyLPI(s0, s1, tmp, inSz.width, beta0); // horizontal pass - horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l); + horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width); } } // if lpi == 4 } else if (!xRatioEq) { GAPI_DbgAssert(yRatioEq); GAPI_Assert(inSz.width >= nlanes); + uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, + 1, 5, 9, 13, 3, 7, 11, 15 }; if (4 == lpi) { // vertical pass @@ -702,14 +673,15 @@ void calcRowLinear_8UC1(uint8_t* dst[], } // horizontal pass - horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width); + horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width); } else { // any LPI GAPI_Assert(outSz.width >= half_nlanes); for (int l = 0; l < lpi; ++l) { const uchar* src = src0[l]; + uchar* _dst = dst[l]; // horizontal pass - horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l); + horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width); } } diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 0e49f4116ec..2b4c53f4d9a 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -895,7 +895,7 @@ struct linearScratchDesc { tmp = reinterpret_cast (mapsy + outH*2); } - static int bufSize(int inW, int inH, int outW, int outH, int lpi) { + static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) { auto size = outW * sizeof(alpha_t) + outW * sizeof(alpha_t) * 4 + // alpha clones // previous alpha is redundant? outW * sizeof(index_t) + @@ -910,7 +910,7 @@ struct linearScratchDesc { template static void initScratchLinear(const cv::GMatDesc& in, const Size& outSz, - cv::gapi::fluid::Buffer& scratch, + cv::gapi::fluid::Buffer& scratch, int lpi) { using alpha_type = typename Mapper::alpha_type; static const auto unity = Mapper::unity; @@ -1171,7 +1171,7 @@ static void calcRowLinear(const cv::gapi::fluid::View & in, template static void calcRowLinearC(const cv::gapi::fluid::View & in, std::array, numChan>& out, - cv::gapi::fluid::Buffer& scratch) { + cv::gapi::fluid::Buffer& scratch) { using alpha_type = typename Mapper::alpha_type; auto inSz = in.meta().size; diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp index 3a68bd4a980..a59111b86b6 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp @@ -18,12 +18,12 @@ namespace gapi { namespace kernels { -inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[], - uint8_t out[], int length) { +CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[], + uint8_t out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -46,12 +46,12 @@ inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[], } } -inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], uint8_t out[], int length) { +CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[], + const uint8_t in2[], uint8_t out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -75,12 +75,13 @@ inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[], } } -inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[], - const uint8_t in3[], uint8_t out[], int length) { +CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], + const uint8_t in2[], const uint8_t in3[], + uint8_t out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -106,12 +107,12 @@ inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const u } } -inline void mergeRow_32FC2_Impl(const float in0[], const float in1[], - float out[], int length) { +CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[], + float out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -133,12 +134,12 @@ inline void mergeRow_32FC2_Impl(const float in0[], const float in1[], } } -inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[], - float out[], int length) { +CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[], + float out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -162,13 +163,13 @@ inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const floa } } -inline void mergeRow_32FC4_Impl(const float in0[], const float in1[], - const float in2[], const float in3[], - float out[], int length) { +CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[], + const float in2[], const float in3[], + float out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -196,12 +197,12 @@ inline void mergeRow_32FC4_Impl(const float in0[], const float in1[], //------------------------------------------------------------------------------ -inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[], - uint8_t out1[], int length) { +CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[], + uint8_t out1[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -223,12 +224,12 @@ inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[], } } -inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[], - uint8_t out1[], uint8_t out2[], int length) { +CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[], + uint8_t out1[], uint8_t out2[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -252,12 +253,12 @@ inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[], } } -inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[], - uint8_t out2[], uint8_t out3[], int length) { +CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[], + uint8_t out2[], uint8_t out3[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -283,12 +284,12 @@ inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[ } } -inline void splitRow_32FC2_Impl(const float in[], float out0[], +CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[], float out1[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -311,12 +312,12 @@ inline void splitRow_32FC2_Impl(const float in[], float out0[], } } -inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[], - float out2[], int length) { +CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[], + float out2[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -340,12 +341,12 @@ inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[], } } -inline void splitRow_32FC4_Impl(const float in[], float out0[], float out1[], - float out2[], float out3[], int length) { +CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[], + float out2[], float out3[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; cycle: for (; l <= length - nlanes; l += nlanes) { @@ -380,7 +381,7 @@ static const int ITUR_BT_601_CVG = -852492; static const int ITUR_BT_601_CVR = 1673527; static const int ITUR_BT_601_SHIFT = 20; -static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) { +CV_ALWAYS_INLINE void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) { int uu, vv; uu = static_cast(u) - 128; vv = static_cast(v) - 128; @@ -390,9 +391,9 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; } -static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, - v_int32 (&ruv)[4], v_int32 (&guv)[4], - v_int32 (&buv)[4]) { +CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v, + v_int32 (&ruv)[4], v_int32 (&guv)[4], + v_int32 (&buv)[4]) { v_uint8 v128 = vx_setall_u8(128); v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128)); v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128)); @@ -417,8 +418,8 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, } } -static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv, - uchar& r, uchar& g, uchar& b) { +CV_ALWAYS_INLINE void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, + const int buv, uchar& r, uchar& g, uchar& b) { int yy = static_cast(vy); int y = std::max(0, yy - 16) * ITUR_BT_601_CY; r = saturate_cast((y + ruv) >> ITUR_BT_601_SHIFT); @@ -426,11 +427,11 @@ static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, con b = saturate_cast((y + buv) >> ITUR_BT_601_SHIFT); } -static inline void yRGBuvToRGB(const v_uint8& vy, - const v_int32 (&ruv)[4], - const v_int32 (&guv)[4], - const v_int32 (&buv)[4], - v_uint8& rr, v_uint8& gg, v_uint8& bb) { +CV_ALWAYS_INLINE void yRGBuvToRGB(const v_uint8& vy, + const v_int32 (&ruv)[4], + const v_int32 (&guv)[4], + const v_int32 (&buv)[4], + v_uint8& rr, v_uint8& gg, v_uint8& bb) { v_uint8 v16 = vx_setall_u8(16); v_uint8 posY = vy - v16; v_uint16 yy0, yy1; @@ -463,15 +464,14 @@ static inline void yRGBuvToRGB(const v_uint8& vy, bb = v_pack_u(b0, b1); } -inline void calculate_nv12_to_rgb_impl(const uchar **srcY, - const uchar *srcUV, - uchar **dstRGBx, - int width) { +CV_ALWAYS_INLINE void calculate_nv12_to_rgb_impl(const uchar **srcY, + const uchar *srcUV, + uchar **dstRGBx, + int width) { int i = 0; #if MANUAL_SIMD - - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; for ( ; i <= width - 2*nlanes; i += 2*nlanes) { v_uint8 u, v; @@ -535,14 +535,13 @@ inline void calculate_nv12_to_rgb_impl(const uchar **srcY, } } -inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU, - const uchar *srcV, uchar **dstRGBx, - int width) { +CV_ALWAYS_INLINE void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU, + const uchar *srcV, uchar **dstRGBx, + int width) { int i = 0; #if MANUAL_SIMD - - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; for ( ; i <= width - 2*nlanes; i += 2*nlanes) { v_uint8 u = vx_load(srcU + i/2); @@ -610,8 +609,8 @@ inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU, // vertical pass template -static inline void downy(const T *src[], int inWidth, const MapperUnit& ymap, A yalpha, - W vbuf[]) { +CV_ALWAYS_INLINE void downy(const T *src[], int inWidth, const MapperUnit& ymap, + A yalpha, W vbuf[]) { int y_1st = ymap.index0; int ylast = ymap.index1 - 1; @@ -619,7 +618,7 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit& ym GAPI_DbgAssert(y_1st < ylast); #if MANUAL_SIMD - const int nlanes = v_uint16::nlanes; + constexpr int nlanes = v_uint16::nlanes; #endif // 1st and last rows @@ -667,8 +666,8 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit& ym // horizontal pass template -static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], - const A xalpha[], const W vbuf[]) { +CV_ALWAYS_INLINE void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], + const A xalpha[], const W vbuf[]) { // TO DO: try lambda here #define HSUM(xmaxdf) \ for (int x = 0; x < outWidth; x++) { \ @@ -704,9 +703,11 @@ static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], } template -static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz, - A yalpha, const MapperUnit& ymap, int xmaxdf, const I xindex[], const A xalpha[], - W vbuf[]) { +CV_ALWAYS_INLINE void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, + const Size& outSz, A yalpha, + const MapperUnit& ymap, int xmaxdf, + const I xindex[], const A xalpha[], + W vbuf[]) { bool xRatioEq1 = inSz.width == outSz.width; bool yRatioEq1 = inSz.height == outSz.height; @@ -738,18 +739,18 @@ static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Si #if MANUAL_SIMD template -void copyRow_impl(const T in[], T out[], int l) { +CV_ALWAYS_INLINE void copyRow_impl(const T in[], T out[], int l) { VecT r; r = vx_load(&in[l]); vx_store(&out[l], r); } #endif -inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) { +CV_ALWAYS_INLINE void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_uint8::nlanes; + constexpr int nlanes = v_uint8::nlanes; for (; l <= length - nlanes; l += nlanes) { copyRow_impl(in, out, l); @@ -766,11 +767,11 @@ inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) { } } -inline void copyRow_32F_impl(const float in[], float out[], int length) { +CV_ALWAYS_INLINE void copyRow_32F_impl(const float in[], float out[], int length) { int l = 0; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; for (; l <= length - nlanes; l += nlanes) { copyRow_impl(in, out, l); @@ -801,7 +802,7 @@ CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[], bool yRatioEq1 = inSz.height == outSz.height; #if MANUAL_SIMD - const int nlanes = v_float32::nlanes; + constexpr int nlanes = v_float32::nlanes; #endif if (!xRatioEq1 && !yRatioEq1) { diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp index 9c005626572..83d561d2115 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp @@ -2606,6 +2606,24 @@ CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8 return v_uint8x16(v); } +CV_ALWAYS_INLINE void v_interleave(const v_int16x8& a, const v_int16x8& b, + v_int16x8& v1, v_int16x8& v2) +{ + int16x8x2_t p = vzipq_s16(a.val, b.val); + v1.val = p.val[0]; + v2.val = p.val[1]; + return; +} + +CV_ALWAYS_INLINE void v_interleave(const v_int32x4& a, const v_int32x4& b, + v_int32x4& v1, v_int32x4& v2) +{ + int32x4x2_t p = vzipq_s32(a.val, b.val); + v1.val = p.val[0]; + v2.val = p.val[1]; + return; +} + template CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a) {