Pre-processing: SIMD optimization of the Resize 8UC1 (#4804)

* * Pre-processing: SIMD optimization of the Resize 8UC1 * * Refactored and added new universal intrinsics. * fix for ARM32 issue * * Refactoring.
2021-03-23 17:29:23 +03:00 · 2021-03-23 17:29:23 +03:00 · 87000ed1ca
commit 87000ed1ca
parent 5caa706334
4 changed files with 391 additions and 11 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -299,6 +299,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
                                              const Size&    inSz,
                                              const Size&    outSz,
                                              const int      lpi) {
+    static_assert(v_uint8::nlanes == 16,
+                  "The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
    constexpr int half_nlanes = nlanes / 2;

@ -475,6 +477,251 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
                                     beta, tmp, inSz, outSz, lpi);
 }
+
+CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
+                                      const uchar* tmp, const short mapsx[],
+                                      const short clone[], const int length) {
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+    const int half_nlanes = nlanes / 2;
+    GAPI_Assert(length >= half_nlanes);
+
+    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                      1, 5, 9, 13, 3, 7, 11, 15 };
+    v_uint8 hmask = vx_load(_mask_horizontal);
+    int x = 0;
+    for (;;) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
+
+            v_uint8 val_0 = v_gather_lines(tmp, &mapsx[x]);
+            v_uint8 val_1 = v_gather_lines(tmp, &mapsx[x + 2]);
+            v_uint8 val_2 = v_gather_lines(tmp, &mapsx[x + 4]);
+            v_uint8 val_3 = v_gather_lines(tmp, &mapsx[x + 6]);
+
+            v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+            v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+            v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+            v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+            v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+            v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+            v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+            v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+            v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+            v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+            v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+            v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+            v_int16 r0 = v_add_wrap(val1_0, t0);
+            v_int16 r1 = v_add_wrap(val1_1, t1);
+            v_int16 r2 = v_add_wrap(val1_2, t2);
+            v_int16 r3 = v_add_wrap(val1_3, t3);
+
+            v_uint8 q0 = v_pack_u(r0, r1);
+            v_uint8 q1 = v_pack_u(r2, r3);
+
+            v_uint8 q2 = v_shuffle(q0, hmask);
+            v_uint8 q3 = v_shuffle(q1, hmask);
+
+            v_uint8 q4 = v_blend<0xCC /*0b11001100*/>(q2, v_shift_left<4>(q3));
+            v_uint8 q5 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q2), q3);
+
+            v_store_low(&dst[0][x],  q4);
+            v_store_high(&dst[1][x], q4);
+            v_store_low(&dst[2][x],  q5);
+            v_store_high(&dst[3][x], q5);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
+                                        const uchar* src, const short mapsx[],
+                                        const short alpha[], const int length,
+                                        const int line) {
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+    const int half_nlanes = nlanes / 2;
+    GAPI_Assert(length >= half_nlanes);
+    v_int16 t0, t1;
+    int x = 0;
+    for (;;) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);
+            v_uint8 t = v_gather_pairs(src, &mapsx[x]);
+
+            v_deinterleave_expand(t, t0, t1);
+            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+            v_pack_u_store(&dst[line][x], d);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+// 8UC1 Resize (bi-linear)
+void calcRowLinear_8UC1(uint8_t* dst[],
+                        const uint8_t* src0[],
+                        const uint8_t* src1[],
+                        const short    alpha[],
+                        const short    clone[],  // 4 clones of alpha
+                        const short    mapsx[],
+                        const short    beta[],
+                            uint8_t    tmp[],
+                        const Size&    inSz,
+                        const Size&    outSz,
+                        const int      lpi) {
+    static_assert(v_uint8::nlanes == 16,
+                  "The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
+
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+    constexpr int half_nlanes = nlanes / 2;
+
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    if (!xRatioEq && !yRatioEq) {
+        GAPI_Assert(inSz.width >= half_nlanes);
+
+        if (4 == lpi) {
+            // vertical pass
+            v_int16 b0 = vx_setall_s16(beta[0]);
+            v_int16 b1 = vx_setall_s16(beta[1]);
+            v_int16 b2 = vx_setall_s16(beta[2]);
+            v_int16 b3 = vx_setall_s16(beta[3]);
+
+            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
+                                            2, 10, 6, 14, 3, 11, 7, 15 };
+            v_uint8 vmask = vx_load(_mask_vertical);
+
+            int w = 0;
+            for (;;) {
+                for (; w <= inSz.width - half_nlanes; w += half_nlanes) {
+                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+                    v_int16 r0 = v_add_wrap(val1_0, t0);
+                    v_int16 r1 = v_add_wrap(val1_1, t1);
+                    v_int16 r2 = v_add_wrap(val1_2, t2);
+                    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                    v_uint8 q0 = v_pack_u(r0, r1);
+                    v_uint8 q1 = v_pack_u(r2, r3);
+
+                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
+                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
+
+                    v_uint8 q4 = v_shuffle(q2, vmask);
+                    v_uint8 q5 = v_shuffle(q3, vmask);
+
+                    vx_store(&tmp[4 * w + 0], q4);
+                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - half_nlanes;
+                    continue;
+                }
+                break;
+            }
+
+            // horizontal pass
+             horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+        } else {  // if any lpi
+            for (int l = 0; l < lpi; ++l) {
+                short beta0 = beta[l];
+                const uchar* s0 = src0[l];
+                const uchar* s1 = src1[l];
+
+                // vertical pass
+                vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
+
+                // horizontal pass
+                horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l);
+            }
+        }  // if lpi == 4
+
+    } else if (!xRatioEq) {
+        GAPI_DbgAssert(yRatioEq);
+        GAPI_Assert(inSz.width >= nlanes);
+
+        if (4 == lpi) {
+            // vertical pass
+            int w = 0;
+            for (;;) {
+                for (; w <= inSz.width - nlanes; w += nlanes) {
+                    v_uint8 s0 = vx_load(&src0[0][w]);
+                    v_uint8 s1 = vx_load(&src0[1][w]);
+                    v_uint8 s2 = vx_load(&src0[2][w]);
+                    v_uint8 s3 = vx_load(&src0[3][w]);
+                    v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - nlanes;
+                    continue;
+                }
+                break;
+            }
+
+            // horizontal pass
+            horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+        } else {  // any LPI
+            GAPI_Assert(outSz.width >= half_nlanes);
+            for (int l = 0; l < lpi; ++l) {
+                const uchar* src = src0[l];
+
+                // horizontal pass
+                horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l);
+            }
+        }
+
+    } else if (!yRatioEq) {
+        GAPI_DbgAssert(xRatioEq);
+        int length = inSz.width;  // == outSz.width
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+            const uchar* s0 = src0[l];
+            const uchar* s1 = src1[l];
+
+            // vertical pass
+            vertical_anyLPI(s0, s1, dst[l], length, beta0);
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq && yRatioEq);
+        int length = inSz.width;  // == outSz.width
+
+        for (int l = 0; l < lpi; ++l) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+}
 }  // namespace neon
 }  // namespace kernels
 }  // namespace gapi
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
@ -29,17 +29,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si
                     const float xalpha[], float vbuf[]);

 // Resize (bi-linear, 8U)
-void calcRowLinear_8U(uint8_t *dst[],
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
-                        const short  alpha[],
-                        const short  clone[],
-                        const short  mapsx[],
-                        const short  beta[],
-                            uint8_t  tmp[],
-                        const Size&  inSz,
-                        const Size&  outSz,
-                                int  lpi);
+void calcRowLinear_8UC1(uint8_t *dst[],
+                        const uint8_t *src0[],
+                        const uint8_t *src1[],
+                          const short  alpha[],
+                          const short  clone[],
+                          const short  mapsx[],
+                          const short  beta[],
+                              uint8_t  tmp[],
+                          const Size&  inSz,
+                          const Size&  outSz,
+                                  int  lpi);

 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -1105,6 +1105,23 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
    }
    #endif  // HAVE_SSE

+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value) {
+        if (inSz.width >= 16 && outSz.width >= 8) {
+            neon::calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
+                                     reinterpret_cast<const uint8_t**>(src0),
+                                     reinterpret_cast<const uint8_t**>(src1),
+                                     reinterpret_cast<const short*>(alpha),
+                                     reinterpret_cast<const short*>(clone),
+                                     reinterpret_cast<const short*>(mapsx),
+                                     reinterpret_cast<const short*>(beta),
+                                     reinterpret_cast<uint8_t*>(tmp),
+                                     inSz, outSz, lpi);
+            return;
+        }
+    }
+#endif
+
    for (int l = 0; l < lpi; l++) {
        constexpr static const auto unity = Mapper::unity;

--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@ -2401,6 +2401,31 @@ CV_ALWAYS_INLINE void v_gather_channel(v_int16x8& vec, const uchar src[], const
 }
 }  // namespace

+CV_ALWAYS_INLINE v_uint8x16 v_gather_pairs(const uchar src[], const short* mapsx)
+{
+    int16x8_t result = {};
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 0)]), result, 0);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 1)]), result, 1);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 2)]), result, 2);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 3)]), result, 3);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 4)]), result, 4);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 5)]), result, 5);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 6)]), result, 6);
+    result = vsetq_lane_s16(*reinterpret_cast<const ushort*>(&src[*(mapsx + 7)]), result, 7);
+    return v_uint8x16(vreinterpretq_u8_s16(result));
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_gather_lines(const uchar src[], const short* mapsx)
+{
+    int32x4_t result = {};
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&src[4 * (*(mapsx + 0))]), result, 0);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&src[4 * (*(mapsx + 1))]), result, 1);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&src[4 * (*(mapsx + 0) + 1)]), result, 2);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&src[4 * (*(mapsx + 1) + 1)]), result, 3);
+
+    return v_uint8x16(vreinterpretq_u8_s32(result));
+}
+
 template<int imm>
 CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
 {
@ -2448,6 +2473,91 @@ CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mas
 #endif
 }

+CV_ALWAYS_INLINE void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
+                                     const v_uint8x16& i2, const v_uint8x16& i3,
+                                     v_uint8x16& res0, v_uint8x16& res1,
+                                     v_uint8x16& res2, v_uint8x16& res3)
+{
+    uint8x16x2_t p1 = vzipq_u8(i0.val, i2.val);
+    uint8x16x2_t p2 = vzipq_u8(i1.val, i3.val);
+
+    uint8x16_t v0 = p1.val[0];
+    uint8x16_t v1 = p1.val[1];
+    uint8x16_t v2 = p2.val[0];
+    uint8x16_t v3 = p2.val[1];
+
+    uint8x16x2_t p3 = vzipq_u8(v0, v2);
+    uint8x16x2_t p4 = vzipq_u8(v1, v3);
+
+    uint8x16_t u0 = p3.val[0];
+    uint8x16_t u2 = p3.val[1];
+    uint8x16_t u1 = p4.val[0];
+    uint8x16_t u3 = p4.val[1];
+
+    uint8x16x2_t p5 = vzipq_u8(u0, u1);
+    uint8x16x2_t p6 = vzipq_u8(u2, u3);
+
+    v0 = p5.val[0];
+    v2 = p5.val[1];
+    v1 = p6.val[0];
+    v3 = p6.val[1];
+
+    uint8x16x2_t p7 = vzipq_u8(v0, v1);
+    uint8x16x2_t p8 = vzipq_u8(v2, v3);
+
+    res0.val = p7.val[0];
+    res1.val = p7.val[1];
+    res2.val = p8.val[0];
+    res3.val = p8.val[1];
+}
+
+CV_ALWAYS_INLINE void v_deinterleave_expand(const v_uint8x16& src,
+                                            v_int16x8& even, v_int16x8& odd)
+{
+    constexpr int nlanes = static_cast<int>(v_uint8x16::nlanes);
+    uchar mask_e[nlanes] = { 0, -1, 2, -1, 4, -1, 6, -1,
+                            8, -1, 10, -1, 12, -1, 14, -1 };
+
+    uchar mask_o[nlanes] = { 1, -1, 3, -1, 5, -1, 7, -1,
+                            9, -1, 11, -1, 13, -1, 15, -1 };
+
+    uint8x16_t mask_even = vld1q_u8(mask_e);
+    uint8x16_t mask_odd = vld1q_u8(mask_o);
+
+    v_uint8x16 res1 = v_shuffle(src, v_uint8x16(mask_even));
+    v_uint8x16 res2 = v_shuffle(src, v_uint8x16(mask_odd));
+    even.val = vreinterpretq_s16_u8(res1.val);
+    odd.val = vreinterpretq_s16_u8(res2.val);
+}
+
+CV_ALWAYS_INLINE v_int16x8 v_interleave_low(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x8x2_t p = vzipq_s16(a.val, b.val);
+    int16x8_t v = p.val[0];
+    return v_int16x8(v);
+}
+
+CV_ALWAYS_INLINE v_int16x8 v_interleave_high(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x8x2_t p = vzipq_s16(a.val, b.val);
+    int16x8_t v = p.val[1];
+    return v_int16x8(v);
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_interleave_low(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uint8x16x2_t p = vzipq_u8(a.val, b.val);
+    uint8x16_t v = p.val[0];
+    return v_uint8x16(v);
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uint8x16x2_t p = vzipq_u8(a.val, b.val);
+    uint8x16_t v = p.val[1];
+    return v_uint8x16(v);
+}
+
 template<int shift>
 CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
 {
@ -2470,6 +2580,12 @@ CV_ALWAYS_INLINE v_uint8x16 v_shift_left(const v_uint8x16& a)
    return v_slli_si128<16 - shift>(a);
 }

+template<int indx>
+CV_ALWAYS_INLINE v_uint8x16 v_insert(v_uint8x16& a, int64_t b)
+{
+    return v_uint8x16(vreinterpretq_u8_s64(vsetq_lane_s64(b, vreinterpretq_s64_u8(a.val), indx)));
+}
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

 //! @endcond