SIMD optimization of 8U3C/4C Resize for ARM via universal intrinsics. (#4331)

* SIMD opt of 8U3C/4C Resize for ARM * * Fix issue. * * Applied comments. * * Applied comments. Step 2.
2021-03-05 19:58:46 +03:00 · 2021-03-05 19:58:46 +03:00 · 3656e1c564
commit 3656e1c564
parent 3dd5bfcfdd
5 changed files with 545 additions and 23 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -4,6 +4,7 @@
 #include "ie_preprocess_gapi_kernels.hpp"
 #include "ie_preprocess_gapi_kernels_impl.hpp"
 #include "ie_preprocess_gapi_kernels_neon.hpp"
 #include <arm_neon.h>
@ -126,6 +127,351 @@ void copyRow_32F(const float in[], float out[], int length) {
    copyRow_32F_impl(in, out, length);
 }
 template<int chanNum>
 CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                            const uchar* src, const int width,
                                            const int nlanes, const int line) {
    v_uint8 chan;
    int x = 0;
    for (;;) {
        for (; x <= width - nlanes && x >= 0; x += nlanes) {
            for (int c = 0; c < chanNum; ++c) {
                v_gather_channel<chanNum>(chan, &src[chanNum * x], c);
                vx_store(&dst[c][line][x], chan);
            }
        }
        if (x < width) {
            x = width - nlanes;
            continue;
        }
        break;
    }
 }
 CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
                                      uchar* tmp, const int inLength,
                                      const int nlanes, const short beta) {
    int w = 0;
    const int half_nlanes = nlanes/2;
    for (;;) {
        for (; w <= inLength - nlanes; w += nlanes) {
            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[w]));
            v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[w]));
            v_int16 s2 = v_reinterpret_as_s16(vx_load_expand(&src0[w + half_nlanes]));
            v_int16 s3 = v_reinterpret_as_s16(vx_load_expand(&src1[w + half_nlanes]));
            v_int16 res1 = v_mulhrs(s0 - s1, beta) + s1;
            v_int16 res2 = v_mulhrs(s2 - s3, beta) + s3;
            vx_store(tmp + w, v_pack_u(res1, res2));
        }
        if (w < inLength) {
            w = inLength - nlanes;
            continue;
        }
        break;
    }
 }
 template<int chanNum>
 CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                        const uchar* src, const short mapsx[],
                                        const short alpha[], const int nlanes,
                                        const int width, const int line) {
    const int half_nlanes = nlanes/2;
    v_int16 t0, t1;//, t2, t3;
    int x = 0;
    for (;;) {
        for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
            v_int16 a0 = vx_load(&alpha[x]);
            for (int c = 0; c < chanNum; ++c) {
                v_gather_channel<chanNum>(t0, src, &mapsx[x], c, 0);
                v_gather_channel<chanNum>(t1, src, &mapsx[x], c, 1);
                //v_gather_channel<chanNum>(t2, src, &mapsx[x + half_nlanes], c, 0);
                //v_gather_channel<chanNum>(t3, src, &mapsx[x + half_nlanes], c, 1);
                v_int16 res1 = v_mulhrs(t0 - t1, a0) + t1;
                //v_int16 res2 = v_mulhrs(t2 - t3, a0) + t3;
                //vx_store(&dst[c][line][x], v_pack_u(res1, res2));
                v_pack_u_store(&dst[c][line][x], res1);
            }
        }
        if (x < width) {
            //x = width - nlanes;
            x = width - half_nlanes;
            continue;
        }
        break;
    }
 }
 template<int chanNum>
 CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                      const uchar* tmp, const short mapsx[], const short clone[],
                                      const int width, const int nlanes) {
    v_uint8 val_0, val_1, val_2, val_3;
    const int half_nlanes = nlanes / 2;
    const int shift = static_cast<int>(half_nlanes / 4);
    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
    v_uint8 hmask = vx_load(_mask_horizontal);
    int x = 0;
    for (;;) {
        for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
            v_int16 a10 = vx_load(&clone[4 * x]);
            v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
            v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
            v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
            for (int c = 0; c < chanNum; ++c) {
                v_gather_channel(val_0, tmp, &mapsx[x], chanNum, c, 0);
                v_gather_channel(val_1, tmp, &mapsx[x], chanNum, c, shift);
                v_gather_channel(val_2, tmp, &mapsx[x], chanNum, c, shift * 2);
                v_gather_channel(val_3, tmp, &mapsx[x], chanNum, c, shift * 3);
                v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
                v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
                v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
                v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
                v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
                v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
                v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
                v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
                v_int16 r0 = v_add_wrap(val1_0, t0);
                v_int16 r1 = v_add_wrap(val1_1, t1);
                v_int16 r2 = v_add_wrap(val1_2, t2);
                v_int16 r3 = v_add_wrap(val1_3, t3);
                v_uint8 q0 = v_pack_u(r0, r1);
                v_uint8 q1 = v_pack_u(r2, r3);
                v_uint8 q2 = v_shuffle(q0, hmask);
                v_uint8 q3 = v_shuffle(q1, hmask);
                v_uint8 q4 = v_blend<0xCC /*0b11001100*/>(q2, v_slli_si128(q3, 4));
                v_uint8 q5 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q2, 4), q3);
                v_store_low(&dst[c][0][x], q4);
                v_store_high(&dst[c][1][x], q4);
                v_store_low(&dst[c][2][x], q5);
                v_store_high(&dst[c][3][x], q5);
            }
        }
        if (x < width) {
            x = width - half_nlanes;
            continue;
        }
        break;
    }
 }
 template<int chanNum>
 CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                              const uint8_t* src0[],
                                              const uint8_t* src1[],
                                              const short    alpha[],
                                              const short    clone[],  // 4 clones of alpha
                                              const short    mapsx[],
                                              const short    beta[],
                                                  uint8_t    tmp[],
                                              const Size&    inSz,
                                              const Size&    outSz,
                                              const int      lpi) {
    constexpr int nlanes = static_cast<int>(v_int8::nlanes);
    constexpr int half_nlanes = nlanes / 2;
    bool xRatioEq = inSz.width == outSz.width;
    bool yRatioEq = inSz.height == outSz.height;
    if (!xRatioEq && !yRatioEq) {
        if (4 == lpi) {
            // vertical pass
            int inLength = inSz.width * chanNum;
            GAPI_Assert(inLength >= half_nlanes);
            v_int16 b0 = vx_setall_s16(beta[0]);
            v_int16 b1 = vx_setall_s16(beta[1]);
            v_int16 b2 = vx_setall_s16(beta[2]);
            v_int16 b3 = vx_setall_s16(beta[3]);
            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
                                            2, 10, 6, 14, 3, 11, 7, 15 };
            v_uint8 vmask = vx_load(_mask_vertical);
            int w = 0;
            for (;;) {
                for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
                    v_int16 r0 = v_add_wrap(val1_0, t0);
                    v_int16 r1 = v_add_wrap(val1_1, t1);
                    v_int16 r2 = v_add_wrap(val1_2, t2);
                    v_int16 r3 = v_add_wrap(val1_3, t3);
                    v_uint8 q0 = v_pack_u(r0, r1);
                    v_uint8 q1 = v_pack_u(r2, r3);
                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_slli_si128(q1, 4));
                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q0, 4), q1);
                    v_uint8 q4 = v_shuffle(q2, vmask);
                    v_uint8 q5 = v_shuffle(q3, vmask);
                    vx_store(&tmp[4 * w + 0], q4);
                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
                }
                if (w < inLength) {
                    w = inLength - half_nlanes;
                    continue;
                }
                break;
            }
            // horizontal pass
            GAPI_Assert(outSz.width >= half_nlanes);
            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
        } else {  // if any lpi
              int inLength = inSz.width * chanNum;
              GAPI_Assert(inLength >= half_nlanes);
              GAPI_Assert(outSz.width >= half_nlanes);
              for (int l = 0; l < lpi; ++l) {
                  short beta0 = beta[l];
                  const uchar* s0 = src0[l];
                  const uchar* s1 = src1[l];
                  // vertical pass
                  vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
                  // horizontal pass
                  horizontal_anyLPI<chanNum>(dst, tmp, mapsx, alpha, nlanes, outSz.width, l);
              }
          }
    } else if (!xRatioEq) {
        GAPI_Assert(yRatioEq);
        if (4 == lpi) {
            int inLength = inSz.width * chanNum;
            // vertical pass
            GAPI_DbgAssert(inLength >= nlanes);
            v_uint8 s0, s1, s2, s3;
            int w = 0;
            for (;;) {
                for (; w <= inLength - nlanes; w += nlanes) {
                    s0 = vx_load(&src0[0][w]);
                    s1 = vx_load(&src0[1][w]);
                    s2 = vx_load(&src0[2][w]);
                    s3 = vx_load(&src0[3][w]);
                    v_store_interleave(&tmp[lpi * w], s0, s1, s2, s3);
                }
                if (w < inLength) {
                    w = inLength - nlanes;
                    continue;
                }
                break;
            }
            // horizontal pass
            GAPI_Assert(outSz.width >= half_nlanes);
            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
        } else {  // any LPI
            GAPI_Assert(outSz.width >= half_nlanes);
            for (int l = 0; l < lpi; ++l) {
                const uchar* src = src0[l];
                // horizontal pass
                horizontal_anyLPI<chanNum>(dst, src, mapsx, alpha, nlanes, outSz.width, l);
            }
        }
    } else if (!yRatioEq) {
        GAPI_Assert(xRatioEq);
        int inLength = inSz.width*chanNum;  // == outSz.width
        GAPI_Assert(inLength >= half_nlanes);
        GAPI_Assert(outSz.width >= nlanes);
        for (int l = 0; l < lpi; ++l) {
            short beta0 = beta[l];
            const uchar* s0 = src0[l];
            const uchar* s1 = src1[l];
            // vertical pass
            vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
            //split channels to planes and store
            channels2planes_store<chanNum>(dst, tmp, outSz.width, nlanes, l);
        }
    } else {
        GAPI_Assert(xRatioEq && yRatioEq);
        GAPI_Assert(outSz.width >= nlanes);
        //split channels to planes and store
        for (int l = 0; l < lpi; ++l) {
            const uchar* src = src0[l];
            channels2planes_store<chanNum>(dst, src, outSz.width, nlanes, l);
        }
    }
 }
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
                      const uint8_t* src0[],
                      const uint8_t* src1[],
                      const short    alpha[],
                      const short    clone[],  // 4 clones of alpha
                      const short    mapsx[],
                      const short    beta[],
                          uint8_t    tmp[],
                      const Size&    inSz,
                      const Size&    outSz,
                        const int    lpi) {
    constexpr int chanNum = 3;
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
                                     beta, tmp, inSz, outSz, lpi);
 }
 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
                      const uint8_t* src0[],
                      const uint8_t* src1[],
                      const short    alpha[],
                      const short    clone[],  // 4 clones of alpha
                      const short    mapsx[],
                      const short    beta[],
                          uint8_t    tmp[],
                      const Size&    inSz,
                      const Size&    outSz,
                      const int      lpi) {
    constexpr int chanNum = 4;
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
                                     beta, tmp, inSz, outSz, lpi);
 }
 }  // namespace neon
 }  // namespace kernels
 }  // namespace gapi
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@ -904,34 +904,33 @@ void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
-                  const uint8_t *src0[],
+                      const uint8_t* src0[],
-                  const uint8_t *src1[],
+                      const uint8_t* src1[],
-                  const short    alpha[],
+                      const short    alpha[],
-                  const short    clone[],  // 4 clones of alpha
+                      const short    clone[],  // 4 clones of alpha
-                  const short    mapsx[],
+                      const short    mapsx[],
-                  const short    beta[],
+                      const short    beta[],
-                        uint8_t  tmp[],
+                          uint8_t    tmp[],
-                  const Size    &inSz,
+                      const Size&    inSz,
-                  const Size    &outSz,
+                      const Size&    outSz,
-                        int      lpi) {
+                      const int      lpi) {
-    constexpr const int chanNum = 3;
+    constexpr int chanNum = 3;
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
-                  const uint8_t *src0[],
+                      const uint8_t* src0[],
-                  const uint8_t *src1[],
+                      const uint8_t* src1[],
-                  const short    alpha[],
+                      const short    alpha[],
-                  const short    clone[],  // 4 clones of alpha
+                      const short    clone[],  // 4 clones of alpha
-                  const short    mapsx[],
+                      const short    mapsx[],
-                  const short    beta[],
+                      const short    beta[],
-                        uint8_t  tmp[],
+                          uint8_t    tmp[],
-                  const Size    &inSz,
+                      const Size&   inSz,
-                  const Size    &outSz,
+                      const Size&   outSz,
-                        int      lpi) {
+                      const int     lpi) {
-    constexpr const int chanNum = 4;
+    constexpr int chanNum = 4;
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -1223,6 +1223,23 @@ static void calcRowLinearC(const cv::gapi::fluid::View  & in,
    }
 #endif  // HAVE_SSE
 #ifdef HAVE_NEON
    if (std::is_same<T, uint8_t>::value) {
        if (inSz.width >= 16 && outSz.width >= 8) {
            neon::calcRowLinear_8UC<numChan>(dst,
                                             reinterpret_cast<const uint8_t**>(src0),
                                             reinterpret_cast<const uint8_t**>(src1),
                                             reinterpret_cast<const short*>(alpha),
                                             reinterpret_cast<const short*>(clone),
                                             reinterpret_cast<const short*>(mapsx),
                                             reinterpret_cast<const short*>(beta),
                                             reinterpret_cast<uint8_t*>(tmp),
                                             inSz, outSz, lpi);
            return;
         }
    }
 #endif  // HAVE_NEON
    auto length = out[0].get().length();
    for (int l = 0; l < lpi; l++) {
--- a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
@ -188,6 +188,16 @@ inline unsigned int trailingZeros32(unsigned int value) {
 // access from within opencv code more accessible
 namespace cv {
 #ifndef CV_ALWAYS_INLINE
 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
 #define CV_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #define CV_ALWAYS_INLINE __forceinline
 #else
 #define CV_ALWAYS_INLINE inline
 #endif
 #endif
 namespace hal {
 enum StoreMode
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@ -2323,7 +2323,157 @@ inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
    return result;
 }
 CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b)
 {
    // Multiply
    int32x4_t mul_lo = vmull_s16(vget_low_s16(a.val),
                                 vget_low_s16(b.val));
    int32x4_t mul_hi = vmull_s16(vget_high_s16(a.val),
                                 vget_high_s16(b.val));
    // Rounding narrowing shift right
    // narrow = (int16_t)((mul + 16384) >> 15);
    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
    // Join together
    return v_int16x8(vcombine_s16(narrow_lo, narrow_hi));
 }
 CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const short b)
 {
    return v_mulhrs(a, v_setall_s16(b));
 }
 CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[],
                                       const short* index, const int chanNum,
                                       const int c, const int shift)
 {
    int32x4_t result = {};
    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0)) + c)]), result, 0);
    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1)) + c)]), result, 1);
    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0) + 1) + c)]), result, 2);
    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1) + 1) + c)]), result, 3);
    vec.val = vreinterpretq_u8_s32(result);
 }
 template<int chanNum>
 CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uchar src[], const int channel)
 {
    uint8x16_t result = {};
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + channel), result, 0);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + chanNum + channel), result, 1);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 2 * chanNum + channel), result, 2);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 3 * chanNum + channel), result, 3);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 4 * chanNum + channel), result, 4);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 5 * chanNum + channel), result, 5);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 6 * chanNum + channel), result, 6);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 7 * chanNum + channel), result, 7);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 8 * chanNum + channel), result, 8);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 9 * chanNum + channel), result, 9);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 10 * chanNum + channel), result, 10);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 11 * chanNum + channel), result, 11);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 12 * chanNum + channel), result, 12);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 13 * chanNum + channel), result, 13);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 14 * chanNum + channel), result, 14);
    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 15 * chanNum + channel), result, 15);
    vec.val = result;
 }
 namespace {
 template<int chanNum>
 CV_ALWAYS_INLINE void v_gather_channel(v_int16x8& vec, const uchar src[], const short* index, const int channel, const int pos)
 {
    int16x8_t result = {};
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*index + pos) + channel]), result, 0);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 1) + pos) + channel]), result, 1);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 2) + pos) + channel]), result, 2);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 3) + pos) + channel]), result, 3);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 4) + pos) + channel]), result, 4);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 5) + pos) + channel]), result, 5);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 6) + pos) + channel]), result, 6);
    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 7) + pos) + channel]), result, 7);
    vec.val = result;
 }
 }  // namespace
 template<int imm>
 CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
 {
    const uint16_t _mask[8] = { ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,
                                ((imm) & (1 << 7)) ? 0xFFFF : 0x0000 };
    uint16x8_t _mask_vec = vld1q_u16(_mask);
    uint16x8_t _a = vreinterpretq_u16_u8(a.val);
    uint16x8_t _b = vreinterpretq_u16_u8(b.val);
    return v_uint8x16(vreinterpretq_u8_u16(vbslq_u16(_mask_vec, _b, _a)));
 }
 CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mask)
 {
    uint8x16_t tbl = a.val;   // input a
    uint8x16_t idx = mask.val;  // input mask
    uint8x16_t idx_masked =
               vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
 #if defined(__aarch64__)
    return v_uint8x16(vqtbl1q_u8(tbl, idx_masked));
 #elif defined(__GNUC__)
    uint8x16_t ret;
    // %e and %f represent the even and odd D registers
    // respectively.
    __asm__ __volatile__(
        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
        : [ret] "=&w"(ret)
        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
    return v_uint8x16(ret);
 #else
    uint8x8x2_t a_split = { vget_low_u8(tbl), vget_high_u8(tbl) };
    return v_uint8x16(vcombine_u8(
                                vtbl2_u8(a_split, vget_low_u8(idx_masked)),
                                vtbl2_u8(a_split, vget_high_u8(idx_masked))));
 #endif
 }
 CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a, const int imm)
 {
    uint8x16_t ret = {};
    if (imm <= 0) {
        ret = a.val;
    }
    if (imm > 15) {
        ret = vdupq_n_u8(0);
    } else {
        ret = vextq_u8(vdupq_n_u8(0), a.val, 16 - (imm));
    }
    return v_uint8x16(ret);
 }
 CV_ALWAYS_INLINE v_uint8x16 v_srli_si128(const v_uint8x16& a, const int imm)
 {
    uint8x16_t ret = {};
    if (imm <= 0) {
        ret = a.val;
    }
    if (imm > 15) {
        ret = vdupq_n_u8(0);
    } else {
        ret = vextq_u8(a.val, vdupq_n_u8(0), imm);
    }
    return v_uint8x16(ret);
 }
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END