Improve performance of the Resize 3c/3p and Resize 8UC1 (#4945)

* scratch buffer * Refactoring horizontal path * * Refactoring horizontal pass. Step2 * * Refactoring horizontal pass. Step 3 * * Refactoring vertical pass. Step2 * Refactoring horizontal pass. Step4 * * Clean * Applied comments. * * Applied comments. Part 2
2021-04-19 21:11:58 +03:00 · 2021-04-19 21:11:58 +03:00 · 068229c815
commit 068229c815
parent 40eba6a2ef
4 changed files with 196 additions and 205 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -228,20 +228,90 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
    }
 }

+CV_ALWAYS_INLINE void vertical_4LPI(const uint8_t* src0[], const uint8_t* src1[],
+                                    uchar tmp[], const short beta[], const int length) {
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+    constexpr int half_nlanes = nlanes / 2;
+    GAPI_Assert(length >= half_nlanes);
+
+    v_int16 b0 = vx_setall_s16(beta[0]);
+    v_int16 b1 = vx_setall_s16(beta[1]);
+    v_int16 b2 = vx_setall_s16(beta[2]);
+    v_int16 b3 = vx_setall_s16(beta[3]);
+
+    v_int16 lo1, hi1, lo2, hi2;
+    v_int32 res1_s32, res2_s32;
+    int w = 0;
+    for (;;) {
+        for (; w <= length - half_nlanes; w += half_nlanes) {
+            v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+            v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+            v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+            v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+            v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+            v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+            v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+            v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+            v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+            v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+            v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+            v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+            v_int16 r0 = v_add_wrap(val1_0, t0);
+            v_int16 r1 = v_add_wrap(val1_1, t1);
+            v_int16 r2 = v_add_wrap(val1_2, t2);
+            v_int16 r3 = v_add_wrap(val1_3, t3);
+
+            v_interleave(r0, r1, lo1, hi1);
+            v_interleave(r2, r3, lo2, hi2);
+
+            v_int32 lo1_s32 = v_reinterpret_as_s32(lo1);
+            v_int32 hi1_s32 = v_reinterpret_as_s32(hi1);
+            v_int32 lo2_s32 = v_reinterpret_as_s32(lo2);
+            v_int32 hi2_s32 = v_reinterpret_as_s32(hi2);
+
+            v_interleave(lo1_s32, lo2_s32, res1_s32, res2_s32);
+
+            v_int16 res1 = v_reinterpret_as_s16(res1_s32);
+            v_int16 res2 = v_reinterpret_as_s16(res2_s32);
+
+            v_pack_u_store(&tmp[4 * w + 0], res1);
+            v_pack_u_store(&tmp[4 * w + half_nlanes], res2);
+
+            v_interleave(hi1_s32, hi2_s32, res1_s32, res2_s32);
+
+            v_int16 res3 = v_reinterpret_as_s16(res1_s32);
+            v_int16 res4 = v_reinterpret_as_s16(res2_s32);
+
+            v_pack_u_store(&tmp[4 * w + 2*half_nlanes], res3);
+            v_pack_u_store(&tmp[4 * w + 3*half_nlanes], res4);
+        }
+
+        if (w < length) {
+            w = length - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
 template<int chanNum>
 CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
-                                      const uchar* tmp, const short mapsx[],
-                                      const short clone[], const int length) {
+                                      const uchar* tmp, const short mapsx[], const uchar _mask_horizontal[],
+                                      const short clone[],
+                                      const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
    GAPI_Assert(length >= half_nlanes);

    const int shift = static_cast<int>(half_nlanes / 4);

-    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
    v_uint8 hmask = vx_load(_mask_horizontal);

    v_uint8 val_0, val_1, val_2, val_3;
+
    int x = 0;
    for (;;) {
        for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) {
@ -315,71 +385,19 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
    static_assert(v_uint8::nlanes == 16,
                  "The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    constexpr int half_nlanes = nlanes / 2;

    bool xRatioEq = inSz.width == outSz.width;
    bool yRatioEq = inSz.height == outSz.height;

    if (!xRatioEq && !yRatioEq) {
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
        if (4 == lpi) {
            // vertical pass
-            int inLength = inSz.width * chanNum;
-            GAPI_Assert(inLength >= half_nlanes);
-
-            v_int16 b0 = vx_setall_s16(beta[0]);
-            v_int16 b1 = vx_setall_s16(beta[1]);
-            v_int16 b2 = vx_setall_s16(beta[2]);
-            v_int16 b3 = vx_setall_s16(beta[3]);
-
-            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
-                                            2, 10, 6, 14, 3, 11, 7, 15 };
-            v_uint8 vmask = vx_load(_mask_vertical);
-
-            int w = 0;
-            for (;;) {
-                for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
-                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
-                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
-                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
-                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
-
-                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
-                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
-                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
-                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_pack_u(r0, r1);
-                    v_uint8 q1 = v_pack_u(r2, r3);
-
-                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
-                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
-
-                    v_uint8 q4 = v_shuffle(q2, vmask);
-                    v_uint8 q5 = v_shuffle(q3, vmask);
-
-                    vx_store(&tmp[4 * w + 0], q4);
-                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
-                }
-
-                if (w < inLength) {
-                    w = inLength - half_nlanes;
-                    continue;
-                }
-                break;
-            }
+            vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);

            // horizontal pass
-            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
        } else {  // if any lpi
              int inLength = inSz.width * chanNum;

@ -397,6 +415,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
          }
    } else if (!xRatioEq) {
        GAPI_DbgAssert(yRatioEq);
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };

        if (4 == lpi) {
            int inLength = inSz.width * chanNum;
@ -422,7 +442,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
            }

            // horizontal pass
-            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
        } else {  // any LPI
            for (int l = 0; l < lpi; ++l) {
                const uchar* src = src0[l];
@ -469,9 +489,8 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
                      const Size&    inSz,
                      const Size&    outSz,
                        const int    lpi) {
-    constexpr int chanNum = 3;
-    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
-                                     beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<3>(dst, src0, src1, alpha, clone, mapsx,
+                               beta, tmp, inSz, outSz, lpi);
 }

 // Resize (bi-linear, 8UC4)
@ -486,20 +505,18 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
                      const Size&    inSz,
                      const Size&    outSz,
                      const int      lpi) {
-    constexpr int chanNum = 4;
-    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
-                                     beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<4>(dst, src0, src1, alpha, clone, mapsx,
+                               beta, tmp, inSz, outSz, lpi);
 }

 CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
                                      const uchar* tmp, const short mapsx[],
+                                      const uchar _mask_horizontal[],
                                      const short clone[], const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
    GAPI_Assert(length >= half_nlanes);

-    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
-                                      1, 5, 9, 13, 3, 7, 11, 15 };
    v_uint8 hmask = vx_load(_mask_horizontal);
    int x = 0;
    for (;;) {
@ -557,12 +574,11 @@ CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
    }
 }

-CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
+CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
                                        const uchar* src, const short mapsx[],
-                                        const short alpha[], const int length,
-                                        const int line) {
+                                        const short alpha[], const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
    GAPI_Assert(length >= half_nlanes);
    v_int16 t0, t1;
    int x = 0;
@ -573,7 +589,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],

            v_deinterleave_expand(t, t0, t1);
            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
-            v_pack_u_store(&dst[line][x], d);
+            v_pack_u_store(&dst[x], d);
        }

        if (x < length) {
@ -608,79 +624,34 @@ void calcRowLinear_8UC1(uint8_t* dst[],
    if (!xRatioEq && !yRatioEq) {
        GAPI_Assert(inSz.width >= half_nlanes);

+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
        if (4 == lpi) {
            // vertical pass
-            v_int16 b0 = vx_setall_s16(beta[0]);
-            v_int16 b1 = vx_setall_s16(beta[1]);
-            v_int16 b2 = vx_setall_s16(beta[2]);
-            v_int16 b3 = vx_setall_s16(beta[3]);
-
-            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
-                                            2, 10, 6, 14, 3, 11, 7, 15 };
-            v_uint8 vmask = vx_load(_mask_vertical);
-
-            int w = 0;
-            for (;;) {
-                for (; w <= inSz.width - half_nlanes; w += half_nlanes) {
-                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
-                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
-                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
-                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
-
-                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
-                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
-                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
-                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_pack_u(r0, r1);
-                    v_uint8 q1 = v_pack_u(r2, r3);
-
-                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
-                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
-
-                    v_uint8 q4 = v_shuffle(q2, vmask);
-                    v_uint8 q5 = v_shuffle(q3, vmask);
-
-                    vx_store(&tmp[4 * w + 0], q4);
-                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
-                }
-
-                if (w < inSz.width) {
-                    w = inSz.width - half_nlanes;
-                    continue;
-                }
-                break;
-            }
+            vertical_4LPI(src0, src1, tmp, beta, inSz.width);

            // horizontal pass
-             horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
        } else {  // if any lpi
            for (int l = 0; l < lpi; ++l) {
                short beta0 = beta[l];
                const uchar* s0 = src0[l];
                const uchar* s1 = src1[l];
+                uchar* _dst = dst[l];

                // vertical pass
                vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);

                // horizontal pass
-                horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l);
+                horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
            }
        }  // if lpi == 4

    } else if (!xRatioEq) {
        GAPI_DbgAssert(yRatioEq);
        GAPI_Assert(inSz.width >= nlanes);
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };

        if (4 == lpi) {
            // vertical pass
@ -702,14 +673,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
            }

            // horizontal pass
-            horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
        } else {  // any LPI
            GAPI_Assert(outSz.width >= half_nlanes);
            for (int l = 0; l < lpi; ++l) {
                const uchar* src = src0[l];
+                uchar* _dst = dst[l];

                // horizontal pass
-                horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l);
+                horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
            }
        }

--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -895,7 +895,7 @@ struct linearScratchDesc {
        tmp   = reinterpret_cast<T*>      (mapsy + outH*2);
    }

-    static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
+    static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) {
        auto size = outW * sizeof(alpha_t)     +
                    outW * sizeof(alpha_t) * 4 +  // alpha clones // previous alpha is redundant?
                    outW * sizeof(index_t)     +
@ -910,7 +910,7 @@ struct linearScratchDesc {
 template<typename T, typename Mapper, int chanNum = 1>
 static void initScratchLinear(const cv::GMatDesc& in,
                              const         Size& outSz,
-                         cv::gapi::fluid::Buffer& scratch,
+                              cv::gapi::fluid::Buffer& scratch,
                                             int  lpi) {
    using alpha_type = typename Mapper::alpha_type;
    static const auto unity = Mapper::unity;
@ -1171,7 +1171,7 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
 template<typename T, class Mapper, int numChan>
 static void calcRowLinearC(const cv::gapi::fluid::View  & in,
                           std::array<std::reference_wrapper<cv::gapi::fluid::Buffer>, numChan>& out,
-                                  cv::gapi::fluid::Buffer& scratch) {
+                           cv::gapi::fluid::Buffer& scratch) {
    using alpha_type = typename Mapper::alpha_type;

    auto  inSz =  in.meta().size;
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@ -18,12 +18,12 @@ namespace gapi {

 namespace kernels {

-inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
-                               uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         uint8_t out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -46,12 +46,12 @@ inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
    }
 }

-inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
-                               const uint8_t in2[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         const uint8_t in2[], uint8_t out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -75,12 +75,13 @@ inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
    }
 }

-inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
-                               const uint8_t in3[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         const uint8_t in2[], const uint8_t in3[],
+                                         uint8_t out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -106,12 +107,12 @@ inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const u
    }
 }

-inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
+                                          float out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -133,12 +134,12 @@ inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
    }
 }

-inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
+                                          float out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -162,13 +163,13 @@ inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const floa
    }
 }

-inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
-                                const float in2[], const float in3[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
+                                          const float in2[], const float in3[],
+                                          float out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -196,12 +197,12 @@ inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],

 //------------------------------------------------------------------------------

-inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
-                               uint8_t out1[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
+                                         uint8_t out1[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -223,12 +224,12 @@ inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
    }
 }

-inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
-                               uint8_t out1[], uint8_t out2[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
+                                         uint8_t out1[], uint8_t out2[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -252,12 +253,12 @@ inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
    }
 }

-inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                               uint8_t out2[], uint8_t out3[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
+                                         uint8_t out2[], uint8_t out3[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -283,12 +284,12 @@ inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[
    }
 }

-inline void splitRow_32FC2_Impl(const float in[], float out0[],
+CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
                                float out1[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -311,12 +312,12 @@ inline void splitRow_32FC2_Impl(const float in[], float out0[],
    }
 }

-inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
-                                float out2[], int length) {
+CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
+                                          float out2[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -340,12 +341,12 @@ inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
    }
 }

-inline void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
-                                float out2[], float out3[], int length) {
+CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
+                                          float out2[], float out3[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    cycle:
    for (; l <= length - nlanes; l += nlanes) {
@ -380,7 +381,7 @@ static const int ITUR_BT_601_CVG = -852492;
 static const int ITUR_BT_601_CVR = 1673527;
 static const int ITUR_BT_601_SHIFT = 20;

-static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
+CV_ALWAYS_INLINE void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
    int uu, vv;
    uu = static_cast<int>(u) - 128;
    vv = static_cast<int>(v) - 128;
@ -390,9 +391,9 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
    buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }

-static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4], v_int32 (&guv)[4],
-                             v_int32 (&buv)[4]) {
+CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
+                                v_int32 (&ruv)[4], v_int32 (&guv)[4],
+                                v_int32 (&buv)[4]) {
    v_uint8 v128 = vx_setall_u8(128);
    v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
    v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
@ -417,8 +418,8 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
    }
 }

-static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
-                               uchar& r, uchar& g, uchar& b) {
+CV_ALWAYS_INLINE void yRGBuvToRGB(const uchar vy, const int ruv, const int guv,
+                                  const int buv, uchar& r, uchar& g, uchar& b) {
    int yy = static_cast<int>(vy);
    int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
    r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
@ -426,11 +427,11 @@ static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, con
    b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
 }

-static inline void yRGBuvToRGB(const v_uint8& vy,
-                               const v_int32 (&ruv)[4],
-                               const v_int32 (&guv)[4],
-                               const v_int32 (&buv)[4],
-                               v_uint8& rr, v_uint8& gg, v_uint8& bb) {
+CV_ALWAYS_INLINE void yRGBuvToRGB(const v_uint8& vy,
+                                  const v_int32 (&ruv)[4],
+                                  const v_int32 (&guv)[4],
+                                  const v_int32 (&buv)[4],
+                                  v_uint8& rr, v_uint8& gg, v_uint8& bb) {
    v_uint8 v16 = vx_setall_u8(16);
    v_uint8 posY = vy - v16;
    v_uint16 yy0, yy1;
@ -463,15 +464,14 @@ static inline void yRGBuvToRGB(const v_uint8& vy,
    bb = v_pack_u(b0, b1);
 }

-inline void calculate_nv12_to_rgb_impl(const  uchar **srcY,
-                                       const  uchar *srcUV,
-                                       uchar **dstRGBx,
-                                       int width) {
+CV_ALWAYS_INLINE void calculate_nv12_to_rgb_impl(const  uchar **srcY,
+                                                 const  uchar *srcUV,
+                                                 uchar **dstRGBx,
+                                                 int width) {
    int i = 0;

 #if MANUAL_SIMD
-
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
        v_uint8 u, v;
@ -535,14 +535,13 @@ inline void calculate_nv12_to_rgb_impl(const  uchar **srcY,
    }
 }

-inline void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,
-                                       const  uchar *srcV, uchar **dstRGBx,
-                                       int width) {
+CV_ALWAYS_INLINE void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,
+                                                 const  uchar *srcV, uchar **dstRGBx,
+                                                 int width) {
    int i = 0;

 #if MANUAL_SIMD
-
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
        v_uint8 u = vx_load(srcU + i/2);
@ -610,8 +609,8 @@ inline void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,

 // vertical pass
 template<typename T, typename A, typename I, typename W>
-static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
-                         W vbuf[]) {
+CV_ALWAYS_INLINE void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap,
+                            A yalpha, W vbuf[]) {
    int y_1st = ymap.index0;
    int ylast = ymap.index1 - 1;

@ -619,7 +618,7 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
    GAPI_DbgAssert(y_1st < ylast);

 #if MANUAL_SIMD
-    const int nlanes = v_uint16::nlanes;
+    constexpr int nlanes = v_uint16::nlanes;
 #endif

    // 1st and last rows
@ -667,8 +666,8 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym

 // horizontal pass
 template<typename T, typename A, typename I, typename W>
-static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
-                         const A xalpha[], const W vbuf[]) {
+CV_ALWAYS_INLINE void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
+                            const A xalpha[], const W vbuf[]) {
 // TO DO: try lambda here
 #define HSUM(xmaxdf)                                 \
    for (int x = 0; x < outWidth; x++) {             \
@ -704,9 +703,11 @@ static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
 }

 template<typename T, typename A, typename I, typename W>
-static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
-    A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
-    W vbuf[]) {
+CV_ALWAYS_INLINE void calcRowArea_impl(T dst[], const T *src[], const Size& inSz,
+                                       const Size& outSz, A yalpha,
+                                       const MapperUnit<A, I>& ymap, int xmaxdf,
+                                       const I xindex[], const A xalpha[],
+                                       W vbuf[]) {
    bool xRatioEq1 = inSz.width  == outSz.width;
    bool yRatioEq1 = inSz.height == outSz.height;

@ -738,18 +739,18 @@ static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Si

 #if MANUAL_SIMD
 template <typename VecT, typename T>
-void copyRow_impl(const T in[], T out[], int l) {
+CV_ALWAYS_INLINE void copyRow_impl(const T in[], T out[], int l) {
    VecT r;
    r = vx_load(&in[l]);
    vx_store(&out[l], r);
 }
 #endif

-inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;

    for (; l <= length - nlanes; l += nlanes) {
        copyRow_impl<v_uint8>(in, out, l);
@ -766,11 +767,11 @@ inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
    }
 }

-inline void copyRow_32F_impl(const float in[], float out[], int length) {
+CV_ALWAYS_INLINE void copyRow_32F_impl(const float in[], float out[], int length) {
    int l = 0;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;

    for (; l <= length - nlanes; l += nlanes) {
        copyRow_impl<v_float32>(in, out, l);
@ -801,7 +802,7 @@ CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
    bool yRatioEq1 = inSz.height == outSz.height;

 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 #endif

    if (!xRatioEq1 && !yRatioEq1) {
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@ -2606,6 +2606,24 @@ CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8
    return v_uint8x16(v);
 }

+CV_ALWAYS_INLINE void v_interleave(const v_int16x8& a, const v_int16x8& b,
+                                   v_int16x8& v1, v_int16x8& v2)
+{
+    int16x8x2_t p = vzipq_s16(a.val, b.val);
+    v1.val = p.val[0];
+    v2.val = p.val[1];
+    return;
+}
+
+CV_ALWAYS_INLINE void v_interleave(const v_int32x4& a, const v_int32x4& b,
+                                   v_int32x4& v1, v_int32x4& v2)
+{
+    int32x4x2_t p = vzipq_s32(a.val, b.val);
+    v1.val = p.val[0];
+    v2.val = p.val[1];
+    return;
+}
+
 template<int shift>
 CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
 {