From 068229c8151338f448c29f00ccbc82ce5b42e006 Mon Sep 17 00:00:00 2001
From: Anna Khakimova <anna.khakimova@intel.com>
Date: Mon, 19 Apr 2021 21:11:58 +0300
Subject: [PATCH] Improve performance of the Resize 3c/3p and Resize 8UC1
 (#4945)

* scratch buffer

* Refactoring horizontal path

* * Refactoring horizontal pass. Step2

* * Refactoring horizontal pass. Step 3

* * Refactoring vertical pass. Step2

* Refactoring  horizontal pass. Step4

* * Clean

* Applied comments.

* * Applied comments. Part 2
---
 .../ie_preprocess_gapi_kernels_neon.cpp       | 232 ++++++++----------
 .../ie_preprocess_gapi_kernels.cpp            |   6 +-
 .../ie_preprocess_gapi_kernels_simd_impl.hpp  | 145 +++++------
 .../thirdparty/ocv/opencv_hal_neon.hpp        |  18 ++
 4 files changed, 196 insertions(+), 205 deletions(-)
diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
index 493ed365e45..779db927c32 100644
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@@ -228,20 +228,90 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
     }
 }
 
+CV_ALWAYS_INLINE void vertical_4LPI(const uint8_t* src0[], const uint8_t* src1[],
+                                    uchar tmp[], const short beta[], const int length) {
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+    constexpr int half_nlanes = nlanes / 2;
+    GAPI_Assert(length >= half_nlanes);
+
+    v_int16 b0 = vx_setall_s16(beta[0]);
+    v_int16 b1 = vx_setall_s16(beta[1]);
+    v_int16 b2 = vx_setall_s16(beta[2]);
+    v_int16 b3 = vx_setall_s16(beta[3]);
+
+    v_int16 lo1, hi1, lo2, hi2;
+    v_int32 res1_s32, res2_s32;
+    int w = 0;
+    for (;;) {
+        for (; w <= length - half_nlanes; w += half_nlanes) {
+            v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+            v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+            v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+            v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+            v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+            v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+            v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+            v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+            v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+            v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+            v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+            v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+            v_int16 r0 = v_add_wrap(val1_0, t0);
+            v_int16 r1 = v_add_wrap(val1_1, t1);
+            v_int16 r2 = v_add_wrap(val1_2, t2);
+            v_int16 r3 = v_add_wrap(val1_3, t3);
+
+            v_interleave(r0, r1, lo1, hi1);
+            v_interleave(r2, r3, lo2, hi2);
+
+            v_int32 lo1_s32 = v_reinterpret_as_s32(lo1);
+            v_int32 hi1_s32 = v_reinterpret_as_s32(hi1);
+            v_int32 lo2_s32 = v_reinterpret_as_s32(lo2);
+            v_int32 hi2_s32 = v_reinterpret_as_s32(hi2);
+
+            v_interleave(lo1_s32, lo2_s32, res1_s32, res2_s32);
+
+            v_int16 res1 = v_reinterpret_as_s16(res1_s32);
+            v_int16 res2 = v_reinterpret_as_s16(res2_s32);
+
+            v_pack_u_store(&tmp[4 * w + 0], res1);
+            v_pack_u_store(&tmp[4 * w + half_nlanes], res2);
+
+            v_interleave(hi1_s32, hi2_s32, res1_s32, res2_s32);
+
+            v_int16 res3 = v_reinterpret_as_s16(res1_s32);
+            v_int16 res4 = v_reinterpret_as_s16(res2_s32);
+
+            v_pack_u_store(&tmp[4 * w + 2*half_nlanes], res3);
+            v_pack_u_store(&tmp[4 * w + 3*half_nlanes], res4);
+        }
+
+        if (w < length) {
+            w = length - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
 template<int chanNum>
 CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
-                                      const uchar* tmp, const short mapsx[],
-                                      const short clone[], const int length) {
+                                      const uchar* tmp, const short mapsx[], const uchar _mask_horizontal[],
+                                      const short clone[],
+                                      const int length) {
     constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
     GAPI_Assert(length >= half_nlanes);
 
     const int shift = static_cast<int>(half_nlanes / 4);
 
-    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
     v_uint8 hmask = vx_load(_mask_horizontal);
 
     v_uint8 val_0, val_1, val_2, val_3;
+
     int x = 0;
     for (;;) {
         for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) {
@@ -315,71 +385,19 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
     static_assert(v_uint8::nlanes == 16,
                   "The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
     constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    constexpr int half_nlanes = nlanes / 2;
 
     bool xRatioEq = inSz.width == outSz.width;
     bool yRatioEq = inSz.height == outSz.height;
 
     if (!xRatioEq && !yRatioEq) {
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
         if (4 == lpi) {
             // vertical pass
-            int inLength = inSz.width * chanNum;
-            GAPI_Assert(inLength >= half_nlanes);
-
-            v_int16 b0 = vx_setall_s16(beta[0]);
-            v_int16 b1 = vx_setall_s16(beta[1]);
-            v_int16 b2 = vx_setall_s16(beta[2]);
-            v_int16 b3 = vx_setall_s16(beta[3]);
-
-            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
-                                            2, 10, 6, 14, 3, 11, 7, 15 };
-            v_uint8 vmask = vx_load(_mask_vertical);
-
-            int w = 0;
-            for (;;) {
-                for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
-                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
-                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
-                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
-                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
-
-                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
-                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
-                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
-                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_pack_u(r0, r1);
-                    v_uint8 q1 = v_pack_u(r2, r3);
-
-                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
-                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
-
-                    v_uint8 q4 = v_shuffle(q2, vmask);
-                    v_uint8 q5 = v_shuffle(q3, vmask);
-
-                    vx_store(&tmp[4 * w + 0], q4);
-                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
-                }
-
-                if (w < inLength) {
-                    w = inLength - half_nlanes;
-                    continue;
-                }
-                break;
-            }
+            vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
 
             // horizontal pass
-            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
         } else {  // if any lpi
               int inLength = inSz.width * chanNum;
 
@@ -397,6 +415,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
           }
     } else if (!xRatioEq) {
         GAPI_DbgAssert(yRatioEq);
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
 
         if (4 == lpi) {
             int inLength = inSz.width * chanNum;
@@ -422,7 +442,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
             }
 
             // horizontal pass
-            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
         } else {  // any LPI
             for (int l = 0; l < lpi; ++l) {
                 const uchar* src = src0[l];
@@ -469,9 +489,8 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
                       const Size&    inSz,
                       const Size&    outSz,
                         const int    lpi) {
-    constexpr int chanNum = 3;
-    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
-                                     beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<3>(dst, src0, src1, alpha, clone, mapsx,
+                               beta, tmp, inSz, outSz, lpi);
 }
 
 // Resize (bi-linear, 8UC4)
@@ -486,20 +505,18 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
                       const Size&    inSz,
                       const Size&    outSz,
                       const int      lpi) {
-    constexpr int chanNum = 4;
-    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
-                                     beta, tmp, inSz, outSz, lpi);
+    calcRowLinear_8UC_Impl_<4>(dst, src0, src1, alpha, clone, mapsx,
+                               beta, tmp, inSz, outSz, lpi);
 }
 
 CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
                                       const uchar* tmp, const short mapsx[],
+                                      const uchar _mask_horizontal[],
                                       const short clone[], const int length) {
     constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
     GAPI_Assert(length >= half_nlanes);
 
-    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
-                                      1, 5, 9, 13, 3, 7, 11, 15 };
     v_uint8 hmask = vx_load(_mask_horizontal);
     int x = 0;
     for (;;) {
@@ -557,12 +574,11 @@ CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
     }
 }
 
-CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
+CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
                                         const uchar* src, const short mapsx[],
-                                        const short alpha[], const int length,
-                                        const int line) {
+                                        const short alpha[], const int length) {
     constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    const int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = nlanes / 2;
     GAPI_Assert(length >= half_nlanes);
     v_int16 t0, t1;
     int x = 0;
@@ -573,7 +589,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
 
             v_deinterleave_expand(t, t0, t1);
             v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
-            v_pack_u_store(&dst[line][x], d);
+            v_pack_u_store(&dst[x], d);
         }
 
         if (x < length) {
@@ -608,79 +624,34 @@ void calcRowLinear_8UC1(uint8_t* dst[],
     if (!xRatioEq && !yRatioEq) {
         GAPI_Assert(inSz.width >= half_nlanes);
 
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
         if (4 == lpi) {
             // vertical pass
-            v_int16 b0 = vx_setall_s16(beta[0]);
-            v_int16 b1 = vx_setall_s16(beta[1]);
-            v_int16 b2 = vx_setall_s16(beta[2]);
-            v_int16 b3 = vx_setall_s16(beta[3]);
-
-            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
-                                            2, 10, 6, 14, 3, 11, 7, 15 };
-            v_uint8 vmask = vx_load(_mask_vertical);
-
-            int w = 0;
-            for (;;) {
-                for (; w <= inSz.width - half_nlanes; w += half_nlanes) {
-                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
-                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
-                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
-                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
-
-                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
-                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
-                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
-                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_pack_u(r0, r1);
-                    v_uint8 q1 = v_pack_u(r2, r3);
-
-                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
-                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
-
-                    v_uint8 q4 = v_shuffle(q2, vmask);
-                    v_uint8 q5 = v_shuffle(q3, vmask);
-
-                    vx_store(&tmp[4 * w + 0], q4);
-                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
-                }
-
-                if (w < inSz.width) {
-                    w = inSz.width - half_nlanes;
-                    continue;
-                }
-                break;
-            }
+            vertical_4LPI(src0, src1, tmp, beta, inSz.width);
 
             // horizontal pass
-             horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
         } else {  // if any lpi
             for (int l = 0; l < lpi; ++l) {
                 short beta0 = beta[l];
                 const uchar* s0 = src0[l];
                 const uchar* s1 = src1[l];
+                uchar* _dst = dst[l];
 
                 // vertical pass
                 vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
 
                 // horizontal pass
-                horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l);
+                horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
             }
         }  // if lpi == 4
 
     } else if (!xRatioEq) {
         GAPI_DbgAssert(yRatioEq);
         GAPI_Assert(inSz.width >= nlanes);
+        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                           1, 5, 9, 13, 3, 7, 11, 15 };
 
         if (4 == lpi) {
             // vertical pass
@@ -702,14 +673,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
             }
 
             // horizontal pass
-            horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
         } else {  // any LPI
             GAPI_Assert(outSz.width >= half_nlanes);
             for (int l = 0; l < lpi; ++l) {
                 const uchar* src = src0[l];
+                uchar* _dst = dst[l];
 
                 // horizontal pass
-                horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l);
+                horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
             }
         }
 
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
index 0e49f4116ec..2b4c53f4d9a 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -895,7 +895,7 @@ struct linearScratchDesc {
         tmp   = reinterpret_cast<T*>      (mapsy + outH*2);
     }
 
-    static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
+    static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) {
         auto size = outW * sizeof(alpha_t)     +
                     outW * sizeof(alpha_t) * 4 +  // alpha clones // previous alpha is redundant?
                     outW * sizeof(index_t)     +
@@ -910,7 +910,7 @@ struct linearScratchDesc {
 template<typename T, typename Mapper, int chanNum = 1>
 static void initScratchLinear(const cv::GMatDesc& in,
                               const         Size& outSz,
-                         cv::gapi::fluid::Buffer& scratch,
+                              cv::gapi::fluid::Buffer& scratch,
                                              int  lpi) {
     using alpha_type = typename Mapper::alpha_type;
     static const auto unity = Mapper::unity;
@@ -1171,7 +1171,7 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
 template<typename T, class Mapper, int numChan>
 static void calcRowLinearC(const cv::gapi::fluid::View  & in,
                            std::array<std::reference_wrapper<cv::gapi::fluid::Buffer>, numChan>& out,
-                                  cv::gapi::fluid::Buffer& scratch) {
+                           cv::gapi::fluid::Buffer& scratch) {
     using alpha_type = typename Mapper::alpha_type;
 
     auto  inSz =  in.meta().size;
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
index 3a68bd4a980..a59111b86b6 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@@ -18,12 +18,12 @@ namespace gapi {
 
 namespace kernels {
 
-inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
-                               uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         uint8_t out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -46,12 +46,12 @@ inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
     }
 }
 
-inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
-                               const uint8_t in2[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         const uint8_t in2[], uint8_t out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -75,12 +75,13 @@ inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
     }
 }
 
-inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
-                               const uint8_t in3[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
+                                         const uint8_t in2[], const uint8_t in3[],
+                                         uint8_t out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -106,12 +107,12 @@ inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const u
     }
 }
 
-inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
+                                          float out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -133,12 +134,12 @@ inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
     }
 }
 
-inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
+                                          float out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -162,13 +163,13 @@ inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const floa
     }
 }
 
-inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
-                                const float in2[], const float in3[],
-                                float out[], int length) {
+CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
+                                          const float in2[], const float in3[],
+                                          float out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -196,12 +197,12 @@ inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
 
 //------------------------------------------------------------------------------
 
-inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
-                               uint8_t out1[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
+                                         uint8_t out1[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -223,12 +224,12 @@ inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
     }
 }
 
-inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
-                               uint8_t out1[], uint8_t out2[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
+                                         uint8_t out1[], uint8_t out2[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -252,12 +253,12 @@ inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
     }
 }
 
-inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                               uint8_t out2[], uint8_t out3[], int length) {
+CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
+                                         uint8_t out2[], uint8_t out3[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -283,12 +284,12 @@ inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[
     }
 }
 
-inline void splitRow_32FC2_Impl(const float in[], float out0[],
+CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
                                 float out1[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -311,12 +312,12 @@ inline void splitRow_32FC2_Impl(const float in[], float out0[],
     }
 }
 
-inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
-                                float out2[], int length) {
+CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
+                                          float out2[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -340,12 +341,12 @@ inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
     }
 }
 
-inline void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
-                                float out2[], float out3[], int length) {
+CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
+                                          float out2[], float out3[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     cycle:
     for (; l <= length - nlanes; l += nlanes) {
@@ -380,7 +381,7 @@ static const int ITUR_BT_601_CVG = -852492;
 static const int ITUR_BT_601_CVR = 1673527;
 static const int ITUR_BT_601_SHIFT = 20;
 
-static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
+CV_ALWAYS_INLINE void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
     int uu, vv;
     uu = static_cast<int>(u) - 128;
     vv = static_cast<int>(v) - 128;
@@ -390,9 +391,9 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
     buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
 }
 
-static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
-                             v_int32 (&ruv)[4], v_int32 (&guv)[4],
-                             v_int32 (&buv)[4]) {
+CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
+                                v_int32 (&ruv)[4], v_int32 (&guv)[4],
+                                v_int32 (&buv)[4]) {
     v_uint8 v128 = vx_setall_u8(128);
     v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
     v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
@@ -417,8 +418,8 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
     }
 }
 
-static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
-                               uchar& r, uchar& g, uchar& b) {
+CV_ALWAYS_INLINE void yRGBuvToRGB(const uchar vy, const int ruv, const int guv,
+                                  const int buv, uchar& r, uchar& g, uchar& b) {
     int yy = static_cast<int>(vy);
     int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
     r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
@@ -426,11 +427,11 @@ static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, con
     b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
 }
 
-static inline void yRGBuvToRGB(const v_uint8& vy,
-                               const v_int32 (&ruv)[4],
-                               const v_int32 (&guv)[4],
-                               const v_int32 (&buv)[4],
-                               v_uint8& rr, v_uint8& gg, v_uint8& bb) {
+CV_ALWAYS_INLINE void yRGBuvToRGB(const v_uint8& vy,
+                                  const v_int32 (&ruv)[4],
+                                  const v_int32 (&guv)[4],
+                                  const v_int32 (&buv)[4],
+                                  v_uint8& rr, v_uint8& gg, v_uint8& bb) {
     v_uint8 v16 = vx_setall_u8(16);
     v_uint8 posY = vy - v16;
     v_uint16 yy0, yy1;
@@ -463,15 +464,14 @@ static inline void yRGBuvToRGB(const v_uint8& vy,
     bb = v_pack_u(b0, b1);
 }
 
-inline void calculate_nv12_to_rgb_impl(const  uchar **srcY,
-                                       const  uchar *srcUV,
-                                       uchar **dstRGBx,
-                                       int width) {
+CV_ALWAYS_INLINE void calculate_nv12_to_rgb_impl(const  uchar **srcY,
+                                                 const  uchar *srcUV,
+                                                 uchar **dstRGBx,
+                                                 int width) {
     int i = 0;
 
 #if MANUAL_SIMD
-
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
         v_uint8 u, v;
@@ -535,14 +535,13 @@ inline void calculate_nv12_to_rgb_impl(const  uchar **srcY,
     }
 }
 
-inline void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,
-                                       const  uchar *srcV, uchar **dstRGBx,
-                                       int width) {
+CV_ALWAYS_INLINE void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,
+                                                 const  uchar *srcV, uchar **dstRGBx,
+                                                 int width) {
     int i = 0;
 
 #if MANUAL_SIMD
-
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
         v_uint8 u = vx_load(srcU + i/2);
@@ -610,8 +609,8 @@ inline void calculate_i420_to_rgb_impl(const  uchar **srcY, const  uchar *srcU,
 
 // vertical pass
 template<typename T, typename A, typename I, typename W>
-static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
-                         W vbuf[]) {
+CV_ALWAYS_INLINE void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap,
+                            A yalpha, W vbuf[]) {
     int y_1st = ymap.index0;
     int ylast = ymap.index1 - 1;
 
@@ -619,7 +618,7 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
     GAPI_DbgAssert(y_1st < ylast);
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint16::nlanes;
+    constexpr int nlanes = v_uint16::nlanes;
 #endif
 
     // 1st and last rows
@@ -667,8 +666,8 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
 
 // horizontal pass
 template<typename T, typename A, typename I, typename W>
-static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
-                         const A xalpha[], const W vbuf[]) {
+CV_ALWAYS_INLINE void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
+                            const A xalpha[], const W vbuf[]) {
 // TO DO: try lambda here
 #define HSUM(xmaxdf)                                 \
     for (int x = 0; x < outWidth; x++) {             \
@@ -704,9 +703,11 @@ static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
 }
 
 template<typename T, typename A, typename I, typename W>
-static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
-    A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
-    W vbuf[]) {
+CV_ALWAYS_INLINE void calcRowArea_impl(T dst[], const T *src[], const Size& inSz,
+                                       const Size& outSz, A yalpha,
+                                       const MapperUnit<A, I>& ymap, int xmaxdf,
+                                       const I xindex[], const A xalpha[],
+                                       W vbuf[]) {
     bool xRatioEq1 = inSz.width  == outSz.width;
     bool yRatioEq1 = inSz.height == outSz.height;
 
@@ -738,18 +739,18 @@ static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Si
 
 #if MANUAL_SIMD
 template <typename VecT, typename T>
-void copyRow_impl(const T in[], T out[], int l) {
+CV_ALWAYS_INLINE void copyRow_impl(const T in[], T out[], int l) {
     VecT r;
     r = vx_load(&in[l]);
     vx_store(&out[l], r);
 }
 #endif
 
-inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
+CV_ALWAYS_INLINE void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = v_uint8::nlanes;
 
     for (; l <= length - nlanes; l += nlanes) {
         copyRow_impl<v_uint8>(in, out, l);
@@ -766,11 +767,11 @@ inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
     }
 }
 
-inline void copyRow_32F_impl(const float in[], float out[], int length) {
+CV_ALWAYS_INLINE void copyRow_32F_impl(const float in[], float out[], int length) {
     int l = 0;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 
     for (; l <= length - nlanes; l += nlanes) {
         copyRow_impl<v_float32>(in, out, l);
@@ -801,7 +802,7 @@ CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
     bool yRatioEq1 = inSz.height == outSz.height;
 
 #if MANUAL_SIMD
-    const int nlanes = v_float32::nlanes;
+    constexpr int nlanes = v_float32::nlanes;
 #endif
 
     if (!xRatioEq1 && !yRatioEq1) {
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
index 9c005626572..83d561d2115 100644
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@@ -2606,6 +2606,24 @@ CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8
     return v_uint8x16(v);
 }
 
+CV_ALWAYS_INLINE void v_interleave(const v_int16x8& a, const v_int16x8& b,
+                                   v_int16x8& v1, v_int16x8& v2)
+{
+    int16x8x2_t p = vzipq_s16(a.val, b.val);
+    v1.val = p.val[0];
+    v2.val = p.val[1];
+    return;
+}
+
+CV_ALWAYS_INLINE void v_interleave(const v_int32x4& a, const v_int32x4& b,
+                                   v_int32x4& v1, v_int32x4& v2)
+{
+    int32x4x2_t p = vzipq_s32(a.val, b.val);
+    v1.val = p.val[0];
+    v2.val = p.val[1];
+    return;
+}
+
 template<int shift>
 CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
 {