diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
index 86dc6b22fea..0ee824c8748 100644
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@@ -4,6 +4,7 @@
 
 #include "ie_preprocess_gapi_kernels.hpp"
 #include "ie_preprocess_gapi_kernels_impl.hpp"
+#include "ie_preprocess_gapi_kernels_neon.hpp"
 
 #include <arm_neon.h>
 
@@ -126,6 +127,351 @@ void copyRow_32F(const float in[], float out[], int length) {
     copyRow_32F_impl(in, out, length);
 }
 
+template<int chanNum>
+CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
+                                            const uchar* src, const int width,
+                                            const int nlanes, const int line) {
+    v_uint8 chan;
+    int x = 0;
+    for (;;) {
+        for (; x <= width - nlanes && x >= 0; x += nlanes) {
+            for (int c = 0; c < chanNum; ++c) {
+                v_gather_channel<chanNum>(chan, &src[chanNum * x], c);
+                vx_store(&dst[c][line][x], chan);
+            }
+        }
+
+        if (x < width) {
+            x = width - nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
+                                      uchar* tmp, const int inLength,
+                                      const int nlanes, const short beta) {
+    int w = 0;
+    const int half_nlanes = nlanes/2;
+    for (;;) {
+        for (; w <= inLength - nlanes; w += nlanes) {
+            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[w]));
+            v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[w]));
+            v_int16 s2 = v_reinterpret_as_s16(vx_load_expand(&src0[w + half_nlanes]));
+            v_int16 s3 = v_reinterpret_as_s16(vx_load_expand(&src1[w + half_nlanes]));
+            v_int16 res1 = v_mulhrs(s0 - s1, beta) + s1;
+            v_int16 res2 = v_mulhrs(s2 - s3, beta) + s3;
+
+            vx_store(tmp + w, v_pack_u(res1, res2));
+        }
+
+        if (w < inLength) {
+            w = inLength - nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+template<int chanNum>
+CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
+                                        const uchar* src, const short mapsx[],
+                                        const short alpha[], const int nlanes,
+                                        const int width, const int line) {
+    const int half_nlanes = nlanes/2;
+    v_int16 t0, t1;//, t2, t3;
+    int x = 0;
+    for (;;) {
+        for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);
+            for (int c = 0; c < chanNum; ++c) {
+                v_gather_channel<chanNum>(t0, src, &mapsx[x], c, 0);
+                v_gather_channel<chanNum>(t1, src, &mapsx[x], c, 1);
+                //v_gather_channel<chanNum>(t2, src, &mapsx[x + half_nlanes], c, 0);
+                //v_gather_channel<chanNum>(t3, src, &mapsx[x + half_nlanes], c, 1);
+                v_int16 res1 = v_mulhrs(t0 - t1, a0) + t1;
+                //v_int16 res2 = v_mulhrs(t2 - t3, a0) + t3;
+                //vx_store(&dst[c][line][x], v_pack_u(res1, res2));
+                v_pack_u_store(&dst[c][line][x], res1);
+            }
+        }
+
+        if (x < width) {
+            //x = width - nlanes;
+            x = width - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+template<int chanNum>
+CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
+                                      const uchar* tmp, const short mapsx[], const short clone[],
+                                      const int width, const int nlanes) {
+    v_uint8 val_0, val_1, val_2, val_3;
+    const int half_nlanes = nlanes / 2;
+    const int shift = static_cast<int>(half_nlanes / 4);
+
+    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
+    v_uint8 hmask = vx_load(_mask_horizontal);
+
+    int x = 0;
+    for (;;) {
+        for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
+
+            for (int c = 0; c < chanNum; ++c) {
+                v_gather_channel(val_0, tmp, &mapsx[x], chanNum, c, 0);
+                v_gather_channel(val_1, tmp, &mapsx[x], chanNum, c, shift);
+                v_gather_channel(val_2, tmp, &mapsx[x], chanNum, c, shift * 2);
+                v_gather_channel(val_3, tmp, &mapsx[x], chanNum, c, shift * 3);
+
+                v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+                v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+                v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+                v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+                v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+                v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+                v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+                v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+                v_int16 r0 = v_add_wrap(val1_0, t0);
+                v_int16 r1 = v_add_wrap(val1_1, t1);
+                v_int16 r2 = v_add_wrap(val1_2, t2);
+                v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                v_uint8 q0 = v_pack_u(r0, r1);
+                v_uint8 q1 = v_pack_u(r2, r3);
+
+                v_uint8 q2 = v_shuffle(q0, hmask);
+                v_uint8 q3 = v_shuffle(q1, hmask);
+
+                v_uint8 q4 = v_blend<0xCC /*0b11001100*/>(q2, v_slli_si128(q3, 4));
+                v_uint8 q5 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q2, 4), q3);
+
+                v_store_low(&dst[c][0][x], q4);
+                v_store_high(&dst[c][1][x], q4);
+                v_store_low(&dst[c][2][x], q5);
+                v_store_high(&dst[c][3][x], q5);
+            }
+        }
+
+        if (x < width) {
+            x = width - half_nlanes;
+            continue;
+        }
+        break;
+    }
+}
+
+template<int chanNum>
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
+                                              const uint8_t* src0[],
+                                              const uint8_t* src1[],
+                                              const short    alpha[],
+                                              const short    clone[],  // 4 clones of alpha
+                                              const short    mapsx[],
+                                              const short    beta[],
+                                                  uint8_t    tmp[],
+                                              const Size&    inSz,
+                                              const Size&    outSz,
+                                              const int      lpi) {
+    constexpr int nlanes = static_cast<int>(v_int8::nlanes);
+    constexpr int half_nlanes = nlanes / 2;
+
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    if (!xRatioEq && !yRatioEq) {
+        if (4 == lpi) {
+            // vertical pass
+            int inLength = inSz.width * chanNum;
+            GAPI_Assert(inLength >= half_nlanes);
+
+            v_int16 b0 = vx_setall_s16(beta[0]);
+            v_int16 b1 = vx_setall_s16(beta[1]);
+            v_int16 b2 = vx_setall_s16(beta[2]);
+            v_int16 b3 = vx_setall_s16(beta[3]);
+
+            uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
+                                            2, 10, 6, 14, 3, 11, 7, 15 };
+            v_uint8 vmask = vx_load(_mask_vertical);
+
+            int w = 0;
+            for (;;) {
+                for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
+                    v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
+                    v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
+                    v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
+                    v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
+
+                    v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
+                    v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
+                    v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
+                    v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
+
+                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+                    v_int16 r0 = v_add_wrap(val1_0, t0);
+                    v_int16 r1 = v_add_wrap(val1_1, t1);
+                    v_int16 r2 = v_add_wrap(val1_2, t2);
+                    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+                    v_uint8 q0 = v_pack_u(r0, r1);
+                    v_uint8 q1 = v_pack_u(r2, r3);
+
+                    v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_slli_si128(q1, 4));
+                    v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q0, 4), q1);
+
+                    v_uint8 q4 = v_shuffle(q2, vmask);
+                    v_uint8 q5 = v_shuffle(q3, vmask);
+
+                    vx_store(&tmp[4 * w + 0], q4);
+                    vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+                }
+
+                if (w < inLength) {
+                    w = inLength - half_nlanes;
+                    continue;
+                }
+                break;
+            }
+
+            // horizontal pass
+            GAPI_Assert(outSz.width >= half_nlanes);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
+        } else {  // if any lpi
+              int inLength = inSz.width * chanNum;
+              GAPI_Assert(inLength >= half_nlanes);
+              GAPI_Assert(outSz.width >= half_nlanes);
+
+              for (int l = 0; l < lpi; ++l) {
+                  short beta0 = beta[l];
+                  const uchar* s0 = src0[l];
+                  const uchar* s1 = src1[l];
+
+                  // vertical pass
+                  vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
+
+                  // horizontal pass
+                  horizontal_anyLPI<chanNum>(dst, tmp, mapsx, alpha, nlanes, outSz.width, l);
+              }
+          }
+    } else if (!xRatioEq) {
+        GAPI_Assert(yRatioEq);
+
+        if (4 == lpi) {
+            int inLength = inSz.width * chanNum;
+
+            // vertical pass
+            GAPI_DbgAssert(inLength >= nlanes);
+            v_uint8 s0, s1, s2, s3;
+            int w = 0;
+            for (;;) {
+                for (; w <= inLength - nlanes; w += nlanes) {
+                    s0 = vx_load(&src0[0][w]);
+                    s1 = vx_load(&src0[1][w]);
+                    s2 = vx_load(&src0[2][w]);
+                    s3 = vx_load(&src0[3][w]);
+                    v_store_interleave(&tmp[lpi * w], s0, s1, s2, s3);
+                }
+
+                if (w < inLength) {
+                    w = inLength - nlanes;
+                    continue;
+                }
+                break;
+            }
+
+            // horizontal pass
+            GAPI_Assert(outSz.width >= half_nlanes);
+            horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
+        } else {  // any LPI
+            GAPI_Assert(outSz.width >= half_nlanes);
+            for (int l = 0; l < lpi; ++l) {
+                const uchar* src = src0[l];
+
+                // horizontal pass
+                horizontal_anyLPI<chanNum>(dst, src, mapsx, alpha, nlanes, outSz.width, l);
+            }
+        }
+    } else if (!yRatioEq) {
+        GAPI_Assert(xRatioEq);
+        int inLength = inSz.width*chanNum;  // == outSz.width
+
+        GAPI_Assert(inLength >= half_nlanes);
+        GAPI_Assert(outSz.width >= nlanes);
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+            const uchar* s0 = src0[l];
+            const uchar* s1 = src1[l];
+
+            // vertical pass
+            vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
+
+            //split channels to planes and store
+            channels2planes_store<chanNum>(dst, tmp, outSz.width, nlanes, l);
+        }
+    } else {
+        GAPI_Assert(xRatioEq && yRatioEq);
+        GAPI_Assert(outSz.width >= nlanes);
+
+        //split channels to planes and store
+        for (int l = 0; l < lpi; ++l) {
+            const uchar* src = src0[l];
+            channels2planes_store<chanNum>(dst, src, outSz.width, nlanes, l);
+        }
+    }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                        const int    lpi) {
+    constexpr int chanNum = 3;
+    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
+                                     beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                      const int      lpi) {
+    constexpr int chanNum = 4;
+    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
+                                     beta, tmp, inSz, outSz, lpi);
+}
 }  // namespace neon
 }  // namespace kernels
 }  // namespace gapi
diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
index e01c569f505..e9f8bb9ffbe 100644
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@@ -904,34 +904,33 @@ void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
 
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
-                  const uint8_t *src0[],
-                  const uint8_t *src1[],
-                  const short    alpha[],
-                  const short    clone[],  // 4 clones of alpha
-                  const short    mapsx[],
-                  const short    beta[],
-                        uint8_t  tmp[],
-                  const Size    &inSz,
-                  const Size    &outSz,
-                        int      lpi) {
-    constexpr const int chanNum = 3;
-
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                      const int      lpi) {
+    constexpr int chanNum = 3;
     calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
 
 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
-                  const uint8_t *src0[],
-                  const uint8_t *src1[],
-                  const short    alpha[],
-                  const short    clone[],  // 4 clones of alpha
-                  const short    mapsx[],
-                  const short    beta[],
-                        uint8_t  tmp[],
-                  const Size    &inSz,
-                  const Size    &outSz,
-                        int      lpi) {
-    constexpr const int chanNum = 4;
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
+                      const short    alpha[],
+                      const short    clone[],  // 4 clones of alpha
+                      const short    mapsx[],
+                      const short    beta[],
+                          uint8_t    tmp[],
+                      const Size&   inSz,
+                      const Size&   outSz,
+                      const int     lpi) {
+    constexpr int chanNum = 4;
     calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
 
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
index 0d52c378e2e..f40d579ffe5 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -1223,6 +1223,23 @@ static void calcRowLinearC(const cv::gapi::fluid::View  & in,
     }
 #endif  // HAVE_SSE
 
+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value) {
+        if (inSz.width >= 16 && outSz.width >= 8) {
+            neon::calcRowLinear_8UC<numChan>(dst,
+                                             reinterpret_cast<const uint8_t**>(src0),
+                                             reinterpret_cast<const uint8_t**>(src1),
+                                             reinterpret_cast<const short*>(alpha),
+                                             reinterpret_cast<const short*>(clone),
+                                             reinterpret_cast<const short*>(mapsx),
+                                             reinterpret_cast<const short*>(beta),
+                                             reinterpret_cast<uint8_t*>(tmp),
+                                             inSz, outSz, lpi);
+            return;
+         }
+    }
+#endif  // HAVE_NEON
+
     auto length = out[0].get().length();
 
     for (int l = 0; l < lpi; l++) {
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
index c5b843f6c32..a10ed62f79c 100644
--- a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
@@ -188,6 +188,16 @@ inline unsigned int trailingZeros32(unsigned int value) {
 // access from within opencv code more accessible
 namespace cv {
 
+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
 namespace hal {
 
 enum StoreMode
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
index 3ee2d3fd3d9..d37d377b012 100644
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@@ -2323,7 +2323,157 @@ inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
     return result;
 }
 
+CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b)
+{
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(a.val),
+                                 vget_low_s16(b.val));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(a.val),
+                                 vget_high_s16(b.val));
 
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return v_int16x8(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const short b)
+{
+    return v_mulhrs(a, v_setall_s16(b));
+}
+
+CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[],
+                                       const short* index, const int chanNum,
+                                       const int c, const int shift)
+{
+    int32x4_t result = {};
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0)) + c)]), result, 0);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1)) + c)]), result, 1);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0) + 1) + c)]), result, 2);
+    result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1) + 1) + c)]), result, 3);
+
+    vec.val = vreinterpretq_u8_s32(result);
+}
+
+template<int chanNum>
+CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uchar src[], const int channel)
+{
+    uint8x16_t result = {};
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + channel), result, 0);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + chanNum + channel), result, 1);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 2 * chanNum + channel), result, 2);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 3 * chanNum + channel), result, 3);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 4 * chanNum + channel), result, 4);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 5 * chanNum + channel), result, 5);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 6 * chanNum + channel), result, 6);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 7 * chanNum + channel), result, 7);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 8 * chanNum + channel), result, 8);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 9 * chanNum + channel), result, 9);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 10 * chanNum + channel), result, 10);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 11 * chanNum + channel), result, 11);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 12 * chanNum + channel), result, 12);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 13 * chanNum + channel), result, 13);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 14 * chanNum + channel), result, 14);
+    result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 15 * chanNum + channel), result, 15);
+
+    vec.val = result;
+}
+
+namespace {
+template<int chanNum>
+CV_ALWAYS_INLINE void v_gather_channel(v_int16x8& vec, const uchar src[], const short* index, const int channel, const int pos)
+{
+    int16x8_t result = {};
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*index + pos) + channel]), result, 0);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 1) + pos) + channel]), result, 1);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 2) + pos) + channel]), result, 2);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 3) + pos) + channel]), result, 3);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 4) + pos) + channel]), result, 4);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 5) + pos) + channel]), result, 5);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 6) + pos) + channel]), result, 6);
+    result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 7) + pos) + channel]), result, 7);
+
+    vec.val = result;
+}
+}  // namespace
+
+template<int imm>
+CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
+{
+    const uint16_t _mask[8] = { ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,
+                                ((imm) & (1 << 7)) ? 0xFFFF : 0x0000 };
+    uint16x8_t _mask_vec = vld1q_u16(_mask);
+    uint16x8_t _a = vreinterpretq_u16_u8(a.val);
+    uint16x8_t _b = vreinterpretq_u16_u8(b.val);
+    return v_uint8x16(vreinterpretq_u8_u16(vbslq_u16(_mask_vec, _b, _a)));
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mask)
+{
+
+    uint8x16_t tbl = a.val;   // input a
+    uint8x16_t idx = mask.val;  // input mask
+    uint8x16_t idx_masked =
+               vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return v_uint8x16(vqtbl1q_u8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    uint8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+
+    return v_uint8x16(ret);
+#else
+    uint8x8x2_t a_split = { vget_low_u8(tbl), vget_high_u8(tbl) };
+
+    return v_uint8x16(vcombine_u8(
+                                vtbl2_u8(a_split, vget_low_u8(idx_masked)),
+                                vtbl2_u8(a_split, vget_high_u8(idx_masked))));
+
+#endif
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a, const int imm)
+{
+    uint8x16_t ret = {};
+    if (imm <= 0) {
+        ret = a.val;
+    }
+    if (imm > 15) {
+        ret = vdupq_n_u8(0);
+    } else {
+        ret = vextq_u8(vdupq_n_u8(0), a.val, 16 - (imm));
+    }
+    return v_uint8x16(ret);
+}
+
+CV_ALWAYS_INLINE v_uint8x16 v_srli_si128(const v_uint8x16& a, const int imm)
+{
+    uint8x16_t ret = {};
+    if (imm <= 0) {
+        ret = a.val;
+    }
+    if (imm > 15) {
+        ret = vdupq_n_u8(0);
+    } else {
+        ret = vextq_u8(a.val, vdupq_n_u8(0), imm);
+    }
+    return v_uint8x16(ret);
+}
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END