Preprocessing(GAPI): Universal intrinsics (AVX512) implementation of linear Resize 8UC1 (#1132)

2020-07-27 19:04:51 +03:00 · 2020-07-27 19:04:51 +03:00 · 9b76b3ea39
commit 9b76b3ea39
parent 48f5f524b8
6 changed files with 574 additions and 231 deletions
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
@ -266,10 +266,10 @@ static inline void horizontalPass_lpi4_8UC1(const short clone[], const short map
            v_int16 a76 = vx_load(&clone[4 * (x + 12)]);

            v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
-            val_0 = v_permutevar8x32(val_0, idxs);
-            val_1 = v_permutevar8x32(val_1, idxs);
-            val_2 = v_permutevar8x32(val_2, idxs);
-            val_3 = v_permutevar8x32(val_3, idxs);
+            val_0 = v_permute32(val_0, idxs);
+            val_1 = v_permute32(val_1, idxs);
+            val_2 = v_permute32(val_2, idxs);
+            val_3 = v_permute32(val_3, idxs);

            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
                                                 a10, a32, a54, a76,
@ -312,7 +312,7 @@ static inline void horizontalPass_anylpi_8U(const short alpha[], const short map
        for (; x <= length - half_nlanes; x += half_nlanes) {
            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
-            v_uint8 t = v_gather_pairs(tmp, sx);  // 8 pairs of src0 pixels
+            v_uint8 t = v_gather_pairs(tmp, sx);  // 16 pairs of src0 pixels
            v_int16 t0, t1;
            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
@ -125,26 +125,384 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }

+static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *src1[],
+                                        uint8_t tmp[], v_int16& b0, v_int16& b1,
+                                        v_int16& b2, v_int16& b3, v_uint8& shuf_mask,
+                                        int half_nlanes, int width) {
+    v_uint32 permute_idxs1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
+    v_uint32 permute_idxs2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
+
+    for (int w = 0; w < width; ) {
+        for (; w <= width - half_nlanes; w += half_nlanes) {
+            v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
+            v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
+            v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
+            v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
+
+            v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
+            v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
+            v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
+            v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
+
+            v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
+            v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
+            v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
+            v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
+
+            v_int16 r0 = v_add_wrap(val1_0, t0);
+            v_int16 r1 = v_add_wrap(val1_1, t1);
+            v_int16 r2 = v_add_wrap(val1_2, t2);
+            v_int16 r3 = v_add_wrap(val1_3, t3);
+
+            v_uint8 q0 = v_packus(r0, r1);
+            v_uint8 q1 = v_packus(r2, r3);
+#if 1
+            v_uint8 q2 = v_permutex2_s32(q0, q1, permute_idxs1);
+            v_uint8 q3 = v_permutex2_s32(q0, q1, permute_idxs2);
+
+            v_uint8 q4 = v_shuffle_s8(q2, shuf_mask);
+            v_uint8 q5 = v_shuffle_s8(q3, shuf_mask);
+
+            //Second variant of decompose. It'll be usefull in the future.
+#else
+            v_uint8 q2 = v_mblend_shiftleft(q0, q1);
+            v_uint8 q3 = v_mblend_shiftright(q0, q1);
+
+            v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
+                                      2, 10, 6, 14, 3, 11, 7, 15,
+                                      0, 8, 4, 12, 1, 9, 5, 13,
+                                      2, 10, 6, 14, 3, 11, 7, 15,
+                                      0, 8, 4, 12, 1, 9, 5, 13,
+                                      2, 10, 6, 14, 3, 11, 7, 15,
+                                      0, 8, 4, 12, 1, 9, 5, 13,
+                                      2, 10, 6, 14, 3, 11, 7, 15);
+
+            v_uint8 q4 = v_shuffle_s8(q2, mask1);
+            v_uint8 q5 = v_shuffle_s8(q3, mask1);
+
+            v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
+            v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
+
+            v_uint8 q6 = v_permutex2_s64(q4, q5, permute_idxs1);
+            v_uint8 q7 = v_permutex2_s64(q4, q5, permute_idxs2);
+#endif
+
+            vx_store(&tmp[4 * w + 0], q4);
+            vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
+        }
+
+        if (w < width) {
+            w = width - half_nlanes;
+        }
+    }
+ }
+
+static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
+                                                        const v_uint8& val_1,
+                                                        const v_uint8& val_2,
+                                                        const v_uint8& val_3,
+                                                        const v_int16& a10,
+                                                        const v_int16& a32,
+                                                        const v_int16& a54,
+                                                        const v_int16& a76,
+                                                        v_uint8& shuf_mask1,
+                                                        v_uint8& shuf_mask2,
+                                                        v_uint32& idxs1,
+                                                        v_uint32& idxs2,
+                                                        v_uint8& res1, v_uint8& res2) {
+    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
+    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
+    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
+    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+
+    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
+    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
+    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
+    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
+
+    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
+    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
+    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
+    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
+
+    v_int16 r0 = v_add_wrap(val1_0, t0);
+    v_int16 r1 = v_add_wrap(val1_1, t1);
+    v_int16 r2 = v_add_wrap(val1_2, t2);
+    v_int16 r3 = v_add_wrap(val1_3, t3);
+
+    v_uint8 q0 = v_packus(r0, r1);
+    v_uint8 q1 = v_packus(r2, r3);
+
+    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
+    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
+#if 1
+    v_uint8 q4 = v_permutex2_s32(q2, q3, idxs1);
+    v_uint8 q5 = v_permutex2_s32(q2, q3, idxs2);
+
+    res1 = v_shuffle_s8(q4, shuf_mask2);
+    res2 = v_shuffle_s8(q5, shuf_mask2);
+
+    //Second variant of decompose. It'll be usefull in the future.
+#else
+    v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+    v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
+
+    v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+    v_uint8 q6 = v_permute32(idx, q4);
+    v_uint8 q7 = v_permute32(idx, q5);
+
+    v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
+                              2, 3, 6, 7, 10, 11, 14, 15,
+                              0, 1, 4, 5, 8,  9,  12, 13,
+                              2, 3, 6, 7, 10, 11, 14, 15,
+                              0, 1, 4, 5, 8,  9,  12, 13,
+                              2, 3, 6, 7, 10, 11, 14, 15,
+                              0, 1, 4, 5, 8,  9,  12, 13,
+                              2, 3, 6, 7, 10, 11, 14, 15);
+
+    v_uint8 q8 = v_shuffle_s8(q6, mask2);
+    v_uint8 q9 = v_shuffle_s8(q7, mask2);
+#endif
+}
+
+static inline void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
+                                             uint8_t tmp[], uint8_t *dst[],
+                                             v_uint8& shuf_mask1,
+                                             int width, int half_nlanes) {
+    v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8,  9,  12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8,  9,  12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8,  9,  12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15);
+
+    v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+    v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+    v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
+
+    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
+    const int shift = half_nlanes / 4;
+
+    for (int x = 0; x < width; ) {
+        for (; x <= width - half_nlanes; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
+
+            v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
+
+            val_0 = v_permute32(val_0, permute_idxs1);
+            val_1 = v_permute32(val_1, permute_idxs1);
+            val_2 = v_permute32(val_2, permute_idxs1);
+            val_3 = v_permute32(val_3, permute_idxs1);
+
+            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                 a10, a32, a54, a76,
+                                                 shuf_mask1, shuf_mask2,
+                                                 permute_idxs2, permute_idxs3,
+                                                 res1, res2);
+            v_store_low(&dst[0][x], res1);
+            v_store_high(&dst[1][x], res1);
+            v_store_low(&dst[2][x], res2);
+            v_store_high(&dst[3][x], res2);
+        }
+
+        if (x < width) {
+            x = width - half_nlanes;
+        }
+    }
+}
+
+static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                          uint8_t tmp[], const int& beta0, const int& half_nlanes,
+                                          const int& l, const int& length1, const int& length2) {
+    for (int w = 0; w < length2; ) {
+        for (; w <= length1 - half_nlanes; w += half_nlanes) {
+            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
+            v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
+            v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
+            v_pack_u_store(tmp + w, t);
+        }
+
+        if (w < length1) {
+            w = length1 - half_nlanes;
+        }
+    }
+}
+
+static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
+                                            uint8_t* dst[], const uchar tmp[], const int& l,
+                                            const int& half_nlanes, const int& length) {
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+            v_uint8 t = v_gather_pairs(tmp, sx);
+            v_int16 t0, t1;
+            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+            v_pack_u_store(&dst[l][x], d);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
+}
+
+// 8UC1 Resize (bi-linear)
+void calcRowLinear_8UC1(        uint8_t* dst[],
+                          const uint8_t* src0[],
+                          const uint8_t* src1[],
+                          const short    alpha[],
+                          const short    clone[],  // 4 clones of alpha
+                          const short    mapsx[],
+                          const short    beta[],
+                              uint8_t    tmp[],
+                          const Size&    inSz,
+                          const Size&    outSz,
+                                  int    lpi) {
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int half_nlanes = (nlanes / 2);
+
+    if (!xRatioEq && !yRatioEq) {
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= half_nlanes);
+
+            v_int16 b0 = vx_setall_s16(beta[0]);
+            v_int16 b1 = vx_setall_s16(beta[1]);
+            v_int16 b2 = vx_setall_s16(beta[2]);
+            v_int16 b3 = vx_setall_s16(beta[3]);
+
+            v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15);
+
+            verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, shuf_mask1,
+                                 half_nlanes, inSz.width);
+
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+            horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
+                                     outSz.width, half_nlanes);
+
+         } else {  // if any lpi
+             int inLength = inSz.width;
+             int outLength = outSz.width;
+             for (int l = 0; l < lpi; ++l) {
+                 short beta0 = beta[l];
+
+                 // vertical pass
+                 GAPI_DbgAssert(inSz.width >= half_nlanes);
+                 verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
+
+                 // horizontal pass
+                 GAPI_DbgAssert(outSz.width >= half_nlanes);
+                 horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
+             }
+         }  // if lpi == 4
+
+     } else if (!xRatioEq) {
+         GAPI_DbgAssert(yRatioEq);
+
+         if (4 == lpi) {
+             // vertical pass
+             GAPI_DbgAssert(inSz.width >= nlanes);
+             for (int w = 0; w < inSz.width; ) {
+                 for (; w <= inSz.width - nlanes; w += nlanes) {
+                     v_uint8 s0, s1, s2, s3;
+                     s0 = vx_load(&src0[0][w]);
+                     s1 = vx_load(&src0[1][w]);
+                     s2 = vx_load(&src0[2][w]);
+                     s3 = vx_load(&src0[3][w]);
+                     v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
+                 }
+
+                 if (w < inSz.width) {
+                     w = inSz.width - nlanes;
+                 }
+             }
+
+             // horizontal pass
+             v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15,
+                                            0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15,
+                                            0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15,
+                                            0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15);
+
+             horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
+                                      outSz.width, half_nlanes);
+
+         } else {  // any LPI
+             for (int l = 0; l < lpi; ++l) {
+                 const uchar *src = src0[l];
+
+                 // horizontal pass
+                 GAPI_DbgAssert(outSz.width >= half_nlanes);
+                 horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
+             }
+         }
+
+     } else if (!yRatioEq) {
+         GAPI_DbgAssert(xRatioEq);
+         int inLength = inSz.width;
+         int outLength = outSz.width;
+
+         for (int l = 0; l < lpi; ++l) {
+             short beta0 = beta[l];
+
+             // vertical pass
+             GAPI_DbgAssert(inSz.width >= half_nlanes);
+             verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
+                                    inLength, outLength);
+         }
+
+     } else {
+         GAPI_DbgAssert(xRatioEq && yRatioEq);
+         int length = inSz.width;
+
+         for (int l = 0; l < lpi; ++l) {
+             memcpy(dst[l], src0[l], length);
+         }
+     }
+}
+
 // Resize (bi-linear, 8U, generic number of channels)
 template<int chanNum>
-void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
-                            const uint8_t *src0[],
-                            const uint8_t *src1[],
-                            const short    alpha[],
-                            const short    clone[],  // 4 clones of alpha
-                            const short    mapsx[],
-                            const short    beta[],
-                                uint8_t    tmp[],
-                             const Size    &inSz,
-                             const Size    &outSz,
-                                    int    lpi) {
+static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                                          const uint8_t *src0[],
+                                          const uint8_t *src1[],
+                                          const short    alpha[],
+                                          const short    clone[],  // 4 clones of alpha
+                                          const short    mapsx[],
+                                          const short    beta[],
+                                              uint8_t    tmp[],
+                                           const Size    &inSz,
+                                           const Size    &outSz,
+                                                  int    lpi) {
    constexpr int half_nlanes = (v_uint8::nlanes / 2);
-    const int shift = (half_nlanes / 4);
+    constexpr int shift = (half_nlanes / 4);

    if (4 == lpi) {
        GAPI_DbgAssert(inSz.width >= half_nlanes);

-
        v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
                                       2, 6, 10, 14, 3, 7, 11, 15,
                                       0, 4, 8,  12, 1, 5, 9,  13,
@ -154,6 +512,17 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                                       0, 4, 8,  12, 1, 5, 9,  13,
                                       2, 6, 10, 14, 3, 7, 11, 15);

+        // vertical pass
+        v_int16 b0 = vx_setall_s16(beta[0]);
+        v_int16 b1 = vx_setall_s16(beta[1]);
+        v_int16 b2 = vx_setall_s16(beta[2]);
+        v_int16 b3 = vx_setall_s16(beta[3]);
+
+        verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3,
+                             shuf_mask1, half_nlanes, inSz.width*chanNum);
+
+        // horizontal pass
+        v_uint8 val_0, val_1, val_2, val_3, res1, res2;
        v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
                                       2, 3, 6, 7, 10, 11, 14, 15,
                                       0, 1, 4, 5, 8,  9,  12, 13,
@ -163,83 +532,8 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                                       0, 1, 4, 5, 8,  9,  12, 13,
                                       2, 3, 6, 7, 10, 11, 14, 15);

-        v_uint32 idx1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
-        v_uint32 idx2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
-        v_uint32 idx3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
-        v_uint32 idx4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
-
-        // vertical pass
-        v_int16 b0 = vx_setall_s16(beta[0]);
-        v_int16 b1 = vx_setall_s16(beta[1]);
-        v_int16 b2 = vx_setall_s16(beta[2]);
-        v_int16 b3 = vx_setall_s16(beta[3]);
-
-        for (int w = 0; w < inSz.width*chanNum; ) {
-            for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) {
-                v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]);
-                v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]);
-                v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]);
-                v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]);
-
-                v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]);
-                v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]);
-                v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]);
-                v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]);
-
-                v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
-                v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
-                v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
-                v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
-
-                v_int16 r0 = v_add_wrap(val1_0, t0);
-                v_int16 r1 = v_add_wrap(val1_1, t1);
-                v_int16 r2 = v_add_wrap(val1_2, t2);
-                v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                v_uint8 q0 = v_packus(r0, r1);
-                v_uint8 q1 = v_packus(r2, r3);
-#if 1
-                v_uint8 q2 = v_permutex2_s32(q0, q1, idx1);
-                v_uint8 q3 = v_permutex2_s32(q0, q1, idx2);
-
-                v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1);
-                v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1);
-
-               //Second variant of decompose. It'll be usefull in the future.
-#else
-                v_uint8 q2 = v_mblend_shiftleft(q0, q1);
-                v_uint8 q3 = v_mblend_shiftright(q0, q1);
-
-                v_uint8 mask1 = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
-                                          2, 10, 6, 14, 3, 11, 7, 15,
-                                          0, 8,  4, 12, 1, 9,  5, 13,
-                                          2, 10, 6, 14, 3, 11, 7, 15,
-                                          0, 8,  4, 12, 1, 9,  5, 13,
-                                          2, 10, 6, 14, 3, 11, 7, 15,
-                                          0, 8,  4, 12, 1, 9,  5, 13,
-                                          2, 10, 6, 14, 3, 11, 7, 15);
-
-                v_uint8 q4 = v_shuffle_s8(q2, mask1);
-                v_uint8 q5 = v_shuffle_s8(q3, mask1);
-
-                v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
-                v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
-
-                v_uint8 q6 = v_permutex2_s64(q4, q5, idx1);
-                v_uint8 q7 = v_permutex2_s64(q4, q5, idx2);
-#endif
-
-                vx_store(&tmp[4 * w + 0], q4);
-                vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
-            }
-
-            if (w < inSz.width*chanNum) {
-                w = inSz.width*chanNum - half_nlanes;
-            }
-        }
-
-        // horizontal pass
-        v_uint8 val_0, val_1, val_2, val_3;
+        v_uint32 idxs3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+        v_uint32 idxs4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);

        GAPI_DbgAssert(outSz.width >= half_nlanes);
        for (int x = 0; x < outSz.width; ) {
@ -248,72 +542,23 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
                v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
                v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
                v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
-                                
+
                for (int c = 0; c < chanNum; ++c) {
                    v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0);
                    v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift);
                    v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2);
                    v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3);

-                    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
-                    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
-                    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
-                    v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
+                    main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                         a10, a32, a54, a76,
+                                                         shuf_mask1, shuf_mask2,
+                                                         idxs3, idxs4,
+                                                         res1, res2);

-                    v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
-                    v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
-                    v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
-                    v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
-
-                    v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
-                    v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
-                    v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
-                    v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
-
-                    v_int16 r0 = v_add_wrap(val1_0, t0);
-                    v_int16 r1 = v_add_wrap(val1_1, t1);
-                    v_int16 r2 = v_add_wrap(val1_2, t2);
-                    v_int16 r3 = v_add_wrap(val1_3, t3);
-
-                    v_uint8 q0 = v_packus(r0, r1);
-                    v_uint8 q1 = v_packus(r2, r3);
-
-                    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
-                    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);              
-#if 1
-                    v_uint8 q4 = v_permutex2_s32(q2, q3, idx3);
-                    v_uint8 q5 = v_permutex2_s32(q2, q3, idx4);
-
-                    v_uint8 q6 = v_shuffle_s8(q4, shuf_mask2);
-                    v_uint8 q7 = v_shuffle_s8(q5, shuf_mask2);
-
-
-                    //Second variant of decompose. It'll be usefull in the future.
-#else
-                    v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
-                    v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
-
-                    v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-                    v_uint8 q6 = v_permutex_s32(idx, q4);
-                    v_uint8 q7 = v_permutex_s32(idx, q5);
-
-                    v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
-                                              2, 3, 6, 7, 10, 11, 14, 15,
-                                              0, 1, 4, 5, 8,  9,  12, 13,
-                                              2, 3, 6, 7, 10, 11, 14, 15,
-                                              0, 1, 4, 5, 8,  9,  12, 13,
-                                              2, 3, 6, 7, 10, 11, 14, 15,
-                                              0, 1, 4, 5, 8,  9,  12, 13,
-                                              2, 3, 6, 7, 10, 11, 14, 15);
-
-                    v_uint8 q8 = v_shuffle_s8(q6, mask2);
-                    v_uint8 q9 = v_shuffle_s8(q7, mask2);
-#endif
-                    v_store_low(&dst[c][0][x],  q6);
-                    v_store_high(&dst[c][1][x], q6);
-                    v_store_low(&dst[c][2][x],  q7);
-                    v_store_high(&dst[c][3][x], q7);
+                    v_store_low(&dst[c][0][x],  res1);
+                    v_store_high(&dst[c][1][x], res1);
+                    v_store_low(&dst[c][2][x],  res2);
+                    v_store_high(&dst[c][3][x], res2);
                }
            }

@ -325,41 +570,30 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
        for (int l = 0; l < lpi; ++l) {
            short beta0 = beta[l];

-         // vertical pass
+            // vertical pass
            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
-            for (int w = 0; w < inSz.width*chanNum; ) {
-                for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) {
-                    v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
-                    v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w]));
-                    v_int16 t = v_mulhrs(s0 - s1, beta0) + s1;
-                    v_pack_u_store(tmp + w, t);
+            verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
+                                   inSz.width*chanNum, inSz.width*chanNum);
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= half_nlanes);
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
+                    for (int c = 0; c < chanNum; ++c) {
+                        v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+                        v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
+                        v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
+                        v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[c][l][x], d);
+                    }
                }

-                if (w < inSz.width*chanNum) {
-                    w = inSz.width*chanNum - half_nlanes;
+                if (x < outSz.width) {
+                     x = outSz.width - half_nlanes;
                }
            }
-
-         // horizontal pass
-         GAPI_DbgAssert(outSz.width >= half_nlanes);
-
-        for (int x = 0; x < outSz.width; ) {
-            for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) {
-                 for (int c = 0; c < chanNum; ++c) {
-                     v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
-                     v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
-                     v_int16 t0 = v_gather_chan<chanNum>(tmp, sx, c, 0);
-                     v_int16 t1 = v_gather_chan<chanNum>(tmp, sx, c, 1);
-                     v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
-                     v_pack_u_store(&dst[c][l][x], d);
-                 }
-            }
-
-            if (x < outSz.width) {
-                 x = outSz.width - half_nlanes;
-            }
-         }
-       }
+        }
    }
 }

--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
@ -26,34 +26,34 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si
                     const float xalpha[], float vbuf[]);

 #if USE_CVKL
-void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
-                                     uchar    dst[],
-                               const Size   & inSz,
-                               const Size   & outSz,
-                                     int      y,
-                               const uint16_t xsi[],
-                               const uint16_t ysi[],
-                               const uint16_t xalpha[],
-                               const uint16_t yalpha[],
-                                     int      x_max_count,
-                                     int      y_max_count,
-                                     uint16_t vert_sum[]);
+void calcRowArea_CVKL_U8(const uchar  * src[],
+                                 uchar    dst[],
+                           const Size   & inSz,
+                           const Size   & outSz,
+                                 int      y,
+                           const uint16_t xsi[],
+                           const uint16_t ysi[],
+                           const uint16_t xalpha[],
+                           const uint16_t yalpha[],
+                                 int      x_max_count,
+                                 int      y_max_count,
+                                 uint16_t vert_sum[]);
 #endif

 //-----------------------------------------------------------------------------

-// Resize (bi-linear, 8U)
-void calcRowLinear_8U(uint8_t *dst[],
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
-                      const short    alpha[],
-                      const short    clone[],
-                      const short    mapsx[],
-                      const short    beta[],
-                      uint8_t  tmp[],
-                      const Size   & inSz,
-                      const Size   & outSz,
-                      int      lpi);
+// Resize (bi-linear, 8UC1)
+void calcRowLinear_8UC1(uint8_t *dst[],
+                        const uint8_t *src0[],
+                        const uint8_t *src1[],
+                        const short    alpha[],
+                        const short    clone[],
+                        const short    mapsx[],
+                        const short    beta[],
+                        uint8_t  tmp[],
+                        const Size   & inSz,
+                        const Size   & outSz,
+                        int      lpi);

 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -910,6 +910,26 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
        dst[l] = out.OutLine<T>(l);
    }

+    #ifdef HAVE_AVX512
+    if (with_cpu_x86_avx512_core()) {
+        if (std::is_same<T, uint8_t>::value) {
+            if (inSz.width >= 64 && outSz.width >= 32) {
+                avx512::calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
+                                           reinterpret_cast<const uint8_t**>(src0),
+                                           reinterpret_cast<const uint8_t**>(src1),
+                                           reinterpret_cast<const short*>(alpha),
+                                           reinterpret_cast<const short*>(clone),
+                                           reinterpret_cast<const short*>(mapsx),
+                                           reinterpret_cast<const short*>(beta),
+                                           reinterpret_cast<uint8_t*>(tmp),
+                                           inSz, outSz, lpi);
+
+                return;
+            }
+        }
+    }
+    #endif
+
    #ifdef HAVE_AVX2
    if (with_cpu_x86_avx2()) {
        if (std::is_same<T, uint8_t>::value) {
--- a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp
@ -3173,7 +3173,7 @@ static inline void v_setr64(v_uint8x32& val_0, v_uint8x32& val_1,v_uint8x32& val
                                  *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3 * shift + 3]]));
 }

-static inline v_uint8x32 v_permutevar8x32(v_uint8x32& a, v_uint32x8& idxs)
+static inline v_uint8x32 v_permute32(v_uint8x32& a, v_uint32x8& idxs)
 {
    return v_uint8x32(_mm256_permutevar8x32_epi32(a.val, idxs.val));
 }
--- a/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp
@ -3033,23 +3033,6 @@ static inline v_int32x16 v_madd(const v_int16x32& a, const v_int16x32& b)
    return r;
 }

-// This function call non-existing intrinsic _mm512_setr_epi8().
-#if 0
-static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd)
-{
-    static const __m512i mask_even =
-                         _mm512_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10,
-                                          -1, 12, -1, 14, -1, 16, -1, 18, -1, 20,
-                                          -1, 22, -1, 24, -1, 26, -1, 28, -1, 30, -1);
-    static const __m512i mask_odd  =
-                         _mm512_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11,
-                                          -1, 13, -1, 15, -1, 17, -1, 19, -1, 21,
-                                          -1, 23, -1, 25, -1, 27, -1, 29, -1, 31, -1);
-
-    even.val = _mm512_shuffle_epi8(src.val, mask_even);
-    odd .val = _mm512_shuffle_epi8(src.val, mask_odd);
-}
-#endif
 static inline v_int16x32 v_mulhi(const v_int16x32& a, short b)
 {
    v_int16x32 r;
@ -3125,7 +3108,6 @@ static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
    return v_uint8x64(_mm512_packus_epi16(a.val, b.val));
 }

-
 #define word(b0, b1, b2, b3)                 \
        (((uint32_t)((uint8_t)(b0)) << 0*8)  \
      | ((uint32_t)((uint8_t)(b1))  << 1*8)  \
@ -3154,6 +3136,26 @@ static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
                                        word(b60, b61, b62, b63)));
 }

+static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd)
+{
+    v_uint8x64 mask_even = v_setr_s8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1,
+                                     12, -1, 14, -1, 16, -1, 18, -1, 20, -1, 22,
+                                     -1, 24, -1, 26, -1, 28, -1, 30, -1, 32, -1,
+                                     34, -1, 36, -1, 38, -1, 40, -1, 42, -1, 44,
+                                     -1, 46, -1, 48, -1, 50, -1, 52, -1, 54, -1,
+                                     56, -1, 58, -1, 60, -1, 62, -1);
+
+    v_uint8x64 mask_odd  = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1,
+                                     13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23,
+                                     -1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1,
+                                     35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45,
+                                     -1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1,
+                                     57, -1, 59, -1, 61, -1, 63, -1);
+
+    even.val = _mm512_shuffle_epi8(src.val, mask_even.val);
+    odd .val = _mm512_shuffle_epi8(src.val, mask_odd.val);
+}
+
 static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
 {
    return v_uint64x8(_mm512_set_epi64(b7, b6, b5, b4, b3, b2, b1, b0));
@ -3173,11 +3175,11 @@ static inline v_int16x32 v_load_ccache_expand(const uchar* ptr)
 {
    return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr)));                         \
 }
-static inline __m512i v512_insert_epi16(__m512i target, const uchar x, const int index)
+static inline __m512i v512_insert_epi16(__m512i& target, const ushort x, const int index)
 {
    return _mm512_mask_set1_epi16(target, 1UL << index, x);
 }
-static inline __m512i v512_insert_epi32(__m512i target, const int32_t x, const int index)
+static inline __m512i v512_insert_epi32(__m512i& target, const int32_t x, const int index)
 {
    return _mm512_mask_set1_epi32(target, 1UL << index, x);
 }
@ -3214,16 +3216,63 @@ static inline v_uint8x64 v_permutex2_s64(const v_uint8x64& a, const v_uint8x64&
    return v_uint8x64(_mm512_permutex2var_epi64(a.val, idxs.val, b.val));
 }

-static inline v_uint8x64 v_permutex_s32(const v_uint8x64& a, const v_uint64x8 idxs)
+static inline v_uint8x64 v_permute32(const v_uint8x64& a, const v_uint64x8& idxs)
 {
    return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val));
 }

-static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16 idxs)
+static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& idxs)
 {
    return v_uint8x64(_mm512_permutex2var_epi32(a.val, idxs.val, b.val));
 }

+static inline v_uint8x64 v_permute32(const v_uint8x64& a, const v_uint32x16& idxs)
+{
+    return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val));
+}
+
+static inline void v_set(v_uint8x64& val_0, v_uint8x64& val_1,
+                         v_uint8x64& val_2, v_uint8x64& val_3,
+                         uint8_t tmp[], const short mapsx[],
+                         int x, int shift)
+{
+    val_0.val = _mm512_setr_epi64(*reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 0))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 1))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 4))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 5))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 6))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 7))]));
+
+    val_1.val = _mm512_setr_epi64(*reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 0))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 1))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 2))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 3))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 4))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 5))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 6))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + shift + 7))]));
+
+    val_2.val = _mm512_setr_epi64(*reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 0))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 1))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 2))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 3))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 4))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 5))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 6))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 2 * shift + 7))]));
+
+    val_3.val = _mm512_setr_epi64(*reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 0))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 1))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 2))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 3))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 4))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 5))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 6))]),
+                                  *reinterpret_cast<int64_t*>(&tmp[4 * (*(mapsx + x + 3 * shift + 7))]));
+}
+
 #if defined(__GNUC__)

 int _mm512_cvtsi512_si32(__m512i a)
@ -3246,6 +3295,46 @@ static inline int v512_extract_epi16(__m512i target)
    return (v512_extract_epi32<index/2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
 }

+static inline v_uint8x64 v_gather_pairs(const uchar src[], const v_int16x32& index) {
+    v_uint8x64 r;
+
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<0>(index.val)]), 0);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<1>(index.val)]), 1);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<2>(index.val)]), 2);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<3>(index.val)]), 3);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<4>(index.val)]), 4);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<5>(index.val)]), 5);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<6>(index.val)]), 6);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<7>(index.val)]), 7);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<8>(index.val)]), 8);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<9>(index.val)]), 9);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<10>(index.val)]), 10);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<11>(index.val)]), 11);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<12>(index.val)]), 12);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<13>(index.val)]), 13);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<14>(index.val)]), 14);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<15>(index.val)]), 15);
+
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<16>(index.val)]), 16);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<17>(index.val)]), 17);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<18>(index.val)]), 18);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<19>(index.val)]), 19);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<20>(index.val)]), 20);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<21>(index.val)]), 21);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<22>(index.val)]), 22);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<23>(index.val)]), 23);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<24>(index.val)]), 24);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<25>(index.val)]), 25);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<26>(index.val)]), 26);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<27>(index.val)]), 27);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<28>(index.val)]), 28);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<29>(index.val)]), 29);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<30>(index.val)]), 30);
+    r.val = v512_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[v512_extract_epi16<31>(index.val)]), 31);
+
+    return r;
+}
+
 namespace {
    template<int chanNum>
    static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {