Pre-processing: Resize Linear 1C refactoring (#6330)

* * Resize 8UC1 refactoring * * Resize 32FC1 refactoring * Applied comments
2021-06-24 09:39:09 +03:00 · 2021-06-24 09:39:09 +03:00 · c24b302c45
commit c24b302c45
parent 123dd1d5ff
10 changed files with 1301 additions and 1242 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -43,25 +43,12 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }

-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float* dst[],
-                       const float* src0[],
-                       const float* src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size& inSz,
-                       const Size& outSz,
-                       const int   lpi) {
-    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
-}
-
 template<int chanNum>
 CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                            const uchar* src, const int width,
                                            const int line) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    GAPI_Assert(width >= nlanes);
+    GAPI_DbgAssert(width >= nlanes);

    v_uint8 chan;
    int x = 0;
@ -85,7 +72,7 @@ CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
                                      uchar* tmp, const int inLength,
                                      const short beta) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    GAPI_Assert(inLength >= nlanes);
+    GAPI_DbgAssert(inLength >= nlanes);

    const int half_nlanes = nlanes/2;
    int w = 0;
@ -116,7 +103,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
                                        const int line) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
    const int half_nlanes = nlanes/2;
-    GAPI_Assert(width >= half_nlanes);
+    GAPI_DbgAssert(width >= half_nlanes);

    v_int16 t0, t1;//, t2, t3;
    int x = 0;
@ -220,7 +207,7 @@ CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNu
                                      const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
    constexpr int half_nlanes = nlanes / 2;
-    GAPI_Assert(length >= half_nlanes);
+    GAPI_DbgAssert(length >= half_nlanes);

    const int shift = static_cast<int>(half_nlanes / 4);

@ -310,7 +297,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
                                           1, 5, 9, 13, 3, 7, 11, 15 };
        if (4 == lpi) {
            // vertical pass
-            vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
+            neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);

            // horizontal pass
            horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
@ -338,7 +325,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
            int inLength = inSz.width * chanNum;

            // vertical pass
-            GAPI_Assert(inLength >= nlanes);
+            GAPI_DbgAssert(inLength >= nlanes);
            v_uint8 s0, s1, s2, s3;
            int w = 0;
            for (;;) {
@ -427,12 +414,13 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,

 CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
                                      const uchar* tmp, const short mapsx[],
-                                      const uchar _mask_horizontal[],
                                      const short clone[], const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
    constexpr int half_nlanes = nlanes / 2;
-    GAPI_Assert(length >= half_nlanes);
+    GAPI_DbgAssert(length >= half_nlanes);

+    uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
+                                       1, 5, 9, 13, 3, 7, 11, 15 };
    v_uint8 hmask = vx_load(_mask_horizontal);
    int x = 0;
    for (;;) {
@ -495,7 +483,8 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
                                        const short alpha[], const int length) {
    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
    constexpr int half_nlanes = nlanes / 2;
-    GAPI_Assert(length >= half_nlanes);
+    GAPI_DbgAssert(length >= half_nlanes);
+
    v_int16 t0, t1;
    int x = 0;
    for (;;) {
@ -515,39 +504,42 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
        break;
    }
 }
+}  // namespace neon

 // 8UC1 Resize (bi-linear)
-void calcRowLinear_8UC1(uint8_t* dst[],
-                        const uint8_t* src0[],
-                        const uint8_t* src1[],
-                        const short    alpha[],
-                        const short    clone[],  // 4 clones of alpha
-                        const short    mapsx[],
-                        const short    beta[],
-                            uint8_t    tmp[],
-                        const Size&    inSz,
-                        const Size&    outSz,
-                        const int      lpi) {
+template<>
+bool calcRowLinear8UC1Impl(neon_tag,
+                                 uint8_t* dst[],
+                           const uint8_t* src0[],
+                           const uint8_t* src1[],
+                           const short    alpha[],
+                           const short    clone[],  // 4 clones of alpha
+                           const short    mapsx[],
+                           const short    beta[],
+                               uint8_t    tmp[],
+                           const Size&    inSz,
+                           const Size&    outSz,
+                           const int      lpi,
+                           const int) {
    static_assert(v_uint8::nlanes == 16,
                  "The wide of NEON vector is 128 bits, so one vector contains 16 uchars");

    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-    constexpr int half_nlanes = nlanes / 2;
+    constexpr int half_nlanes = v_uint8::nlanes / 2;
+
+    if (inSz.width < nlanes || outSz.width < half_nlanes)
+        return false;

    bool xRatioEq = inSz.width == outSz.width;
    bool yRatioEq = inSz.height == outSz.height;

    if (!xRatioEq && !yRatioEq) {
-        GAPI_Assert(inSz.width >= half_nlanes);
-
-        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
-                                           1, 5, 9, 13, 3, 7, 11, 15 };
        if (4 == lpi) {
            // vertical pass
-            vertical_4LPI(src0, src1, tmp, beta, inSz.width);
+            neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width);

            // horizontal pass
-            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
+            neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
        } else {  // if any lpi
            for (int l = 0; l < lpi; ++l) {
                short beta0 = beta[l];
@ -556,18 +548,16 @@ void calcRowLinear_8UC1(uint8_t* dst[],
                uchar* _dst = dst[l];

                // vertical pass
-                vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
+                neon::vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);

                // horizontal pass
-                horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
+                neon::horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
            }
        }  // if lpi == 4

    } else if (!xRatioEq) {
        GAPI_DbgAssert(yRatioEq);
-        GAPI_Assert(inSz.width >= nlanes);
-        uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
-                                           1, 5, 9, 13, 3, 7, 11, 15 };
+        GAPI_DbgAssert(inSz.width >= nlanes);

        if (4 == lpi) {
            // vertical pass
@ -589,15 +579,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
            }

            // horizontal pass
-            horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
+            neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
+
        } else {  // any LPI
-            GAPI_Assert(outSz.width >= half_nlanes);
            for (int l = 0; l < lpi; ++l) {
                const uchar* src = src0[l];
                uchar* _dst = dst[l];

                // horizontal pass
-                horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
+                neon::horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
            }
        }

@ -611,7 +601,7 @@ void calcRowLinear_8UC1(uint8_t* dst[],
            const uchar* s1 = src1[l];

            // vertical pass
-            vertical_anyLPI(s0, s1, dst[l], length, beta0);
+            neon::vertical_anyLPI(s0, s1, dst[l], length, beta0);
        }

    } else {
@ -622,8 +612,8 @@ void calcRowLinear_8UC1(uint8_t* dst[],
            memcpy(dst[l], src0[l], length);
        }
    }
+    return true;
 }
-}  // namespace neon

 template void chanToPlaneRowImpl(neon_tag, const uint8_t* in, int chan, int chs, uint8_t* out, const int length);
 template void chanToPlaneRowImpl(neon_tag, const float*   in, int chan, int chs, float  * out, const int length);
@ -646,6 +636,10 @@ template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<cons
 template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[], const float* src1[],
+                                     const float alpha[], const int mapsx[], const float beta[],
+                                     const Size& inSz, const Size& outSz, const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
@ -28,19 +28,6 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si
                     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
                     const float xalpha[], float vbuf[]);

-// Resize (bi-linear, 8U)
-void calcRowLinear_8UC1(uint8_t *dst[],
-                        const uint8_t *src0[],
-                        const uint8_t *src1[],
-                          const short  alpha[],
-                          const short  clone[],
-                          const short  mapsx[],
-                          const short  beta[],
-                              uint8_t  tmp[],
-                          const Size&  inSz,
-                          const Size&  outSz,
-                                  int  lpi);
-
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
                      const uint8_t *src0[],
@ -81,17 +68,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
                                 int  lpi) {
    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
-
-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float       *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size& inSz,
-                       const Size& outSz,
-                               int lpi);
 }  // namespace neon

 template<typename isa_tag_t, typename T>
@ -131,6 +107,24 @@ extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::arr
 extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template<typename isa_tag_t>
+bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
+                           const short alpha[], const short clone[], const short mapsx[],
+                           const short beta[], uint8_t tmp[], const Size& inSz,
+                           const Size& outSz, const int lpi, const int l);
+
+template<typename isa_tag_t>
+void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
+                            const float alpha[], const int mapsx[],
+                            const float beta[], const Size& inSz, const Size& outSz,
+                            const int lpi, const int l);
+
+extern template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[],
+                                            const float* src1[], const float alpha[],
+                                            const int mapsx[], const float beta[],
+                                            const Size& inSz, const Size& outSz,
+                                            const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
@ -61,17 +61,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }

-static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
-                                                        const v_uint8& val_1,
-                                                        const v_uint8& val_2,
-                                                        const v_uint8& val_3,
-                                                        const v_int16& a10,
-                                                        const v_int16& a32,
-                                                        const v_int16& a54,
-                                                        const v_int16& a76,
-                                                        v_uint8& shuf_mask1,
-                                                        v_uint8& shuf_mask2,
-                                                        v_uint8& res1, v_uint8& res2) {
+CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
+                                                           const v_uint8& val_1,
+                                                           const v_uint8& val_2,
+                                                           const v_uint8& val_3,
+                                                           const v_int16& a10,
+                                                           const v_int16& a32,
+                                                           const v_int16& a54,
+                                                           const v_int16& a76,
+                                                           v_uint8& shuf_mask1,
+                                                           v_uint8& shuf_mask2,
+                                                           v_uint8& res1, v_uint8& res2) {
    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
@ -108,17 +108,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
    res2 = v_shuffle_s8(q7, shuf_mask2);
 }

-static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
-                                        uint8_t tmp[], const short beta[],
-                                        const int& length, const int& half_nlanes) {
+CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                           uint8_t tmp[], const short beta[],
+                                           const int& length) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length >= half_nlanes);
+
    v_int16 b0 = vx_setall_s16(beta[0]);
    v_int16 b1 = vx_setall_s16(beta[1]);
    v_int16 b2 = vx_setall_s16(beta[2]);
    v_int16 b3 = vx_setall_s16(beta[3]);

-    v_uint8 shuf_mask = v_setr_s8(0, 8,  4, 12, 1, 9,  5, 13,
+    v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
                                  2, 10, 6, 14, 3, 11, 7, 15,
-                                  0, 8,  4, 12, 1, 9,  5, 13,
+                                  0, 8, 4, 12, 1, 9, 5, 13,
                                  2, 10, 6, 14, 3, 11, 7, 15);
    for (int w = 0; w < length; ) {
        for (; w <= length - half_nlanes; w += half_nlanes) {
@ -164,63 +167,26 @@ static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* sr
    }
 }

-static inline v_uint8 setHorizontalShufMask1() {
+CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask1() {
    return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
                     1, 5, 9, 13, 3, 7, 11, 15,
                     0, 4, 8, 12, 2, 6, 10, 14,
                     1, 5, 9, 13, 3, 7, 11, 15);
 }

-static inline v_uint8 setHorizontalShufMask2() {
+CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask2() {
    return v_setr_s8(0, 1, 8,  9,  2, 3, 10, 11,
                     4, 5, 12, 13, 6, 7, 14, 15,
                     0, 1, 8,  9,  2, 3, 10, 11,
                     4, 5, 12, 13, 6, 7, 14, 15);
 }

-static inline void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
-                                            uint8_t tmp[], uint8_t* dst[], const int& length,
-                                            const int& half_nlanes) {
-    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
-    constexpr int shift = 4;
-    v_uint8 shuf_mask1 = setHorizontalShufMask1();
-    v_uint8 shuf_mask2 = setHorizontalShufMask2();
+CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                             uint8_t tmp[], const int& beta0,
+                                             const int l, const int length1, const int length2) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length1 >= half_nlanes);

-    v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
-
-    for (int x = 0; x < length; ) {
-        for (; x <= length - half_nlanes; x += half_nlanes) {
-            v_int16 a10 = vx_load(&clone[4 * x]);
-            v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
-            v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
-            v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
-
-            v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
-            val_0 = v_permute32(val_0, idxs);
-            val_1 = v_permute32(val_1, idxs);
-            val_2 = v_permute32(val_2, idxs);
-            val_3 = v_permute32(val_3, idxs);
-
-            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
-                                                 a10, a32, a54, a76,
-                                                 shuf_mask1, shuf_mask2,
-                                                 res1, res2);
-
-            v_store_low(&dst[0][x], res1);
-            v_store_high(&dst[1][x], res1);
-            v_store_low(&dst[2][x], res2);
-            v_store_high(&dst[3][x], res2);
-        }
-
-        if (x < length) {
-            x = length - half_nlanes;
-        }
-    }
-}
-
-static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
-                                          uint8_t tmp[], const int& beta0, const int& half_nlanes,
-                                          const int& l, const int& length1, const int& length2) {
    for (int w = 0; w < length2; ) {
        for (; w <= length1 - half_nlanes; w += half_nlanes) {
            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
@ -235,148 +201,25 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
    }
 }

-static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
-                                            uint8_t* dst[], const uchar tmp[], const int& l,
-                                            const int& half_nlanes, const int& length) {
-    for (int x = 0; x < length; ) {
-        for (; x <= length - half_nlanes; x += half_nlanes) {
-            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
-            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
-            v_uint8 t = v_gather_pairs(tmp, sx);  // 16 pairs of src0 pixels
-            v_int16 t0, t1;
-            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
-            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
-            v_pack_u_store(&dst[l][x], d);
-        }
-
-        if (x < length) {
-            x = length - half_nlanes;
-        }
-    }
-}
-
-// 8UC1 Resize (bi-linear)
-void calcRowLinear_8UC1(uint8_t*       dst[],
-                        const uint8_t* src0[],
-                        const uint8_t* src1[],
-                        const short    alpha[],
-                        const short    clone[],  // 4 clones of alpha
-                        const short    mapsx[],
-                        const short    beta[],
-                        uint8_t        tmp[],
-                        const Size&    inSz,
-                        const Size&    outSz,
-                        int            lpi) {
-    bool xRatioEq = inSz.width == outSz.width;
-    bool yRatioEq = inSz.height == outSz.height;
-
-    constexpr int nlanes = v_uint8::nlanes;
-    constexpr int half_nlanes = (nlanes / 2);
-
-    if (!xRatioEq && !yRatioEq) {
-        if (4 == lpi) {
-            // vertical pass
-            GAPI_DbgAssert(inSz.width >= half_nlanes);
-            verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width, half_nlanes);
-
-            // horizontal pass
-            GAPI_DbgAssert(outSz.width >= half_nlanes);
-            horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
-
-        } else {  // if any lpi
-            int inLength = inSz.width;
-            int outLength = outSz.width;
-            for (int l = 0; l < lpi; ++l) {
-                short beta0 = beta[l];
-
-                // vertical pass
-                GAPI_DbgAssert(inSz.width >= half_nlanes);
-                verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
-
-                // horizontal pass
-                GAPI_DbgAssert(outSz.width >= half_nlanes);
-                horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
-            }
-        }  // if lpi == 4
-
-    } else if (!xRatioEq) {
-        GAPI_DbgAssert(yRatioEq);
-
-        if (4 == lpi) {
-            // vertical pass
-            GAPI_DbgAssert(inSz.width >= nlanes);
-            for (int w = 0; w < inSz.width; ) {
-                for (; w <= inSz.width - nlanes; w += nlanes) {
-                    v_uint8 s0, s1, s2, s3;
-                    s0 = vx_load(&src0[0][w]);
-                    s1 = vx_load(&src0[1][w]);
-                    s2 = vx_load(&src0[2][w]);
-                    s3 = vx_load(&src0[3][w]);
-                    v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
-                }
-
-                if (w < inSz.width) {
-                    w = inSz.width - nlanes;
-                }
-            }
-
-            // horizontal pass
-            GAPI_DbgAssert(outSz.width >= half_nlanes);
-            horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
-
-        } else {  // any LPI
-            for (int l = 0; l < lpi; ++l) {
-                const uchar *src = src0[l];
-
-                // horizontal pass
-                GAPI_DbgAssert(outSz.width >= half_nlanes);
-                horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
-            }
-        }
-
-    } else if (!yRatioEq) {
-        GAPI_DbgAssert(xRatioEq);
-        int inLength = inSz.width;
-        int outLength = outSz.width;
-
-        for (int l = 0; l < lpi; ++l) {
-            short beta0 = beta[l];
-
-            // vertical pass
-            GAPI_DbgAssert(inSz.width >= half_nlanes);
-            verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
-                                   inLength, outLength);
-        }
-
-    } else {
-        GAPI_DbgAssert(xRatioEq && yRatioEq);
-        int length = inSz.width;
-
-        for (int l = 0; l < lpi; ++l) {
-            memcpy(dst[l], src0[l], length);
-        }
-    }
-}
-
 template<int chanNum>
-void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
-                            const uint8_t *src0[],
-                            const uint8_t *src1[],
-                            const short    alpha[],
-                            const short    clone[],  // 4 clones of alpha
-                            const short    mapsx[],
-                            const short    beta[],
-                                uint8_t    tmp[],
-                             const Size    &inSz,
-                             const Size    &outSz,
-                                    int    lpi) {
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                                             const uint8_t* src0[],
+                                             const uint8_t* src1[],
+                                             const short    alpha[],
+                                             const short    clone[],  // 4 clones of alpha
+                                             const short    mapsx[],
+                                             const short    beta[],
+                                                 uint8_t    tmp[],
+                                             const Size&    inSz,
+                                             const Size&    outSz,
+                                               const int    lpi) {
    constexpr int half_nlanes = (v_uint8::nlanes / 2);
    const int shift = (half_nlanes / 4);

    if (4 == lpi) {
        GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
        verticalPass_lpi4_8U(src0, src1, tmp, beta,
-                             inSz.width*chanNum, half_nlanes);
+                             inSz.width*chanNum);

        // horizontal pass
        GAPI_DbgAssert(outSz.width >= half_nlanes);
@ -420,8 +263,7 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
            short beta0 = beta[l];

            // vertical pass
-            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
-            verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
+            verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
                                   inSz.width*chanNum, inSz.width*chanNum);

            // horizontal pass
@ -480,20 +322,176 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-void calcRowLinear_32F(float *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size&  inSz,
-                       const Size&  outSz,
-                       int  lpi) {
-    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
+CV_ALWAYS_INLINE void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
+                                               uint8_t tmp[], uint8_t* dst[], const int& length) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length >= half_nlanes);
+
+    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
+    constexpr int shift = 4;
+    v_uint8 shuf_mask1 = avx::setHorizontalShufMask1();
+    v_uint8 shuf_mask2 = avx::setHorizontalShufMask2();
+
+    v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
+
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
+
+            v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
+            val_0 = v_permute32(val_0, idxs);
+            val_1 = v_permute32(val_1, idxs);
+            val_2 = v_permute32(val_2, idxs);
+            val_3 = v_permute32(val_3, idxs);
+
+            avx::main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                      a10, a32, a54, a76,
+                                                      shuf_mask1, shuf_mask2,
+                                                      res1, res2);
+
+            v_store_low(&dst[0][x], res1);
+            v_store_high(&dst[1][x], res1);
+            v_store_low(&dst[2][x], res2);
+            v_store_high(&dst[3][x], res2);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
 }

+CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
+                                               uint8_t* dst[], const uchar tmp[], const int l,
+                                               const int length) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length >= half_nlanes);
+
+    v_int16 t0, t1;
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+            v_uint8 t = v_gather_pairs(tmp, sx);  // 16 pairs of src0 pixels
+
+            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+            v_pack_u_store(&dst[l][x], d);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
+}
 }  // namespace avx

+
+// 8UC1 Resize (bi-linear)
+template<>
+bool calcRowLinear8UC1Impl(avx2_tag,
+                           uint8_t*       dst[],
+                           const uint8_t* src0[],
+                           const uint8_t* src1[],
+                           const short    alpha[],
+                           const short    clone[],  // 4 clones of alpha
+                           const short    mapsx[],
+                           const short    beta[],
+                           uint8_t        tmp[],
+                           const Size&    inSz,
+                           const Size&    outSz,
+                           const int      lpi,
+                           const int) {
+    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+
+    if (inSz.width < nlanes || outSz.width < half_nlanes)
+        return false;
+
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    if (!xRatioEq && !yRatioEq) {
+        if (4 == lpi) {
+            // vertical pass
+            avx::verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width);
+
+            // horizontal pass
+            avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
+
+        } else {  // if any lpi
+            int inLength = inSz.width;
+            int outLength = outSz.width;
+            for (int l = 0; l < lpi; ++l) {
+                short beta0 = beta[l];
+
+                // vertical pass
+                avx::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
+
+                // horizontal pass
+                avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
+            }
+        }  // if lpi == 4
+
+    } else if (!xRatioEq) {
+        GAPI_DbgAssert(yRatioEq);
+
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= nlanes);
+            v_uint8 s0, s1, s2, s3;
+            for (int w = 0; w < inSz.width; ) {
+                for (; w <= inSz.width - nlanes; w += nlanes) {
+                    s0 = vx_load(&src0[0][w]);
+                    s1 = vx_load(&src0[1][w]);
+                    s2 = vx_load(&src0[2][w]);
+                    s3 = vx_load(&src0[3][w]);
+                    v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - nlanes;
+                }
+            }
+
+            // horizontal pass
+            avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
+
+        } else {  // any LPI
+            for (int l = 0; l < lpi; ++l) {
+                const uchar* src = src0[l];
+
+                // horizontal pass
+                avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
+            }
+        }
+
+    } else if (!yRatioEq) {
+        GAPI_DbgAssert(xRatioEq);
+        int inLength = inSz.width;
+        int outLength = outSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            avx::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq && yRatioEq);
+        int length = inSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+    return true;
+}
+
 template void chanToPlaneRowImpl(avx2_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
 template void chanToPlaneRowImpl(avx2_tag, const float*   in, const int chan, const int chs, float*   out, const int length);

@ -516,6 +514,11 @@ template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<cons
 template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
+                                     const float alpha[], const int mapsx[],
+                                     const float beta[], const Size& inSz, const Size& outSz,
+                                     const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
@ -41,20 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
 #endif

 //-----------------------------------------------------------------------------
-
-// Resize (bi-linear, 8UC1)
-void calcRowLinear_8UC1(uint8_t*       dst[],
-                        const uint8_t* src0[],
-                        const uint8_t* src1[],
-                        const short    alpha[],
-                        const short    clone[],
-                        const short    mapsx[],
-                        const short    beta[],
-                        uint8_t        tmp[],
-                        const Size&    inSz,
-                        const Size&    outSz,
-                        int            lpi);
-
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
                      const uint8_t* src0[],
@ -66,7 +52,7 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
                          uint8_t    tmp[],
                      const Size&    inSz,
                      const Size&    outSz,
-                              int    lpi);
+                        const int    lpi);

 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
@ -79,33 +65,22 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
                          uint8_t    tmp[],
                      const Size&    inSz,
                      const Size&    outSz,
-                              int    lpi);
+                        const int    lpi);

 template<int numChan>
 void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
-                       const uint8_t *src0[],
-                       const uint8_t *src1[],
+                       const uint8_t* src0[],
+                       const uint8_t* src1[],
                       const short    alpha[],
                       const short    clone[],
                       const short    mapsx[],
                       const short    beta[],
-                        uint8_t  tmp[],
-                       const Size    &inSz,
-                       const Size    &outSz,
-                        int      lpi) {
+                           uint8_t    tmp[],
+                       const Size&    inSz,
+                       const Size&    outSz,
+                       const int      lpi) {
    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
-
-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size & inSz,
-                       const Size & outSz,
-                       int    lpi);
 }  // namespace avx

 template<typename isa_tag_t, typename T>
@ -148,6 +123,23 @@ extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::arr
 extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template<typename isa_tag_t>
+bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
+                           const short alpha[], const short clone[], const short mapsx[],
+                           const short beta[], uint8_t tmp[], const Size& inSz,
+                           const Size& outSz, const int lpi, const int l);
+
+template<typename isa_tag_t>
+void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
+                            const float alpha[], const int mapsx[],
+                            const float beta[], const Size& inSz, const Size& outSz,
+                            const int lpi, const int l);
+
+extern template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
+                                            const float alpha[], const int mapsx[],
+                                            const float beta[], const Size& inSz, const Size& outSz,
+                                            const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
@ -55,10 +55,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }

-static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *src1[],
-                                        uint8_t tmp[], v_int16& b0, v_int16& b1,
-                                        v_int16& b2, v_int16& b3, v_uint8& shuf_mask,
-                                        int half_nlanes, int width) {
+CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                           uint8_t tmp[], const short beta[], const v_uint8& shuf_mask,
+                                           const int width) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(width >= half_nlanes);
+
+    v_int16 b0 = vx_setall_s16(beta[0]);
+    v_int16 b1 = vx_setall_s16(beta[1]);
+    v_int16 b2 = vx_setall_s16(beta[2]);
+    v_int16 b3 = vx_setall_s16(beta[3]);
+
    v_uint32 permute_idxs1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
    v_uint32 permute_idxs2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);

@ -86,37 +93,13 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr

            v_uint8 q0 = v_packus(r0, r1);
            v_uint8 q1 = v_packus(r2, r3);
-#if 1
+
            v_uint8 q2 = v_permutex2_s32(q0, q1, permute_idxs1);
            v_uint8 q3 = v_permutex2_s32(q0, q1, permute_idxs2);

            v_uint8 q4 = v_shuffle_s8(q2, shuf_mask);
            v_uint8 q5 = v_shuffle_s8(q3, shuf_mask);

-            //Second variant of decompose. It'll be usefull in the future.
-#else
-            v_uint8 q2 = v_mblend_shiftleft(q0, q1);
-            v_uint8 q3 = v_mblend_shiftright(q0, q1);
-
-            v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
-                                      2, 10, 6, 14, 3, 11, 7, 15,
-                                      0, 8, 4, 12, 1, 9, 5, 13,
-                                      2, 10, 6, 14, 3, 11, 7, 15,
-                                      0, 8, 4, 12, 1, 9, 5, 13,
-                                      2, 10, 6, 14, 3, 11, 7, 15,
-                                      0, 8, 4, 12, 1, 9, 5, 13,
-                                      2, 10, 6, 14, 3, 11, 7, 15);
-
-            v_uint8 q4 = v_shuffle_s8(q2, mask1);
-            v_uint8 q5 = v_shuffle_s8(q3, mask1);
-
-            v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
-            v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
-
-            v_uint8 q6 = v_permutex2_s64(q4, q5, permute_idxs1);
-            v_uint8 q7 = v_permutex2_s64(q4, q5, permute_idxs2);
-#endif
-
            vx_store(&tmp[4 * w + 0], q4);
            vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
        }
@ -125,21 +108,21 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr
            w = width - half_nlanes;
        }
    }
- }
+}

-static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
-                                                        const v_uint8& val_1,
-                                                        const v_uint8& val_2,
-                                                        const v_uint8& val_3,
-                                                        const v_int16& a10,
-                                                        const v_int16& a32,
-                                                        const v_int16& a54,
-                                                        const v_int16& a76,
-                                                        v_uint8& shuf_mask1,
-                                                        v_uint8& shuf_mask2,
-                                                        v_uint32& idxs1,
-                                                        v_uint32& idxs2,
-                                                        v_uint8& res1, v_uint8& res2) {
+CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
+                                                           const v_uint8& val_1,
+                                                           const v_uint8& val_2,
+                                                           const v_uint8& val_3,
+                                                           const v_int16& a10,
+                                                           const v_int16& a32,
+                                                           const v_int16& a54,
+                                                           const v_int16& a76,
+                                                           v_uint8& shuf_mask1,
+                                                           v_uint8& shuf_mask2,
+                                                           v_uint32& idxs1,
+                                                           v_uint32& idxs2,
+                                                           v_uint8& res1, v_uint8& res2) {
    v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
    v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
    v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
@ -165,91 +148,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,

    v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
    v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
-#if 1
+
    v_uint8 q4 = v_permutex2_s32(q2, q3, idxs1);
    v_uint8 q5 = v_permutex2_s32(q2, q3, idxs2);

    res1 = v_shuffle_s8(q4, shuf_mask2);
    res2 = v_shuffle_s8(q5, shuf_mask2);
-
-    //Second variant of decompose. It'll be usefull in the future.
-#else
-    v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
-    v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
-
-    v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-    v_uint8 q6 = v_permute32(idx, q4);
-    v_uint8 q7 = v_permute32(idx, q5);
-
-    v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
-                              2, 3, 6, 7, 10, 11, 14, 15,
-                              0, 1, 4, 5, 8,  9,  12, 13,
-                              2, 3, 6, 7, 10, 11, 14, 15,
-                              0, 1, 4, 5, 8,  9,  12, 13,
-                              2, 3, 6, 7, 10, 11, 14, 15,
-                              0, 1, 4, 5, 8,  9,  12, 13,
-                              2, 3, 6, 7, 10, 11, 14, 15);
-
-    v_uint8 q8 = v_shuffle_s8(q6, mask2);
-    v_uint8 q9 = v_shuffle_s8(q7, mask2);
-#endif
 }

-static inline void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
-                                             uint8_t tmp[], uint8_t *dst[],
-                                             v_uint8& shuf_mask1,
-                                             int width, int half_nlanes) {
-    v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8,  9,  12, 13,
-                                   2, 3, 6, 7, 10, 11, 14, 15,
-                                   0, 1, 4, 5, 8,  9,  12, 13,
-                                   2, 3, 6, 7, 10, 11, 14, 15,
-                                   0, 1, 4, 5, 8,  9,  12, 13,
-                                   2, 3, 6, 7, 10, 11, 14, 15,
-                                   0, 1, 4, 5, 8,  9,  12, 13,
-                                   2, 3, 6, 7, 10, 11, 14, 15);
+CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
+                                             uint8_t tmp[], const int beta0,
+                                             const int l, const int length1, const int length2) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length1 >= half_nlanes);

-    v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
-    v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
-    v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
-
-    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
-    const int shift = half_nlanes / 4;
-
-    for (int x = 0; x < width; ) {
-        for (; x <= width - half_nlanes; x += half_nlanes) {
-            v_int16 a10 = vx_load(&clone[4 * x]);
-            v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
-            v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
-            v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
-
-            v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
-
-            val_0 = v_permute32(val_0, permute_idxs1);
-            val_1 = v_permute32(val_1, permute_idxs1);
-            val_2 = v_permute32(val_2, permute_idxs1);
-            val_3 = v_permute32(val_3, permute_idxs1);
-
-            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
-                                                 a10, a32, a54, a76,
-                                                 shuf_mask1, shuf_mask2,
-                                                 permute_idxs2, permute_idxs3,
-                                                 res1, res2);
-            v_store_low(&dst[0][x], res1);
-            v_store_high(&dst[1][x], res1);
-            v_store_low(&dst[2][x], res2);
-            v_store_high(&dst[3][x], res2);
-        }
-
-        if (x < width) {
-            x = width - half_nlanes;
-        }
-    }
-}
-
-static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
-                                          uint8_t tmp[], const int& beta0, const int& half_nlanes,
-                                          const int& l, const int& length1, const int& length2) {
    for (int w = 0; w < length2; ) {
        for (; w <= length1 - half_nlanes; w += half_nlanes) {
            v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
@ -264,169 +176,19 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
    }
 }

-static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
-                                            uint8_t* dst[], const uchar tmp[], const int& l,
-                                            const int& half_nlanes, const int& length) {
-    for (int x = 0; x < length; ) {
-        for (; x <= length - half_nlanes; x += half_nlanes) {
-            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
-            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
-            v_uint8 t = v_gather_pairs(tmp, sx);
-            v_int16 t0, t1;
-            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
-            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
-            v_pack_u_store(&dst[l][x], d);
-        }
-
-        if (x < length) {
-            x = length - half_nlanes;
-        }
-    }
-}
-
-// 8UC1 Resize (bi-linear)
-void calcRowLinear_8UC1(uint8_t      * dst[],
-                        const uint8_t* src0[],
-                        const uint8_t* src1[],
-                        const short    alpha[],
-                        const short    clone[],  // 4 clones of alpha
-                        const short    mapsx[],
-                        const short    beta[],
-                        uint8_t        tmp[],
-                        const Size&    inSz,
-                        const Size&    outSz,
-                        int            lpi) {
-    bool xRatioEq = inSz.width == outSz.width;
-    bool yRatioEq = inSz.height == outSz.height;
-
-    constexpr int nlanes = v_uint8::nlanes;
-    constexpr int half_nlanes = (nlanes / 2);
-
-    if (!xRatioEq && !yRatioEq) {
-        if (4 == lpi) {
-            // vertical pass
-            GAPI_DbgAssert(inSz.width >= half_nlanes);
-
-            v_int16 b0 = vx_setall_s16(beta[0]);
-            v_int16 b1 = vx_setall_s16(beta[1]);
-            v_int16 b2 = vx_setall_s16(beta[2]);
-            v_int16 b3 = vx_setall_s16(beta[3]);
-
-            v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
-                                           2, 6, 10, 14, 3, 7, 11, 15,
-                                           0, 4, 8,  12, 1, 5, 9,  13,
-                                           2, 6, 10, 14, 3, 7, 11, 15,
-                                           0, 4, 8,  12, 1, 5, 9,  13,
-                                           2, 6, 10, 14, 3, 7, 11, 15,
-                                           0, 4, 8,  12, 1, 5, 9,  13,
-                                           2, 6, 10, 14, 3, 7, 11, 15);
-
-            verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, shuf_mask1,
-                                 half_nlanes, inSz.width);
-
-
-            // horizontal pass
-            GAPI_DbgAssert(outSz.width >= half_nlanes);
-            horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
-                                     outSz.width, half_nlanes);
-
-         } else {  // if any lpi
-             int inLength = inSz.width;
-             int outLength = outSz.width;
-             for (int l = 0; l < lpi; ++l) {
-                 short beta0 = beta[l];
-
-                 // vertical pass
-                 GAPI_DbgAssert(inSz.width >= half_nlanes);
-                 verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
-
-                 // horizontal pass
-                 GAPI_DbgAssert(outSz.width >= half_nlanes);
-                 horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
-             }
-         }  // if lpi == 4
-
-     } else if (!xRatioEq) {
-         GAPI_DbgAssert(yRatioEq);
-
-         if (4 == lpi) {
-             // vertical pass
-             GAPI_DbgAssert(inSz.width >= nlanes);
-             for (int w = 0; w < inSz.width; ) {
-                 for (; w <= inSz.width - nlanes; w += nlanes) {
-                     v_uint8 s0, s1, s2, s3;
-                     s0 = vx_load(&src0[0][w]);
-                     s1 = vx_load(&src0[1][w]);
-                     s2 = vx_load(&src0[2][w]);
-                     s3 = vx_load(&src0[3][w]);
-                     v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
-                 }
-
-                 if (w < inSz.width) {
-                     w = inSz.width - nlanes;
-                 }
-             }
-
-             // horizontal pass
-             v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
-                                            2, 6, 10, 14, 3, 7, 11, 15,
-                                            0, 4, 8,  12, 1, 5, 9,  13,
-                                            2, 6, 10, 14, 3, 7, 11, 15,
-                                            0, 4, 8,  12, 1, 5, 9,  13,
-                                            2, 6, 10, 14, 3, 7, 11, 15,
-                                            0, 4, 8,  12, 1, 5, 9,  13,
-                                            2, 6, 10, 14, 3, 7, 11, 15);
-
-             horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
-                                      outSz.width, half_nlanes);
-
-         } else {  // any LPI
-             for (int l = 0; l < lpi; ++l) {
-                 const uchar *src = src0[l];
-
-                 // horizontal pass
-                 GAPI_DbgAssert(outSz.width >= half_nlanes);
-                 horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
-             }
-         }
-
-     } else if (!yRatioEq) {
-         GAPI_DbgAssert(xRatioEq);
-         int inLength = inSz.width;
-         int outLength = outSz.width;
-
-         for (int l = 0; l < lpi; ++l) {
-             short beta0 = beta[l];
-
-             // vertical pass
-             GAPI_DbgAssert(inSz.width >= half_nlanes);
-             verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
-                                    inLength, outLength);
-         }
-
-     } else {
-         GAPI_DbgAssert(xRatioEq && yRatioEq);
-         int length = inSz.width;
-
-         for (int l = 0; l < lpi; ++l) {
-             memcpy(dst[l], src0[l], length);
-         }
-     }
-}
-
 // Resize (bi-linear, 8U, generic number of channels)
 template<int chanNum>
-static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
-                                          const uint8_t *src0[],
-                                          const uint8_t *src1[],
-                                          const short    alpha[],
-                                          const short    clone[],  // 4 clones of alpha
-                                          const short    mapsx[],
-                                          const short    beta[],
-                                              uint8_t    tmp[],
-                                           const Size    &inSz,
-                                           const Size    &outSz,
-                                                  int    lpi) {
+CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
+                                             const uint8_t* src0[],
+                                             const uint8_t* src1[],
+                                             const short    alpha[],
+                                             const short    clone[],  // 4 clones of alpha
+                                             const short    mapsx[],
+                                             const short    beta[],
+                                                 uint8_t    tmp[],
+                                             const Size&    inSz,
+                                             const Size&    outSz,
+                                               const int      lpi) {
    constexpr int half_nlanes = (v_uint8::nlanes / 2);
    constexpr int shift = (half_nlanes / 4);

@ -443,13 +205,8 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch
                                       2, 6, 10, 14, 3, 7, 11, 15);

        // vertical pass
-        v_int16 b0 = vx_setall_s16(beta[0]);
-        v_int16 b1 = vx_setall_s16(beta[1]);
-        v_int16 b2 = vx_setall_s16(beta[2]);
-        v_int16 b3 = vx_setall_s16(beta[3]);
-
-        verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3,
-                             shuf_mask1, half_nlanes, inSz.width*chanNum);
+        verticalPass_lpi4_8U(src0, src1, tmp, beta,
+                             shuf_mask1, inSz.width*chanNum);

        // horizontal pass
        v_uint8 val_0, val_1, val_2, val_3, res1, res2;
@ -502,7 +259,7 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch

            // vertical pass
            GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
-            verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
+            verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
                                   inSz.width*chanNum, inSz.width*chanNum);

            // horizontal pass
@ -561,19 +318,207 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-void calcRowLinear_32F(float *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size&  inSz,
-                       const Size&  outSz,
-                               int  lpi) {
-    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
+CV_ALWAYS_INLINE void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
+                                               uint8_t tmp[], uint8_t* dst[],
+                                               v_uint8& shuf_mask1,
+                                               const int width) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(width >= half_nlanes);
+
+    v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8, 9, 12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8, 9, 12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15,
+                                   0, 1, 4, 5, 8, 9, 12, 13,
+                                   2, 3, 6, 7, 10, 11, 14, 15);
+
+    v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+    v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+    v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
+
+    v_uint8 val_0, val_1, val_2, val_3, res1, res2;
+    const int shift = half_nlanes / 4;
+
+    for (int x = 0; x < width; ) {
+        for (; x <= width - half_nlanes; x += half_nlanes) {
+            v_int16 a10 = vx_load(&clone[4 * x]);
+            v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
+            v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
+            v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
+
+            v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
+
+            val_0 = v_permute32(val_0, permute_idxs1);
+            val_1 = v_permute32(val_1, permute_idxs1);
+            val_2 = v_permute32(val_2, permute_idxs1);
+            val_3 = v_permute32(val_3, permute_idxs1);
+
+            main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
+                                                 a10, a32, a54, a76,
+                                                 shuf_mask1, shuf_mask2,
+                                                 permute_idxs2, permute_idxs3,
+                                                 res1, res2);
+            v_store_low(&dst[0][x], res1);
+            v_store_high(&dst[1][x], res1);
+            v_store_low(&dst[2][x], res2);
+            v_store_high(&dst[3][x], res2);
+        }
+
+        if (x < width) {
+            x = width - half_nlanes;
+        }
+    }
+}
+
+CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
+                                               uint8_t* dst[], const uchar tmp[], const int l,
+                                               const int length) {
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+    GAPI_DbgAssert(length >= half_nlanes);
+
+    v_int16 t0, t1;
+    for (int x = 0; x < length; ) {
+        for (; x <= length - half_nlanes; x += half_nlanes) {
+            v_int16 a0 = vx_load(&alpha[x]);        // as signed Q1.1.14
+            v_int16 sx = vx_load(&mapsx[x]);        // as integer (int16)
+            v_uint8 t = v_gather_pairs(tmp, sx);
+
+            v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+            v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
+            v_pack_u_store(&dst[l][x], d);
+        }
+
+        if (x < length) {
+            x = length - half_nlanes;
+        }
+    }
 }
 }  // namespace avx512

+// 8UC1 Resize (bi-linear)
+template<>
+bool calcRowLinear8UC1Impl(avx512_tag,
+                                 uint8_t* dst[],
+                           const uint8_t* src0[],
+                           const uint8_t* src1[],
+                           const short    alpha[],
+                           const short    clone[],  // 4 clones of alpha
+                           const short    mapsx[],
+                           const short    beta[],
+                           uint8_t        tmp[],
+                           const Size&    inSz,
+                           const Size&    outSz,
+                           const int      lpi,
+                           const int) {
+    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+
+    if (inSz.width < nlanes || outSz.width < half_nlanes)
+        return false;
+
+    bool xRatioEq = inSz.width == outSz.width;
+    bool yRatioEq = inSz.height == outSz.height;
+
+    if (!xRatioEq && !yRatioEq) {
+        if (4 == lpi) {
+            v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15);
+            // vertical pass
+            avx512::verticalPass_lpi4_8U(src0, src1, tmp, beta, shuf_mask1, inSz.width);
+
+            // horizontal pass
+            avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
+                                             outSz.width);
+
+        } else {  // if any lpi
+            int inLength = inSz.width;
+            int outLength = outSz.width;
+
+            for (int l = 0; l < lpi; ++l) {
+                short beta0 = beta[l];
+
+                // vertical pass
+                avx512::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
+
+                // horizontal pass
+                avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
+            }
+        }  // if lpi == 4
+
+    } else if (!xRatioEq) {
+        GAPI_DbgAssert(yRatioEq);
+
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= nlanes);
+            for (int w = 0; w < inSz.width; ) {
+                for (; w <= inSz.width - nlanes; w += nlanes) {
+                    v_uint8 s0, s1, s2, s3;
+                    s0 = vx_load(&src0[0][w]);
+                    s1 = vx_load(&src0[1][w]);
+                    s2 = vx_load(&src0[2][w]);
+                    s3 = vx_load(&src0[3][w]);
+                    v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - nlanes;
+                }
+            }
+
+            // horizontal pass
+            v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15,
+                                           0, 4, 8,  12, 1, 5, 9,  13,
+                                           2, 6, 10, 14, 3, 7, 11, 15);
+
+            avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
+                                             outSz.width);
+
+        } else {  // any LPI
+            for (int l = 0; l < lpi; ++l) {
+                const uchar* src = src0[l];
+
+                // horizontal pass
+                avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
+            }
+        }
+
+    } else if (!yRatioEq) {
+        GAPI_DbgAssert(xRatioEq);
+        int inLength = inSz.width;
+        int outLength = outSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            avx512::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq && yRatioEq);
+        int length = inSz.width;
+
+        for (int l = 0; l < lpi; ++l) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+    return true;
+}
+
 template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
 template void chanToPlaneRowImpl(avx512_tag, const float*   in, const int chan, const int chs, float*   out, const int length);

@ -595,6 +540,12 @@ template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<
 template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[],
+                                     const float* src1[], const float alpha[],
+                                     const int mapsx[], const float beta[],
+                                     const Size& inSz, const Size& outSz,
+                                     const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
@ -42,70 +42,46 @@ void calcRowArea_CVKL_U8(const uchar  * src[],

 //-----------------------------------------------------------------------------

-// Resize (bi-linear, 8UC1)
-void calcRowLinear_8UC1(uint8_t *dst[],
-                        const uint8_t *src0[],
-                        const uint8_t *src1[],
-                        const short    alpha[],
-                        const short    clone[],
-                        const short    mapsx[],
-                        const short    beta[],
-                        uint8_t  tmp[],
-                        const Size   & inSz,
-                        const Size   & outSz,
-                        int      lpi);
-
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
                      const short    alpha[],
                      const short    clone[],
                      const short    mapsx[],
                      const short    beta[],
-                        uint8_t  tmp[],
-                      const Size    &inSz,
-                      const Size    &outSz,
-                        int      lpi);
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                      const int      lpi);

 // Resize (bi-linear, 8UC4)
 void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
-                      const uint8_t *src0[],
-                      const uint8_t *src1[],
+                      const uint8_t* src0[],
+                      const uint8_t* src1[],
                      const short    alpha[],
                      const short    clone[],
                      const short    mapsx[],
                      const short    beta[],
-                        uint8_t  tmp[],
-                      const Size    &inSz,
-                      const Size    &outSz,
-                        int      lpi);
+                          uint8_t    tmp[],
+                      const Size&    inSz,
+                      const Size&    outSz,
+                      const int      lpi);

 template<int numChan>
 void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
-                       const uint8_t *src0[],
-                       const uint8_t *src1[],
+                       const uint8_t* src0[],
+                       const uint8_t* src1[],
                       const short    alpha[],
                       const short    clone[],
                       const short    mapsx[],
                       const short    beta[],
-                        uint8_t  tmp[],
-                       const Size    &inSz,
-                       const Size    &outSz,
-                        int      lpi) {
+                           uint8_t    tmp[],
+                       const Size&    inSz,
+                       const Size&    outSz,
+                       const int      lpi) {
    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }
-
-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size & inSz,
-                       const Size & outSz,
-                       int    lpi);
 }  // namespace avx512

 template<typename isa_tag_t, typename T>
@ -145,6 +121,23 @@ extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std:
 extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template<typename isa_tag_t>
+bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
+                           const short alpha[], const short clone[], const short mapsx[],
+                           const short beta[], uint8_t tmp[], const Size& inSz,
+                           const Size& outSz, const int lpi, const int l);
+
+template<typename isa_tag_t>
+void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
+                            const float alpha[], const int mapsx[],
+                            const float beta[], const Size& inSz, const Size& outSz,
+                            const int lpi, const int l);
+
+extern template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[], const float* src1[],
+                                            const float alpha[], const int mapsx[],
+                                            const float beta[], const Size& inSz, const Size& outSz,
+                                            const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@ -56,17 +56,26 @@ namespace gapi {
 namespace kernels {

 // 8UC1 Resize (bi-linear)
-void calcRowLinear_8UC1(uint8_t       *dst[],
-                        const uint8_t *src0[],
-                        const uint8_t *src1[],
-                        const short    alpha[],
-                        const short    clone[],  // 4 clones of alpha
-                        const short    mapsx[],
-                        const short    beta[],
-                              uint8_t  tmp[],
-                        const Size&    inSz,
-                        const Size&    outSz,
-                              int      lpi) {
+template<>
+bool calcRowLinear8UC1Impl(sse42_tag,
+                                 uint8_t *dst[],
+                           const uint8_t *src0[],
+                           const uint8_t *src1[],
+                           const short    alpha[],
+                           const short    clone[],  // 4 clones of alpha
+                           const short    mapsx[],
+                           const short    beta[],
+                               uint8_t    tmp[],
+                            const Size&   inSz,
+                            const Size&   outSz,
+                            const int     lpi,
+                            const int) {
+    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int half_nlanes = (v_uint8::nlanes / 2);
+
+    if (inSz.width < nlanes || outSz.width < half_nlanes)
+        return false;
+
    bool xRatioEq1 = inSz.width  == outSz.width;
    bool yRatioEq1 = inSz.height == outSz.height;

@ -503,6 +512,7 @@ void calcRowLinear_8UC1(uint8_t       *dst[],
            memcpy(dst[l], src0[l], length);
        }
    }
+    return true;
 }

 // Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes.
@ -934,19 +944,6 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
    calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float *dst[],
-                       const float *src0[],
-                       const float *src1[],
-                       const float  alpha[],
-                       const int    mapsx[],
-                       const float  beta[],
-                       const Size&  inSz,
-                       const Size&  outSz,
-                               int  lpi) {
-    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
-}
-
 //------------------------------------------------------------------------------

 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
@ -1289,6 +1286,11 @@ template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<cons
 template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
+                                     const float alpha[], const int mapsx[],
+                                     const float beta[], const Size& inSz, const Size& outSz,
+                                     const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
@ -41,19 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],

 //----------------------------------------------------------------------

-// Resize (bi-linear, 8U)
-void calcRowLinear_8UC1(uint8_t *dst[],
-                        const uint8_t *src0[],
-                        const uint8_t *src1[],
-                        const short    alpha[],
-                        const short    clone[],
-                        const short    mapsx[],
-                        const short    beta[],
-                            uint8_t  tmp[],
-                        const Size&  inSz,
-                        const Size&  outSz,
-                              int    lpi);
-
 // Resize (bi-linear, 8UC3)
 void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
                  const uint8_t *src0[],
@ -95,17 +82,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-// Resize (bi-linear, 32F)
-void calcRowLinear_32F(float *dst[],
-                 const float *src0[],
-                 const float *src1[],
-                 const float  alpha[],
-                 const int    mapsx[],
-                 const float  beta[],
-                 const Size & inSz,
-                 const Size & outSz,
-                       int    lpi);
-
 template<typename isa_tag_t, typename T>
 void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
                        T* out, const int length);
@ -145,6 +121,23 @@ extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::a
 extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
 extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
 extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
+
+template<typename isa_tag_t>
+bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
+                           const short alpha[], const short clone[], const short mapsx[],
+                           const short beta[], uint8_t tmp[], const Size& inSz,
+                           const Size& outSz, const int lpi, const int l);
+
+template<typename isa_tag_t>
+void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
+                            const float alpha[], const int mapsx[],
+                            const float beta[], const Size& inSz, const Size& outSz,
+                            const int lpi, const int l);
+
+extern template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
+                                            const float alpha[], const int mapsx[],
+                                            const float beta[], const Size& inSz, const Size& outSz,
+                                            const int lpi, const int l);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@ -577,15 +577,18 @@ CV_ALWAYS_INLINE void copyRow_Impl(const T in[], T out[], int length) {
 }

 // Resize (bi-linear, 32FC1)
-CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
-                                          const float *src0[],
-                                          const float *src1[],
-                                          const float  alpha[],
-                                          const int    mapsx[],
-                                          const float  beta[],
-                                          const Size& inSz,
-                                          const Size& outSz,
-                                          const int   lpi) {
+template<typename isa_tag_t>
+CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(isa_tag_t,
+                                             float *dst[],
+                                             const float *src0[],
+                                             const float *src1[],
+                                             const float  alpha[],
+                                             const int    mapsx[],
+                                             const float  beta[],
+                                             const Size& inSz,
+                                             const Size& outSz,
+                                             const int   lpi,
+                                             const int) {
    bool xRatioEq1 = inSz.width == outSz.width;
    bool yRatioEq1 = inSz.height == outSz.height;

@ -714,7 +717,8 @@ template<typename isa_tag_t> struct vector_type_of<isa_tag_t, uint8_t> { using t
 template<typename isa_tag_t> struct vector_type_of<isa_tag_t, float>   { using type = v_float32;};

 template<typename isa_tag_t, typename T>
-CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length) {
+CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
+                                         const int chs, T* out, const int length) {
    if (chs == 1) {
        copyRow_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, out, length);
        return;