Pre-processing: Split and Merge kernels refactoring. (#6205)

* * Split and Merge kernel refactoring * * SFINAE: replace condition compilation macro with std::enable_if
2021-06-22 18:46:23 +03:00
parent 27ae3ec433
commit e00cee2fc6
11 changed files with 563 additions and 1425 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@@ -29,67 +29,6 @@ namespace InferenceEngine {
 namespace gapi {
 namespace kernels {
 namespace neon {
-
-void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
-                   uint8_t out[], int length) {
-    mergeRow_8UC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
-                   const uint8_t in2[], uint8_t out[], int length) {
-    mergeRow_8UC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
-                   const uint8_t in3[], uint8_t out[], int length) {
-    mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void mergeRow_32FC2(const float in0[], const float in1[],
-                    float out[], int length) {
-    mergeRow_32FC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
-                    float out[], int length) {
-    mergeRow_32FC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_32FC4(const float in0[], const float in1[],
-                    const float in2[], const float in3[],
-                    float out[], int length) {
-    mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], int length) {
-    splitRow_8UC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], uint8_t out2[], int length) {
-    splitRow_8UC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                   uint8_t out2[], uint8_t out3[], int length) {
-    splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
-}
-
-void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
-    splitRow_32FC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_32FC3(const float in[], float out0[], float out1[],
-                    float out2[], int length) {
-    splitRow_32FC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_32FC4(const float in[], float out0[], float out1[],
-                    float out2[], float out3[], int length) {
-    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -693,6 +632,20 @@ template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t*

 template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
+template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
+template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
@@ -92,87 +92,6 @@ void calcRowLinear_32F(float       *dst[],
                       const Size& inSz,
                       const Size& outSz,
                               int lpi);
-
-//----------------------------------------------------------------------
-
-void mergeRow_8UC2(const uint8_t in0[],
-                   const uint8_t in1[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC3(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC4(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                   const uint8_t in3[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_32FC2(const float in0[],
-                    const float in1[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC3(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC4(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                    const float in3[],
-                          float out[],
-                            int length);
-
-void splitRow_8UC2(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                             int length);
-
-void splitRow_8UC3(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                             int length);
-
-void splitRow_8UC4(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                         uint8_t out3[],
-                             int length);
-
-void splitRow_32FC2(const float in[],
-                          float out0[],
-                          float out1[],
-                            int length);
-
-void splitRow_32FC3(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                            int length);
-
-void splitRow_32FC4(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                          float out3[],
-                            int length);
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width);
-
 }  // namespace neon

 template<typename isa_tag_t, typename T>
@@ -192,6 +111,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,

 extern template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t, typename T, int chs>
+void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
+
+extern template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+extern template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
+extern template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+extern template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
+extern template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+extern template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template<typename isa_tag_t, typename T, int chs>
+void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
+
+extern template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
@@ -47,66 +47,6 @@ namespace kernels {

 namespace avx {

-void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
-                   uint8_t out[], int length) {
-    mergeRow_8UC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
-                   const uint8_t in2[], uint8_t out[], int length) {
-    mergeRow_8UC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
-                   const uint8_t in3[], uint8_t out[], int length) {
-    mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void mergeRow_32FC2(const float in0[], const float in1[],
-                    float out[], int length) {
-    mergeRow_32FC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
-                    float out[], int length) {
-    mergeRow_32FC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_32FC4(const float in0[], const float in1[],
-                    const float in2[], const float in3[],
-                    float out[], int length) {
-    mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], int length) {
-    splitRow_8UC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], uint8_t out2[], int length) {
-    splitRow_8UC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                   uint8_t out2[], uint8_t out3[], int length) {
-    splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
-}
-
-void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
-    splitRow_32FC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_32FC3(const float in[], float out0[], float out1[],
-                    float out2[], int length) {
-    splitRow_32FC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_32FC4(const float in[], float out0[], float out1[],
-                    float out2[], float out3[], int length) {
-    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -562,6 +502,20 @@ template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t*

 template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
+template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
+template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
@@ -106,84 +106,8 @@ void calcRowLinear_32F(float *dst[],
                       const Size & inSz,
                       const Size & outSz,
                       int    lpi);
-
-//----------------------------------------------------------------------
-
-
-void mergeRow_8UC2(const uint8_t in0[],
-                   const uint8_t in1[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC3(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC4(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                   const uint8_t in3[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_32FC2(const float in0[],
-                    const float in1[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC3(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC4(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                    const float in3[],
-                          float out[],
-                            int length);
-
-void splitRow_8UC2(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                             int length);
-
-void splitRow_8UC3(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                             int length);
-
-void splitRow_8UC4(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                         uint8_t out3[],
-                             int length);
-
-void splitRow_32FC2(const float in[],
-                          float out0[],
-                          float out1[],
-                            int length);
-
-void splitRow_32FC3(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                            int length);
-
-void splitRow_32FC4(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                          float out3[],
-                            int length);
 }  // namespace avx

-
 template<typename isa_tag_t, typename T>
 void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);

@@ -192,7 +116,7 @@ extern template void chanToPlaneRowImpl(avx2_tag, const float*   in, const int c

 template<typename isa_tag_t>
 void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row,
-                             uint8_t** out_rows, const int buf_width);
+                      uint8_t** out_rows, const int buf_width);

 extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
                                      const uint8_t* uv_row, uint8_t** out_rows,
@@ -200,10 +124,30 @@ extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,

 template<typename isa_tag_t>
 void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
-                             const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);

 extern template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t, typename T, int chs>
+void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
+
+extern template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+extern template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
+extern template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+extern template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
+extern template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+extern template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template<typename isa_tag_t, typename T, int chs>
+void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
+
+extern template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
@@ -41,66 +41,6 @@ namespace kernels {

 namespace avx512 {

-void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
-                   uint8_t out[], int length) {
-    mergeRow_8UC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
-                   const uint8_t in2[], uint8_t out[], int length) {
-    mergeRow_8UC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
-                   const uint8_t in3[], uint8_t out[], int length) {
-    mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void mergeRow_32FC2(const float in0[], const float in1[],
-                    float out[], int length) {
-    mergeRow_32FC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
-                    float out[], int length) {
-    mergeRow_32FC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_32FC4(const float in0[], const float in1[],
-                    const float in2[], const float in3[],
-                    float out[], int length) {
-    mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], int length) {
-    splitRow_8UC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
-                   uint8_t out1[], uint8_t out2[], int length) {
-    splitRow_8UC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                   uint8_t out2[], uint8_t out3[], int length) {
-    splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
-}
-
-void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
-    splitRow_32FC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_32FC3(const float in[], float out0[], float out1[],
-                    float out2[], int length) {
-    splitRow_32FC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_32FC4(const float in[], float out0[], float out1[],
-                    float out2[], float out3[], int length) {
-    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -632,7 +572,6 @@ void calcRowLinear_32F(float *dst[],
                               int  lpi) {
    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
 }
-
 }  // namespace avx512

 template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
@@ -642,6 +581,20 @@ template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t

 template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
+template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
+template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
@@ -106,83 +106,8 @@ void calcRowLinear_32F(float *dst[],
                       const Size & inSz,
                       const Size & outSz,
                       int    lpi);
-
-//----------------------------------------------------------------------
-
-void mergeRow_8UC2(const uint8_t in0[],
-                   const uint8_t in1[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC3(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC4(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                   const uint8_t in3[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_32FC2(const float in0[],
-                    const float in1[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC3(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC4(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                    const float in3[],
-                          float out[],
-                            int length);
-
-void splitRow_8UC2(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                             int length);
-
-void splitRow_8UC3(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                             int length);
-
-void splitRow_8UC4(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                         uint8_t out3[],
-                             int length);
-
-void splitRow_32FC2(const float in[],
-                          float out0[],
-                          float out1[],
-                            int length);
-
-void splitRow_32FC3(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                            int length);
-
-void splitRow_32FC4(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                          float out3[],
-                            int length);
 }  // namespace avx512

-
 template<typename isa_tag_t, typename T>
 void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);

@@ -200,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,

 extern template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t, typename T, int chs>
+void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
+
+extern template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+extern template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
+extern template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+extern template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
+extern template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+extern template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template<typename isa_tag_t, typename T, int chs>
+void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
+
+extern template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@@ -1267,103 +1267,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
 }

 #endif  // CVKL
-//------------------------------------------------------------------------------
-
-void mergeRow_8UC2(const uint8_t in0[],
-                   const uint8_t in1[],
-                         uint8_t out[],
-                             int length) {
-    mergeRow_8UC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_8UC3(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                         uint8_t out[],
-                             int length) {
-    mergeRow_8UC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_8UC4(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                   const uint8_t in3[],
-                         uint8_t out[],
-                             int length) {
-    mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void mergeRow_32FC2(const float in0[],
-                    const float in1[],
-                          float out[],
-                            int length) {
-    mergeRow_32FC2_Impl(in0, in1, out, length);
-}
-
-void mergeRow_32FC3(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                          float out[],
-                            int length) {
-    mergeRow_32FC3_Impl(in0, in1, in2, out, length);
-}
-
-void mergeRow_32FC4(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                    const float in3[],
-                          float out[],
-                            int length) {
-    mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
-}
-
-void splitRow_8UC2(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                             int length) {
-    splitRow_8UC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_8UC3(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                             int length) {
-    splitRow_8UC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_8UC4(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                         uint8_t out3[],
-                             int length) {
-    splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
-}
-
-void splitRow_32FC2(const float in[],
-                          float out0[],
-                          float out1[],
-                            int length) {
-    splitRow_32FC2_Impl(in, out0, out1, length);
-}
-
-void splitRow_32FC3(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                            int length) {
-    splitRow_32FC3_Impl(in, out0, out1, out2, length);
-}
-
-void splitRow_32FC4(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                          float out3[],
-                            int length) {
-    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
-}

 template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
 template void chanToPlaneRowImpl(sse42_tag, const float* in, const int chan, const int chs, float* out, const int length);
@@ -1372,6 +1275,20 @@ template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t*

 template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template void splitRowImpl<sse42_tag, uchar, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
+template void splitRowImpl<sse42_tag, uchar, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
+template void splitRowImpl<sse42_tag, uchar, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template void mergeRowImpl<sse42_tag, uchar, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
@@ -106,80 +106,6 @@ void calcRowLinear_32F(float *dst[],
                 const Size & outSz,
                       int    lpi);

-//----------------------------------------------------------------------
-
-void mergeRow_8UC2(const uint8_t in0[],
-                   const uint8_t in1[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC3(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_8UC4(const uint8_t in0[],
-                   const uint8_t in1[],
-                   const uint8_t in2[],
-                   const uint8_t in3[],
-                         uint8_t out[],
-                             int length);
-
-void mergeRow_32FC2(const float in0[],
-                    const float in1[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC3(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                          float out[],
-                            int length);
-
-void mergeRow_32FC4(const float in0[],
-                    const float in1[],
-                    const float in2[],
-                    const float in3[],
-                          float out[],
-                            int length);
-
-void splitRow_8UC2(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                             int length);
-
-void splitRow_8UC3(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                             int length);
-
-void splitRow_8UC4(const uint8_t in[],
-                         uint8_t out0[],
-                         uint8_t out1[],
-                         uint8_t out2[],
-                         uint8_t out3[],
-                             int length);
-
-void splitRow_32FC2(const float in[],
-                          float out0[],
-                          float out1[],
-                            int length);
-
-void splitRow_32FC3(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                            int length);
-
-void splitRow_32FC4(const float in[],
-                          float out0[],
-                          float out1[],
-                          float out2[],
-                          float out3[],
-                            int length);
-
 template<typename isa_tag_t, typename T>
 void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
                        T* out, const int length);
@@ -199,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,

 extern template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t, typename T, int chs>
+void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
+
+extern template void splitRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
+extern template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
+extern template void splitRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
+extern template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
+extern template void splitRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
+extern template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
+
+template<typename isa_tag_t, typename T, int chs>
+void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
+
+extern template void mergeRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
+extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
+extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
+extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -42,396 +42,10 @@

 namespace InferenceEngine {
 namespace gapi {
+
+//using namespace kernels;
+
 namespace kernels {
-
-template<typename T, int chs> static
-void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length) {
-// AVX512 implementation of wide universal intrinsics is slower than AVX2.
-// It is turned off until the cause isn't found out.
-#if 0
-#ifdef HAVE_AVX512
-    if (with_cpu_x86_avx512f()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            avx512::mergeRow_8UC2(ins[0], ins[1], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            avx512::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            avx512::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            avx512::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
-                                   reinterpret_cast<const float*>(ins[1]),
-                                   reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            avx512::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
-                                   reinterpret_cast<const float*>(ins[1]),
-                                   reinterpret_cast<const float*>(ins[2]),
-                                   reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            avx512::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
-                                   reinterpret_cast<const float*>(ins[1]),
-                                   reinterpret_cast<const float*>(ins[2]),
-                                   reinterpret_cast<const float*>(ins[3]),
-                                   reinterpret_cast<float*>(out), length);
-            return;
-        }
-    }
-#endif  // HAVE_AVX512
-#endif
-
-#ifdef HAVE_AVX2
-    if (with_cpu_x86_avx2()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            avx::mergeRow_8UC2(ins[0], ins[1], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            avx::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            avx::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            avx::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
-                                reinterpret_cast<const float*>(ins[1]),
-                                reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            avx::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
-                                reinterpret_cast<const float*>(ins[1]),
-                                reinterpret_cast<const float*>(ins[2]),
-                                reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            avx::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
-                                reinterpret_cast<const float*>(ins[1]),
-                                reinterpret_cast<const float*>(ins[2]),
-                                reinterpret_cast<const float*>(ins[3]),
-                                reinterpret_cast<float*>(out), length);
-            return;
-        }
-    }
-#endif  // HAVE_AVX2
-
-#ifdef HAVE_SSE
-    if (with_cpu_x86_sse42()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            mergeRow_8UC2(ins[0], ins[1], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
-                           reinterpret_cast<const float*>(ins[1]),
-                           reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
-                           reinterpret_cast<const float*>(ins[1]),
-                           reinterpret_cast<const float*>(ins[2]),
-                           reinterpret_cast<float*>(out), length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
-                           reinterpret_cast<const float*>(ins[1]),
-                           reinterpret_cast<const float*>(ins[2]),
-                           reinterpret_cast<const float*>(ins[3]),
-                           reinterpret_cast<float*>(out), length);
-            return;
-        }
-    }
-#endif  // HAVE_SSE
-
-#ifdef HAVE_NEON
-    if (std::is_same<T, uint8_t>::value && chs == 2) {
-        neon::mergeRow_8UC2(ins[0], ins[1], out, length);
-        return;
-    }
-
-    if (std::is_same<T, uint8_t>::value && chs == 3) {
-        neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
-        return;
-    }
-
-    if (std::is_same<T, uint8_t>::value && chs == 4) {
-        neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 2) {
-        neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
-                             reinterpret_cast<const float*>(ins[1]),
-                             reinterpret_cast<float*>(out), length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 3) {
-        neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
-                             reinterpret_cast<const float*>(ins[1]),
-                             reinterpret_cast<const float*>(ins[2]),
-                             reinterpret_cast<float*>(out), length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 4) {
-        neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
-                             reinterpret_cast<const float*>(ins[1]),
-                             reinterpret_cast<const float*>(ins[2]),
-                             reinterpret_cast<const float*>(ins[3]),
-                             reinterpret_cast<float*>(out), length);
-        return;
-    }
-#endif  // HAVE_NEON
-
-    const T* insT[chs];
-    for (int c = 0; c < chs; c++) {
-        insT[c] = reinterpret_cast<const T*>(ins[c]);
-    }
-    auto outT = reinterpret_cast<T*>(out);
-
-    for (int x = 0; x < length; x++) {
-        for (int c = 0; c < chs; c++) {
-            outT[chs*x + c] = insT[c][x];
-        }
-    }
-}
-
-template<typename T, int chs> static
-void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
-#ifdef HAVE_AVX512
-    if (with_cpu_x86_avx512f()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            avx512::splitRow_8UC2(in, outs[0], outs[1], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            avx512::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            avx512::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            avx512::splitRow_32FC2(reinterpret_cast<const float*>(in),
-                                   reinterpret_cast<float*>(outs[0]),
-                                   reinterpret_cast<float*>(outs[1]),
-                                   length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            avx512::splitRow_32FC3(reinterpret_cast<const float*>(in),
-                                   reinterpret_cast<float*>(outs[0]),
-                                   reinterpret_cast<float*>(outs[1]),
-                                   reinterpret_cast<float*>(outs[2]),
-                                   length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            avx512::splitRow_32FC4(reinterpret_cast<const float*>(in),
-                                   reinterpret_cast<float*>(outs[0]),
-                                   reinterpret_cast<float*>(outs[1]),
-                                   reinterpret_cast<float*>(outs[2]),
-                                   reinterpret_cast<float*>(outs[3]),
-                                   length);
-            return;
-        }
-    }
-#endif  // HAVE_AVX512
-
-#ifdef HAVE_AVX2
-
-    if (with_cpu_x86_avx2()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            avx::splitRow_8UC2(in, outs[0], outs[1], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            avx::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            avx::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            avx::splitRow_32FC2(reinterpret_cast<const float*>(in),
-                                reinterpret_cast<float*>(outs[0]),
-                                reinterpret_cast<float*>(outs[1]),
-                                length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            avx::splitRow_32FC3(reinterpret_cast<const float*>(in),
-                                reinterpret_cast<float*>(outs[0]),
-                                reinterpret_cast<float*>(outs[1]),
-                                reinterpret_cast<float*>(outs[2]),
-                                length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            avx::splitRow_32FC4(reinterpret_cast<const float*>(in),
-                                reinterpret_cast<float*>(outs[0]),
-                                reinterpret_cast<float*>(outs[1]),
-                                reinterpret_cast<float*>(outs[2]),
-                                reinterpret_cast<float*>(outs[3]),
-                                length);
-            return;
-        }
-    }
-#endif  // HAVE_AVX2
-
-#ifdef HAVE_SSE
-    if (with_cpu_x86_sse42()) {
-        if (std::is_same<T, uint8_t>::value && chs == 2) {
-            splitRow_8UC2(in, outs[0], outs[1], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 3) {
-            splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
-            return;
-        }
-
-        if (std::is_same<T, uint8_t>::value && chs == 4) {
-            splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 2) {
-            splitRow_32FC2(reinterpret_cast<const float*>(in),
-                           reinterpret_cast<float*>(outs[0]),
-                           reinterpret_cast<float*>(outs[1]),
-                           length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 3) {
-            splitRow_32FC3(reinterpret_cast<const float*>(in),
-                           reinterpret_cast<float*>(outs[0]),
-                           reinterpret_cast<float*>(outs[1]),
-                           reinterpret_cast<float*>(outs[2]),
-                           length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 4) {
-            splitRow_32FC4(reinterpret_cast<const float*>(in),
-                           reinterpret_cast<float*>(outs[0]),
-                           reinterpret_cast<float*>(outs[1]),
-                           reinterpret_cast<float*>(outs[2]),
-                           reinterpret_cast<float*>(outs[3]),
-                           length);
-            return;
-        }
-    }
-#endif  // HAVE_SSE
-
-#ifdef HAVE_NEON
-    if (std::is_same<T, uint8_t>::value && chs == 2) {
-        neon::splitRow_8UC2(in, outs[0], outs[1], length);
-        return;
-    }
-
-    if (std::is_same<T, uint8_t>::value && chs == 3) {
-        neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
-        return;
-    }
-
-    if (std::is_same<T, uint8_t>::value && chs == 4) {
-        neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 2) {
-        neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
-                             reinterpret_cast<float*>(outs[0]),
-                             reinterpret_cast<float*>(outs[1]),
-                             length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 3) {
-        neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
-                             reinterpret_cast<float*>(outs[0]),
-                             reinterpret_cast<float*>(outs[1]),
-                             reinterpret_cast<float*>(outs[2]),
-                             length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 4) {
-        neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
-                             reinterpret_cast<float*>(outs[0]),
-                             reinterpret_cast<float*>(outs[1]),
-                             reinterpret_cast<float*>(outs[2]),
-                             reinterpret_cast<float*>(outs[3]),
-                             length);
-        return;
-    }
-#endif  // HAVE_NEON
-
-    auto inT = reinterpret_cast<const T*>(in);
-
-    T* outsT[chs];
-    for (int c = 0; c < chs; c++) {
-        outsT[c] = reinterpret_cast<T*>(outs[c]);
-    }
-
-    for (int x = 0; x < length; x++) {
-        for (int c = 0; c < chs; c++) {
-            outsT[c][x] = inT[chs*x + c];
-        }
-    }
-}
-
 namespace {

 struct fp_16_t {
@@ -583,168 +197,108 @@ bool is_cv_type_in_list(const int type_id) {
 }

 namespace {
-
 using merge_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;

-template<int chs>
+template<typename T, int chs>
+void mergeRowImpl(scalar_tag, const std::array<const T*, chs>& ins, T* out, const int length) {
+    for (int x = 0; x < length; ++x) {
+        for (int c = 0; c < chs; ++c) {
+            out[chs * x + c] = ins[c][x];
+        }
+    }
+}
+
+template<typename isa_tag_t, int chs>
 struct typed_merge_row {
-    using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length);
+    using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length);

    template <typename type>
-    p_f operator()(type_to_type<type> ) { return mergeRow<type, chs>; }
+    typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
+            (!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
+             !std::is_same<type, float>::value), p_f>::type
+    operator()(type_to_type<type> ) {
+        return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
+            const auto inT = reinterpret_cast<const std::array<const type*, chs>&>(ins);
+            auto outT = reinterpret_cast<type*>(out);
+            scalar_tag t;
+            mergeRowImpl<type, chs>(t, inT, outT, length);
+        };
+    }

-    p_f operator()(type_to_type<fp_16_t> ) {
-        static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
-                "fp_16_t should be a plain wrap over FP16 implementation type");
-        return mergeRow<decltype(fp_16_t::v), chs>;
+    template<typename tag = isa_tag_t>
+    typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
+    operator()(type_to_type<uint8_t>) {
+        return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
+            tag t;
+            mergeRowImpl<tag, uint8_t, chs>(t, ins, out, length);
+        };
+    }
+
+    template<typename tag = isa_tag_t>
+    typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
+    operator()(type_to_type<float>) {
+        return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
+            const auto inT = reinterpret_cast<const std::array<const float*, chs>&>(ins);
+            auto outT = reinterpret_cast<float*>(out);
+            tag t;
+            mergeRowImpl<tag, float, chs>(t, inT, outT, length);
+        };
    }
 };

 }  // namespace

-GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View& a,
-                    const cv::gapi::fluid::View& b,
-                          cv::gapi::fluid::Buffer& out) {
-        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
-
-        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<2>{}, nullptr);
-        for (int l = 0; l < out.lpi(); l++) {
-            rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length());
-        }
-    }
-};
-
-GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View& a,
-                    const cv::gapi::fluid::View& b,
-                    const cv::gapi::fluid::View& c,
-                          cv::gapi::fluid::Buffer& out) {
-        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
-
-        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<3>{}, nullptr);
-        for (int l = 0; l < out.lpi(); l++) {
-            rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length());
-        }
-    }
-};
-
-GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View& a,
-                    const cv::gapi::fluid::View& b,
-                    const cv::gapi::fluid::View& c,
-                    const cv::gapi::fluid::View& d,
-                          cv::gapi::fluid::Buffer& out) {
-        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
-
-        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<4>{}, nullptr);
-        for (int l = 0; l < out.lpi(); l++) {
-            rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length());
-        }
-    }
-};
-
-
 namespace {
 using split_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;

-template<int chs>
+template<typename T, int chs>
+void splitRowImpl(scalar_tag, const T* in, std::array<T*, chs>& outs, const int length) {
+    for (int x = 0; x < length; ++x) {
+        for (int c = 0; c < chs; ++c) {
+            outs[c][x] = in[chs * x + c];
+        }
+    }
+}
+
+template<typename isa_tag_t, int chs>
 struct typed_split_row {
-    using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length);
+    using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length);

    template <typename type>
-    p_f operator()(type_to_type<type> ) { return splitRow<type, chs>; }
+    typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
+            (!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
+             !std::is_same<type, float>::value), p_f>::type
+    operator()(type_to_type<type> ) {
+        return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
+            const auto inT = reinterpret_cast<const type*>(in);
+            auto outT = reinterpret_cast<std::array<type*, chs>&>(outs);
+            scalar_tag t;
+            splitRowImpl<type, chs>(t, inT, outT, length);
+        };
+    }

-    p_f operator()(type_to_type<fp_16_t> ) {
-        static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
-                "fp_16_t should be a plain wrap over FP16 implementation type");
-        return splitRow<decltype(fp_16_t::v), chs>;
+    template<typename tag = isa_tag_t>
+    typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
+    operator()(type_to_type<uint8_t>) {
+        return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
+            tag t;
+            splitRowImpl<tag, uint8_t, chs>(t, in, outs, length);
+        };
+    }
+
+    template<typename tag = isa_tag_t>
+    typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
+    operator()(type_to_type<float>) {
+        return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
+            const auto inT = reinterpret_cast<const float*>(in);
+            auto outT = reinterpret_cast<std::array<float*, chs>&>(outs);
+            tag t;
+            splitRowImpl<tag, float, chs>(t, inT, outT, length);
+        };
    }
 };
-
 }  // namespace

-GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View  & in,
-                          cv::gapi::fluid::Buffer& out1,
-                          cv::gapi::fluid::Buffer& out2) {
-        GAPI_DbgAssert(2 == in.meta().chan);
-        GAPI_DbgAssert(1 == out1.meta().chan);
-        GAPI_DbgAssert(1 == out2.meta().chan);
-        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
-        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
-
-        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<2>{}, nullptr);
-        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
-            std::array<uint8_t*, 2> outs = {out1.OutLineB(i), out2.OutLineB(i)};
-            rowFunc(in.InLineB(i), outs, in.length());
-        }
-    }
-};
-
-GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View  & in,
-                          cv::gapi::fluid::Buffer& out1,
-                          cv::gapi::fluid::Buffer& out2,
-                          cv::gapi::fluid::Buffer& out3) {
-        GAPI_DbgAssert(3 == in.meta().chan);
-        GAPI_DbgAssert(1 == out1.meta().chan);
-        GAPI_DbgAssert(1 == out2.meta().chan);
-        GAPI_DbgAssert(1 == out3.meta().chan);
-        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
-
-        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
-
-        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<3>{}, nullptr);
-        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
-            std::array<uint8_t*, 3> outs = {out1.OutLineB(i), out2.OutLineB(i),
-                                            out3.OutLineB(i)};
-            rowFunc(in.InLineB(i), outs, in.length());
-        }
-    }
-};
-
-GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
-    static const int LPI = 4;
-    static const int Window = 1;
-    static void run(const cv::gapi::fluid::View  & in,
-                          cv::gapi::fluid::Buffer& out1,
-                          cv::gapi::fluid::Buffer& out2,
-                          cv::gapi::fluid::Buffer& out3,
-                          cv::gapi::fluid::Buffer& out4) {
-        GAPI_DbgAssert(4 == in.meta().chan);
-        GAPI_DbgAssert(1 == out1.meta().chan);
-        GAPI_DbgAssert(1 == out2.meta().chan);
-        GAPI_DbgAssert(1 == out3.meta().chan);
-        GAPI_DbgAssert(1 == out4.meta().chan);
-        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
-        GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
-        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
-
-        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<4>{}, nullptr);
-        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
-            std::array<uint8_t*, 4> outs = {out1.OutLineB(i), out2.OutLineB(i),
-                                            out3.OutLineB(i), out4.OutLineB(i)};
-            rowFunc(in.InLineB(i), outs, in.length());
-        }
-    }
-};
-
 //----------------------------------------------------------------------
 using isas_set = typelist<
 #ifdef HAVE_AVX512
@@ -1005,36 +559,179 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
        rowFunc(y_rows, u_row, v_row, out_rows, buf_width);
    }
 };
+
+GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & in,
+        cv::gapi::fluid::Buffer & out1,
+        cv::gapi::fluid::Buffer & out2) {
+        GAPI_DbgAssert(2 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
+
+        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 2>{}, nullptr);
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 2> outs = { out1.OutLineB(i), out2.OutLineB(i) };
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & in,
+        cv::gapi::fluid::Buffer & out1,
+        cv::gapi::fluid::Buffer & out2,
+        cv::gapi::fluid::Buffer & out3) {
+        GAPI_DbgAssert(3 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(1 == out3.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
+
+        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
+
+        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 3>{}, nullptr);
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 3> outs = { out1.OutLineB(i), out2.OutLineB(i),
+                                            out3.OutLineB(i) };
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & in,
+        cv::gapi::fluid::Buffer & out1,
+        cv::gapi::fluid::Buffer & out2,
+        cv::gapi::fluid::Buffer & out3,
+        cv::gapi::fluid::Buffer & out4) {
+        GAPI_DbgAssert(4 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(1 == out3.meta().chan);
+        GAPI_DbgAssert(1 == out4.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
+        GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
+
+        const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 4>{}, nullptr);
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 4> outs = { out1.OutLineB(i), out2.OutLineB(i),
+                                            out3.OutLineB(i), out4.OutLineB(i) };
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & a,
+        const cv::gapi::fluid::View & b,
+        cv::gapi::fluid::Buffer & out) {
+        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
+
+        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 2>{}, nullptr);
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({ a.InLineB(l), b.InLineB(l) }, out.OutLineB(l), a.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & a,
+        const cv::gapi::fluid::View & b,
+        const cv::gapi::fluid::View & c,
+        cv::gapi::fluid::Buffer & out) {
+        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
+
+        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 3>{}, nullptr);
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l) }, out.OutLineB(l), a.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View & a,
+        const cv::gapi::fluid::View & b,
+        const cv::gapi::fluid::View & c,
+        const cv::gapi::fluid::View & d,
+        cv::gapi::fluid::Buffer & out) {
+        GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
+
+        const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 4>{}, nullptr);
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l) }, out.OutLineB(l), a.length());
+        }
+    }
+};
 };

 namespace {
-struct ColorConversionISA {
+struct CC_and_MergeISA {
    cv::gapi::GKernelPackage& pckg;

-    ColorConversionISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
+    CC_and_MergeISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}

    template<typename isa_tag_t>
    bool operator()(type_to_type<isa_tag_t>) {
        pckg.include<typename choose_impl<isa_tag_t>::FI420toRGB>();
        pckg.include<typename choose_impl<isa_tag_t>::FNV12toRGB>();
        pckg.include<typename choose_impl<isa_tag_t>::FChanToPlane>();
+        pckg.include<typename choose_impl<isa_tag_t>::FMerge2>();
+        pckg.include<typename choose_impl<isa_tag_t>::FMerge3>();
+        pckg.include<typename choose_impl<isa_tag_t>::FMerge4>();
+        //at the moment type_dispatch requires something to be returned by the lambda
+        return true;
+    }
+};
+
+struct SplitISA {
+    cv::gapi::GKernelPackage& pckg;
+
+    SplitISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
+
+    template<typename isa_tag_t>
+    bool operator()(type_to_type<isa_tag_t>) {
+        pckg.include<typename choose_impl<isa_tag_t>::FSplit2>();
+        pckg.include<typename choose_impl<isa_tag_t>::FSplit3>();
+        pckg.include<typename choose_impl<isa_tag_t>::FSplit4>();
        //at the moment type_dispatch requires something to be returned by the lambda
        return true;
    }
 };
 }  //namespace

-cv::gapi::GKernelPackage FColorConversionChooseISA() {
+cv::gapi::GKernelPackage FKernelsChooseISA() {
    // At the moment AVX512 implementation of wide universal intrinsics is slower than AVX2.
    // So, disable it for now.
    using isas = remove_t<isas_set, avx512_tag>;

-    cv::gapi::GKernelPackage pckg;
-    ColorConversionISA ctpISA{pckg};
+    cv::gapi::GKernelPackage pckg1, pckg2;
+    CC_and_MergeISA ccISA{ pckg1 };
+    SplitISA sISA{ pckg2 };

-    type_dispatch<isas>(is_isa_present{}, ctpISA, false);
+    type_dispatch<isas>(is_isa_present{}, ccISA, false);
+    type_dispatch<isas_set>(is_isa_present{}, sISA, false);

-    return pckg;
+    return combine(pckg1, pckg2);
 }

 //----------------------------------------------------------------------
@@ -2601,7 +2298,7 @@ using namespace kernels;

 cv::gapi::GKernelPackage preprocKernels() {
    return combine(
-        FColorConversionChooseISA(),
+        FKernelsChooseISA(),
        cv::gapi::kernels
        <FScalePlanes
        , FScalePlanes4
@@ -2612,12 +2309,6 @@ cv::gapi::GKernelPackage preprocKernels() {
        , FUpscalePlaneArea32f
        , FScalePlaneArea8u
        , FScalePlaneArea32f
-        , FMerge2
-        , FMerge3
-        , FMerge4
-        , FSplit2
-        , FSplit3
-        , FSplit4
        , FConvertDepth
        , FSubC
        , FDivC
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@@ -18,360 +18,204 @@ namespace gapi {

 namespace kernels {

-CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
-                                         uint8_t out[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void mergeRowC2_Impl(const T in0[], const T in1[],
+                                      T out[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_uint8 r0, r1;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        v_store_interleave(&out[2*l], r0, r1);
-    }
+    VecT r0, r1;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+            r0 = vx_load(&in0[x]);
+            r1 = vx_load(&in1[x]);
+            v_store_interleave(&out[2*x], r0, r1);
+        }

-    // to think about how to remove those ifs
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out[2*l + 0] = in0[l];
-        out[2*l + 1] = in1[l];
+    for (; x < length; ++x) {
+        out[2*x + 0] = in0[x];
+        out[2*x + 1] = in1[x];
    }
 }

-CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
-                                         const uint8_t in2[], uint8_t out[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void mergeRowC3_Impl(const T in0[], const T in1[],
+                                      const T in2[], T out[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_uint8 r0, r1, r2;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        r2 = vx_load(&in2[l]);
-        v_store_interleave(&out[3*l], r0, r1, r2);
-    }
+    VecT r0, r1, r2;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+            r0 = vx_load(&in0[x]);
+            r1 = vx_load(&in1[x]);
+            r2 = vx_load(&in2[x]);
+            v_store_interleave(&out[3*x], r0, r1, r2);
+        }

-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out[3*l + 0] = in0[l];
-        out[3*l + 1] = in1[l];
-        out[3*l + 2] = in2[l];
+    for (; x < length; ++x) {
+        out[3*x + 0] = in0[x];
+        out[3*x + 1] = in1[x];
+        out[3*x + 2] = in2[x];
    }
 }

-CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
-                                         const uint8_t in2[], const uint8_t in3[],
-                                         uint8_t out[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void mergeRowC4_Impl(const T in0[], const T in1[],
+                                      const T in2[], const T in3[],
+                                      T out[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_uint8 r0, r1, r2, r3;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        r2 = vx_load(&in2[l]);
-        r3 = vx_load(&in3[l]);
-        v_store_interleave(&out[4*l], r0, r1, r2, r3);
-    }
+    VecT r0, r1, r2, r3;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+            r0 = vx_load(&in0[x]);
+            r1 = vx_load(&in1[x]);
+            r2 = vx_load(&in2[x]);
+            r3 = vx_load(&in3[x]);
+            v_store_interleave(&out[4* x], r0, r1, r2, r3);
+        }

-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out[4*l + 0] = in0[l];
-        out[4*l + 1] = in1[l];
-        out[4*l + 2] = in2[l];
-        out[4*l + 3] = in3[l];
+    for (; x < length; ++x) {
+        out[4*x + 0] = in0[x];
+        out[4*x + 1] = in1[x];
+        out[4*x + 2] = in2[x];
+        out[4*x + 3] = in3[x];
    }
 }
-
-CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
-                                          float out[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        v_store_interleave(&out[2*l], r0, r1);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-#endif
-
-    for (; l < length; ++l) {
-        out[2*l + 0] = in0[l];
-        out[2*l + 1] = in1[l];
-    }
-}
-
-CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
-                                          float out[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1, r2;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        r2 = vx_load(&in2[l]);
-        v_store_interleave(&out[3*l], r0, r1, r2);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-#endif
-
-    for (; l < length; ++l) {
-        out[3*l + 0] = in0[l];
-        out[3*l + 1] = in1[l];
-        out[3*l + 2] = in2[l];
-    }
-}
-
-CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
-                                          const float in2[], const float in3[],
-                                          float out[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1, r2, r3;
-        r0 = vx_load(&in0[l]);
-        r1 = vx_load(&in1[l]);
-        r2 = vx_load(&in2[l]);
-        r3 = vx_load(&in3[l]);
-        v_store_interleave(&out[4*l], r0, r1, r2, r3);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-#endif
-
-    for (; l < length; ++l) {
-        out[4*l + 0] = in0[l];
-        out[4*l + 1] = in1[l];
-        out[4*l + 2] = in2[l];
-        out[4*l + 3] = in3[l];
-    }
-}
-
 //------------------------------------------------------------------------------
-
-CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
-                                         uint8_t out1[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void splitRowC2_Impl(const T in[], T out0[],
+                                      T out1[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_uint8 r0, r1;
-        v_load_deinterleave(&in[2*l], r0, r1);
-        vx_store(&out0[l], r0);
-        vx_store(&out1[l], r1);
-    }
+    VecT r0, r1;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+            v_load_deinterleave(&in[2*x], r0, r1);
+            vx_store(&out0[x], r0);
+            vx_store(&out1[x], r1);
+        }

-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out0[l] = in[2*l + 0];
-        out1[l] = in[2*l + 1];
+    for (; x < length; ++x) {
+        out0[x] = in[2*x + 0];
+        out1[x] = in[2*x + 1];
    }
 }

-CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
-                                         uint8_t out1[], uint8_t out2[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void splitRowC3_Impl(const T in[], T out0[],
+                                      T out1[], T out2[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-         v_uint8 r0, r1, r2;
-         v_load_deinterleave(&in[3*l], r0, r1, r2);
-         vx_store(&out0[l], r0);
-         vx_store(&out1[l], r1);
-         vx_store(&out2[l], r2);
-    }
+    VecT r0, r1, r2;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+             v_load_deinterleave(&in[3*x], r0, r1, r2);
+             vx_store(&out0[x], r0);
+             vx_store(&out1[x], r1);
+             vx_store(&out2[x], r2);
+        }

-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out0[l] = in[3*l + 0];
-        out1[l] = in[3*l + 1];
-        out2[l] = in[3*l + 2];
+    for (; x < length; ++x) {
+        out0[x] = in[3*x + 0];
+        out1[x] = in[3*x + 1];
+        out2[x] = in[3*x + 2];
    }
 }

-CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
-                                         uint8_t out2[], uint8_t out3[], int length) {
-    int l = 0;
+template <typename VecT, typename T>
+CV_ALWAYS_INLINE void splitRowC4_Impl(const T in[], T out0[], T out1[],
+                                      T out2[], T out3[], const int length) {
+    int x = 0;

 #if MANUAL_SIMD
-    constexpr int nlanes = v_uint8::nlanes;
+    constexpr int nlanes = VecT::nlanes;
+    GAPI_DbgAssert(length >= nlanes);

-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_uint8 r0, r1, r2, r3;
-        v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
-        vx_store(&out0[l], r0);
-        vx_store(&out1[l], r1);
-        vx_store(&out2[l], r2);
-        vx_store(&out3[l], r3);
-    }
+    VecT r0, r1, r2, r3;
+    for (; length >= nlanes;) {
+        for (; x <= length - nlanes; x += nlanes) {
+            v_load_deinterleave(&in[4*x], r0, r1, r2, r3);
+            vx_store(&out0[x], r0);
+            vx_store(&out1[x], r1);
+            vx_store(&out2[x], r2);
+            vx_store(&out3[x], r3);
+        }

-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
+        if (x < length) {
+            x = length - nlanes;
+            continue;
+        }
+        break;
    }
 #endif

-    for (; l < length; ++l) {
-        out0[l] = in[4*l + 0];
-        out1[l] = in[4*l + 1];
-        out2[l] = in[4*l + 2];
-        out3[l] = in[4*l + 3];
+    for (; x < length; ++x) {
+        out0[x] = in[4*x + 0];
+        out1[x] = in[4*x + 1];
+        out2[x] = in[4*x + 2];
+        out3[x] = in[4*x + 3];
    }
 }
-
-CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
-                                float out1[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1;
-        v_load_deinterleave(&in[2*l], r0, r1);
-        vx_store(&out0[l], r0);
-        vx_store(&out1[l], r1);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-
-#endif
-
-    for (; l < length; ++l) {
-        out0[l] = in[2*l + 0];
-        out1[l] = in[2*l + 1];
-    }
-}
-
-CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
-                                          float out2[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1, r2;
-        v_load_deinterleave(&in[3*l], r0, r1, r2);
-        vx_store(&out0[l], r0);
-        vx_store(&out1[l], r1);
-        vx_store(&out2[l], r2);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-#endif
-
-    for (; l < length; ++l) {
-        out0[l] = in[3*l + 0];
-        out1[l] = in[3*l + 1];
-        out2[l] = in[3*l + 2];
-    }
-}
-
-CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
-                                          float out2[], float out3[], int length) {
-    int l = 0;
-
-#if MANUAL_SIMD
-    constexpr int nlanes = v_float32::nlanes;
-
-    cycle:
-    for (; l <= length - nlanes; l += nlanes) {
-        v_float32 r0, r1, r2, r3;
-        v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
-        vx_store(&out0[l], r0);
-        vx_store(&out1[l], r1);
-        vx_store(&out2[l], r2);
-        vx_store(&out3[l], r3);
-    }
-
-    if (l < length && length >= nlanes) {
-        l = length - nlanes;
-        goto cycle;
-    }
-#endif
-
-    for (; l < length; ++l) {
-        out0[l] = in[4*l + 0];
-        out1[l] = in[4*l + 1];
-        out2[l] = in[4*l + 2];
-        out3[l] = in[4*l + 3];
-    }
-}
-
 //------------------------------------------------------------------------------

 CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
@@ -880,6 +724,38 @@ CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
        out[x] = in[x*chs + chan];
    }
 }
+
+template<typename isa_tag_t, typename T, int chs>
+CV_ALWAYS_INLINE void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length) {
+    static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
+
+    if (chs == 2) {
+        splitRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], length);
+        return;
+    } else if (chs == 3) {
+        splitRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], length);
+        return;
+    } else {
+        splitRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], outs[3], length);
+        return;
+    }
+}
+
+template<typename isa_tag_t, typename T, int chs>
+CV_ALWAYS_INLINE void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length) {
+    static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
+
+    if (chs == 2) {
+        mergeRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], out, length);
+        return;
+    } else if (chs == 3) {
+        mergeRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], out, length);
+        return;
+    } else {
+        mergeRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], ins[3], out, length);
+        return;
+    }
+}
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
+++ b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
@@ -98,10 +98,30 @@ cv::String typeToString(int type)
    case CV_8UC2  : return "CV_8UC2";
    case CV_8UC3  : return "CV_8UC3";
    case CV_8UC4  : return "CV_8UC4";
+    case CV_16FC1 : return "CV_16FC1";
+    case CV_16FC2 : return "CV_16FC2";
+    case CV_16FC3 : return "CV_16FC3";
+    case CV_16FC4 : return "CV_16FC4";
    case CV_32FC1 : return "CV_32FC1";
    case CV_32FC2 : return "CV_32FC2";
    case CV_32FC3 : return "CV_32FC3";
    case CV_32FC4 : return "CV_32FC4";
+    case CV_8SC1  : return "CV_8SC1";
+    case CV_8SC2  : return "CV_8SC2";
+    case CV_8SC3  : return "CV_8SC3";
+    case CV_8SC4  : return "CV_8SC4";
+    case CV_16SC1 : return "CV_16SC1";
+    case CV_16SC2 : return "CV_16SC2";
+    case CV_16SC3 : return "CV_16SC3";
+    case CV_16SC4 : return "CV_16SC4";
+    case CV_16UC1 : return "CV_16UC1";
+    case CV_16UC2 : return "CV_16UC2";
+    case CV_16UC3 : return "CV_16UC3";
+    case CV_16UC4 : return "CV_16UC4";
+    case CV_32SC1 : return "CV_32SC1";
+    case CV_32SC2 : return "CV_32SC2";
+    case CV_32SC3 : return "CV_32SC3";
+    case CV_32SC4 : return "CV_32SC4";
    }
    CV_Assert(!"ERROR: unsupported type!");
    return nullptr;