diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp index 8290fda25ed..b0b3281748e 100644 --- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp +++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp @@ -29,67 +29,6 @@ namespace InferenceEngine { namespace gapi { namespace kernels { namespace neon { - -void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[], - uint8_t out[], int length) { - mergeRow_8UC2_Impl(in0, in1, out, length); -} - -void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], uint8_t out[], int length) { - mergeRow_8UC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[], - const uint8_t in3[], uint8_t out[], int length) { - mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length); -} - -void mergeRow_32FC2(const float in0[], const float in1[], - float out[], int length) { - mergeRow_32FC2_Impl(in0, in1, out, length); -} - -void mergeRow_32FC3(const float in0[], const float in1[], const float in2[], - float out[], int length) { - mergeRow_32FC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_32FC4(const float in0[], const float in1[], - const float in2[], const float in3[], - float out[], int length) { - mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length); -} - -void splitRow_8UC2(const uint8_t in[], uint8_t out0[], - uint8_t out1[], int length) { - splitRow_8UC2_Impl(in, out0, out1, length); -} - -void splitRow_8UC3(const uint8_t in[], uint8_t out0[], - uint8_t out1[], uint8_t out2[], int length) { - splitRow_8UC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[], - uint8_t out2[], uint8_t out3[], int length) { - splitRow_8UC4_Impl(in, out0, out1, out2, out3, length); -} - -void splitRow_32FC2(const float in[], float out0[], float out1[], int length) { - splitRow_32FC2_Impl(in, out0, out1, length); -} - -void splitRow_32FC3(const float in[], float out0[], float out1[], - float out2[], int length) { - splitRow_32FC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_32FC4(const float in[], float out0[], float out1[], - float out2[], float out3[], int length) { - splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); -} - void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[], @@ -693,6 +632,20 @@ template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); + +template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp index 4e0b82a6259..5f6d1b783db 100644 --- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp +++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp @@ -92,87 +92,6 @@ void calcRowLinear_32F(float *dst[], const Size& inSz, const Size& outSz, int lpi); - -//---------------------------------------------------------------------- - -void mergeRow_8UC2(const uint8_t in0[], - const uint8_t in1[], - uint8_t out[], - int length); - -void mergeRow_8UC3(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - uint8_t out[], - int length); - -void mergeRow_8UC4(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - const uint8_t in3[], - uint8_t out[], - int length); - -void mergeRow_32FC2(const float in0[], - const float in1[], - float out[], - int length); - -void mergeRow_32FC3(const float in0[], - const float in1[], - const float in2[], - float out[], - int length); - -void mergeRow_32FC4(const float in0[], - const float in1[], - const float in2[], - const float in3[], - float out[], - int length); - -void splitRow_8UC2(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - int length); - -void splitRow_8UC3(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - int length); - -void splitRow_8UC4(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - uint8_t out3[], - int length); - -void splitRow_32FC2(const float in[], - float out0[], - float out1[], - int length); - -void splitRow_32FC3(const float in[], - float out0[], - float out1[], - float out2[], - int length); - -void splitRow_32FC4(const float in[], - float out0[], - float out1[], - float out2[], - float out3[], - int length); - -void calculate_i420_to_rgb(const uchar **srcY, - const uchar *srcU, - const uchar *srcV, - uchar **dstRGBx, - int width); - } // namespace neon template @@ -192,6 +111,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row, extern template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template +void splitRowImpl(isa_tag_t, const T* in, std::array& outs, const int length); + +extern template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(neon_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(neon_tag, const float* in, std::array& outs, const int length); + +template +void mergeRowImpl(isa_tag_t, const std::array& ins, T* out, const int length); + +extern template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(neon_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(neon_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp index aefd2e2dfc3..d5b3d2664f4 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp @@ -47,66 +47,6 @@ namespace kernels { namespace avx { -void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[], - uint8_t out[], int length) { - mergeRow_8UC2_Impl(in0, in1, out, length); -} - -void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], uint8_t out[], int length) { - mergeRow_8UC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[], - const uint8_t in3[], uint8_t out[], int length) { - mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length); -} - -void mergeRow_32FC2(const float in0[], const float in1[], - float out[], int length) { - mergeRow_32FC2_Impl(in0, in1, out, length); -} - -void mergeRow_32FC3(const float in0[], const float in1[], const float in2[], - float out[], int length) { - mergeRow_32FC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_32FC4(const float in0[], const float in1[], - const float in2[], const float in3[], - float out[], int length) { - mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length); -} - -void splitRow_8UC2(const uint8_t in[], uint8_t out0[], - uint8_t out1[], int length) { - splitRow_8UC2_Impl(in, out0, out1, length); -} - -void splitRow_8UC3(const uint8_t in[], uint8_t out0[], - uint8_t out1[], uint8_t out2[], int length) { - splitRow_8UC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[], - uint8_t out2[], uint8_t out3[], int length) { - splitRow_8UC4_Impl(in, out0, out1, out2, out3, length); -} - -void splitRow_32FC2(const float in[], float out0[], float out1[], int length) { - splitRow_32FC2_Impl(in, out0, out1, length); -} - -void splitRow_32FC3(const float in[], float out0[], float out1[], - float out2[], int length) { - splitRow_32FC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_32FC4(const float in[], float out0[], float out1[], - float out2[], float out3[], int length) { - splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); -} - void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[], @@ -562,6 +502,20 @@ template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); + +template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp index 512121b0f3d..5784abe0ea1 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp @@ -106,84 +106,8 @@ void calcRowLinear_32F(float *dst[], const Size & inSz, const Size & outSz, int lpi); - -//---------------------------------------------------------------------- - - -void mergeRow_8UC2(const uint8_t in0[], - const uint8_t in1[], - uint8_t out[], - int length); - -void mergeRow_8UC3(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - uint8_t out[], - int length); - -void mergeRow_8UC4(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - const uint8_t in3[], - uint8_t out[], - int length); - -void mergeRow_32FC2(const float in0[], - const float in1[], - float out[], - int length); - -void mergeRow_32FC3(const float in0[], - const float in1[], - const float in2[], - float out[], - int length); - -void mergeRow_32FC4(const float in0[], - const float in1[], - const float in2[], - const float in3[], - float out[], - int length); - -void splitRow_8UC2(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - int length); - -void splitRow_8UC3(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - int length); - -void splitRow_8UC4(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - uint8_t out3[], - int length); - -void splitRow_32FC2(const float in[], - float out0[], - float out1[], - int length); - -void splitRow_32FC3(const float in[], - float out0[], - float out1[], - float out2[], - int length); - -void splitRow_32FC4(const float in[], - float out0[], - float out1[], - float out2[], - float out3[], - int length); } // namespace avx - template void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length); @@ -192,7 +116,7 @@ extern template void chanToPlaneRowImpl(avx2_tag, const float* in, const int c template void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row, - uint8_t** out_rows, const int buf_width); + uint8_t** out_rows, const int buf_width); extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, @@ -200,10 +124,30 @@ extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, template void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row, - const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + const uint8_t* v_row, uint8_t** out_rows, const int buf_width); extern template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template +void splitRowImpl(isa_tag_t, const T* in, std::array& outs, const int length); + +extern template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(avx2_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx2_tag, const float* in, std::array& outs, const int length); + +template +void mergeRowImpl(isa_tag_t, const std::array& ins, T* out, const int length); + +extern template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(avx2_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx2_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp index 78b74f532d6..ec8b52d3aad 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp @@ -41,66 +41,6 @@ namespace kernels { namespace avx512 { -void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[], - uint8_t out[], int length) { - mergeRow_8UC2_Impl(in0, in1, out, length); -} - -void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], uint8_t out[], int length) { - mergeRow_8UC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[], - const uint8_t in3[], uint8_t out[], int length) { - mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length); -} - -void mergeRow_32FC2(const float in0[], const float in1[], - float out[], int length) { - mergeRow_32FC2_Impl(in0, in1, out, length); -} - -void mergeRow_32FC3(const float in0[], const float in1[], const float in2[], - float out[], int length) { - mergeRow_32FC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_32FC4(const float in0[], const float in1[], - const float in2[], const float in3[], - float out[], int length) { - mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length); -} - -void splitRow_8UC2(const uint8_t in[], uint8_t out0[], - uint8_t out1[], int length) { - splitRow_8UC2_Impl(in, out0, out1, length); -} - -void splitRow_8UC3(const uint8_t in[], uint8_t out0[], - uint8_t out1[], uint8_t out2[], int length) { - splitRow_8UC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[], - uint8_t out2[], uint8_t out3[], int length) { - splitRow_8UC4_Impl(in, out0, out1, out2, out3, length); -} - -void splitRow_32FC2(const float in[], float out0[], float out1[], int length) { - splitRow_32FC2_Impl(in, out0, out1, length); -} - -void splitRow_32FC3(const float in[], float out0[], float out1[], - float out2[], int length) { - splitRow_32FC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_32FC4(const float in[], float out0[], float out1[], - float out2[], float out3[], int length) { - splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); -} - void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[], @@ -632,7 +572,6 @@ void calcRowLinear_32F(float *dst[], int lpi) { calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi); } - } // namespace avx512 template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length); @@ -642,6 +581,20 @@ template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); + +template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp index 8d2778781a3..23da40a8089 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp @@ -106,83 +106,8 @@ void calcRowLinear_32F(float *dst[], const Size & inSz, const Size & outSz, int lpi); - -//---------------------------------------------------------------------- - -void mergeRow_8UC2(const uint8_t in0[], - const uint8_t in1[], - uint8_t out[], - int length); - -void mergeRow_8UC3(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - uint8_t out[], - int length); - -void mergeRow_8UC4(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - const uint8_t in3[], - uint8_t out[], - int length); - -void mergeRow_32FC2(const float in0[], - const float in1[], - float out[], - int length); - -void mergeRow_32FC3(const float in0[], - const float in1[], - const float in2[], - float out[], - int length); - -void mergeRow_32FC4(const float in0[], - const float in1[], - const float in2[], - const float in3[], - float out[], - int length); - -void splitRow_8UC2(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - int length); - -void splitRow_8UC3(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - int length); - -void splitRow_8UC4(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - uint8_t out3[], - int length); - -void splitRow_32FC2(const float in[], - float out0[], - float out1[], - int length); - -void splitRow_32FC3(const float in[], - float out0[], - float out1[], - float out2[], - int length); - -void splitRow_32FC4(const float in[], - float out0[], - float out1[], - float out2[], - float out3[], - int length); } // namespace avx512 - template void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length); @@ -200,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row, extern template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template +void splitRowImpl(isa_tag_t, const T* in, std::array& outs, const int length); + +extern template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(avx512_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(avx512_tag, const float* in, std::array& outs, const int length); + +template +void mergeRowImpl(isa_tag_t, const std::array& ins, T* out, const int length); + +extern template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(avx512_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(avx512_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp index 4823e335371..d3d448b9279 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp @@ -1267,103 +1267,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[], } #endif // CVKL -//------------------------------------------------------------------------------ - -void mergeRow_8UC2(const uint8_t in0[], - const uint8_t in1[], - uint8_t out[], - int length) { - mergeRow_8UC2_Impl(in0, in1, out, length); -} - -void mergeRow_8UC3(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - uint8_t out[], - int length) { - mergeRow_8UC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_8UC4(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - const uint8_t in3[], - uint8_t out[], - int length) { - mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length); -} - -void mergeRow_32FC2(const float in0[], - const float in1[], - float out[], - int length) { - mergeRow_32FC2_Impl(in0, in1, out, length); -} - -void mergeRow_32FC3(const float in0[], - const float in1[], - const float in2[], - float out[], - int length) { - mergeRow_32FC3_Impl(in0, in1, in2, out, length); -} - -void mergeRow_32FC4(const float in0[], - const float in1[], - const float in2[], - const float in3[], - float out[], - int length) { - mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length); -} - -void splitRow_8UC2(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - int length) { - splitRow_8UC2_Impl(in, out0, out1, length); -} - -void splitRow_8UC3(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - int length) { - splitRow_8UC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_8UC4(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - uint8_t out3[], - int length) { - splitRow_8UC4_Impl(in, out0, out1, out2, out3, length); -} - -void splitRow_32FC2(const float in[], - float out0[], - float out1[], - int length) { - splitRow_32FC2_Impl(in, out0, out1, length); -} - -void splitRow_32FC3(const float in[], - float out0[], - float out1[], - float out2[], - int length) { - splitRow_32FC3_Impl(in, out0, out1, out2, length); -} - -void splitRow_32FC4(const float in[], - float out0[], - float out1[], - float out2[], - float out3[], - int length) { - splitRow_32FC4_Impl(in, out0, out1, out2, out3, length); -} template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length); template void chanToPlaneRowImpl(sse42_tag, const float* in, const int chan, const int chs, float* out, const int length); @@ -1372,6 +1275,20 @@ template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); +template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); + +template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); +template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp index 8726013357e..6dfa51515e7 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp @@ -106,80 +106,6 @@ void calcRowLinear_32F(float *dst[], const Size & outSz, int lpi); -//---------------------------------------------------------------------- - -void mergeRow_8UC2(const uint8_t in0[], - const uint8_t in1[], - uint8_t out[], - int length); - -void mergeRow_8UC3(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - uint8_t out[], - int length); - -void mergeRow_8UC4(const uint8_t in0[], - const uint8_t in1[], - const uint8_t in2[], - const uint8_t in3[], - uint8_t out[], - int length); - -void mergeRow_32FC2(const float in0[], - const float in1[], - float out[], - int length); - -void mergeRow_32FC3(const float in0[], - const float in1[], - const float in2[], - float out[], - int length); - -void mergeRow_32FC4(const float in0[], - const float in1[], - const float in2[], - const float in3[], - float out[], - int length); - -void splitRow_8UC2(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - int length); - -void splitRow_8UC3(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - int length); - -void splitRow_8UC4(const uint8_t in[], - uint8_t out0[], - uint8_t out1[], - uint8_t out2[], - uint8_t out3[], - int length); - -void splitRow_32FC2(const float in[], - float out0[], - float out1[], - int length); - -void splitRow_32FC3(const float in[], - float out0[], - float out1[], - float out2[], - int length); - -void splitRow_32FC4(const float in[], - float out0[], - float out1[], - float out2[], - float out3[], - int length); - template void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length); @@ -199,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row, extern template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row, uint8_t** out_rows, const int buf_width); + +template +void splitRowImpl(isa_tag_t, const T* in, std::array& outs, const int length); + +extern template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); +extern template void splitRowImpl(sse42_tag, const uint8_t* in, std::array& outs, const int length); +extern template void splitRowImpl(sse42_tag, const float* in, std::array& outs, const int length); + +template +void mergeRowImpl(isa_tag_t, const std::array& ins, T* out, const int length); + +extern template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); +extern template void mergeRowImpl(sse42_tag, const std::array& ins, uint8_t* out, const int length); +extern template void mergeRowImpl(sse42_tag, const std::array& ins, float* out, const int length); } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 4cf4fffd74b..ffc62705c99 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -42,396 +42,10 @@ namespace InferenceEngine { namespace gapi { + +//using namespace kernels; + namespace kernels { - -template static -void mergeRow(const std::array& ins, uint8_t* out, int length) { -// AVX512 implementation of wide universal intrinsics is slower than AVX2. -// It is turned off until the cause isn't found out. -#if 0 -#ifdef HAVE_AVX512 - if (with_cpu_x86_avx512f()) { - if (std::is_same::value && chs == 2) { - avx512::mergeRow_8UC2(ins[0], ins[1], out, length); - return; - } - - if (std::is_same::value && chs == 3) { - avx512::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length); - return; - } - - if (std::is_same::value && chs == 4) { - avx512::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length); - return; - } - - if (std::is_same::value && chs == 2) { - avx512::mergeRow_32FC2(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 3) { - avx512::mergeRow_32FC3(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 4) { - avx512::mergeRow_32FC4(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(ins[3]), - reinterpret_cast(out), length); - return; - } - } -#endif // HAVE_AVX512 -#endif - -#ifdef HAVE_AVX2 - if (with_cpu_x86_avx2()) { - if (std::is_same::value && chs == 2) { - avx::mergeRow_8UC2(ins[0], ins[1], out, length); - return; - } - - if (std::is_same::value && chs == 3) { - avx::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length); - return; - } - - if (std::is_same::value && chs == 4) { - avx::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length); - return; - } - - if (std::is_same::value && chs == 2) { - avx::mergeRow_32FC2(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 3) { - avx::mergeRow_32FC3(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 4) { - avx::mergeRow_32FC4(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(ins[3]), - reinterpret_cast(out), length); - return; - } - } -#endif // HAVE_AVX2 - -#ifdef HAVE_SSE - if (with_cpu_x86_sse42()) { - if (std::is_same::value && chs == 2) { - mergeRow_8UC2(ins[0], ins[1], out, length); - return; - } - - if (std::is_same::value && chs == 3) { - mergeRow_8UC3(ins[0], ins[1], ins[2], out, length); - return; - } - - if (std::is_same::value && chs == 4) { - mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length); - return; - } - - if (std::is_same::value && chs == 2) { - mergeRow_32FC2(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 3) { - mergeRow_32FC3(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 4) { - mergeRow_32FC4(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(ins[3]), - reinterpret_cast(out), length); - return; - } - } -#endif // HAVE_SSE - -#ifdef HAVE_NEON - if (std::is_same::value && chs == 2) { - neon::mergeRow_8UC2(ins[0], ins[1], out, length); - return; - } - - if (std::is_same::value && chs == 3) { - neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length); - return; - } - - if (std::is_same::value && chs == 4) { - neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length); - return; - } - - if (std::is_same::value && chs == 2) { - neon::mergeRow_32FC2(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 3) { - neon::mergeRow_32FC3(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(out), length); - return; - } - - if (std::is_same::value && chs == 4) { - neon::mergeRow_32FC4(reinterpret_cast(ins[0]), - reinterpret_cast(ins[1]), - reinterpret_cast(ins[2]), - reinterpret_cast(ins[3]), - reinterpret_cast(out), length); - return; - } -#endif // HAVE_NEON - - const T* insT[chs]; - for (int c = 0; c < chs; c++) { - insT[c] = reinterpret_cast(ins[c]); - } - auto outT = reinterpret_cast(out); - - for (int x = 0; x < length; x++) { - for (int c = 0; c < chs; c++) { - outT[chs*x + c] = insT[c][x]; - } - } -} - -template static -void splitRow(const uint8_t* in, std::array& outs, int length) { -#ifdef HAVE_AVX512 - if (with_cpu_x86_avx512f()) { - if (std::is_same::value && chs == 2) { - avx512::splitRow_8UC2(in, outs[0], outs[1], length); - return; - } - - if (std::is_same::value && chs == 3) { - avx512::splitRow_8UC3(in, outs[0], outs[1], outs[2], length); - return; - } - - if (std::is_same::value && chs == 4) { - avx512::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length); - return; - } - - if (std::is_same::value && chs == 2) { - avx512::splitRow_32FC2(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - length); - return; - } - - if (std::is_same::value && chs == 3) { - avx512::splitRow_32FC3(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - length); - return; - } - - if (std::is_same::value && chs == 4) { - avx512::splitRow_32FC4(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - reinterpret_cast(outs[3]), - length); - return; - } - } -#endif // HAVE_AVX512 - -#ifdef HAVE_AVX2 - - if (with_cpu_x86_avx2()) { - if (std::is_same::value && chs == 2) { - avx::splitRow_8UC2(in, outs[0], outs[1], length); - return; - } - - if (std::is_same::value && chs == 3) { - avx::splitRow_8UC3(in, outs[0], outs[1], outs[2], length); - return; - } - - if (std::is_same::value && chs == 4) { - avx::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length); - return; - } - - if (std::is_same::value && chs == 2) { - avx::splitRow_32FC2(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - length); - return; - } - - if (std::is_same::value && chs == 3) { - avx::splitRow_32FC3(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - length); - return; - } - - if (std::is_same::value && chs == 4) { - avx::splitRow_32FC4(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - reinterpret_cast(outs[3]), - length); - return; - } - } -#endif // HAVE_AVX2 - -#ifdef HAVE_SSE - if (with_cpu_x86_sse42()) { - if (std::is_same::value && chs == 2) { - splitRow_8UC2(in, outs[0], outs[1], length); - return; - } - - if (std::is_same::value && chs == 3) { - splitRow_8UC3(in, outs[0], outs[1], outs[2], length); - return; - } - - if (std::is_same::value && chs == 4) { - splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length); - return; - } - - if (std::is_same::value && chs == 2) { - splitRow_32FC2(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - length); - return; - } - - if (std::is_same::value && chs == 3) { - splitRow_32FC3(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - length); - return; - } - - if (std::is_same::value && chs == 4) { - splitRow_32FC4(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - reinterpret_cast(outs[3]), - length); - return; - } - } -#endif // HAVE_SSE - -#ifdef HAVE_NEON - if (std::is_same::value && chs == 2) { - neon::splitRow_8UC2(in, outs[0], outs[1], length); - return; - } - - if (std::is_same::value && chs == 3) { - neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length); - return; - } - - if (std::is_same::value && chs == 4) { - neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length); - return; - } - - if (std::is_same::value && chs == 2) { - neon::splitRow_32FC2(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - length); - return; - } - - if (std::is_same::value && chs == 3) { - neon::splitRow_32FC3(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - length); - return; - } - - if (std::is_same::value && chs == 4) { - neon::splitRow_32FC4(reinterpret_cast(in), - reinterpret_cast(outs[0]), - reinterpret_cast(outs[1]), - reinterpret_cast(outs[2]), - reinterpret_cast(outs[3]), - length); - return; - } -#endif // HAVE_NEON - - auto inT = reinterpret_cast(in); - - T* outsT[chs]; - for (int c = 0; c < chs; c++) { - outsT[c] = reinterpret_cast(outs[c]); - } - - for (int x = 0; x < length; x++) { - for (int c = 0; c < chs; c++) { - outsT[c][x] = inT[chs*x + c]; - } - } -} - namespace { struct fp_16_t { @@ -583,168 +197,108 @@ bool is_cv_type_in_list(const int type_id) { } namespace { - using merge_supported_types = typelist; -template +template +void mergeRowImpl(scalar_tag, const std::array& ins, T* out, const int length) { + for (int x = 0; x < length; ++x) { + for (int c = 0; c < chs; ++c) { + out[chs * x + c] = ins[c][x]; + } + } +} + +template struct typed_merge_row { - using p_f = void (*)(const std::array& ins, uint8_t* out, int length); + using p_f = void (*)(const std::array& ins, uint8_t* out, const int length); template - p_f operator()(type_to_type ) { return mergeRow; } + typename std::enable_if::value || + (!std::is_same::value && !std::is_same::value && + !std::is_same::value), p_f>::type + operator()(type_to_type ) { + return [](const std::array& ins, uint8_t* out, const int length) { + const auto inT = reinterpret_cast&>(ins); + auto outT = reinterpret_cast(out); + scalar_tag t; + mergeRowImpl(t, inT, outT, length); + }; + } - p_f operator()(type_to_type ) { - static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v), - "fp_16_t should be a plain wrap over FP16 implementation type"); - return mergeRow; + template + typename std::enable_if::value, p_f>::type + operator()(type_to_type) { + return [](const std::array& ins, uint8_t* out, const int length) { + tag t; + mergeRowImpl(t, ins, out, length); + }; + } + + template + typename std::enable_if::value, p_f>::type + operator()(type_to_type) { + return [](const std::array& ins, uint8_t* out, const int length) { + const auto inT = reinterpret_cast&>(ins); + auto outT = reinterpret_cast(out); + tag t; + mergeRowImpl(t, inT, outT, length); + }; } }; } // namespace -GAPI_FLUID_KERNEL(FMerge2, Merge2, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View& a, - const cv::gapi::fluid::View& b, - cv::gapi::fluid::Buffer& out) { - GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); - - const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row<2>{}, nullptr); - for (int l = 0; l < out.lpi(); l++) { - rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length()); - } - } -}; - -GAPI_FLUID_KERNEL(FMerge3, Merge3, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View& a, - const cv::gapi::fluid::View& b, - const cv::gapi::fluid::View& c, - cv::gapi::fluid::Buffer& out) { - GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); - - const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row<3>{}, nullptr); - for (int l = 0; l < out.lpi(); l++) { - rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length()); - } - } -}; - -GAPI_FLUID_KERNEL(FMerge4, Merge4, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View& a, - const cv::gapi::fluid::View& b, - const cv::gapi::fluid::View& c, - const cv::gapi::fluid::View& d, - cv::gapi::fluid::Buffer& out) { - GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); - - const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row<4>{}, nullptr); - for (int l = 0; l < out.lpi(); l++) { - rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length()); - } - } -}; - - namespace { using split_supported_types = typelist; -template +template +void splitRowImpl(scalar_tag, const T* in, std::array& outs, const int length) { + for (int x = 0; x < length; ++x) { + for (int c = 0; c < chs; ++c) { + outs[c][x] = in[chs * x + c]; + } + } +} + +template struct typed_split_row { - using p_f = void (*)(const uint8_t* in, std::array& outs, int length); + using p_f = void (*)(const uint8_t* in, std::array& outs, const int length); template - p_f operator()(type_to_type ) { return splitRow; } + typename std::enable_if::value || + (!std::is_same::value && !std::is_same::value && + !std::is_same::value), p_f>::type + operator()(type_to_type ) { + return [](const uint8_t* in, std::array& outs, const int length) { + const auto inT = reinterpret_cast(in); + auto outT = reinterpret_cast&>(outs); + scalar_tag t; + splitRowImpl(t, inT, outT, length); + }; + } - p_f operator()(type_to_type ) { - static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v), - "fp_16_t should be a plain wrap over FP16 implementation type"); - return splitRow; + template + typename std::enable_if::value, p_f>::type + operator()(type_to_type) { + return [](const uint8_t* in, std::array& outs, const int length) { + tag t; + splitRowImpl(t, in, outs, length); + }; + } + + template + typename std::enable_if::value, p_f>::type + operator()(type_to_type) { + return [](const uint8_t* in, std::array& outs, const int length) { + const auto inT = reinterpret_cast(in); + auto outT = reinterpret_cast&>(outs); + tag t; + splitRowImpl(t, inT, outT, length); + }; } }; - } // namespace -GAPI_FLUID_KERNEL(FSplit2, Split2, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View & in, - cv::gapi::fluid::Buffer& out1, - cv::gapi::fluid::Buffer& out2) { - GAPI_DbgAssert(2 == in.meta().chan); - GAPI_DbgAssert(1 == out1.meta().chan); - GAPI_DbgAssert(1 == out2.meta().chan); - GAPI_DbgAssert(in.meta().depth == out1.meta().depth); - GAPI_DbgAssert(in.meta().depth == out2.meta().depth); - GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); - - const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row<2>{}, nullptr); - for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { - std::array outs = {out1.OutLineB(i), out2.OutLineB(i)}; - rowFunc(in.InLineB(i), outs, in.length()); - } - } -}; - -GAPI_FLUID_KERNEL(FSplit3, Split3, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View & in, - cv::gapi::fluid::Buffer& out1, - cv::gapi::fluid::Buffer& out2, - cv::gapi::fluid::Buffer& out3) { - GAPI_DbgAssert(3 == in.meta().chan); - GAPI_DbgAssert(1 == out1.meta().chan); - GAPI_DbgAssert(1 == out2.meta().chan); - GAPI_DbgAssert(1 == out3.meta().chan); - GAPI_DbgAssert(in.meta().depth == out1.meta().depth); - GAPI_DbgAssert(in.meta().depth == out2.meta().depth); - GAPI_DbgAssert(in.meta().depth == out3.meta().depth); - - GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); - - const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row<3>{}, nullptr); - for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { - std::array outs = {out1.OutLineB(i), out2.OutLineB(i), - out3.OutLineB(i)}; - rowFunc(in.InLineB(i), outs, in.length()); - } - } -}; - -GAPI_FLUID_KERNEL(FSplit4, Split4, false) { - static const int LPI = 4; - static const int Window = 1; - static void run(const cv::gapi::fluid::View & in, - cv::gapi::fluid::Buffer& out1, - cv::gapi::fluid::Buffer& out2, - cv::gapi::fluid::Buffer& out3, - cv::gapi::fluid::Buffer& out4) { - GAPI_DbgAssert(4 == in.meta().chan); - GAPI_DbgAssert(1 == out1.meta().chan); - GAPI_DbgAssert(1 == out2.meta().chan); - GAPI_DbgAssert(1 == out3.meta().chan); - GAPI_DbgAssert(1 == out4.meta().chan); - GAPI_DbgAssert(in.meta().depth == out1.meta().depth); - GAPI_DbgAssert(in.meta().depth == out2.meta().depth); - GAPI_DbgAssert(in.meta().depth == out3.meta().depth); - GAPI_DbgAssert(in.meta().depth == out4.meta().depth); - GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); - - const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row<4>{}, nullptr); - for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { - std::array outs = {out1.OutLineB(i), out2.OutLineB(i), - out3.OutLineB(i), out4.OutLineB(i)}; - rowFunc(in.InLineB(i), outs, in.length()); - } - } -}; - //---------------------------------------------------------------------- using isas_set = typelist< #ifdef HAVE_AVX512 @@ -1005,36 +559,179 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) { rowFunc(y_rows, u_row, v_row, out_rows, buf_width); } }; + +GAPI_FLUID_KERNEL(FSplit2, Split2, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & in, + cv::gapi::fluid::Buffer & out1, + cv::gapi::fluid::Buffer & out2) { + GAPI_DbgAssert(2 == in.meta().chan); + GAPI_DbgAssert(1 == out1.meta().chan); + GAPI_DbgAssert(1 == out2.meta().chan); + GAPI_DbgAssert(in.meta().depth == out1.meta().depth); + GAPI_DbgAssert(in.meta().depth == out2.meta().depth); + GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); + + const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row{}, nullptr); + for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { + std::array outs = { out1.OutLineB(i), out2.OutLineB(i) }; + rowFunc(in.InLineB(i), outs, in.length()); + } + } +}; + +GAPI_FLUID_KERNEL(FSplit3, Split3, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & in, + cv::gapi::fluid::Buffer & out1, + cv::gapi::fluid::Buffer & out2, + cv::gapi::fluid::Buffer & out3) { + GAPI_DbgAssert(3 == in.meta().chan); + GAPI_DbgAssert(1 == out1.meta().chan); + GAPI_DbgAssert(1 == out2.meta().chan); + GAPI_DbgAssert(1 == out3.meta().chan); + GAPI_DbgAssert(in.meta().depth == out1.meta().depth); + GAPI_DbgAssert(in.meta().depth == out2.meta().depth); + GAPI_DbgAssert(in.meta().depth == out3.meta().depth); + + GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); + + const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row{}, nullptr); + for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { + std::array outs = { out1.OutLineB(i), out2.OutLineB(i), + out3.OutLineB(i) }; + rowFunc(in.InLineB(i), outs, in.length()); + } + } +}; + +GAPI_FLUID_KERNEL(FSplit4, Split4, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & in, + cv::gapi::fluid::Buffer & out1, + cv::gapi::fluid::Buffer & out2, + cv::gapi::fluid::Buffer & out3, + cv::gapi::fluid::Buffer & out4) { + GAPI_DbgAssert(4 == in.meta().chan); + GAPI_DbgAssert(1 == out1.meta().chan); + GAPI_DbgAssert(1 == out2.meta().chan); + GAPI_DbgAssert(1 == out3.meta().chan); + GAPI_DbgAssert(1 == out4.meta().chan); + GAPI_DbgAssert(in.meta().depth == out1.meta().depth); + GAPI_DbgAssert(in.meta().depth == out2.meta().depth); + GAPI_DbgAssert(in.meta().depth == out3.meta().depth); + GAPI_DbgAssert(in.meta().depth == out4.meta().depth); + GAPI_DbgAssert(is_cv_type_in_list(in.meta().depth)); + + const auto rowFunc = type_dispatch(in.meta().depth, cv_type_id{}, typed_split_row{}, nullptr); + for (int i = 0, lpi = out1.lpi(); i < lpi; i++) { + std::array outs = { out1.OutLineB(i), out2.OutLineB(i), + out3.OutLineB(i), out4.OutLineB(i) }; + rowFunc(in.InLineB(i), outs, in.length()); + } + } +}; + +GAPI_FLUID_KERNEL(FMerge2, Merge2, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & a, + const cv::gapi::fluid::View & b, + cv::gapi::fluid::Buffer & out) { + GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); + + const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row{}, nullptr); + for (int l = 0; l < out.lpi(); l++) { + rowFunc({ a.InLineB(l), b.InLineB(l) }, out.OutLineB(l), a.length()); + } + } +}; + +GAPI_FLUID_KERNEL(FMerge3, Merge3, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & a, + const cv::gapi::fluid::View & b, + const cv::gapi::fluid::View & c, + cv::gapi::fluid::Buffer & out) { + GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); + + const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row{}, nullptr); + for (int l = 0; l < out.lpi(); l++) { + rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l) }, out.OutLineB(l), a.length()); + } + } +}; + +GAPI_FLUID_KERNEL(FMerge4, Merge4, false) { + static const int LPI = 4; + static const int Window = 1; + static void run(const cv::gapi::fluid::View & a, + const cv::gapi::fluid::View & b, + const cv::gapi::fluid::View & c, + const cv::gapi::fluid::View & d, + cv::gapi::fluid::Buffer & out) { + GAPI_DbgAssert(is_cv_type_in_list(out.meta().depth)); + + const auto rowFunc = type_dispatch(out.meta().depth, cv_type_id{}, typed_merge_row{}, nullptr); + for (int l = 0; l < out.lpi(); l++) { + rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l) }, out.OutLineB(l), a.length()); + } + } +}; }; namespace { -struct ColorConversionISA { +struct CC_and_MergeISA { cv::gapi::GKernelPackage& pckg; - ColorConversionISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {} + CC_and_MergeISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {} template bool operator()(type_to_type) { pckg.include::FI420toRGB>(); pckg.include::FNV12toRGB>(); pckg.include::FChanToPlane>(); + pckg.include::FMerge2>(); + pckg.include::FMerge3>(); + pckg.include::FMerge4>(); + //at the moment type_dispatch requires something to be returned by the lambda + return true; + } +}; + +struct SplitISA { + cv::gapi::GKernelPackage& pckg; + + SplitISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {} + + template + bool operator()(type_to_type) { + pckg.include::FSplit2>(); + pckg.include::FSplit3>(); + pckg.include::FSplit4>(); //at the moment type_dispatch requires something to be returned by the lambda return true; } }; } //namespace -cv::gapi::GKernelPackage FColorConversionChooseISA() { +cv::gapi::GKernelPackage FKernelsChooseISA() { // At the moment AVX512 implementation of wide universal intrinsics is slower than AVX2. // So, disable it for now. using isas = remove_t; - cv::gapi::GKernelPackage pckg; - ColorConversionISA ctpISA{pckg}; + cv::gapi::GKernelPackage pckg1, pckg2; + CC_and_MergeISA ccISA{ pckg1 }; + SplitISA sISA{ pckg2 }; - type_dispatch(is_isa_present{}, ctpISA, false); + type_dispatch(is_isa_present{}, ccISA, false); + type_dispatch(is_isa_present{}, sISA, false); - return pckg; + return combine(pckg1, pckg2); } //---------------------------------------------------------------------- @@ -2601,7 +2298,7 @@ using namespace kernels; cv::gapi::GKernelPackage preprocKernels() { return combine( - FColorConversionChooseISA(), + FKernelsChooseISA(), cv::gapi::kernels +CV_ALWAYS_INLINE void mergeRowC2_Impl(const T in0[], const T in1[], + T out[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - v_store_interleave(&out[2*l], r0, r1); - } + VecT r0, r1; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + r0 = vx_load(&in0[x]); + r1 = vx_load(&in1[x]); + v_store_interleave(&out[2*x], r0, r1); + } - // to think about how to remove those ifs - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out[2*l + 0] = in0[l]; - out[2*l + 1] = in1[l]; + for (; x < length; ++x) { + out[2*x + 0] = in0[x]; + out[2*x + 1] = in1[x]; } } -CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], uint8_t out[], int length) { - int l = 0; +template +CV_ALWAYS_INLINE void mergeRowC3_Impl(const T in0[], const T in1[], + const T in2[], T out[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1, r2; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - r2 = vx_load(&in2[l]); - v_store_interleave(&out[3*l], r0, r1, r2); - } + VecT r0, r1, r2; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + r0 = vx_load(&in0[x]); + r1 = vx_load(&in1[x]); + r2 = vx_load(&in2[x]); + v_store_interleave(&out[3*x], r0, r1, r2); + } - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out[3*l + 0] = in0[l]; - out[3*l + 1] = in1[l]; - out[3*l + 2] = in2[l]; + for (; x < length; ++x) { + out[3*x + 0] = in0[x]; + out[3*x + 1] = in1[x]; + out[3*x + 2] = in2[x]; } } -CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], - const uint8_t in2[], const uint8_t in3[], - uint8_t out[], int length) { - int l = 0; +template +CV_ALWAYS_INLINE void mergeRowC4_Impl(const T in0[], const T in1[], + const T in2[], const T in3[], + T out[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1, r2, r3; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - r2 = vx_load(&in2[l]); - r3 = vx_load(&in3[l]); - v_store_interleave(&out[4*l], r0, r1, r2, r3); - } + VecT r0, r1, r2, r3; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + r0 = vx_load(&in0[x]); + r1 = vx_load(&in1[x]); + r2 = vx_load(&in2[x]); + r3 = vx_load(&in3[x]); + v_store_interleave(&out[4* x], r0, r1, r2, r3); + } - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out[4*l + 0] = in0[l]; - out[4*l + 1] = in1[l]; - out[4*l + 2] = in2[l]; - out[4*l + 3] = in3[l]; + for (; x < length; ++x) { + out[4*x + 0] = in0[x]; + out[4*x + 1] = in1[x]; + out[4*x + 2] = in2[x]; + out[4*x + 3] = in3[x]; } } - -CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[], - float out[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - v_store_interleave(&out[2*l], r0, r1); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } -#endif - - for (; l < length; ++l) { - out[2*l + 0] = in0[l]; - out[2*l + 1] = in1[l]; - } -} - -CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[], - float out[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1, r2; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - r2 = vx_load(&in2[l]); - v_store_interleave(&out[3*l], r0, r1, r2); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } -#endif - - for (; l < length; ++l) { - out[3*l + 0] = in0[l]; - out[3*l + 1] = in1[l]; - out[3*l + 2] = in2[l]; - } -} - -CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[], - const float in2[], const float in3[], - float out[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1, r2, r3; - r0 = vx_load(&in0[l]); - r1 = vx_load(&in1[l]); - r2 = vx_load(&in2[l]); - r3 = vx_load(&in3[l]); - v_store_interleave(&out[4*l], r0, r1, r2, r3); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } -#endif - - for (; l < length; ++l) { - out[4*l + 0] = in0[l]; - out[4*l + 1] = in1[l]; - out[4*l + 2] = in2[l]; - out[4*l + 3] = in3[l]; - } -} - //------------------------------------------------------------------------------ - -CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[], - uint8_t out1[], int length) { - int l = 0; +template +CV_ALWAYS_INLINE void splitRowC2_Impl(const T in[], T out0[], + T out1[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1; - v_load_deinterleave(&in[2*l], r0, r1); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - } + VecT r0, r1; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + v_load_deinterleave(&in[2*x], r0, r1); + vx_store(&out0[x], r0); + vx_store(&out1[x], r1); + } - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out0[l] = in[2*l + 0]; - out1[l] = in[2*l + 1]; + for (; x < length; ++x) { + out0[x] = in[2*x + 0]; + out1[x] = in[2*x + 1]; } } -CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[], - uint8_t out1[], uint8_t out2[], int length) { - int l = 0; +template +CV_ALWAYS_INLINE void splitRowC3_Impl(const T in[], T out0[], + T out1[], T out2[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1, r2; - v_load_deinterleave(&in[3*l], r0, r1, r2); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - vx_store(&out2[l], r2); - } + VecT r0, r1, r2; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + v_load_deinterleave(&in[3*x], r0, r1, r2); + vx_store(&out0[x], r0); + vx_store(&out1[x], r1); + vx_store(&out2[x], r2); + } - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out0[l] = in[3*l + 0]; - out1[l] = in[3*l + 1]; - out2[l] = in[3*l + 2]; + for (; x < length; ++x) { + out0[x] = in[3*x + 0]; + out1[x] = in[3*x + 1]; + out2[x] = in[3*x + 2]; } } -CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[], - uint8_t out2[], uint8_t out3[], int length) { - int l = 0; +template +CV_ALWAYS_INLINE void splitRowC4_Impl(const T in[], T out0[], T out1[], + T out2[], T out3[], const int length) { + int x = 0; #if MANUAL_SIMD - constexpr int nlanes = v_uint8::nlanes; + constexpr int nlanes = VecT::nlanes; + GAPI_DbgAssert(length >= nlanes); - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_uint8 r0, r1, r2, r3; - v_load_deinterleave(&in[4*l], r0, r1, r2, r3); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - vx_store(&out2[l], r2); - vx_store(&out3[l], r3); - } + VecT r0, r1, r2, r3; + for (; length >= nlanes;) { + for (; x <= length - nlanes; x += nlanes) { + v_load_deinterleave(&in[4*x], r0, r1, r2, r3); + vx_store(&out0[x], r0); + vx_store(&out1[x], r1); + vx_store(&out2[x], r2); + vx_store(&out3[x], r3); + } - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; + if (x < length) { + x = length - nlanes; + continue; + } + break; } #endif - for (; l < length; ++l) { - out0[l] = in[4*l + 0]; - out1[l] = in[4*l + 1]; - out2[l] = in[4*l + 2]; - out3[l] = in[4*l + 3]; + for (; x < length; ++x) { + out0[x] = in[4*x + 0]; + out1[x] = in[4*x + 1]; + out2[x] = in[4*x + 2]; + out3[x] = in[4*x + 3]; } } - -CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[], - float out1[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1; - v_load_deinterleave(&in[2*l], r0, r1); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } - -#endif - - for (; l < length; ++l) { - out0[l] = in[2*l + 0]; - out1[l] = in[2*l + 1]; - } -} - -CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[], - float out2[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1, r2; - v_load_deinterleave(&in[3*l], r0, r1, r2); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - vx_store(&out2[l], r2); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } -#endif - - for (; l < length; ++l) { - out0[l] = in[3*l + 0]; - out1[l] = in[3*l + 1]; - out2[l] = in[3*l + 2]; - } -} - -CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[], - float out2[], float out3[], int length) { - int l = 0; - -#if MANUAL_SIMD - constexpr int nlanes = v_float32::nlanes; - - cycle: - for (; l <= length - nlanes; l += nlanes) { - v_float32 r0, r1, r2, r3; - v_load_deinterleave(&in[4*l], r0, r1, r2, r3); - vx_store(&out0[l], r0); - vx_store(&out1[l], r1); - vx_store(&out2[l], r2); - vx_store(&out3[l], r3); - } - - if (l < length && length >= nlanes) { - l = length - nlanes; - goto cycle; - } -#endif - - for (; l < length; ++l) { - out0[l] = in[4*l + 0]; - out1[l] = in[4*l + 1]; - out2[l] = in[4*l + 2]; - out3[l] = in[4*l + 3]; - } -} - //------------------------------------------------------------------------------ CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v, @@ -880,6 +724,38 @@ CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, out[x] = in[x*chs + chan]; } } + +template +CV_ALWAYS_INLINE void splitRowImpl(isa_tag_t, const T* in, std::array& outs, const int length) { + static_assert(chs > 1 && chs < 5, "This number of channels isn't supported."); + + if (chs == 2) { + splitRowC2_Impl, T>(in, outs[0], outs[1], length); + return; + } else if (chs == 3) { + splitRowC3_Impl, T>(in, outs[0], outs[1], outs[2], length); + return; + } else { + splitRowC4_Impl, T>(in, outs[0], outs[1], outs[2], outs[3], length); + return; + } +} + +template +CV_ALWAYS_INLINE void mergeRowImpl(isa_tag_t, const std::array& ins, T* out, const int length) { + static_assert(chs > 1 && chs < 5, "This number of channels isn't supported."); + + if (chs == 2) { + mergeRowC2_Impl, T>(ins[0], ins[1], out, length); + return; + } else if (chs == 3) { + mergeRowC3_Impl, T>(ins[0], ins[1], ins[2], out, length); + return; + } else { + mergeRowC4_Impl, T>(ins[0], ins[1], ins[2], ins[3], out, length); + return; + } +} } // namespace kernels } // namespace gapi } // namespace InferenceEngine diff --git a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp index c7300214c7b..f883e722348 100644 --- a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp +++ b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp @@ -98,10 +98,30 @@ cv::String typeToString(int type) case CV_8UC2 : return "CV_8UC2"; case CV_8UC3 : return "CV_8UC3"; case CV_8UC4 : return "CV_8UC4"; + case CV_16FC1 : return "CV_16FC1"; + case CV_16FC2 : return "CV_16FC2"; + case CV_16FC3 : return "CV_16FC3"; + case CV_16FC4 : return "CV_16FC4"; case CV_32FC1 : return "CV_32FC1"; case CV_32FC2 : return "CV_32FC2"; case CV_32FC3 : return "CV_32FC3"; case CV_32FC4 : return "CV_32FC4"; + case CV_8SC1 : return "CV_8SC1"; + case CV_8SC2 : return "CV_8SC2"; + case CV_8SC3 : return "CV_8SC3"; + case CV_8SC4 : return "CV_8SC4"; + case CV_16SC1 : return "CV_16SC1"; + case CV_16SC2 : return "CV_16SC2"; + case CV_16SC3 : return "CV_16SC3"; + case CV_16SC4 : return "CV_16SC4"; + case CV_16UC1 : return "CV_16UC1"; + case CV_16UC2 : return "CV_16UC2"; + case CV_16UC3 : return "CV_16UC3"; + case CV_16UC4 : return "CV_16UC4"; + case CV_32SC1 : return "CV_32SC1"; + case CV_32SC2 : return "CV_32SC2"; + case CV_32SC3 : return "CV_32SC3"; + case CV_32SC4 : return "CV_32SC4"; } CV_Assert(!"ERROR: unsupported type!"); return nullptr;