Pre-processing: Split and Merge kernels refactoring. (#6205)

* * Split and Merge kernel refactoring

* * SFINAE: replace condition compilation macro with std::enable_if
This commit is contained in:
Anna Khakimova
2021-06-22 18:46:23 +03:00
committed by GitHub
parent 27ae3ec433
commit e00cee2fc6
11 changed files with 563 additions and 1425 deletions

View File

@@ -29,67 +29,6 @@ namespace InferenceEngine {
namespace gapi {
namespace kernels {
namespace neon {
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
mergeRow_8UC2_Impl(in0, in1, out, length);
}
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
const uint8_t in3[], uint8_t out[], int length) {
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
}
void mergeRow_32FC2(const float in0[], const float in1[],
float out[], int length) {
mergeRow_32FC2_Impl(in0, in1, out, length);
}
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
float out[], int length) {
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_32FC4(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
}
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
splitRow_8UC2_Impl(in, out0, out1, length);
}
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
splitRow_8UC3_Impl(in, out0, out1, out2, length);
}
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
}
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
splitRow_32FC2_Impl(in, out0, out1, length);
}
void splitRow_32FC3(const float in[], float out0[], float out1[],
float out2[], int length) {
splitRow_32FC3_Impl(in, out0, out1, out2, length);
}
void splitRow_32FC4(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -693,6 +632,20 @@ template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t*
template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -92,87 +92,6 @@ void calcRowLinear_32F(float *dst[],
const Size& inSz,
const Size& outSz,
int lpi);
//----------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length);
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length);
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length);
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length);
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length);
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length);
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length);
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length);
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length);
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length);
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length);
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length);
void calculate_i420_to_rgb(const uchar **srcY,
const uchar *srcU,
const uchar *srcV,
uchar **dstRGBx,
int width);
} // namespace neon
template<typename isa_tag_t, typename T>
@@ -192,6 +111,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
extern template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template<typename isa_tag_t, typename T, int chs>
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
extern template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
extern template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
extern template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
extern template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
extern template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
extern template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
template<typename isa_tag_t, typename T, int chs>
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
extern template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -47,66 +47,6 @@ namespace kernels {
namespace avx {
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
mergeRow_8UC2_Impl(in0, in1, out, length);
}
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
const uint8_t in3[], uint8_t out[], int length) {
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
}
void mergeRow_32FC2(const float in0[], const float in1[],
float out[], int length) {
mergeRow_32FC2_Impl(in0, in1, out, length);
}
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
float out[], int length) {
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_32FC4(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
}
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
splitRow_8UC2_Impl(in, out0, out1, length);
}
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
splitRow_8UC3_Impl(in, out0, out1, out2, length);
}
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
}
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
splitRow_32FC2_Impl(in, out0, out1, length);
}
void splitRow_32FC3(const float in[], float out0[], float out1[],
float out2[], int length) {
splitRow_32FC3_Impl(in, out0, out1, out2, length);
}
void splitRow_32FC4(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -562,6 +502,20 @@ template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t*
template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -106,84 +106,8 @@ void calcRowLinear_32F(float *dst[],
const Size & inSz,
const Size & outSz,
int lpi);
//----------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length);
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length);
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length);
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length);
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length);
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length);
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length);
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length);
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length);
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length);
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length);
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length);
} // namespace avx
template<typename isa_tag_t, typename T>
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
@@ -192,7 +116,7 @@ extern template void chanToPlaneRowImpl(avx2_tag, const float* in, const int c
template<typename isa_tag_t>
void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row,
uint8_t** out_rows, const int buf_width);
uint8_t** out_rows, const int buf_width);
extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
const uint8_t* uv_row, uint8_t** out_rows,
@@ -200,10 +124,30 @@ extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
template<typename isa_tag_t>
void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
extern template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template<typename isa_tag_t, typename T, int chs>
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
extern template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
extern template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
extern template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
extern template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
extern template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
extern template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
template<typename isa_tag_t, typename T, int chs>
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
extern template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -41,66 +41,6 @@ namespace kernels {
namespace avx512 {
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
mergeRow_8UC2_Impl(in0, in1, out, length);
}
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
const uint8_t in3[], uint8_t out[], int length) {
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
}
void mergeRow_32FC2(const float in0[], const float in1[],
float out[], int length) {
mergeRow_32FC2_Impl(in0, in1, out, length);
}
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
float out[], int length) {
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_32FC4(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
}
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
splitRow_8UC2_Impl(in, out0, out1, length);
}
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
splitRow_8UC3_Impl(in, out0, out1, out2, length);
}
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
}
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
splitRow_32FC2_Impl(in, out0, out1, length);
}
void splitRow_32FC3(const float in[], float out0[], float out1[],
float out2[], int length) {
splitRow_32FC3_Impl(in, out0, out1, out2, length);
}
void splitRow_32FC4(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@@ -632,7 +572,6 @@ void calcRowLinear_32F(float *dst[],
int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
}
} // namespace avx512
template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
@@ -642,6 +581,20 @@ template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t
template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -106,83 +106,8 @@ void calcRowLinear_32F(float *dst[],
const Size & inSz,
const Size & outSz,
int lpi);
//----------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length);
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length);
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length);
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length);
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length);
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length);
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length);
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length);
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length);
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length);
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length);
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length);
} // namespace avx512
template<typename isa_tag_t, typename T>
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
@@ -200,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
extern template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template<typename isa_tag_t, typename T, int chs>
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
extern template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
extern template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
extern template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
extern template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
extern template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
extern template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
template<typename isa_tag_t, typename T, int chs>
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
extern template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -1267,103 +1267,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
}
#endif // CVKL
//------------------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length) {
mergeRow_8UC2_Impl(in0, in1, out, length);
}
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length) {
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length) {
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
}
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length) {
mergeRow_32FC2_Impl(in0, in1, out, length);
}
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length) {
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
}
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length) {
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
}
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length) {
splitRow_8UC2_Impl(in, out0, out1, length);
}
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length) {
splitRow_8UC3_Impl(in, out0, out1, out2, length);
}
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length) {
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
}
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length) {
splitRow_32FC2_Impl(in, out0, out1, length);
}
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length) {
splitRow_32FC3_Impl(in, out0, out1, out2, length);
}
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length) {
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
}
template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
template void chanToPlaneRowImpl(sse42_tag, const float* in, const int chan, const int chs, float* out, const int length);
@@ -1372,6 +1275,20 @@ template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t*
template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template void splitRowImpl<sse42_tag, uchar, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
template void splitRowImpl<sse42_tag, uchar, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
template void splitRowImpl<sse42_tag, uchar, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
template void mergeRowImpl<sse42_tag, uchar, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -106,80 +106,6 @@ void calcRowLinear_32F(float *dst[],
const Size & outSz,
int lpi);
//----------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length);
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length);
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length);
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length);
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length);
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length);
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length);
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length);
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length);
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length);
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length);
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length);
template<typename isa_tag_t, typename T>
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
T* out, const int length);
@@ -199,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
extern template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
template<typename isa_tag_t, typename T, int chs>
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
extern template void splitRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
extern template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
extern template void splitRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
extern template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
extern template void splitRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
extern template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
template<typename isa_tag_t, typename T, int chs>
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
extern template void mergeRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -42,396 +42,10 @@
namespace InferenceEngine {
namespace gapi {
//using namespace kernels;
namespace kernels {
template<typename T, int chs> static
void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length) {
// AVX512 implementation of wide universal intrinsics is slower than AVX2.
// It is turned off until the cause isn't found out.
#if 0
#ifdef HAVE_AVX512
if (with_cpu_x86_avx512f()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
avx512::mergeRow_8UC2(ins[0], ins[1], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
avx512::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
avx512::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
avx512::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
avx512::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
avx512::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<const float*>(ins[3]),
reinterpret_cast<float*>(out), length);
return;
}
}
#endif // HAVE_AVX512
#endif
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
avx::mergeRow_8UC2(ins[0], ins[1], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
avx::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
avx::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
avx::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
avx::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
avx::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<const float*>(ins[3]),
reinterpret_cast<float*>(out), length);
return;
}
}
#endif // HAVE_AVX2
#ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
mergeRow_8UC2(ins[0], ins[1], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<const float*>(ins[3]),
reinterpret_cast<float*>(out), length);
return;
}
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value && chs == 2) {
neon::mergeRow_8UC2(ins[0], ins[1], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<const float*>(ins[3]),
reinterpret_cast<float*>(out), length);
return;
}
#endif // HAVE_NEON
const T* insT[chs];
for (int c = 0; c < chs; c++) {
insT[c] = reinterpret_cast<const T*>(ins[c]);
}
auto outT = reinterpret_cast<T*>(out);
for (int x = 0; x < length; x++) {
for (int c = 0; c < chs; c++) {
outT[chs*x + c] = insT[c][x];
}
}
}
template<typename T, int chs> static
void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
#ifdef HAVE_AVX512
if (with_cpu_x86_avx512f()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
avx512::splitRow_8UC2(in, outs[0], outs[1], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
avx512::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
avx512::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
avx512::splitRow_32FC2(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
avx512::splitRow_32FC3(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
avx512::splitRow_32FC4(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
reinterpret_cast<float*>(outs[3]),
length);
return;
}
}
#endif // HAVE_AVX512
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
avx::splitRow_8UC2(in, outs[0], outs[1], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
avx::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
avx::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
avx::splitRow_32FC2(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
avx::splitRow_32FC3(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
avx::splitRow_32FC4(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
reinterpret_cast<float*>(outs[3]),
length);
return;
}
}
#endif // HAVE_AVX2
#ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
if (std::is_same<T, uint8_t>::value && chs == 2) {
splitRow_8UC2(in, outs[0], outs[1], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
splitRow_32FC2(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
splitRow_32FC3(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
splitRow_32FC4(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
reinterpret_cast<float*>(outs[3]),
length);
return;
}
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value && chs == 2) {
neon::splitRow_8UC2(in, outs[0], outs[1], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
reinterpret_cast<float*>(outs[3]),
length);
return;
}
#endif // HAVE_NEON
auto inT = reinterpret_cast<const T*>(in);
T* outsT[chs];
for (int c = 0; c < chs; c++) {
outsT[c] = reinterpret_cast<T*>(outs[c]);
}
for (int x = 0; x < length; x++) {
for (int c = 0; c < chs; c++) {
outsT[c][x] = inT[chs*x + c];
}
}
}
namespace {
struct fp_16_t {
@@ -583,168 +197,108 @@ bool is_cv_type_in_list(const int type_id) {
}
namespace {
using merge_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
template<int chs>
template<typename T, int chs>
void mergeRowImpl(scalar_tag, const std::array<const T*, chs>& ins, T* out, const int length) {
for (int x = 0; x < length; ++x) {
for (int c = 0; c < chs; ++c) {
out[chs * x + c] = ins[c][x];
}
}
}
template<typename isa_tag_t, int chs>
struct typed_merge_row {
using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length);
using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length);
template <typename type>
p_f operator()(type_to_type<type> ) { return mergeRow<type, chs>; }
typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
(!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
!std::is_same<type, float>::value), p_f>::type
operator()(type_to_type<type> ) {
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
const auto inT = reinterpret_cast<const std::array<const type*, chs>&>(ins);
auto outT = reinterpret_cast<type*>(out);
scalar_tag t;
mergeRowImpl<type, chs>(t, inT, outT, length);
};
}
p_f operator()(type_to_type<fp_16_t> ) {
static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
"fp_16_t should be a plain wrap over FP16 implementation type");
return mergeRow<decltype(fp_16_t::v), chs>;
template<typename tag = isa_tag_t>
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
operator()(type_to_type<uint8_t>) {
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
tag t;
mergeRowImpl<tag, uint8_t, chs>(t, ins, out, length);
};
}
template<typename tag = isa_tag_t>
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
operator()(type_to_type<float>) {
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
const auto inT = reinterpret_cast<const std::array<const float*, chs>&>(ins);
auto outT = reinterpret_cast<float*>(out);
tag t;
mergeRowImpl<tag, float, chs>(t, inT, outT, length);
};
}
};
} // namespace
GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View& a,
const cv::gapi::fluid::View& b,
cv::gapi::fluid::Buffer& out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<2>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length());
}
}
};
GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View& a,
const cv::gapi::fluid::View& b,
const cv::gapi::fluid::View& c,
cv::gapi::fluid::Buffer& out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<3>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length());
}
}
};
GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View& a,
const cv::gapi::fluid::View& b,
const cv::gapi::fluid::View& c,
const cv::gapi::fluid::View& d,
cv::gapi::fluid::Buffer& out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<4>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length());
}
}
};
namespace {
using split_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
template<int chs>
template<typename T, int chs>
void splitRowImpl(scalar_tag, const T* in, std::array<T*, chs>& outs, const int length) {
for (int x = 0; x < length; ++x) {
for (int c = 0; c < chs; ++c) {
outs[c][x] = in[chs * x + c];
}
}
}
template<typename isa_tag_t, int chs>
struct typed_split_row {
using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length);
using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length);
template <typename type>
p_f operator()(type_to_type<type> ) { return splitRow<type, chs>; }
typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
(!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
!std::is_same<type, float>::value), p_f>::type
operator()(type_to_type<type> ) {
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
const auto inT = reinterpret_cast<const type*>(in);
auto outT = reinterpret_cast<std::array<type*, chs>&>(outs);
scalar_tag t;
splitRowImpl<type, chs>(t, inT, outT, length);
};
}
p_f operator()(type_to_type<fp_16_t> ) {
static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
"fp_16_t should be a plain wrap over FP16 implementation type");
return splitRow<decltype(fp_16_t::v), chs>;
template<typename tag = isa_tag_t>
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
operator()(type_to_type<uint8_t>) {
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
tag t;
splitRowImpl<tag, uint8_t, chs>(t, in, outs, length);
};
}
template<typename tag = isa_tag_t>
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
operator()(type_to_type<float>) {
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
const auto inT = reinterpret_cast<const float*>(in);
auto outT = reinterpret_cast<std::array<float*, chs>&>(outs);
tag t;
splitRowImpl<tag, float, chs>(t, inT, outT, length);
};
}
};
} // namespace
GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer& out1,
cv::gapi::fluid::Buffer& out2) {
GAPI_DbgAssert(2 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<2>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 2> outs = {out1.OutLineB(i), out2.OutLineB(i)};
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer& out1,
cv::gapi::fluid::Buffer& out2,
cv::gapi::fluid::Buffer& out3) {
GAPI_DbgAssert(3 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(1 == out3.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<3>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 3> outs = {out1.OutLineB(i), out2.OutLineB(i),
out3.OutLineB(i)};
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer& out1,
cv::gapi::fluid::Buffer& out2,
cv::gapi::fluid::Buffer& out3,
cv::gapi::fluid::Buffer& out4) {
GAPI_DbgAssert(4 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(1 == out3.meta().chan);
GAPI_DbgAssert(1 == out4.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<4>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 4> outs = {out1.OutLineB(i), out2.OutLineB(i),
out3.OutLineB(i), out4.OutLineB(i)};
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
//----------------------------------------------------------------------
using isas_set = typelist<
#ifdef HAVE_AVX512
@@ -1005,36 +559,179 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
rowFunc(y_rows, u_row, v_row, out_rows, buf_width);
}
};
GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer & out1,
cv::gapi::fluid::Buffer & out2) {
GAPI_DbgAssert(2 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 2>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 2> outs = { out1.OutLineB(i), out2.OutLineB(i) };
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer & out1,
cv::gapi::fluid::Buffer & out2,
cv::gapi::fluid::Buffer & out3) {
GAPI_DbgAssert(3 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(1 == out3.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 3>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 3> outs = { out1.OutLineB(i), out2.OutLineB(i),
out3.OutLineB(i) };
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & in,
cv::gapi::fluid::Buffer & out1,
cv::gapi::fluid::Buffer & out2,
cv::gapi::fluid::Buffer & out3,
cv::gapi::fluid::Buffer & out4) {
GAPI_DbgAssert(4 == in.meta().chan);
GAPI_DbgAssert(1 == out1.meta().chan);
GAPI_DbgAssert(1 == out2.meta().chan);
GAPI_DbgAssert(1 == out3.meta().chan);
GAPI_DbgAssert(1 == out4.meta().chan);
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 4>{}, nullptr);
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
std::array<uint8_t*, 4> outs = { out1.OutLineB(i), out2.OutLineB(i),
out3.OutLineB(i), out4.OutLineB(i) };
rowFunc(in.InLineB(i), outs, in.length());
}
}
};
GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & a,
const cv::gapi::fluid::View & b,
cv::gapi::fluid::Buffer & out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 2>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({ a.InLineB(l), b.InLineB(l) }, out.OutLineB(l), a.length());
}
}
};
GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & a,
const cv::gapi::fluid::View & b,
const cv::gapi::fluid::View & c,
cv::gapi::fluid::Buffer & out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 3>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l) }, out.OutLineB(l), a.length());
}
}
};
GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
static const int LPI = 4;
static const int Window = 1;
static void run(const cv::gapi::fluid::View & a,
const cv::gapi::fluid::View & b,
const cv::gapi::fluid::View & c,
const cv::gapi::fluid::View & d,
cv::gapi::fluid::Buffer & out) {
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 4>{}, nullptr);
for (int l = 0; l < out.lpi(); l++) {
rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l) }, out.OutLineB(l), a.length());
}
}
};
};
namespace {
struct ColorConversionISA {
struct CC_and_MergeISA {
cv::gapi::GKernelPackage& pckg;
ColorConversionISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
CC_and_MergeISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
template<typename isa_tag_t>
bool operator()(type_to_type<isa_tag_t>) {
pckg.include<typename choose_impl<isa_tag_t>::FI420toRGB>();
pckg.include<typename choose_impl<isa_tag_t>::FNV12toRGB>();
pckg.include<typename choose_impl<isa_tag_t>::FChanToPlane>();
pckg.include<typename choose_impl<isa_tag_t>::FMerge2>();
pckg.include<typename choose_impl<isa_tag_t>::FMerge3>();
pckg.include<typename choose_impl<isa_tag_t>::FMerge4>();
//at the moment type_dispatch requires something to be returned by the lambda
return true;
}
};
struct SplitISA {
cv::gapi::GKernelPackage& pckg;
SplitISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
template<typename isa_tag_t>
bool operator()(type_to_type<isa_tag_t>) {
pckg.include<typename choose_impl<isa_tag_t>::FSplit2>();
pckg.include<typename choose_impl<isa_tag_t>::FSplit3>();
pckg.include<typename choose_impl<isa_tag_t>::FSplit4>();
//at the moment type_dispatch requires something to be returned by the lambda
return true;
}
};
} //namespace
cv::gapi::GKernelPackage FColorConversionChooseISA() {
cv::gapi::GKernelPackage FKernelsChooseISA() {
// At the moment AVX512 implementation of wide universal intrinsics is slower than AVX2.
// So, disable it for now.
using isas = remove_t<isas_set, avx512_tag>;
cv::gapi::GKernelPackage pckg;
ColorConversionISA ctpISA{pckg};
cv::gapi::GKernelPackage pckg1, pckg2;
CC_and_MergeISA ccISA{ pckg1 };
SplitISA sISA{ pckg2 };
type_dispatch<isas>(is_isa_present{}, ctpISA, false);
type_dispatch<isas>(is_isa_present{}, ccISA, false);
type_dispatch<isas_set>(is_isa_present{}, sISA, false);
return pckg;
return combine(pckg1, pckg2);
}
//----------------------------------------------------------------------
@@ -2601,7 +2298,7 @@ using namespace kernels;
cv::gapi::GKernelPackage preprocKernels() {
return combine(
FColorConversionChooseISA(),
FKernelsChooseISA(),
cv::gapi::kernels
<FScalePlanes
, FScalePlanes4
@@ -2612,12 +2309,6 @@ cv::gapi::GKernelPackage preprocKernels() {
, FUpscalePlaneArea32f
, FScalePlaneArea8u
, FScalePlaneArea32f
, FMerge2
, FMerge3
, FMerge4
, FSplit2
, FSplit3
, FSplit4
, FConvertDepth
, FSubC
, FDivC

View File

@@ -18,360 +18,204 @@ namespace gapi {
namespace kernels {
CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void mergeRowC2_Impl(const T in0[], const T in1[],
T out[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
v_store_interleave(&out[2*l], r0, r1);
}
VecT r0, r1;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
r0 = vx_load(&in0[x]);
r1 = vx_load(&in1[x]);
v_store_interleave(&out[2*x], r0, r1);
}
// to think about how to remove those ifs
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out[2*l + 0] = in0[l];
out[2*l + 1] = in1[l];
for (; x < length; ++x) {
out[2*x + 0] = in0[x];
out[2*x + 1] = in1[x];
}
}
CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void mergeRowC3_Impl(const T in0[], const T in1[],
const T in2[], T out[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1, r2;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
r2 = vx_load(&in2[l]);
v_store_interleave(&out[3*l], r0, r1, r2);
}
VecT r0, r1, r2;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
r0 = vx_load(&in0[x]);
r1 = vx_load(&in1[x]);
r2 = vx_load(&in2[x]);
v_store_interleave(&out[3*x], r0, r1, r2);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out[3*l + 0] = in0[l];
out[3*l + 1] = in1[l];
out[3*l + 2] = in2[l];
for (; x < length; ++x) {
out[3*x + 0] = in0[x];
out[3*x + 1] = in1[x];
out[3*x + 2] = in2[x];
}
}
CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], const uint8_t in3[],
uint8_t out[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void mergeRowC4_Impl(const T in0[], const T in1[],
const T in2[], const T in3[],
T out[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1, r2, r3;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
r2 = vx_load(&in2[l]);
r3 = vx_load(&in3[l]);
v_store_interleave(&out[4*l], r0, r1, r2, r3);
}
VecT r0, r1, r2, r3;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
r0 = vx_load(&in0[x]);
r1 = vx_load(&in1[x]);
r2 = vx_load(&in2[x]);
r3 = vx_load(&in3[x]);
v_store_interleave(&out[4* x], r0, r1, r2, r3);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out[4*l + 0] = in0[l];
out[4*l + 1] = in1[l];
out[4*l + 2] = in2[l];
out[4*l + 3] = in3[l];
for (; x < length; ++x) {
out[4*x + 0] = in0[x];
out[4*x + 1] = in1[x];
out[4*x + 2] = in2[x];
out[4*x + 3] = in3[x];
}
}
CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
v_store_interleave(&out[2*l], r0, r1);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out[2*l + 0] = in0[l];
out[2*l + 1] = in1[l];
}
}
CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1, r2;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
r2 = vx_load(&in2[l]);
v_store_interleave(&out[3*l], r0, r1, r2);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out[3*l + 0] = in0[l];
out[3*l + 1] = in1[l];
out[3*l + 2] = in2[l];
}
}
CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1, r2, r3;
r0 = vx_load(&in0[l]);
r1 = vx_load(&in1[l]);
r2 = vx_load(&in2[l]);
r3 = vx_load(&in3[l]);
v_store_interleave(&out[4*l], r0, r1, r2, r3);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out[4*l + 0] = in0[l];
out[4*l + 1] = in1[l];
out[4*l + 2] = in2[l];
out[4*l + 3] = in3[l];
}
}
//------------------------------------------------------------------------------
CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void splitRowC2_Impl(const T in[], T out0[],
T out1[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1;
v_load_deinterleave(&in[2*l], r0, r1);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
}
VecT r0, r1;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
v_load_deinterleave(&in[2*x], r0, r1);
vx_store(&out0[x], r0);
vx_store(&out1[x], r1);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out0[l] = in[2*l + 0];
out1[l] = in[2*l + 1];
for (; x < length; ++x) {
out0[x] = in[2*x + 0];
out1[x] = in[2*x + 1];
}
}
CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void splitRowC3_Impl(const T in[], T out0[],
T out1[], T out2[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1, r2;
v_load_deinterleave(&in[3*l], r0, r1, r2);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
vx_store(&out2[l], r2);
}
VecT r0, r1, r2;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
v_load_deinterleave(&in[3*x], r0, r1, r2);
vx_store(&out0[x], r0);
vx_store(&out1[x], r1);
vx_store(&out2[x], r2);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out0[l] = in[3*l + 0];
out1[l] = in[3*l + 1];
out2[l] = in[3*l + 2];
for (; x < length; ++x) {
out0[x] = in[3*x + 0];
out1[x] = in[3*x + 1];
out2[x] = in[3*x + 2];
}
}
CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
int l = 0;
template <typename VecT, typename T>
CV_ALWAYS_INLINE void splitRowC4_Impl(const T in[], T out0[], T out1[],
T out2[], T out3[], const int length) {
int x = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_uint8::nlanes;
constexpr int nlanes = VecT::nlanes;
GAPI_DbgAssert(length >= nlanes);
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_uint8 r0, r1, r2, r3;
v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
vx_store(&out2[l], r2);
vx_store(&out3[l], r3);
}
VecT r0, r1, r2, r3;
for (; length >= nlanes;) {
for (; x <= length - nlanes; x += nlanes) {
v_load_deinterleave(&in[4*x], r0, r1, r2, r3);
vx_store(&out0[x], r0);
vx_store(&out1[x], r1);
vx_store(&out2[x], r2);
vx_store(&out3[x], r3);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
if (x < length) {
x = length - nlanes;
continue;
}
break;
}
#endif
for (; l < length; ++l) {
out0[l] = in[4*l + 0];
out1[l] = in[4*l + 1];
out2[l] = in[4*l + 2];
out3[l] = in[4*l + 3];
for (; x < length; ++x) {
out0[x] = in[4*x + 0];
out1[x] = in[4*x + 1];
out2[x] = in[4*x + 2];
out3[x] = in[4*x + 3];
}
}
CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
float out1[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1;
v_load_deinterleave(&in[2*l], r0, r1);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out0[l] = in[2*l + 0];
out1[l] = in[2*l + 1];
}
}
CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
float out2[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1, r2;
v_load_deinterleave(&in[3*l], r0, r1, r2);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
vx_store(&out2[l], r2);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out0[l] = in[3*l + 0];
out1[l] = in[3*l + 1];
out2[l] = in[3*l + 2];
}
}
CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
int l = 0;
#if MANUAL_SIMD
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
v_float32 r0, r1, r2, r3;
v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
vx_store(&out0[l], r0);
vx_store(&out1[l], r1);
vx_store(&out2[l], r2);
vx_store(&out3[l], r3);
}
if (l < length && length >= nlanes) {
l = length - nlanes;
goto cycle;
}
#endif
for (; l < length; ++l) {
out0[l] = in[4*l + 0];
out1[l] = in[4*l + 1];
out2[l] = in[4*l + 2];
out3[l] = in[4*l + 3];
}
}
//------------------------------------------------------------------------------
CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
@@ -880,6 +724,38 @@ CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
out[x] = in[x*chs + chan];
}
}
template<typename isa_tag_t, typename T, int chs>
CV_ALWAYS_INLINE void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length) {
static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
if (chs == 2) {
splitRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], length);
return;
} else if (chs == 3) {
splitRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], length);
return;
} else {
splitRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
}
template<typename isa_tag_t, typename T, int chs>
CV_ALWAYS_INLINE void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length) {
static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
if (chs == 2) {
mergeRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], out, length);
return;
} else if (chs == 3) {
mergeRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], out, length);
return;
} else {
mergeRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
}
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -98,10 +98,30 @@ cv::String typeToString(int type)
case CV_8UC2 : return "CV_8UC2";
case CV_8UC3 : return "CV_8UC3";
case CV_8UC4 : return "CV_8UC4";
case CV_16FC1 : return "CV_16FC1";
case CV_16FC2 : return "CV_16FC2";
case CV_16FC3 : return "CV_16FC3";
case CV_16FC4 : return "CV_16FC4";
case CV_32FC1 : return "CV_32FC1";
case CV_32FC2 : return "CV_32FC2";
case CV_32FC3 : return "CV_32FC3";
case CV_32FC4 : return "CV_32FC4";
case CV_8SC1 : return "CV_8SC1";
case CV_8SC2 : return "CV_8SC2";
case CV_8SC3 : return "CV_8SC3";
case CV_8SC4 : return "CV_8SC4";
case CV_16SC1 : return "CV_16SC1";
case CV_16SC2 : return "CV_16SC2";
case CV_16SC3 : return "CV_16SC3";
case CV_16SC4 : return "CV_16SC4";
case CV_16UC1 : return "CV_16UC1";
case CV_16UC2 : return "CV_16UC2";
case CV_16UC3 : return "CV_16UC3";
case CV_16UC4 : return "CV_16UC4";
case CV_32SC1 : return "CV_32SC1";
case CV_32SC2 : return "CV_32SC2";
case CV_32SC3 : return "CV_32SC3";
case CV_32SC4 : return "CV_32SC4";
}
CV_Assert(!"ERROR: unsupported type!");
return nullptr;