Pre-processing: Split and Merge kernels refactoring. (#6205)
* * Split and Merge kernel refactoring * * SFINAE: replace condition compilation macro with std::enable_if
This commit is contained in:
@@ -29,67 +29,6 @@ namespace InferenceEngine {
|
||||
namespace gapi {
|
||||
namespace kernels {
|
||||
namespace neon {
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
|
||||
uint8_t out[], int length) {
|
||||
mergeRow_8UC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
|
||||
const uint8_t in2[], uint8_t out[], int length) {
|
||||
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
|
||||
const uint8_t in3[], uint8_t out[], int length) {
|
||||
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC2(const float in0[], const float in1[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC4(const float in0[], const float in1[],
|
||||
const float in2[], const float in3[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], int length) {
|
||||
splitRow_8UC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], uint8_t out2[], int length) {
|
||||
splitRow_8UC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
||||
uint8_t out2[], uint8_t out3[], int length) {
|
||||
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
|
||||
splitRow_32FC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC3(const float in[], float out0[], float out1[],
|
||||
float out2[], int length) {
|
||||
splitRow_32FC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC4(const float in[], float out0[], float out1[],
|
||||
float out2[], float out3[], int length) {
|
||||
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
|
||||
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
|
||||
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
|
||||
@@ -693,6 +632,20 @@ template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t*
|
||||
|
||||
template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -92,87 +92,6 @@ void calcRowLinear_32F(float *dst[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
const uint8_t in3[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC2(const float in0[],
|
||||
const float in1[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC3(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC4(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
const float in3[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
uint8_t out3[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC2(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC3(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC4(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
float out3[],
|
||||
int length);
|
||||
|
||||
void calculate_i420_to_rgb(const uchar **srcY,
|
||||
const uchar *srcU,
|
||||
const uchar *srcV,
|
||||
uchar **dstRGBx,
|
||||
int width);
|
||||
|
||||
} // namespace neon
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
@@ -192,6 +111,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
|
||||
extern template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
|
||||
|
||||
extern template void splitRowImpl<neon_tag, uint8_t, 2>(neon_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<neon_tag, float, 2>(neon_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<neon_tag, uint8_t, 3>(neon_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<neon_tag, float, 3>(neon_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<neon_tag, uint8_t, 4>(neon_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
extern template void splitRowImpl<neon_tag, float, 4>(neon_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
|
||||
|
||||
extern template void mergeRowImpl<neon_tag, uint8_t, 2>(neon_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, float, 2>(neon_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -47,66 +47,6 @@ namespace kernels {
|
||||
|
||||
namespace avx {
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
|
||||
uint8_t out[], int length) {
|
||||
mergeRow_8UC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
|
||||
const uint8_t in2[], uint8_t out[], int length) {
|
||||
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
|
||||
const uint8_t in3[], uint8_t out[], int length) {
|
||||
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC2(const float in0[], const float in1[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC4(const float in0[], const float in1[],
|
||||
const float in2[], const float in3[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], int length) {
|
||||
splitRow_8UC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], uint8_t out2[], int length) {
|
||||
splitRow_8UC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
||||
uint8_t out2[], uint8_t out3[], int length) {
|
||||
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
|
||||
splitRow_32FC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC3(const float in[], float out0[], float out1[],
|
||||
float out2[], int length) {
|
||||
splitRow_32FC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC4(const float in[], float out0[], float out1[],
|
||||
float out2[], float out3[], int length) {
|
||||
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
|
||||
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
|
||||
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
|
||||
@@ -562,6 +502,20 @@ template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t*
|
||||
|
||||
template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -106,84 +106,8 @@ void calcRowLinear_32F(float *dst[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
const uint8_t in3[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC2(const float in0[],
|
||||
const float in1[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC3(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC4(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
const float in3[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
uint8_t out3[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC2(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC3(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC4(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
float out3[],
|
||||
int length);
|
||||
} // namespace avx
|
||||
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
|
||||
|
||||
@@ -192,7 +116,7 @@ extern template void chanToPlaneRowImpl(avx2_tag, const float* in, const int c
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row,
|
||||
uint8_t** out_rows, const int buf_width);
|
||||
uint8_t** out_rows, const int buf_width);
|
||||
|
||||
extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
|
||||
const uint8_t* uv_row, uint8_t** out_rows,
|
||||
@@ -200,10 +124,30 @@ extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
extern template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
|
||||
|
||||
extern template void splitRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<avx2_tag, float, 2>(avx2_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<avx2_tag, float, 3>(avx2_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
extern template void splitRowImpl<avx2_tag, float, 4>(avx2_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
|
||||
|
||||
extern template void mergeRowImpl<avx2_tag, uint8_t, 2>(avx2_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, float, 2>(avx2_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -41,66 +41,6 @@ namespace kernels {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[], const uint8_t in1[],
|
||||
uint8_t out[], int length) {
|
||||
mergeRow_8UC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[], const uint8_t in1[],
|
||||
const uint8_t in2[], uint8_t out[], int length) {
|
||||
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
|
||||
const uint8_t in3[], uint8_t out[], int length) {
|
||||
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC2(const float in0[], const float in1[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC3(const float in0[], const float in1[], const float in2[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC4(const float in0[], const float in1[],
|
||||
const float in2[], const float in3[],
|
||||
float out[], int length) {
|
||||
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], int length) {
|
||||
splitRow_8UC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], uint8_t out2[], int length) {
|
||||
splitRow_8UC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
||||
uint8_t out2[], uint8_t out3[], int length) {
|
||||
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC2(const float in[], float out0[], float out1[], int length) {
|
||||
splitRow_32FC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC3(const float in[], float out0[], float out1[],
|
||||
float out2[], int length) {
|
||||
splitRow_32FC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC4(const float in[], float out0[], float out1[],
|
||||
float out2[], float out3[], int length) {
|
||||
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
|
||||
const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
|
||||
int xmaxdf, const short xindex[], const Q0_16 xalpha[],
|
||||
@@ -632,7 +572,6 @@ void calcRowLinear_32F(float *dst[],
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
|
||||
@@ -642,6 +581,20 @@ template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t
|
||||
|
||||
template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -106,83 +106,8 @@ void calcRowLinear_32F(float *dst[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
const uint8_t in3[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC2(const float in0[],
|
||||
const float in1[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC3(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC4(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
const float in3[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
uint8_t out3[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC2(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC3(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC4(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
float out3[],
|
||||
int length);
|
||||
} // namespace avx512
|
||||
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
|
||||
|
||||
@@ -200,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
|
||||
extern template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
|
||||
|
||||
extern template void splitRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<avx512_tag, float, 2>(avx512_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<avx512_tag, float, 3>(avx512_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
extern template void splitRowImpl<avx512_tag, float, 4>(avx512_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
|
||||
|
||||
extern template void mergeRowImpl<avx512_tag, uint8_t, 2>(avx512_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, float, 2>(avx512_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -1267,103 +1267,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
|
||||
}
|
||||
|
||||
#endif // CVKL
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
uint8_t out[],
|
||||
int length) {
|
||||
mergeRow_8UC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
uint8_t out[],
|
||||
int length) {
|
||||
mergeRow_8UC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
const uint8_t in3[],
|
||||
uint8_t out[],
|
||||
int length) {
|
||||
mergeRow_8UC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC2(const float in0[],
|
||||
const float in1[],
|
||||
float out[],
|
||||
int length) {
|
||||
mergeRow_32FC2_Impl(in0, in1, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC3(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
float out[],
|
||||
int length) {
|
||||
mergeRow_32FC3_Impl(in0, in1, in2, out, length);
|
||||
}
|
||||
|
||||
void mergeRow_32FC4(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
const float in3[],
|
||||
float out[],
|
||||
int length) {
|
||||
mergeRow_32FC4_Impl(in0, in1, in2, in3, out, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
int length) {
|
||||
splitRow_8UC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
int length) {
|
||||
splitRow_8UC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
uint8_t out3[],
|
||||
int length) {
|
||||
splitRow_8UC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC2(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
int length) {
|
||||
splitRow_32FC2_Impl(in, out0, out1, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC3(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
int length) {
|
||||
splitRow_32FC3_Impl(in, out0, out1, out2, length);
|
||||
}
|
||||
|
||||
void splitRow_32FC4(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
float out3[],
|
||||
int length) {
|
||||
splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
|
||||
}
|
||||
|
||||
template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
|
||||
template void chanToPlaneRowImpl(sse42_tag, const float* in, const int chan, const int chs, float* out, const int length);
|
||||
@@ -1372,6 +1275,20 @@ template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t*
|
||||
|
||||
template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template void splitRowImpl<sse42_tag, uchar, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
template void splitRowImpl<sse42_tag, uchar, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
template void splitRowImpl<sse42_tag, uchar, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template void mergeRowImpl<sse42_tag, uchar, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -106,80 +106,6 @@ void calcRowLinear_32F(float *dst[],
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
void mergeRow_8UC2(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC3(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_8UC4(const uint8_t in0[],
|
||||
const uint8_t in1[],
|
||||
const uint8_t in2[],
|
||||
const uint8_t in3[],
|
||||
uint8_t out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC2(const float in0[],
|
||||
const float in1[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC3(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void mergeRow_32FC4(const float in0[],
|
||||
const float in1[],
|
||||
const float in2[],
|
||||
const float in3[],
|
||||
float out[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC2(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC3(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_8UC4(const uint8_t in[],
|
||||
uint8_t out0[],
|
||||
uint8_t out1[],
|
||||
uint8_t out2[],
|
||||
uint8_t out3[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC2(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC3(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
int length);
|
||||
|
||||
void splitRow_32FC4(const float in[],
|
||||
float out0[],
|
||||
float out1[],
|
||||
float out2[],
|
||||
float out3[],
|
||||
int length);
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
|
||||
T* out, const int length);
|
||||
@@ -199,6 +125,26 @@ void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
|
||||
extern template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
|
||||
const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length);
|
||||
|
||||
extern template void splitRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<sse42_tag, float, 2>(sse42_tag, const float* in, std::array<float*, 2>& outs, const int length);
|
||||
extern template void splitRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<sse42_tag, float, 3>(sse42_tag, const float* in, std::array<float*, 3>& outs, const int length);
|
||||
extern template void splitRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const uint8_t* in, std::array<uint8_t*, 4>& outs, const int length);
|
||||
extern template void splitRowImpl<sse42_tag, float, 4>(sse42_tag, const float* in, std::array<float*, 4>& outs, const int length);
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length);
|
||||
|
||||
extern template void mergeRowImpl<sse42_tag, uint8_t, 2>(sse42_tag, const std::array<const uint8_t*, 2>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, float, 2>(sse42_tag, const std::array<const float*, 2>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::array<const uint8_t*, 3>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -42,396 +42,10 @@
|
||||
|
||||
namespace InferenceEngine {
|
||||
namespace gapi {
|
||||
|
||||
//using namespace kernels;
|
||||
|
||||
namespace kernels {
|
||||
|
||||
template<typename T, int chs> static
|
||||
void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length) {
|
||||
// AVX512 implementation of wide universal intrinsics is slower than AVX2.
|
||||
// It is turned off until the cause isn't found out.
|
||||
#if 0
|
||||
#ifdef HAVE_AVX512
|
||||
if (with_cpu_x86_avx512f()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
avx512::mergeRow_8UC2(ins[0], ins[1], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
avx512::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
avx512::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
avx512::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
avx512::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
avx512::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<const float*>(ins[3]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_AVX512
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (with_cpu_x86_avx2()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
avx::mergeRow_8UC2(ins[0], ins[1], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
avx::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
avx::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
avx::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
avx::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
avx::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<const float*>(ins[3]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
#ifdef HAVE_SSE
|
||||
if (with_cpu_x86_sse42()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
mergeRow_8UC2(ins[0], ins[1], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<const float*>(ins[3]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_SSE
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
neon::mergeRow_8UC2(ins[0], ins[1], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
|
||||
reinterpret_cast<const float*>(ins[1]),
|
||||
reinterpret_cast<const float*>(ins[2]),
|
||||
reinterpret_cast<const float*>(ins[3]),
|
||||
reinterpret_cast<float*>(out), length);
|
||||
return;
|
||||
}
|
||||
#endif // HAVE_NEON
|
||||
|
||||
const T* insT[chs];
|
||||
for (int c = 0; c < chs; c++) {
|
||||
insT[c] = reinterpret_cast<const T*>(ins[c]);
|
||||
}
|
||||
auto outT = reinterpret_cast<T*>(out);
|
||||
|
||||
for (int x = 0; x < length; x++) {
|
||||
for (int c = 0; c < chs; c++) {
|
||||
outT[chs*x + c] = insT[c][x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, int chs> static
|
||||
void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
|
||||
#ifdef HAVE_AVX512
|
||||
if (with_cpu_x86_avx512f()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
avx512::splitRow_8UC2(in, outs[0], outs[1], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
avx512::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
avx512::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
avx512::splitRow_32FC2(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
avx512::splitRow_32FC3(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
avx512::splitRow_32FC4(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
reinterpret_cast<float*>(outs[3]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_AVX512
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
|
||||
if (with_cpu_x86_avx2()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
avx::splitRow_8UC2(in, outs[0], outs[1], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
avx::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
avx::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
avx::splitRow_32FC2(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
avx::splitRow_32FC3(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
avx::splitRow_32FC4(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
reinterpret_cast<float*>(outs[3]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
#ifdef HAVE_SSE
|
||||
if (with_cpu_x86_sse42()) {
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
splitRow_8UC2(in, outs[0], outs[1], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
splitRow_32FC2(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
splitRow_32FC3(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
splitRow_32FC4(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
reinterpret_cast<float*>(outs[3]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif // HAVE_SSE
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
if (std::is_same<T, uint8_t>::value && chs == 2) {
|
||||
neon::splitRow_8UC2(in, outs[0], outs[1], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 3) {
|
||||
neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, uint8_t>::value && chs == 4) {
|
||||
neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 2) {
|
||||
neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 3) {
|
||||
neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value && chs == 4) {
|
||||
neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
|
||||
reinterpret_cast<float*>(outs[0]),
|
||||
reinterpret_cast<float*>(outs[1]),
|
||||
reinterpret_cast<float*>(outs[2]),
|
||||
reinterpret_cast<float*>(outs[3]),
|
||||
length);
|
||||
return;
|
||||
}
|
||||
#endif // HAVE_NEON
|
||||
|
||||
auto inT = reinterpret_cast<const T*>(in);
|
||||
|
||||
T* outsT[chs];
|
||||
for (int c = 0; c < chs; c++) {
|
||||
outsT[c] = reinterpret_cast<T*>(outs[c]);
|
||||
}
|
||||
|
||||
for (int x = 0; x < length; x++) {
|
||||
for (int c = 0; c < chs; c++) {
|
||||
outsT[c][x] = inT[chs*x + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct fp_16_t {
|
||||
@@ -583,168 +197,108 @@ bool is_cv_type_in_list(const int type_id) {
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
using merge_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
|
||||
|
||||
template<int chs>
|
||||
template<typename T, int chs>
|
||||
void mergeRowImpl(scalar_tag, const std::array<const T*, chs>& ins, T* out, const int length) {
|
||||
for (int x = 0; x < length; ++x) {
|
||||
for (int c = 0; c < chs; ++c) {
|
||||
out[chs * x + c] = ins[c][x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename isa_tag_t, int chs>
|
||||
struct typed_merge_row {
|
||||
using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length);
|
||||
using p_f = void (*)(const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length);
|
||||
|
||||
template <typename type>
|
||||
p_f operator()(type_to_type<type> ) { return mergeRow<type, chs>; }
|
||||
typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
|
||||
(!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
|
||||
!std::is_same<type, float>::value), p_f>::type
|
||||
operator()(type_to_type<type> ) {
|
||||
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
|
||||
const auto inT = reinterpret_cast<const std::array<const type*, chs>&>(ins);
|
||||
auto outT = reinterpret_cast<type*>(out);
|
||||
scalar_tag t;
|
||||
mergeRowImpl<type, chs>(t, inT, outT, length);
|
||||
};
|
||||
}
|
||||
|
||||
p_f operator()(type_to_type<fp_16_t> ) {
|
||||
static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
|
||||
"fp_16_t should be a plain wrap over FP16 implementation type");
|
||||
return mergeRow<decltype(fp_16_t::v), chs>;
|
||||
template<typename tag = isa_tag_t>
|
||||
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
|
||||
operator()(type_to_type<uint8_t>) {
|
||||
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
|
||||
tag t;
|
||||
mergeRowImpl<tag, uint8_t, chs>(t, ins, out, length);
|
||||
};
|
||||
}
|
||||
|
||||
template<typename tag = isa_tag_t>
|
||||
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
|
||||
operator()(type_to_type<float>) {
|
||||
return [](const std::array<const uint8_t*, chs>& ins, uint8_t* out, const int length) {
|
||||
const auto inT = reinterpret_cast<const std::array<const float*, chs>&>(ins);
|
||||
auto outT = reinterpret_cast<float*>(out);
|
||||
tag t;
|
||||
mergeRowImpl<tag, float, chs>(t, inT, outT, length);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View& a,
|
||||
const cv::gapi::fluid::View& b,
|
||||
cv::gapi::fluid::Buffer& out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<2>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View& a,
|
||||
const cv::gapi::fluid::View& b,
|
||||
const cv::gapi::fluid::View& c,
|
||||
cv::gapi::fluid::Buffer& out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<3>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View& a,
|
||||
const cv::gapi::fluid::View& b,
|
||||
const cv::gapi::fluid::View& c,
|
||||
const cv::gapi::fluid::View& d,
|
||||
cv::gapi::fluid::Buffer& out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<4>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
namespace {
|
||||
using split_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
|
||||
|
||||
template<int chs>
|
||||
template<typename T, int chs>
|
||||
void splitRowImpl(scalar_tag, const T* in, std::array<T*, chs>& outs, const int length) {
|
||||
for (int x = 0; x < length; ++x) {
|
||||
for (int c = 0; c < chs; ++c) {
|
||||
outs[c][x] = in[chs * x + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename isa_tag_t, int chs>
|
||||
struct typed_split_row {
|
||||
using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length);
|
||||
using p_f = void (*)(const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length);
|
||||
|
||||
template <typename type>
|
||||
p_f operator()(type_to_type<type> ) { return splitRow<type, chs>; }
|
||||
typename std::enable_if<std::is_same<isa_tag_t, scalar_tag>::value ||
|
||||
(!std::is_same<isa_tag_t, scalar_tag>::value && !std::is_same<type, uint8_t>::value &&
|
||||
!std::is_same<type, float>::value), p_f>::type
|
||||
operator()(type_to_type<type> ) {
|
||||
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
|
||||
const auto inT = reinterpret_cast<const type*>(in);
|
||||
auto outT = reinterpret_cast<std::array<type*, chs>&>(outs);
|
||||
scalar_tag t;
|
||||
splitRowImpl<type, chs>(t, inT, outT, length);
|
||||
};
|
||||
}
|
||||
|
||||
p_f operator()(type_to_type<fp_16_t> ) {
|
||||
static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
|
||||
"fp_16_t should be a plain wrap over FP16 implementation type");
|
||||
return splitRow<decltype(fp_16_t::v), chs>;
|
||||
template<typename tag = isa_tag_t>
|
||||
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
|
||||
operator()(type_to_type<uint8_t>) {
|
||||
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
|
||||
tag t;
|
||||
splitRowImpl<tag, uint8_t, chs>(t, in, outs, length);
|
||||
};
|
||||
}
|
||||
|
||||
template<typename tag = isa_tag_t>
|
||||
typename std::enable_if<!std::is_same<tag, scalar_tag>::value, p_f>::type
|
||||
operator()(type_to_type<float>) {
|
||||
return [](const uint8_t* in, std::array<uint8_t*, chs>& outs, const int length) {
|
||||
const auto inT = reinterpret_cast<const float*>(in);
|
||||
auto outT = reinterpret_cast<std::array<float*, chs>&>(outs);
|
||||
tag t;
|
||||
splitRowImpl<tag, float, chs>(t, inT, outT, length);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer& out1,
|
||||
cv::gapi::fluid::Buffer& out2) {
|
||||
GAPI_DbgAssert(2 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<2>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 2> outs = {out1.OutLineB(i), out2.OutLineB(i)};
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer& out1,
|
||||
cv::gapi::fluid::Buffer& out2,
|
||||
cv::gapi::fluid::Buffer& out3) {
|
||||
GAPI_DbgAssert(3 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(1 == out3.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
|
||||
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<3>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 3> outs = {out1.OutLineB(i), out2.OutLineB(i),
|
||||
out3.OutLineB(i)};
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer& out1,
|
||||
cv::gapi::fluid::Buffer& out2,
|
||||
cv::gapi::fluid::Buffer& out3,
|
||||
cv::gapi::fluid::Buffer& out4) {
|
||||
GAPI_DbgAssert(4 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(1 == out3.meta().chan);
|
||||
GAPI_DbgAssert(1 == out4.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<4>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 4> outs = {out1.OutLineB(i), out2.OutLineB(i),
|
||||
out3.OutLineB(i), out4.OutLineB(i)};
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
using isas_set = typelist<
|
||||
#ifdef HAVE_AVX512
|
||||
@@ -1005,36 +559,179 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
|
||||
rowFunc(y_rows, u_row, v_row, out_rows, buf_width);
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer & out1,
|
||||
cv::gapi::fluid::Buffer & out2) {
|
||||
GAPI_DbgAssert(2 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 2>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 2> outs = { out1.OutLineB(i), out2.OutLineB(i) };
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer & out1,
|
||||
cv::gapi::fluid::Buffer & out2,
|
||||
cv::gapi::fluid::Buffer & out3) {
|
||||
GAPI_DbgAssert(3 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(1 == out3.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
|
||||
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 3>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 3> outs = { out1.OutLineB(i), out2.OutLineB(i),
|
||||
out3.OutLineB(i) };
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & in,
|
||||
cv::gapi::fluid::Buffer & out1,
|
||||
cv::gapi::fluid::Buffer & out2,
|
||||
cv::gapi::fluid::Buffer & out3,
|
||||
cv::gapi::fluid::Buffer & out4) {
|
||||
GAPI_DbgAssert(4 == in.meta().chan);
|
||||
GAPI_DbgAssert(1 == out1.meta().chan);
|
||||
GAPI_DbgAssert(1 == out2.meta().chan);
|
||||
GAPI_DbgAssert(1 == out3.meta().chan);
|
||||
GAPI_DbgAssert(1 == out4.meta().chan);
|
||||
GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
|
||||
GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
|
||||
GAPI_DbgAssert(is_cv_type_in_list<split_supported_types>(in.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<split_supported_types>(in.meta().depth, cv_type_id{}, typed_split_row<isa_tag_t, 4>{}, nullptr);
|
||||
for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
|
||||
std::array<uint8_t*, 4> outs = { out1.OutLineB(i), out2.OutLineB(i),
|
||||
out3.OutLineB(i), out4.OutLineB(i) };
|
||||
rowFunc(in.InLineB(i), outs, in.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & a,
|
||||
const cv::gapi::fluid::View & b,
|
||||
cv::gapi::fluid::Buffer & out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 2>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({ a.InLineB(l), b.InLineB(l) }, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & a,
|
||||
const cv::gapi::fluid::View & b,
|
||||
const cv::gapi::fluid::View & c,
|
||||
cv::gapi::fluid::Buffer & out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 3>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l) }, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
|
||||
static const int LPI = 4;
|
||||
static const int Window = 1;
|
||||
static void run(const cv::gapi::fluid::View & a,
|
||||
const cv::gapi::fluid::View & b,
|
||||
const cv::gapi::fluid::View & c,
|
||||
const cv::gapi::fluid::View & d,
|
||||
cv::gapi::fluid::Buffer & out) {
|
||||
GAPI_DbgAssert(is_cv_type_in_list<merge_supported_types>(out.meta().depth));
|
||||
|
||||
const auto rowFunc = type_dispatch<merge_supported_types>(out.meta().depth, cv_type_id{}, typed_merge_row<isa_tag_t, 4>{}, nullptr);
|
||||
for (int l = 0; l < out.lpi(); l++) {
|
||||
rowFunc({ a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l) }, out.OutLineB(l), a.length());
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace {
|
||||
struct ColorConversionISA {
|
||||
struct CC_and_MergeISA {
|
||||
cv::gapi::GKernelPackage& pckg;
|
||||
|
||||
ColorConversionISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
|
||||
CC_and_MergeISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool operator()(type_to_type<isa_tag_t>) {
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FI420toRGB>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FNV12toRGB>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FChanToPlane>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FMerge2>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FMerge3>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FMerge4>();
|
||||
//at the moment type_dispatch requires something to be returned by the lambda
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
struct SplitISA {
|
||||
cv::gapi::GKernelPackage& pckg;
|
||||
|
||||
SplitISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool operator()(type_to_type<isa_tag_t>) {
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FSplit2>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FSplit3>();
|
||||
pckg.include<typename choose_impl<isa_tag_t>::FSplit4>();
|
||||
//at the moment type_dispatch requires something to be returned by the lambda
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} //namespace
|
||||
|
||||
cv::gapi::GKernelPackage FColorConversionChooseISA() {
|
||||
cv::gapi::GKernelPackage FKernelsChooseISA() {
|
||||
// At the moment AVX512 implementation of wide universal intrinsics is slower than AVX2.
|
||||
// So, disable it for now.
|
||||
using isas = remove_t<isas_set, avx512_tag>;
|
||||
|
||||
cv::gapi::GKernelPackage pckg;
|
||||
ColorConversionISA ctpISA{pckg};
|
||||
cv::gapi::GKernelPackage pckg1, pckg2;
|
||||
CC_and_MergeISA ccISA{ pckg1 };
|
||||
SplitISA sISA{ pckg2 };
|
||||
|
||||
type_dispatch<isas>(is_isa_present{}, ctpISA, false);
|
||||
type_dispatch<isas>(is_isa_present{}, ccISA, false);
|
||||
type_dispatch<isas_set>(is_isa_present{}, sISA, false);
|
||||
|
||||
return pckg;
|
||||
return combine(pckg1, pckg2);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@@ -2601,7 +2298,7 @@ using namespace kernels;
|
||||
|
||||
cv::gapi::GKernelPackage preprocKernels() {
|
||||
return combine(
|
||||
FColorConversionChooseISA(),
|
||||
FKernelsChooseISA(),
|
||||
cv::gapi::kernels
|
||||
<FScalePlanes
|
||||
, FScalePlanes4
|
||||
@@ -2612,12 +2309,6 @@ cv::gapi::GKernelPackage preprocKernels() {
|
||||
, FUpscalePlaneArea32f
|
||||
, FScalePlaneArea8u
|
||||
, FScalePlaneArea32f
|
||||
, FMerge2
|
||||
, FMerge3
|
||||
, FMerge4
|
||||
, FSplit2
|
||||
, FSplit3
|
||||
, FSplit4
|
||||
, FConvertDepth
|
||||
, FSubC
|
||||
, FDivC
|
||||
|
||||
@@ -18,360 +18,204 @@ namespace gapi {
|
||||
|
||||
namespace kernels {
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||
uint8_t out[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void mergeRowC2_Impl(const T in0[], const T in1[],
|
||||
T out[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
v_store_interleave(&out[2*l], r0, r1);
|
||||
}
|
||||
VecT r0, r1;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
r0 = vx_load(&in0[x]);
|
||||
r1 = vx_load(&in1[x]);
|
||||
v_store_interleave(&out[2*x], r0, r1);
|
||||
}
|
||||
|
||||
// to think about how to remove those ifs
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[2*l + 0] = in0[l];
|
||||
out[2*l + 1] = in1[l];
|
||||
for (; x < length; ++x) {
|
||||
out[2*x + 0] = in0[x];
|
||||
out[2*x + 1] = in1[x];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||
const uint8_t in2[], uint8_t out[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void mergeRowC3_Impl(const T in0[], const T in1[],
|
||||
const T in2[], T out[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1, r2;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
r2 = vx_load(&in2[l]);
|
||||
v_store_interleave(&out[3*l], r0, r1, r2);
|
||||
}
|
||||
VecT r0, r1, r2;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
r0 = vx_load(&in0[x]);
|
||||
r1 = vx_load(&in1[x]);
|
||||
r2 = vx_load(&in2[x]);
|
||||
v_store_interleave(&out[3*x], r0, r1, r2);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[3*l + 0] = in0[l];
|
||||
out[3*l + 1] = in1[l];
|
||||
out[3*l + 2] = in2[l];
|
||||
for (; x < length; ++x) {
|
||||
out[3*x + 0] = in0[x];
|
||||
out[3*x + 1] = in1[x];
|
||||
out[3*x + 2] = in2[x];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||
const uint8_t in2[], const uint8_t in3[],
|
||||
uint8_t out[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void mergeRowC4_Impl(const T in0[], const T in1[],
|
||||
const T in2[], const T in3[],
|
||||
T out[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1, r2, r3;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
r2 = vx_load(&in2[l]);
|
||||
r3 = vx_load(&in3[l]);
|
||||
v_store_interleave(&out[4*l], r0, r1, r2, r3);
|
||||
}
|
||||
VecT r0, r1, r2, r3;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
r0 = vx_load(&in0[x]);
|
||||
r1 = vx_load(&in1[x]);
|
||||
r2 = vx_load(&in2[x]);
|
||||
r3 = vx_load(&in3[x]);
|
||||
v_store_interleave(&out[4* x], r0, r1, r2, r3);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[4*l + 0] = in0[l];
|
||||
out[4*l + 1] = in1[l];
|
||||
out[4*l + 2] = in2[l];
|
||||
out[4*l + 3] = in3[l];
|
||||
for (; x < length; ++x) {
|
||||
out[4*x + 0] = in0[x];
|
||||
out[4*x + 1] = in1[x];
|
||||
out[4*x + 2] = in2[x];
|
||||
out[4*x + 3] = in3[x];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
|
||||
float out[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
v_store_interleave(&out[2*l], r0, r1);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[2*l + 0] = in0[l];
|
||||
out[2*l + 1] = in1[l];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
|
||||
float out[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1, r2;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
r2 = vx_load(&in2[l]);
|
||||
v_store_interleave(&out[3*l], r0, r1, r2);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[3*l + 0] = in0[l];
|
||||
out[3*l + 1] = in1[l];
|
||||
out[3*l + 2] = in2[l];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
|
||||
const float in2[], const float in3[],
|
||||
float out[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1, r2, r3;
|
||||
r0 = vx_load(&in0[l]);
|
||||
r1 = vx_load(&in1[l]);
|
||||
r2 = vx_load(&in2[l]);
|
||||
r3 = vx_load(&in3[l]);
|
||||
v_store_interleave(&out[4*l], r0, r1, r2, r3);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out[4*l + 0] = in0[l];
|
||||
out[4*l + 1] = in1[l];
|
||||
out[4*l + 2] = in2[l];
|
||||
out[4*l + 3] = in3[l];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void splitRowC2_Impl(const T in[], T out0[],
|
||||
T out1[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1;
|
||||
v_load_deinterleave(&in[2*l], r0, r1);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
}
|
||||
VecT r0, r1;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
v_load_deinterleave(&in[2*x], r0, r1);
|
||||
vx_store(&out0[x], r0);
|
||||
vx_store(&out1[x], r1);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[2*l + 0];
|
||||
out1[l] = in[2*l + 1];
|
||||
for (; x < length; ++x) {
|
||||
out0[x] = in[2*x + 0];
|
||||
out1[x] = in[2*x + 1];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
|
||||
uint8_t out1[], uint8_t out2[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void splitRowC3_Impl(const T in[], T out0[],
|
||||
T out1[], T out2[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1, r2;
|
||||
v_load_deinterleave(&in[3*l], r0, r1, r2);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
vx_store(&out2[l], r2);
|
||||
}
|
||||
VecT r0, r1, r2;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
v_load_deinterleave(&in[3*x], r0, r1, r2);
|
||||
vx_store(&out0[x], r0);
|
||||
vx_store(&out1[x], r1);
|
||||
vx_store(&out2[x], r2);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[3*l + 0];
|
||||
out1[l] = in[3*l + 1];
|
||||
out2[l] = in[3*l + 2];
|
||||
for (; x < length; ++x) {
|
||||
out0[x] = in[3*x + 0];
|
||||
out1[x] = in[3*x + 1];
|
||||
out2[x] = in[3*x + 2];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
||||
uint8_t out2[], uint8_t out3[], int length) {
|
||||
int l = 0;
|
||||
template <typename VecT, typename T>
|
||||
CV_ALWAYS_INLINE void splitRowC4_Impl(const T in[], T out0[], T out1[],
|
||||
T out2[], T out3[], const int length) {
|
||||
int x = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int nlanes = VecT::nlanes;
|
||||
GAPI_DbgAssert(length >= nlanes);
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_uint8 r0, r1, r2, r3;
|
||||
v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
vx_store(&out2[l], r2);
|
||||
vx_store(&out3[l], r3);
|
||||
}
|
||||
VecT r0, r1, r2, r3;
|
||||
for (; length >= nlanes;) {
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
v_load_deinterleave(&in[4*x], r0, r1, r2, r3);
|
||||
vx_store(&out0[x], r0);
|
||||
vx_store(&out1[x], r1);
|
||||
vx_store(&out2[x], r2);
|
||||
vx_store(&out3[x], r3);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
if (x < length) {
|
||||
x = length - nlanes;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[4*l + 0];
|
||||
out1[l] = in[4*l + 1];
|
||||
out2[l] = in[4*l + 2];
|
||||
out3[l] = in[4*l + 3];
|
||||
for (; x < length; ++x) {
|
||||
out0[x] = in[4*x + 0];
|
||||
out1[x] = in[4*x + 1];
|
||||
out2[x] = in[4*x + 2];
|
||||
out3[x] = in[4*x + 3];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
|
||||
float out1[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1;
|
||||
v_load_deinterleave(&in[2*l], r0, r1);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[2*l + 0];
|
||||
out1[l] = in[2*l + 1];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
|
||||
float out2[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1, r2;
|
||||
v_load_deinterleave(&in[3*l], r0, r1, r2);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
vx_store(&out2[l], r2);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[3*l + 0];
|
||||
out1[l] = in[3*l + 1];
|
||||
out2[l] = in[3*l + 2];
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
|
||||
float out2[], float out3[], int length) {
|
||||
int l = 0;
|
||||
|
||||
#if MANUAL_SIMD
|
||||
constexpr int nlanes = v_float32::nlanes;
|
||||
|
||||
cycle:
|
||||
for (; l <= length - nlanes; l += nlanes) {
|
||||
v_float32 r0, r1, r2, r3;
|
||||
v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
|
||||
vx_store(&out0[l], r0);
|
||||
vx_store(&out1[l], r1);
|
||||
vx_store(&out2[l], r2);
|
||||
vx_store(&out3[l], r3);
|
||||
}
|
||||
|
||||
if (l < length && length >= nlanes) {
|
||||
l = length - nlanes;
|
||||
goto cycle;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; l < length; ++l) {
|
||||
out0[l] = in[4*l + 0];
|
||||
out1[l] = in[4*l + 1];
|
||||
out2[l] = in[4*l + 2];
|
||||
out3[l] = in[4*l + 3];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||
@@ -880,6 +724,38 @@ CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
|
||||
out[x] = in[x*chs + chan];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
CV_ALWAYS_INLINE void splitRowImpl(isa_tag_t, const T* in, std::array<T*, chs>& outs, const int length) {
|
||||
static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
|
||||
|
||||
if (chs == 2) {
|
||||
splitRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], length);
|
||||
return;
|
||||
} else if (chs == 3) {
|
||||
splitRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], length);
|
||||
return;
|
||||
} else {
|
||||
splitRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, outs[0], outs[1], outs[2], outs[3], length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename isa_tag_t, typename T, int chs>
|
||||
CV_ALWAYS_INLINE void mergeRowImpl(isa_tag_t, const std::array<const T*, chs>& ins, T* out, const int length) {
|
||||
static_assert(chs > 1 && chs < 5, "This number of channels isn't supported.");
|
||||
|
||||
if (chs == 2) {
|
||||
mergeRowC2_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], out, length);
|
||||
return;
|
||||
} else if (chs == 3) {
|
||||
mergeRowC3_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], out, length);
|
||||
return;
|
||||
} else {
|
||||
mergeRowC4_Impl<vector_type_of_t<isa_tag_t, T>, T>(ins[0], ins[1], ins[2], ins[3], out, length);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
||||
@@ -98,10 +98,30 @@ cv::String typeToString(int type)
|
||||
case CV_8UC2 : return "CV_8UC2";
|
||||
case CV_8UC3 : return "CV_8UC3";
|
||||
case CV_8UC4 : return "CV_8UC4";
|
||||
case CV_16FC1 : return "CV_16FC1";
|
||||
case CV_16FC2 : return "CV_16FC2";
|
||||
case CV_16FC3 : return "CV_16FC3";
|
||||
case CV_16FC4 : return "CV_16FC4";
|
||||
case CV_32FC1 : return "CV_32FC1";
|
||||
case CV_32FC2 : return "CV_32FC2";
|
||||
case CV_32FC3 : return "CV_32FC3";
|
||||
case CV_32FC4 : return "CV_32FC4";
|
||||
case CV_8SC1 : return "CV_8SC1";
|
||||
case CV_8SC2 : return "CV_8SC2";
|
||||
case CV_8SC3 : return "CV_8SC3";
|
||||
case CV_8SC4 : return "CV_8SC4";
|
||||
case CV_16SC1 : return "CV_16SC1";
|
||||
case CV_16SC2 : return "CV_16SC2";
|
||||
case CV_16SC3 : return "CV_16SC3";
|
||||
case CV_16SC4 : return "CV_16SC4";
|
||||
case CV_16UC1 : return "CV_16UC1";
|
||||
case CV_16UC2 : return "CV_16UC2";
|
||||
case CV_16UC3 : return "CV_16UC3";
|
||||
case CV_16UC4 : return "CV_16UC4";
|
||||
case CV_32SC1 : return "CV_32SC1";
|
||||
case CV_32SC2 : return "CV_32SC2";
|
||||
case CV_32SC3 : return "CV_32SC3";
|
||||
case CV_32SC4 : return "CV_32SC4";
|
||||
}
|
||||
CV_Assert(!"ERROR: unsupported type!");
|
||||
return nullptr;
|
||||
|
||||
Reference in New Issue
Block a user