Pre-processing(GAPI): ARM(NEON) integration + Split, Merge, Color conversion kernels on NEON (#1315)

This commit is contained in:
Anna Khakimova
2020-07-21 14:19:15 +03:00
committed by GitHub
parent 14d371849d
commit eecd03aa85
6 changed files with 373 additions and 39 deletions

View File

@@ -0,0 +1,192 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ie_preprocess_gapi_kernels.hpp"
#include "ie_preprocess_gapi_kernels_impl.hpp"
#include <type_traits>
namespace InferenceEngine {
namespace gapi {
namespace kernels {
namespace neon {
using C3 = std::integral_constant<int, 3>;
using C4 = std::integral_constant<int, 4>;
//-----------------------------------------------------------------------------
typedef MapperUnit<float, int> MapperUnit32F;
typedef MapperUnit<Q0_16, short> MapperUnit8U;
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size &inSz, const Size &outSz,
Q0_16 yalpha, const MapperUnit8U& ymap, int xmaxdf, const short xindex[],
const Q0_16 xalpha[], Q8_8 vbuf[]);
void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Size &outSz,
float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
const float xalpha[], float vbuf[]);
// Resize (bi-linear, 8U)
void calcRowLinear_8U(uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
// Resize (bi-linear, 8UC4)
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
template<int numChan>
void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi) {
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F( float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi);
//----------------------------------------------------------------------
void mergeRow_8UC2(const uint8_t in0[],
const uint8_t in1[],
uint8_t out[],
int length);
void mergeRow_8UC3(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
uint8_t out[],
int length);
void mergeRow_8UC4(const uint8_t in0[],
const uint8_t in1[],
const uint8_t in2[],
const uint8_t in3[],
uint8_t out[],
int length);
void mergeRow_32FC2(const float in0[],
const float in1[],
float out[],
int length);
void mergeRow_32FC3(const float in0[],
const float in1[],
const float in2[],
float out[],
int length);
void mergeRow_32FC4(const float in0[],
const float in1[],
const float in2[],
const float in3[],
float out[],
int length);
void splitRow_8UC2(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
int length);
void splitRow_8UC3(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
int length);
void splitRow_8UC4(const uint8_t in[],
uint8_t out0[],
uint8_t out1[],
uint8_t out2[],
uint8_t out3[],
int length);
void splitRow_32FC2(const float in[],
float out0[],
float out1[],
int length);
void splitRow_32FC3(const float in[],
float out0[],
float out1[],
float out2[],
int length);
void splitRow_32FC4(const float in[],
float out0[],
float out1[],
float out2[],
float out3[],
int length);
void calculate_nv12_to_rgb(const uchar **srcY,
const uchar *srcUV,
uchar **dstRGBx,
int width);
void calculate_i420_to_rgb(const uchar **srcY,
const uchar *srcU,
const uchar *srcV,
uchar **dstRGBx,
int width);
void copyRow_8U(const uint8_t in[],
uint8_t out[],
int length);
void copyRow_32F(const float in[],
float out[],
int length);
} // namespace neon
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@@ -22,6 +22,10 @@
#endif
#ifdef HAVE_NEON
#include "arm_neon/ie_preprocess_gapi_kernels_neon.hpp"
#endif
#include <opencv2/gapi/opencv_includes.hpp>
#include <opencv2/gapi/fluid/gfluidkernel.hpp>
#include <opencv2/gapi/gcompoundkernel.hpp>
@@ -174,6 +178,47 @@ void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int leng
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value && chs == 2) {
neon::mergeRow_8UC2(ins[0], ins[1], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<float*>(out), length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
reinterpret_cast<const float*>(ins[1]),
reinterpret_cast<const float*>(ins[2]),
reinterpret_cast<const float*>(ins[3]),
reinterpret_cast<float*>(out), length);
return;
}
#endif // HAVE_NEON
const T* insT[chs];
for (int c = 0; c < chs; c++) {
insT[c] = reinterpret_cast<const T*>(ins[c]);
@@ -328,6 +373,50 @@ void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value && chs == 2) {
neon::splitRow_8UC2(in, outs[0], outs[1], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 3) {
neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
return;
}
if (std::is_same<T, uint8_t>::value && chs == 4) {
neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
return;
}
if (std::is_same<T, float>::value && chs == 2) {
neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 3) {
neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
length);
return;
}
if (std::is_same<T, float>::value && chs == 4) {
neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(outs[0]),
reinterpret_cast<float*>(outs[1]),
reinterpret_cast<float*>(outs[2]),
reinterpret_cast<float*>(outs[3]),
length);
return;
}
#endif // HAVE_NEON
auto inT = reinterpret_cast<const T*>(in);
T* outsT[chs];
@@ -484,6 +573,7 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
}
#endif // HAVE_AVX512
#endif
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
if (std::is_same<T, uint8_t>::value && chs == 1) {
@@ -515,6 +605,20 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value && chs == 1) {
neon::copyRow_8U(in, out, length);
return;
}
if (std::is_same<T, float>::value && chs == 1) {
neon::copyRow_32F(reinterpret_cast<const float*>(in),
reinterpret_cast<float*>(out),
length);
return;
}
#endif // HAVE_NEON
const auto inT = reinterpret_cast<const T*>(in);
auto outT = reinterpret_cast< T*>(out);
@@ -831,14 +935,14 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
if (std::is_same<T, uint8_t>::value) {
if (inSz.width >= 16 && outSz.width >= 8) {
calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
reinterpret_cast<const uint8_t**>(src0),
reinterpret_cast<const uint8_t**>(src1),
reinterpret_cast<const short*>(alpha),
reinterpret_cast<const short*>(clone),
reinterpret_cast<const short*>(mapsx),
reinterpret_cast<const short*>(beta),
reinterpret_cast<uint8_t*>(tmp),
inSz, outSz, lpi);
reinterpret_cast<const uint8_t**>(src0),
reinterpret_cast<const uint8_t**>(src1),
reinterpret_cast<const short*>(alpha),
reinterpret_cast<const short*>(clone),
reinterpret_cast<const short*>(mapsx),
reinterpret_cast<const short*>(beta),
reinterpret_cast<uint8_t*>(tmp),
inSz, outSz, lpi);
return;
}
}
@@ -2011,6 +2115,7 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
}
#endif // HAVE_AVX512
#endif
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
avx::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
@@ -2024,6 +2129,11 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
neon::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
return;
#endif // HAVE_NEON
calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width);
}
};
@@ -2045,29 +2155,35 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
int buf_width = out.length();
GAPI_DbgAssert(in_u.length() == in_v.length());
// AVX512 implementation of wide universal intrinsics is slower than AVX2.
// It is turned off until the cause isn't found out.
#if 0
#ifdef HAVE_AVX512
if (with_cpu_x86_avx512_core()) {
#define CV_AVX_512DQ 1
avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_AVX512
#endif
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_AVX2
#ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_SSE
// AVX512 implementation of wide universal intrinsics is slower than AVX2.
// It is turned off until the cause isn't found out.
#if 0
#ifdef HAVE_AVX512
if (with_cpu_x86_avx512_core()) {
#define CV_AVX_512DQ 1
avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_AVX512
#endif
#ifdef HAVE_AVX2
if (with_cpu_x86_avx2()) {
avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_AVX2
#ifdef HAVE_SSE
if (with_cpu_x86_sse42()) {
calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
}
#endif // HAVE_SSE
#ifdef HAVE_NEON
neon::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
return;
#endif // HAVE_NEON
calculate_i420_to_rgb_fallback(y_rows, u_row, v_row, out_rows, buf_width);
}

View File

@@ -1058,7 +1058,11 @@ TEST_P(PreprocTest, Performance)
std::tie(in_size, out_size) = sizes;
int in_ocv_chan = -1, out_ocv_chan = -1;
std::tie(in_ocv_chan, out_ocv_chan) = ocv_channels;
#if defined(__arm__) || defined(__aarch64__)
double tolerance = Precision::U8 ? 4 : 0.015;
#else
double tolerance = Precision::U8 ? 1 : 0.015;
#endif
const int ocv_depth = prec == Precision::U8 ? CV_8U :
prec == Precision::FP32 ? CV_32F : -1;

View File

@@ -97,24 +97,38 @@
std::make_pair(cv::Size( 96, 256), cv::Size( 128, 384))
using namespace testing;
#if defined(__arm__) || defined(__aarch64__)
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
Combine(Values(CV_8UC1, CV_8UC3),
Values(cv::INTER_LINEAR, cv::INTER_AREA),
Values(TEST_RESIZE_PAIRS),
Values(4))); // error not more than 4 unit
INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
Combine(Values(CV_8UC3, CV_8UC4),
Values(cv::INTER_LINEAR),
Values(TEST_RESIZE_PAIRS),
Values(4))); // error not more than 4 unit
#else
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
Combine(Values(CV_8UC1, CV_8UC3),
Values(cv::INTER_LINEAR, cv::INTER_AREA),
Values(TEST_RESIZE_PAIRS),
Values(1))); // error not more than 1 unit
INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
Combine(Values(CV_8UC3, CV_8UC4),
Values(cv::INTER_LINEAR),
Values(TEST_RESIZE_PAIRS),
Values(1))); // error not more than 1 unit
#endif
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
Combine(Values(CV_32FC1, CV_32FC3),
Values(cv::INTER_LINEAR, cv::INTER_AREA),
Values(TEST_RESIZE_PAIRS),
Values(0.015))); // accuracy like ~1.5%
INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
Combine(Values(CV_8UC3, CV_8UC4),
Values(cv::INTER_LINEAR),
Values(TEST_RESIZE_PAIRS),
Values(1))); // error not more than 1 unit
INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI,
Combine(Values(2, 3, 4),
@@ -179,11 +193,19 @@ INSTANTIATE_TEST_CASE_P(ResizeRGB8URoiTestFluid, ResizeRGB8URoiTestGAPI,
//----------------------------------------------------------------------
#if defined(__arm__) || defined(__aarch64__)
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
Combine(Values(CV_8UC1, CV_8UC3),
Values(cv::INTER_LINEAR, cv::INTER_AREA),
Values(TEST_RESIZE_PAIRS),
Values(4))); // error not more than 4 unit
#else
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
Combine(Values(CV_8UC1, CV_8UC3),
Values(cv::INTER_LINEAR, cv::INTER_AREA),
Values(TEST_RESIZE_PAIRS),
Values(1))); // error not more than 1 unit
#endif
INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestIE,
Combine(Values(CV_32FC1, CV_32FC3),

View File

@@ -268,7 +268,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_MSA
#endif*/
#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD
#if CV_SSE2 || CV_NEON
#define CV__SIMD_FORWARD 128
#include "opencv_hal_intrin_forward.hpp"
#endif

View File

@@ -511,7 +511,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
}
#endif
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
static inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& a)
{
@@ -2224,7 +2224,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
#endif
////// FP16 support ///////
// Currently disabled
// Unsupported. Currently disabled.
#if 0
#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)