* SIMD opt for the Resize 32F1C (#5025)

This commit is contained in:
Anna Khakimova 2021-04-05 19:00:25 +03:00 committed by GitHub
parent e2ada66826
commit 26801c14e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 16 deletions

View File

@ -127,6 +127,19 @@ void copyRow_32F(const float in[], float out[], int length) {
copyRow_32F_impl(in, out, length);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float* dst[],
const float* src0[],
const float* src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
}
template<int chanNum>
CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* src, const int width,

View File

@ -1120,6 +1120,17 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
return;
}
}
if (std::is_same<T, float>::value) {
neon::calcRowLinear_32F(reinterpret_cast<float**>(dst),
reinterpret_cast<const float**>(src0),
reinterpret_cast<const float**>(src1),
reinterpret_cast<const float*>(alpha),
reinterpret_cast<const int*>(mapsx),
reinterpret_cast<const float*>(beta),
inSz, outSz, lpi);
return;
}
#endif
for (int l = 0; l < lpi; l++) {

View File

@ -788,19 +788,19 @@ inline void copyRow_32F_impl(const float in[], float out[], int length) {
}
// Resize (bi-linear, 32FC1)
static inline void calcRowLinear_32FC1(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi) {
CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi) {
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
#if CPU_SIMD
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
#endif
@ -811,19 +811,19 @@ static inline void calcRowLinear_32FC1(float *dst[],
int x = 0;
#if CPU_SIMD
#if MANUAL_SIMD
v_float32 low1, high1, s00, s01;
v_float32 low2, high2, s10, s11;
for (; x <= outSz.width - nlanes; x += nlanes) {
v_float32 alpha0 = vx_load(&alpha[x]);
// v_float32 alpha1 = 1.f - alpha0;
v_float32 low1, high1, s00, s01;
v_gather_pairs(src0[line], mapsx, x, low1, high1);
v_deinterleave(low1, high1, s00, s01);
// v_float32 res0 = s00*alpha0 + s01*alpha1;
v_float32 res0 = v_fma(s00 - s01, alpha0, s01);
v_float32 low2, high2, s10, s11;
v_gather_pairs(src1[line], mapsx, x, low2, high2);
v_deinterleave(low2, high2, s10, s11);
@ -854,12 +854,12 @@ static inline void calcRowLinear_32FC1(float *dst[],
for (int line = 0; line < lpi; ++line) {
int x = 0;
#if CPU_SIMD
#if MANUAL_SIMD
v_float32 low, high, s00, s01;
for (; x <= outSz.width - nlanes; x += nlanes) {
v_float32 alpha0 = vx_load(&alpha[x]);
// v_float32 alpha1 = 1.f - alpha0;
v_float32 low, high, s00, s01;
v_gather_pairs(src0[line], mapsx, x, low, high);
v_deinterleave(low, high, s00, s01);
@ -889,7 +889,7 @@ static inline void calcRowLinear_32FC1(float *dst[],
int x = 0;
#if CPU_SIMD
#if MANUAL_SIMD
for (; x <= length - nlanes; x += nlanes) {
v_float32 s0 = vx_load(&src0[line][x]);
v_float32 s1 = vx_load(&src1[line][x]);

View File

@ -2426,6 +2426,42 @@ CV_ALWAYS_INLINE v_uint8x16 v_gather_lines(const uchar src[], const short* mapsx
return v_uint8x16(vreinterpretq_u8_s32(result));
}
CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int mapsx[], const int x,
v_float32x4& low, v_float32x4& high)
{
#if defined(__aarch64__)
float64x2_t l = {};
l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x]]), l, 0);
l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 1]]), l, 1);
low.val = vreinterpretq_f32_f64(l);
float64x2_t h = {};
h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 2]]), h, 0);
h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 3]]), h, 1);
high.val = vreinterpretq_f32_f64(h);
#else
float32x4_t l = {};
l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x]]), l, 0);
l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x] + 1]), l, 1);
l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1]]), l, 2);
l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1] + 1]), l, 3);
low.val = l;
float32x4_t h = {};
h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2]]), h, 0);
h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2] + 1]), h, 1);
h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3]]), h, 2);
h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3] + 1]), h, 3);
high.val = h;
#endif
return;
}
CV_ALWAYS_INLINE v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
return v_fma(a, v_setall_f32(b), c);
}
template<int imm>
CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
{
@ -2473,6 +2509,18 @@ CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mas
#endif
}
CV_ALWAYS_INLINE void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
v_float32x4& even, v_float32x4& odd) {
float32x4x2_t p1 = vzipq_f32(low.val, high.val);
float32x4_t tmp0 = p1.val[0];
float32x4_t tmp1 = p1.val[1];
float32x4x2_t p2 = vzipq_f32(tmp0, tmp1);
even.val = p2.val[0];
odd.val = p2.val[1];
return;
}
CV_ALWAYS_INLINE void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
const v_uint8x16& i2, const v_uint8x16& i3,
v_uint8x16& res0, v_uint8x16& res1,