* SIMD opt for the Resize 32F1C (#5025)

2021-04-05 19:00:25 +03:00 · 2021-04-05 19:00:25 +03:00 · 26801c14e9
commit 26801c14e9
parent e2ada66826
4 changed files with 88 additions and 16 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -127,6 +127,19 @@ void copyRow_32F(const float in[], float out[], int length) {
    copyRow_32F_impl(in, out, length);
 }

+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(float* dst[],
+                       const float* src0[],
+                       const float* src1[],
+                       const float  alpha[],
+                       const int    mapsx[],
+                       const float  beta[],
+                       const Size& inSz,
+                       const Size& outSz,
+                       const int   lpi) {
+    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
+}
+
 template<int chanNum>
 CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                            const uchar* src, const int width,
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -1120,6 +1120,17 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
            return;
        }
    }
+
+    if (std::is_same<T, float>::value) {
+        neon::calcRowLinear_32F(reinterpret_cast<float**>(dst),
+                                reinterpret_cast<const float**>(src0),
+                                reinterpret_cast<const float**>(src1),
+                                reinterpret_cast<const float*>(alpha),
+                                reinterpret_cast<const int*>(mapsx),
+                                reinterpret_cast<const float*>(beta),
+                                inSz, outSz, lpi);
+        return;
+    }
 #endif

    for (int l = 0; l < lpi; l++) {
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@ -788,19 +788,19 @@ inline void copyRow_32F_impl(const float in[], float out[], int length) {
 }

 // Resize (bi-linear, 32FC1)
-static inline void calcRowLinear_32FC1(float *dst[],
-                                       const float *src0[],
-                                       const float *src1[],
-                                       const float  alpha[],
-                                       const int    mapsx[],
-                                       const float  beta[],
-                                       const Size& inSz,
-                                       const Size& outSz,
-                                               int lpi) {
+CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
+                                          const float *src0[],
+                                          const float *src1[],
+                                          const float  alpha[],
+                                          const int    mapsx[],
+                                          const float  beta[],
+                                          const Size& inSz,
+                                          const Size& outSz,
+                                          const int   lpi) {
    bool xRatioEq1 = inSz.width == outSz.width;
    bool yRatioEq1 = inSz.height == outSz.height;

-#if CPU_SIMD
+#if MANUAL_SIMD
    const int nlanes = v_float32::nlanes;
 #endif

@ -811,19 +811,19 @@ static inline void calcRowLinear_32FC1(float *dst[],

            int x = 0;

-#if CPU_SIMD
+#if MANUAL_SIMD
+            v_float32 low1, high1, s00, s01;
+            v_float32 low2, high2, s10, s11;
            for (; x <= outSz.width - nlanes; x += nlanes) {
                v_float32 alpha0 = vx_load(&alpha[x]);
                //  v_float32 alpha1 = 1.f - alpha0;

-                v_float32 low1, high1, s00, s01;
                v_gather_pairs(src0[line], mapsx, x, low1, high1);
                v_deinterleave(low1, high1, s00, s01);

                //  v_float32 res0 = s00*alpha0 + s01*alpha1;
                v_float32 res0 = v_fma(s00 - s01, alpha0, s01);

-                v_float32 low2, high2, s10, s11;
                v_gather_pairs(src1[line], mapsx, x, low2, high2);
                v_deinterleave(low2, high2, s10, s11);

@ -854,12 +854,12 @@ static inline void calcRowLinear_32FC1(float *dst[],
        for (int line = 0; line < lpi; ++line) {
            int x = 0;

-#if CPU_SIMD
+#if MANUAL_SIMD
+            v_float32 low, high, s00, s01;
            for (; x <= outSz.width - nlanes; x += nlanes) {
                v_float32 alpha0 = vx_load(&alpha[x]);
                //  v_float32 alpha1 = 1.f - alpha0;

-                v_float32 low, high, s00, s01;
                v_gather_pairs(src0[line], mapsx, x, low, high);
                v_deinterleave(low, high, s00, s01);

@ -889,7 +889,7 @@ static inline void calcRowLinear_32FC1(float *dst[],

            int x = 0;

-#if CPU_SIMD
+#if MANUAL_SIMD
            for (; x <= length - nlanes; x += nlanes) {
                v_float32 s0 = vx_load(&src0[line][x]);
                v_float32 s1 = vx_load(&src1[line][x]);
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@ -2426,6 +2426,42 @@ CV_ALWAYS_INLINE v_uint8x16 v_gather_lines(const uchar src[], const short* mapsx
    return v_uint8x16(vreinterpretq_u8_s32(result));
 }

+CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int mapsx[], const int x,
+                                     v_float32x4& low, v_float32x4& high)
+{
+#if defined(__aarch64__)
+    float64x2_t l = {};
+    l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x]]), l, 0);
+    l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 1]]), l, 1);
+    low.val = vreinterpretq_f32_f64(l);
+
+    float64x2_t h = {};
+    h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 2]]), h, 0);
+    h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 3]]), h, 1);
+    high.val = vreinterpretq_f32_f64(h);
+#else
+    float32x4_t l = {};
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x]]), l, 0);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x] + 1]), l, 1);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1]]), l, 2);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1] + 1]), l, 3);
+    low.val = l;
+
+    float32x4_t h = {};
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2]]), h, 0);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2] + 1]), h, 1);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3]]), h, 2);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3] + 1]), h, 3);
+    high.val = h;
+#endif
+
+    return;
+}
+
+CV_ALWAYS_INLINE v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
+    return v_fma(a, v_setall_f32(b), c);
+}
+
 template<int imm>
 CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
 {
@ -2473,6 +2509,18 @@ CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mas
 #endif
 }

+CV_ALWAYS_INLINE void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
+                                     v_float32x4& even, v_float32x4& odd) {
+    float32x4x2_t p1 = vzipq_f32(low.val, high.val);
+    float32x4_t tmp0 = p1.val[0];
+    float32x4_t tmp1 = p1.val[1];
+
+    float32x4x2_t p2 = vzipq_f32(tmp0, tmp1);
+    even.val = p2.val[0];
+    odd.val = p2.val[1];
+    return;
+}
+
 CV_ALWAYS_INLINE void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
                                     const v_uint8x16& i2, const v_uint8x16& i3,
                                     v_uint8x16& res0, v_uint8x16& res1,