Pre-processing(GAPI): ARM(NEON) integration + Split, Merge, Color conversion kernels on NEON (#1315)

2020-07-21 14:19:15 +03:00
parent 14d371849d
commit eecd03aa85
6 changed files with 373 additions and 39 deletions
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
@@ -0,0 +1,192 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_preprocess_gapi_kernels.hpp"
+#include "ie_preprocess_gapi_kernels_impl.hpp"
+#include  <type_traits>
+
+namespace InferenceEngine {
+namespace gapi {
+namespace kernels {
+namespace neon {
+
+using C3 = std::integral_constant<int, 3>;
+using C4 = std::integral_constant<int, 4>;
+//-----------------------------------------------------------------------------
+
+typedef MapperUnit<float,   int> MapperUnit32F;
+typedef MapperUnit<Q0_16, short> MapperUnit8U;
+
+void calcRowArea_8U(uchar dst[], const uchar *src[], const Size &inSz, const Size &outSz,
+                    Q0_16 yalpha, const MapperUnit8U& ymap, int xmaxdf, const short xindex[],
+                    const Q0_16 xalpha[], Q8_8 vbuf[]);
+
+void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Size &outSz,
+                     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
+                     const float xalpha[], float vbuf[]);
+
+// Resize (bi-linear, 8U)
+void calcRowLinear_8U(uint8_t *dst[],
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+// Resize (bi-linear, 8UC4)
+void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
+                      const uint8_t *src0[],
+                      const uint8_t *src1[],
+                        const short  alpha[],
+                        const short  clone[],
+                        const short  mapsx[],
+                        const short  beta[],
+                            uint8_t  tmp[],
+                        const Size&  inSz,
+                        const Size&  outSz,
+                                int  lpi);
+
+template<int numChan>
+void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
+                       const uint8_t *src0[],
+                       const uint8_t *src1[],
+                         const short  alpha[],
+                         const short  clone[],
+                         const short  mapsx[],
+                         const short  beta[],
+                             uint8_t  tmp[],
+                         const Size&  inSz,
+                         const Size&  outSz,
+                                 int  lpi) {
+    calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
+}
+
+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(      float *dst[],
+                       const float *src0[],
+                       const float *src1[],
+                       const float  alpha[],
+                       const int    mapsx[],
+                       const float  beta[],
+                       const Size& inSz,
+                       const Size& outSz,
+                               int lpi);
+
+//----------------------------------------------------------------------
+
+void mergeRow_8UC2(const uint8_t in0[],
+                   const uint8_t in1[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC3(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC4(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                   const uint8_t in3[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_32FC2(const float in0[],
+                    const float in1[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC3(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC4(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                    const float in3[],
+                          float out[],
+                            int length);
+
+void splitRow_8UC2(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                             int length);
+
+void splitRow_8UC3(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                             int length);
+
+void splitRow_8UC4(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                         uint8_t out3[],
+                             int length);
+
+void splitRow_32FC2(const float in[],
+                          float out0[],
+                          float out1[],
+                            int length);
+
+void splitRow_32FC3(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                            int length);
+
+void splitRow_32FC4(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                          float out3[],
+                            int length);
+
+void calculate_nv12_to_rgb(const  uchar **srcY,
+                           const  uchar *srcUV,
+                                  uchar **dstRGBx,
+                                    int width);
+
+void calculate_i420_to_rgb(const  uchar **srcY,
+                           const  uchar *srcU,
+                           const  uchar *srcV,
+                                  uchar **dstRGBx,
+                                    int width);
+
+void copyRow_8U(const uint8_t in[],
+                uint8_t out[],
+                int length);
+
+void copyRow_32F(const float in[],
+                 float out[],
+                 int length);
+
+}  // namespace neon
+}  // namespace kernels
+}  // namespace gapi
+}  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -22,6 +22,10 @@

 #endif

+#ifdef HAVE_NEON
+  #include "arm_neon/ie_preprocess_gapi_kernels_neon.hpp"
+#endif
+
 #include <opencv2/gapi/opencv_includes.hpp>
 #include <opencv2/gapi/fluid/gfluidkernel.hpp>
 #include <opencv2/gapi/gcompoundkernel.hpp>
@@ -174,6 +178,47 @@ void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int leng
    }
 #endif  // HAVE_SSE

+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 2) {
+        neon::mergeRow_8UC2(ins[0], ins[1], out, length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 3) {
+        neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 4) {
+        neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 2) {
+        neon::mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 3) {
+        neon::mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<const float*>(ins[2]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 4) {
+        neon::mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
+                             reinterpret_cast<const float*>(ins[1]),
+                             reinterpret_cast<const float*>(ins[2]),
+                             reinterpret_cast<const float*>(ins[3]),
+                             reinterpret_cast<float*>(out), length);
+        return;
+    }
+#endif  // HAVE_NEON
+
    const T* insT[chs];
    for (int c = 0; c < chs; c++) {
        insT[c] = reinterpret_cast<const T*>(ins[c]);
@@ -328,6 +373,50 @@ void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
    }
 #endif  // HAVE_SSE

+#ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 2) {
+        neon::splitRow_8UC2(in, outs[0], outs[1], length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 3) {
+        neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
+        return;
+    }
+
+    if (std::is_same<T, uint8_t>::value && chs == 4) {
+        neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 2) {
+        neon::splitRow_32FC2(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 3) {
+        neon::splitRow_32FC3(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             reinterpret_cast<float*>(outs[2]),
+                             length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 4) {
+        neon::splitRow_32FC4(reinterpret_cast<const float*>(in),
+                             reinterpret_cast<float*>(outs[0]),
+                             reinterpret_cast<float*>(outs[1]),
+                             reinterpret_cast<float*>(outs[2]),
+                             reinterpret_cast<float*>(outs[3]),
+                             length);
+        return;
+    }
+#endif  // HAVE_NEON
+
    auto inT = reinterpret_cast<const T*>(in);

    T* outsT[chs];
@@ -484,6 +573,7 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
    }
    #endif  // HAVE_AVX512
 #endif
+
    #ifdef HAVE_AVX2
    if (with_cpu_x86_avx2()) {
        if (std::is_same<T, uint8_t>::value && chs == 1) {
@@ -515,6 +605,20 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
    }
    #endif  // HAVE_SSE

+    #ifdef HAVE_NEON
+    if (std::is_same<T, uint8_t>::value && chs == 1) {
+        neon::copyRow_8U(in, out, length);
+        return;
+    }
+
+    if (std::is_same<T, float>::value && chs == 1) {
+        neon::copyRow_32F(reinterpret_cast<const float*>(in),
+                          reinterpret_cast<float*>(out),
+                          length);
+        return;
+    }
+    #endif  // HAVE_NEON
+
    const auto inT  = reinterpret_cast<const T*>(in);
          auto outT = reinterpret_cast<      T*>(out);

@@ -831,14 +935,14 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
        if (std::is_same<T, uint8_t>::value) {
            if (inSz.width >= 16 && outSz.width >= 8) {
                calcRowLinear_8UC1(reinterpret_cast<uint8_t**>(dst),
-                                 reinterpret_cast<const uint8_t**>(src0),
-                                 reinterpret_cast<const uint8_t**>(src1),
-                                 reinterpret_cast<const short*>(alpha),
-                                 reinterpret_cast<const short*>(clone),
-                                 reinterpret_cast<const short*>(mapsx),
-                                 reinterpret_cast<const short*>(beta),
-                                 reinterpret_cast<uint8_t*>(tmp),
-                                 inSz, outSz, lpi);
+                                   reinterpret_cast<const uint8_t**>(src0),
+                                   reinterpret_cast<const uint8_t**>(src1),
+                                   reinterpret_cast<const short*>(alpha),
+                                   reinterpret_cast<const short*>(clone),
+                                   reinterpret_cast<const short*>(mapsx),
+                                   reinterpret_cast<const short*>(beta),
+                                   reinterpret_cast<uint8_t*>(tmp),
+                                   inSz, outSz, lpi);
                return;
            }
        }
@@ -2011,6 +2115,7 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
        }
    #endif  // HAVE_AVX512
    #endif
+
    #ifdef HAVE_AVX2
        if (with_cpu_x86_avx2()) {
            avx::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
@@ -2024,6 +2129,11 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
        }
    #endif  // HAVE_SSE

+    #ifdef HAVE_NEON
+        neon::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
+        return;
+    #endif  // HAVE_NEON
+
        calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width);
    }
 };
@@ -2045,29 +2155,35 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
        int buf_width = out.length();
        GAPI_DbgAssert(in_u.length() ==  in_v.length());

-// AVX512 implementation of wide universal intrinsics is slower than AVX2.
-// It is turned off until the cause isn't found out.
-    #if 0
-    #ifdef HAVE_AVX512
-        if (with_cpu_x86_avx512_core()) {
-           #define CV_AVX_512DQ 1
-           avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_AVX512
-    #endif
-    #ifdef HAVE_AVX2
-        if (with_cpu_x86_avx2()) {
-           avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_AVX2
-    #ifdef HAVE_SSE
-        if (with_cpu_x86_sse42()) {
-           calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-           return;
-        }
-    #endif  // HAVE_SSE
+        // AVX512 implementation of wide universal intrinsics is slower than AVX2.
+        // It is turned off until the cause isn't found out.
+        #if 0
+        #ifdef HAVE_AVX512
+            if (with_cpu_x86_avx512_core()) {
+               #define CV_AVX_512DQ 1
+               avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_AVX512
+        #endif
+
+        #ifdef HAVE_AVX2
+            if (with_cpu_x86_avx2()) {
+               avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_AVX2
+        #ifdef HAVE_SSE
+            if (with_cpu_x86_sse42()) {
+               calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+               return;
+            }
+        #endif  // HAVE_SSE
+
+        #ifdef HAVE_NEON
+            neon::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
+            return;
+        #endif  // HAVE_NEON

        calculate_i420_to_rgb_fallback(y_rows, u_row, v_row, out_rows, buf_width);
    }
--- a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
+++ b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
@@ -1058,7 +1058,11 @@ TEST_P(PreprocTest, Performance)
    std::tie(in_size, out_size) = sizes;
    int in_ocv_chan = -1, out_ocv_chan = -1;
    std::tie(in_ocv_chan, out_ocv_chan) = ocv_channels;
+#if defined(__arm__) || defined(__aarch64__)
+    double tolerance = Precision::U8 ? 4 : 0.015;
+#else
    double tolerance = Precision::U8 ? 1 : 0.015;
+#endif

    const int ocv_depth = prec == Precision::U8 ? CV_8U :
        prec == Precision::FP32 ? CV_32F : -1;
--- a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
+++ b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
@@ -97,24 +97,38 @@
    std::make_pair(cv::Size(  96,  256), cv::Size( 128,  384))

 using namespace testing;
+#if defined(__arm__) || defined(__aarch64__)
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(4))); // error not more than 4 unit

+INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
+                        Combine(Values(CV_8UC3, CV_8UC4),
+                                Values(cv::INTER_LINEAR),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(4))); // error not more than 4 unit
+#else
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
                        Combine(Values(CV_8UC1, CV_8UC3),
                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                Values(TEST_RESIZE_PAIRS),
                                Values(1))); // error not more than 1 unit

+INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
+                        Combine(Values(CV_8UC3, CV_8UC4),
+                                Values(cv::INTER_LINEAR),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(1))); // error not more than 1 unit
+#endif
+
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
                        Combine(Values(CV_32FC1, CV_32FC3),
                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                Values(TEST_RESIZE_PAIRS),
                                Values(0.015))); // accuracy like ~1.5%

-INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI,
-                        Combine(Values(CV_8UC3, CV_8UC4),
-                                Values(cv::INTER_LINEAR),
-                                Values(TEST_RESIZE_PAIRS),
-                                Values(1))); // error not more than 1 unit

 INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI,
                        Combine(Values(2, 3, 4),
@@ -179,11 +193,19 @@ INSTANTIATE_TEST_CASE_P(ResizeRGB8URoiTestFluid, ResizeRGB8URoiTestGAPI,

 //----------------------------------------------------------------------

+#if defined(__arm__) || defined(__aarch64__)
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(4))); // error not more than 4 unit
+#else
 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
                        Combine(Values(CV_8UC1, CV_8UC3),
                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
                                Values(TEST_RESIZE_PAIRS),
                                Values(1))); // error not more than 1 unit
+#endif

 INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestIE,
                        Combine(Values(CV_32FC1, CV_32FC3),
--- a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp
@@ -268,7 +268,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_MSA
 #endif*/

-#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD
+#if CV_SSE2 || CV_NEON
 #define CV__SIMD_FORWARD 128
 #include "opencv_hal_intrin_forward.hpp"
 #endif
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@@ -511,7 +511,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
 }
 #endif

-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+static inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                               const v_float32x4& m1, const v_float32x4& m2,
                               const v_float32x4& a)
 {
@@ -2224,7 +2224,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 #endif

 ////// FP16 support ///////
-// Currently disabled
+// Unsupported. Currently disabled.
 #if 0
 #if CV_FP16
 inline v_float32x4 v_load_expand(const float16_t* ptr)