From eecd03aa859a17dd08257646b6c8a36ae01890b7 Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Tue, 21 Jul 2020 14:19:15 +0300 Subject: [PATCH] Pre-processing(GAPI): ARM(NEON) integration + Split, Merge, Color conversion kernels on NEON (#1315) --- .../ie_preprocess_gapi_kernels_neon.hpp | 192 ++++++++++++++++++ .../ie_preprocess_gapi_kernels.cpp | 178 +++++++++++++--- .../fluid_preproc/common/fluid_tests.cpp | 4 + .../fluid_preproc/cpu/fluid_tests_cpu.cpp | 32 ++- .../thirdparty/ocv/opencv_hal_intrin.hpp | 2 +- .../thirdparty/ocv/opencv_hal_neon.hpp | 4 +- 6 files changed, 373 insertions(+), 39 deletions(-) create mode 100644 inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp new file mode 100644 index 00000000000..92b9da7cd7e --- /dev/null +++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp @@ -0,0 +1,192 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_preprocess_gapi_kernels.hpp" +#include "ie_preprocess_gapi_kernels_impl.hpp" +#include + +namespace InferenceEngine { +namespace gapi { +namespace kernels { +namespace neon { + +using C3 = std::integral_constant; +using C4 = std::integral_constant; +//----------------------------------------------------------------------------- + +typedef MapperUnit MapperUnit32F; +typedef MapperUnit MapperUnit8U; + +void calcRowArea_8U(uchar dst[], const uchar *src[], const Size &inSz, const Size &outSz, + Q0_16 yalpha, const MapperUnit8U& ymap, int xmaxdf, const short xindex[], + const Q0_16 xalpha[], Q8_8 vbuf[]); + +void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Size &outSz, + float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], + const float xalpha[], float vbuf[]); + +// Resize (bi-linear, 8U) +void calcRowLinear_8U(uint8_t *dst[], + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); + +// Resize (bi-linear, 8UC3) +void calcRowLinear_8U(C3, std::array, 3> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); + +// Resize (bi-linear, 8UC4) +void calcRowLinear_8U(C4, std::array, 4> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi); + +template +void calcRowLinear_8UC(std::array, numChan> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi) { + calcRowLinear_8U(std::integral_constant{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); +} + +// Resize (bi-linear, 32F) +void calcRowLinear_32F( float *dst[], + const float *src0[], + const float *src1[], + const float alpha[], + const int mapsx[], + const float beta[], + const Size& inSz, + const Size& outSz, + int lpi); + +//---------------------------------------------------------------------- + +void mergeRow_8UC2(const uint8_t in0[], + const uint8_t in1[], + uint8_t out[], + int length); + +void mergeRow_8UC3(const uint8_t in0[], + const uint8_t in1[], + const uint8_t in2[], + uint8_t out[], + int length); + +void mergeRow_8UC4(const uint8_t in0[], + const uint8_t in1[], + const uint8_t in2[], + const uint8_t in3[], + uint8_t out[], + int length); + +void mergeRow_32FC2(const float in0[], + const float in1[], + float out[], + int length); + +void mergeRow_32FC3(const float in0[], + const float in1[], + const float in2[], + float out[], + int length); + +void mergeRow_32FC4(const float in0[], + const float in1[], + const float in2[], + const float in3[], + float out[], + int length); + +void splitRow_8UC2(const uint8_t in[], + uint8_t out0[], + uint8_t out1[], + int length); + +void splitRow_8UC3(const uint8_t in[], + uint8_t out0[], + uint8_t out1[], + uint8_t out2[], + int length); + +void splitRow_8UC4(const uint8_t in[], + uint8_t out0[], + uint8_t out1[], + uint8_t out2[], + uint8_t out3[], + int length); + +void splitRow_32FC2(const float in[], + float out0[], + float out1[], + int length); + +void splitRow_32FC3(const float in[], + float out0[], + float out1[], + float out2[], + int length); + +void splitRow_32FC4(const float in[], + float out0[], + float out1[], + float out2[], + float out3[], + int length); + +void calculate_nv12_to_rgb(const uchar **srcY, + const uchar *srcUV, + uchar **dstRGBx, + int width); + +void calculate_i420_to_rgb(const uchar **srcY, + const uchar *srcU, + const uchar *srcV, + uchar **dstRGBx, + int width); + +void copyRow_8U(const uint8_t in[], + uint8_t out[], + int length); + +void copyRow_32F(const float in[], + float out[], + int length); + +} // namespace neon +} // namespace kernels +} // namespace gapi +} // namespace InferenceEngine diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index ab9db8a5ea9..e6a3dbffca9 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -22,6 +22,10 @@ #endif +#ifdef HAVE_NEON + #include "arm_neon/ie_preprocess_gapi_kernels_neon.hpp" +#endif + #include #include #include @@ -174,6 +178,47 @@ void mergeRow(const std::array& ins, uint8_t* out, int leng } #endif // HAVE_SSE +#ifdef HAVE_NEON + if (std::is_same::value && chs == 2) { + neon::mergeRow_8UC2(ins[0], ins[1], out, length); + return; + } + + if (std::is_same::value && chs == 3) { + neon::mergeRow_8UC3(ins[0], ins[1], ins[2], out, length); + return; + } + + if (std::is_same::value && chs == 4) { + neon::mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length); + return; + } + + if (std::is_same::value && chs == 2) { + neon::mergeRow_32FC2(reinterpret_cast(ins[0]), + reinterpret_cast(ins[1]), + reinterpret_cast(out), length); + return; + } + + if (std::is_same::value && chs == 3) { + neon::mergeRow_32FC3(reinterpret_cast(ins[0]), + reinterpret_cast(ins[1]), + reinterpret_cast(ins[2]), + reinterpret_cast(out), length); + return; + } + + if (std::is_same::value && chs == 4) { + neon::mergeRow_32FC4(reinterpret_cast(ins[0]), + reinterpret_cast(ins[1]), + reinterpret_cast(ins[2]), + reinterpret_cast(ins[3]), + reinterpret_cast(out), length); + return; + } +#endif // HAVE_NEON + const T* insT[chs]; for (int c = 0; c < chs; c++) { insT[c] = reinterpret_cast(ins[c]); @@ -328,6 +373,50 @@ void splitRow(const uint8_t* in, std::array& outs, int length) { } #endif // HAVE_SSE +#ifdef HAVE_NEON + if (std::is_same::value && chs == 2) { + neon::splitRow_8UC2(in, outs[0], outs[1], length); + return; + } + + if (std::is_same::value && chs == 3) { + neon::splitRow_8UC3(in, outs[0], outs[1], outs[2], length); + return; + } + + if (std::is_same::value && chs == 4) { + neon::splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length); + return; + } + + if (std::is_same::value && chs == 2) { + neon::splitRow_32FC2(reinterpret_cast(in), + reinterpret_cast(outs[0]), + reinterpret_cast(outs[1]), + length); + return; + } + + if (std::is_same::value && chs == 3) { + neon::splitRow_32FC3(reinterpret_cast(in), + reinterpret_cast(outs[0]), + reinterpret_cast(outs[1]), + reinterpret_cast(outs[2]), + length); + return; + } + + if (std::is_same::value && chs == 4) { + neon::splitRow_32FC4(reinterpret_cast(in), + reinterpret_cast(outs[0]), + reinterpret_cast(outs[1]), + reinterpret_cast(outs[2]), + reinterpret_cast(outs[3]), + length); + return; + } +#endif // HAVE_NEON + auto inT = reinterpret_cast(in); T* outsT[chs]; @@ -484,6 +573,7 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i } #endif // HAVE_AVX512 #endif + #ifdef HAVE_AVX2 if (with_cpu_x86_avx2()) { if (std::is_same::value && chs == 1) { @@ -515,6 +605,20 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i } #endif // HAVE_SSE + #ifdef HAVE_NEON + if (std::is_same::value && chs == 1) { + neon::copyRow_8U(in, out, length); + return; + } + + if (std::is_same::value && chs == 1) { + neon::copyRow_32F(reinterpret_cast(in), + reinterpret_cast(out), + length); + return; + } + #endif // HAVE_NEON + const auto inT = reinterpret_cast(in); auto outT = reinterpret_cast< T*>(out); @@ -831,14 +935,14 @@ static void calcRowLinear(const cv::gapi::fluid::View & in, if (std::is_same::value) { if (inSz.width >= 16 && outSz.width >= 8) { calcRowLinear_8UC1(reinterpret_cast(dst), - reinterpret_cast(src0), - reinterpret_cast(src1), - reinterpret_cast(alpha), - reinterpret_cast(clone), - reinterpret_cast(mapsx), - reinterpret_cast(beta), - reinterpret_cast(tmp), - inSz, outSz, lpi); + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); return; } } @@ -2011,6 +2115,7 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) { } #endif // HAVE_AVX512 #endif + #ifdef HAVE_AVX2 if (with_cpu_x86_avx2()) { avx::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width); @@ -2024,6 +2129,11 @@ GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) { } #endif // HAVE_SSE + #ifdef HAVE_NEON + neon::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width); + return; + #endif // HAVE_NEON + calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width); } }; @@ -2045,29 +2155,35 @@ GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) { int buf_width = out.length(); GAPI_DbgAssert(in_u.length() == in_v.length()); -// AVX512 implementation of wide universal intrinsics is slower than AVX2. -// It is turned off until the cause isn't found out. - #if 0 - #ifdef HAVE_AVX512 - if (with_cpu_x86_avx512_core()) { - #define CV_AVX_512DQ 1 - avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); - return; - } - #endif // HAVE_AVX512 - #endif - #ifdef HAVE_AVX2 - if (with_cpu_x86_avx2()) { - avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); - return; - } - #endif // HAVE_AVX2 - #ifdef HAVE_SSE - if (with_cpu_x86_sse42()) { - calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); - return; - } - #endif // HAVE_SSE + // AVX512 implementation of wide universal intrinsics is slower than AVX2. + // It is turned off until the cause isn't found out. + #if 0 + #ifdef HAVE_AVX512 + if (with_cpu_x86_avx512_core()) { + #define CV_AVX_512DQ 1 + avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); + return; + } + #endif // HAVE_AVX512 + #endif + + #ifdef HAVE_AVX2 + if (with_cpu_x86_avx2()) { + avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); + return; + } + #endif // HAVE_AVX2 + #ifdef HAVE_SSE + if (with_cpu_x86_sse42()) { + calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); + return; + } + #endif // HAVE_SSE + + #ifdef HAVE_NEON + neon::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width); + return; + #endif // HAVE_NEON calculate_i420_to_rgb_fallback(y_rows, u_row, v_row, out_rows, buf_width); } diff --git a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp index b38bb5b8101..071faa610f9 100644 --- a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp +++ b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp @@ -1058,7 +1058,11 @@ TEST_P(PreprocTest, Performance) std::tie(in_size, out_size) = sizes; int in_ocv_chan = -1, out_ocv_chan = -1; std::tie(in_ocv_chan, out_ocv_chan) = ocv_channels; +#if defined(__arm__) || defined(__aarch64__) + double tolerance = Precision::U8 ? 4 : 0.015; +#else double tolerance = Precision::U8 ? 1 : 0.015; +#endif const int ocv_depth = prec == Precision::U8 ? CV_8U : prec == Precision::FP32 ? CV_32F : -1; diff --git a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp index e28c6129865..6684cd3424f 100644 --- a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp +++ b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp @@ -97,24 +97,38 @@ std::make_pair(cv::Size( 96, 256), cv::Size( 128, 384)) using namespace testing; +#if defined(__arm__) || defined(__aarch64__) +INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI, + Combine(Values(CV_8UC1, CV_8UC3), + Values(cv::INTER_LINEAR, cv::INTER_AREA), + Values(TEST_RESIZE_PAIRS), + Values(4))); // error not more than 4 unit +INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI, + Combine(Values(CV_8UC3, CV_8UC4), + Values(cv::INTER_LINEAR), + Values(TEST_RESIZE_PAIRS), + Values(4))); // error not more than 4 unit +#else INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI, Combine(Values(CV_8UC1, CV_8UC3), Values(cv::INTER_LINEAR, cv::INTER_AREA), Values(TEST_RESIZE_PAIRS), Values(1))); // error not more than 1 unit +INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI, + Combine(Values(CV_8UC3, CV_8UC4), + Values(cv::INTER_LINEAR), + Values(TEST_RESIZE_PAIRS), + Values(1))); // error not more than 1 unit +#endif + INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI, Combine(Values(CV_32FC1, CV_32FC3), Values(cv::INTER_LINEAR, cv::INTER_AREA), Values(TEST_RESIZE_PAIRS), Values(0.015))); // accuracy like ~1.5% -INSTANTIATE_TEST_CASE_P(ResizeRGB8UTestFluid_U8, ResizeRGB8UTestGAPI, - Combine(Values(CV_8UC3, CV_8UC4), - Values(cv::INTER_LINEAR), - Values(TEST_RESIZE_PAIRS), - Values(1))); // error not more than 1 unit INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI, Combine(Values(2, 3, 4), @@ -179,11 +193,19 @@ INSTANTIATE_TEST_CASE_P(ResizeRGB8URoiTestFluid, ResizeRGB8URoiTestGAPI, //---------------------------------------------------------------------- +#if defined(__arm__) || defined(__aarch64__) +INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE, + Combine(Values(CV_8UC1, CV_8UC3), + Values(cv::INTER_LINEAR, cv::INTER_AREA), + Values(TEST_RESIZE_PAIRS), + Values(4))); // error not more than 4 unit +#else INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE, Combine(Values(CV_8UC1, CV_8UC3), Values(cv::INTER_LINEAR, cv::INTER_AREA), Values(TEST_RESIZE_PAIRS), Values(1))); // error not more than 1 unit +#endif INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestIE, Combine(Values(CV_32FC1, CV_32FC3), diff --git a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp index 13bd390218b..c5b843f6c32 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp @@ -268,7 +268,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_MSA #endif*/ -#if CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD +#if CV_SSE2 || CV_NEON #define CV__SIMD_FORWARD 128 #include "opencv_hal_intrin_forward.hpp" #endif diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp index a4df685e9d5..3ee2d3fd3d9 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp @@ -511,7 +511,7 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) } #endif -inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, +static inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a) { @@ -2224,7 +2224,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo #endif ////// FP16 support /////// -// Currently disabled +// Unsupported. Currently disabled. #if 0 #if CV_FP16 inline v_float32x4 v_load_expand(const float16_t* ptr)