diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp index 86dc6b22fea..0ee824c8748 100644 --- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp +++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp @@ -4,6 +4,7 @@ #include "ie_preprocess_gapi_kernels.hpp" #include "ie_preprocess_gapi_kernels_impl.hpp" +#include "ie_preprocess_gapi_kernels_neon.hpp" #include @@ -126,6 +127,351 @@ void copyRow_32F(const float in[], float out[], int length) { copyRow_32F_impl(in, out, length); } +template +CV_ALWAYS_INLINE void channels2planes_store(std::array, chanNum>& dst, + const uchar* src, const int width, + const int nlanes, const int line) { + v_uint8 chan; + int x = 0; + for (;;) { + for (; x <= width - nlanes && x >= 0; x += nlanes) { + for (int c = 0; c < chanNum; ++c) { + v_gather_channel(chan, &src[chanNum * x], c); + vx_store(&dst[c][line][x], chan); + } + } + + if (x < width) { + x = width - nlanes; + continue; + } + break; + } +} + +CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1, + uchar* tmp, const int inLength, + const int nlanes, const short beta) { + int w = 0; + const int half_nlanes = nlanes/2; + for (;;) { + for (; w <= inLength - nlanes; w += nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[w])); + v_int16 s2 = v_reinterpret_as_s16(vx_load_expand(&src0[w + half_nlanes])); + v_int16 s3 = v_reinterpret_as_s16(vx_load_expand(&src1[w + half_nlanes])); + v_int16 res1 = v_mulhrs(s0 - s1, beta) + s1; + v_int16 res2 = v_mulhrs(s2 - s3, beta) + s3; + + vx_store(tmp + w, v_pack_u(res1, res2)); + } + + if (w < inLength) { + w = inLength - nlanes; + continue; + } + break; + } +} + +template +CV_ALWAYS_INLINE void horizontal_anyLPI(std::array, chanNum>& dst, + const uchar* src, const short mapsx[], + const short alpha[], const int nlanes, + const int width, const int line) { + const int half_nlanes = nlanes/2; + v_int16 t0, t1;//, t2, t3; + int x = 0; + for (;;) { + for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) { + v_int16 a0 = vx_load(&alpha[x]); + for (int c = 0; c < chanNum; ++c) { + v_gather_channel(t0, src, &mapsx[x], c, 0); + v_gather_channel(t1, src, &mapsx[x], c, 1); + //v_gather_channel(t2, src, &mapsx[x + half_nlanes], c, 0); + //v_gather_channel(t3, src, &mapsx[x + half_nlanes], c, 1); + v_int16 res1 = v_mulhrs(t0 - t1, a0) + t1; + //v_int16 res2 = v_mulhrs(t2 - t3, a0) + t3; + //vx_store(&dst[c][line][x], v_pack_u(res1, res2)); + v_pack_u_store(&dst[c][line][x], res1); + } + } + + if (x < width) { + //x = width - nlanes; + x = width - half_nlanes; + continue; + } + break; + } +} + +template +CV_ALWAYS_INLINE void horizontal_4LPI(std::array, chanNum>& dst, + const uchar* tmp, const short mapsx[], const short clone[], + const int width, const int nlanes) { + v_uint8 val_0, val_1, val_2, val_3; + const int half_nlanes = nlanes / 2; + const int shift = static_cast(half_nlanes / 4); + + uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 }; + v_uint8 hmask = vx_load(_mask_horizontal); + + int x = 0; + for (;;) { + for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 2)]); + v_int16 a54 = vx_load(&clone[4 * (x + 4)]); + v_int16 a76 = vx_load(&clone[4 * (x + 6)]); + + for (int c = 0; c < chanNum; ++c) { + v_gather_channel(val_0, tmp, &mapsx[x], chanNum, c, 0); + v_gather_channel(val_1, tmp, &mapsx[x], chanNum, c, shift); + v_gather_channel(val_2, tmp, &mapsx[x], chanNum, c, shift * 2); + v_gather_channel(val_3, tmp, &mapsx[x], chanNum, c, shift * 3); + + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_pack_u(r0, r1); + v_uint8 q1 = v_pack_u(r2, r3); + + v_uint8 q2 = v_shuffle(q0, hmask); + v_uint8 q3 = v_shuffle(q1, hmask); + + v_uint8 q4 = v_blend<0xCC /*0b11001100*/>(q2, v_slli_si128(q3, 4)); + v_uint8 q5 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q2, 4), q3); + + v_store_low(&dst[c][0][x], q4); + v_store_high(&dst[c][1][x], q4); + v_store_low(&dst[c][2][x], q5); + v_store_high(&dst[c][3][x], q5); + } + } + + if (x < width) { + x = width - half_nlanes; + continue; + } + break; + } +} + +template +CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array, chanNum>& dst, + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + constexpr int nlanes = static_cast(v_int8::nlanes); + constexpr int half_nlanes = nlanes / 2; + + bool xRatioEq = inSz.width == outSz.width; + bool yRatioEq = inSz.height == outSz.height; + + if (!xRatioEq && !yRatioEq) { + if (4 == lpi) { + // vertical pass + int inLength = inSz.width * chanNum; + GAPI_Assert(inLength >= half_nlanes); + + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15 }; + v_uint8 vmask = vx_load(_mask_vertical); + + int w = 0; + for (;;) { + for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) { + v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w])); + v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w])); + v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w])); + v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w])); + + v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w])); + v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w])); + v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w])); + v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w])); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_pack_u(r0, r1); + v_uint8 q1 = v_pack_u(r2, r3); + + v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_slli_si128(q1, 4)); + v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q0, 4), q1); + + v_uint8 q4 = v_shuffle(q2, vmask); + v_uint8 q5 = v_shuffle(q3, vmask); + + vx_store(&tmp[4 * w + 0], q4); + vx_store(&tmp[4 * w + 2 * half_nlanes], q5); + } + + if (w < inLength) { + w = inLength - half_nlanes; + continue; + } + break; + } + + // horizontal pass + GAPI_Assert(outSz.width >= half_nlanes); + horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width, nlanes); + } else { // if any lpi + int inLength = inSz.width * chanNum; + GAPI_Assert(inLength >= half_nlanes); + GAPI_Assert(outSz.width >= half_nlanes); + + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + const uchar* s0 = src0[l]; + const uchar* s1 = src1[l]; + + // vertical pass + vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0); + + // horizontal pass + horizontal_anyLPI(dst, tmp, mapsx, alpha, nlanes, outSz.width, l); + } + } + } else if (!xRatioEq) { + GAPI_Assert(yRatioEq); + + if (4 == lpi) { + int inLength = inSz.width * chanNum; + + // vertical pass + GAPI_DbgAssert(inLength >= nlanes); + v_uint8 s0, s1, s2, s3; + int w = 0; + for (;;) { + for (; w <= inLength - nlanes; w += nlanes) { + s0 = vx_load(&src0[0][w]); + s1 = vx_load(&src0[1][w]); + s2 = vx_load(&src0[2][w]); + s3 = vx_load(&src0[3][w]); + v_store_interleave(&tmp[lpi * w], s0, s1, s2, s3); + } + + if (w < inLength) { + w = inLength - nlanes; + continue; + } + break; + } + + // horizontal pass + GAPI_Assert(outSz.width >= half_nlanes); + horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width, nlanes); + } else { // any LPI + GAPI_Assert(outSz.width >= half_nlanes); + for (int l = 0; l < lpi; ++l) { + const uchar* src = src0[l]; + + // horizontal pass + horizontal_anyLPI(dst, src, mapsx, alpha, nlanes, outSz.width, l); + } + } + } else if (!yRatioEq) { + GAPI_Assert(xRatioEq); + int inLength = inSz.width*chanNum; // == outSz.width + + GAPI_Assert(inLength >= half_nlanes); + GAPI_Assert(outSz.width >= nlanes); + + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + const uchar* s0 = src0[l]; + const uchar* s1 = src1[l]; + + // vertical pass + vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0); + + //split channels to planes and store + channels2planes_store(dst, tmp, outSz.width, nlanes, l); + } + } else { + GAPI_Assert(xRatioEq && yRatioEq); + GAPI_Assert(outSz.width >= nlanes); + + //split channels to planes and store + for (int l = 0; l < lpi; ++l) { + const uchar* src = src0[l]; + channels2planes_store(dst, src, outSz.width, nlanes, l); + } + } +} + +// Resize (bi-linear, 8UC3) +void calcRowLinear_8U(C3, std::array, 3>& dst, + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + constexpr int chanNum = 3; + calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, + beta, tmp, inSz, outSz, lpi); +} + +// Resize (bi-linear, 8UC4) +void calcRowLinear_8U(C4, std::array, 4>& dst, + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + constexpr int chanNum = 4; + calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, + beta, tmp, inSz, outSz, lpi); +} } // namespace neon } // namespace kernels } // namespace gapi diff --git a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp index e01c569f505..e9f8bb9ffbe 100644 --- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp @@ -904,34 +904,33 @@ void calcRowLinear_8UC_Impl_(std::array, chanNum> &dst, // Resize (bi-linear, 8UC3) void calcRowLinear_8U(C3, std::array, 3> &dst, - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], // 4 clones of alpha - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size &inSz, - const Size &outSz, - int lpi) { - constexpr const int chanNum = 3; - + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + constexpr int chanNum = 3; calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); } // Resize (bi-linear, 8UC4) void calcRowLinear_8U(C4, std::array, 4> &dst, - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], // 4 clones of alpha - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size &inSz, - const Size &outSz, - int lpi) { - constexpr const int chanNum = 4; + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + const int lpi) { + constexpr int chanNum = 4; calcRowLinear_8UC_Impl_(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); } diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index 0d52c378e2e..f40d579ffe5 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -1223,6 +1223,23 @@ static void calcRowLinearC(const cv::gapi::fluid::View & in, } #endif // HAVE_SSE +#ifdef HAVE_NEON + if (std::is_same::value) { + if (inSz.width >= 16 && outSz.width >= 8) { + neon::calcRowLinear_8UC(dst, + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + return; + } + } +#endif // HAVE_NEON + auto length = out[0].get().length(); for (int l = 0; l < lpi; l++) { diff --git a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp index c5b843f6c32..a10ed62f79c 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_intrin.hpp @@ -188,6 +188,16 @@ inline unsigned int trailingZeros32(unsigned int value) { // access from within opencv code more accessible namespace cv { +#ifndef CV_ALWAYS_INLINE +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) +#define CV_ALWAYS_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define CV_ALWAYS_INLINE __forceinline +#else +#define CV_ALWAYS_INLINE inline +#endif +#endif + namespace hal { enum StoreMode diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp index 3ee2d3fd3d9..d37d377b012 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp @@ -2323,7 +2323,157 @@ inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) { return result; } +CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b) +{ + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(a.val), + vget_low_s16(b.val)); + int32x4_t mul_hi = vmull_s16(vget_high_s16(a.val), + vget_high_s16(b.val)); + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return v_int16x8(vcombine_s16(narrow_lo, narrow_hi)); +} + +CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const short b) +{ + return v_mulhrs(a, v_setall_s16(b)); +} + +CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[], + const short* index, const int chanNum, + const int c, const int shift) +{ + int32x4_t result = {}; + result = vsetq_lane_s32(*reinterpret_cast(&tmp[4 * (chanNum * (*(index + shift + 0)) + c)]), result, 0); + result = vsetq_lane_s32(*reinterpret_cast(&tmp[4 * (chanNum * (*(index + shift + 1)) + c)]), result, 1); + result = vsetq_lane_s32(*reinterpret_cast(&tmp[4 * (chanNum * (*(index + shift + 0) + 1) + c)]), result, 2); + result = vsetq_lane_s32(*reinterpret_cast(&tmp[4 * (chanNum * (*(index + shift + 1) + 1) + c)]), result, 3); + + vec.val = vreinterpretq_u8_s32(result); +} + +template +CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uchar src[], const int channel) +{ + uint8x16_t result = {}; + result = vsetq_lane_u8(*reinterpret_cast(src + channel), result, 0); + result = vsetq_lane_u8(*reinterpret_cast(src + chanNum + channel), result, 1); + result = vsetq_lane_u8(*reinterpret_cast(src + 2 * chanNum + channel), result, 2); + result = vsetq_lane_u8(*reinterpret_cast(src + 3 * chanNum + channel), result, 3); + result = vsetq_lane_u8(*reinterpret_cast(src + 4 * chanNum + channel), result, 4); + result = vsetq_lane_u8(*reinterpret_cast(src + 5 * chanNum + channel), result, 5); + result = vsetq_lane_u8(*reinterpret_cast(src + 6 * chanNum + channel), result, 6); + result = vsetq_lane_u8(*reinterpret_cast(src + 7 * chanNum + channel), result, 7); + result = vsetq_lane_u8(*reinterpret_cast(src + 8 * chanNum + channel), result, 8); + result = vsetq_lane_u8(*reinterpret_cast(src + 9 * chanNum + channel), result, 9); + result = vsetq_lane_u8(*reinterpret_cast(src + 10 * chanNum + channel), result, 10); + result = vsetq_lane_u8(*reinterpret_cast(src + 11 * chanNum + channel), result, 11); + result = vsetq_lane_u8(*reinterpret_cast(src + 12 * chanNum + channel), result, 12); + result = vsetq_lane_u8(*reinterpret_cast(src + 13 * chanNum + channel), result, 13); + result = vsetq_lane_u8(*reinterpret_cast(src + 14 * chanNum + channel), result, 14); + result = vsetq_lane_u8(*reinterpret_cast(src + 15 * chanNum + channel), result, 15); + + vec.val = result; +} + +namespace { +template +CV_ALWAYS_INLINE void v_gather_channel(v_int16x8& vec, const uchar src[], const short* index, const int channel, const int pos) +{ + int16x8_t result = {}; + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*index + pos) + channel]), result, 0); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 1) + pos) + channel]), result, 1); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 2) + pos) + channel]), result, 2); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 3) + pos) + channel]), result, 3); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 4) + pos) + channel]), result, 4); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 5) + pos) + channel]), result, 5); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 6) + pos) + channel]), result, 6); + result = vsetq_lane_s16(*reinterpret_cast(&src[chanNum * (*(index + 7) + pos) + channel]), result, 7); + + vec.val = result; +} +} // namespace + +template +CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b) +{ + const uint16_t _mask[8] = { ((imm) & (1 << 0)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, + ((imm) & (1 << 7)) ? 0xFFFF : 0x0000 }; + uint16x8_t _mask_vec = vld1q_u16(_mask); + uint16x8_t _a = vreinterpretq_u16_u8(a.val); + uint16x8_t _b = vreinterpretq_u16_u8(b.val); + return v_uint8x16(vreinterpretq_u8_u16(vbslq_u16(_mask_vec, _b, _a))); +} + +CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mask) +{ + + uint8x16_t tbl = a.val; // input a + uint8x16_t idx = mask.val; // input mask + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return v_uint8x16(vqtbl1q_u8(tbl, idx_masked)); +#elif defined(__GNUC__) + uint8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + + return v_uint8x16(ret); +#else + uint8x8x2_t a_split = { vget_low_u8(tbl), vget_high_u8(tbl) }; + + return v_uint8x16(vcombine_u8( + vtbl2_u8(a_split, vget_low_u8(idx_masked)), + vtbl2_u8(a_split, vget_high_u8(idx_masked)))); + +#endif +} + +CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a, const int imm) +{ + uint8x16_t ret = {}; + if (imm <= 0) { + ret = a.val; + } + if (imm > 15) { + ret = vdupq_n_u8(0); + } else { + ret = vextq_u8(vdupq_n_u8(0), a.val, 16 - (imm)); + } + return v_uint8x16(ret); +} + +CV_ALWAYS_INLINE v_uint8x16 v_srli_si128(const v_uint8x16& a, const int imm) +{ + uint8x16_t ret = {}; + if (imm <= 0) { + ret = a.val; + } + if (imm > 15) { + ret = vdupq_n_u8(0); + } else { + ret = vextq_u8(a.val, vdupq_n_u8(0), imm); + } + return v_uint8x16(ret); +} CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END