SIMD optimization of 8U3C/4C Resize for ARM via universal intrinsics. (#4331)

* SIMD opt of 8U3C/4C Resize for ARM

* * Fix issue.

* * Applied comments.

* * Applied comments. Step 2.
This commit is contained in:
Anna Khakimova 2021-03-05 19:58:46 +03:00 committed by GitHub
parent 3dd5bfcfdd
commit 3656e1c564
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 545 additions and 23 deletions

View File

@ -4,6 +4,7 @@
#include "ie_preprocess_gapi_kernels.hpp" #include "ie_preprocess_gapi_kernels.hpp"
#include "ie_preprocess_gapi_kernels_impl.hpp" #include "ie_preprocess_gapi_kernels_impl.hpp"
#include "ie_preprocess_gapi_kernels_neon.hpp"
#include <arm_neon.h> #include <arm_neon.h>
@ -126,6 +127,351 @@ void copyRow_32F(const float in[], float out[], int length) {
copyRow_32F_impl(in, out, length); copyRow_32F_impl(in, out, length);
} }
template<int chanNum>
CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* src, const int width,
const int nlanes, const int line) {
v_uint8 chan;
int x = 0;
for (;;) {
for (; x <= width - nlanes && x >= 0; x += nlanes) {
for (int c = 0; c < chanNum; ++c) {
v_gather_channel<chanNum>(chan, &src[chanNum * x], c);
vx_store(&dst[c][line][x], chan);
}
}
if (x < width) {
x = width - nlanes;
continue;
}
break;
}
}
CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
uchar* tmp, const int inLength,
const int nlanes, const short beta) {
int w = 0;
const int half_nlanes = nlanes/2;
for (;;) {
for (; w <= inLength - nlanes; w += nlanes) {
v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[w]));
v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[w]));
v_int16 s2 = v_reinterpret_as_s16(vx_load_expand(&src0[w + half_nlanes]));
v_int16 s3 = v_reinterpret_as_s16(vx_load_expand(&src1[w + half_nlanes]));
v_int16 res1 = v_mulhrs(s0 - s1, beta) + s1;
v_int16 res2 = v_mulhrs(s2 - s3, beta) + s3;
vx_store(tmp + w, v_pack_u(res1, res2));
}
if (w < inLength) {
w = inLength - nlanes;
continue;
}
break;
}
}
template<int chanNum>
CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* src, const short mapsx[],
const short alpha[], const int nlanes,
const int width, const int line) {
const int half_nlanes = nlanes/2;
v_int16 t0, t1;//, t2, t3;
int x = 0;
for (;;) {
for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
v_int16 a0 = vx_load(&alpha[x]);
for (int c = 0; c < chanNum; ++c) {
v_gather_channel<chanNum>(t0, src, &mapsx[x], c, 0);
v_gather_channel<chanNum>(t1, src, &mapsx[x], c, 1);
//v_gather_channel<chanNum>(t2, src, &mapsx[x + half_nlanes], c, 0);
//v_gather_channel<chanNum>(t3, src, &mapsx[x + half_nlanes], c, 1);
v_int16 res1 = v_mulhrs(t0 - t1, a0) + t1;
//v_int16 res2 = v_mulhrs(t2 - t3, a0) + t3;
//vx_store(&dst[c][line][x], v_pack_u(res1, res2));
v_pack_u_store(&dst[c][line][x], res1);
}
}
if (x < width) {
//x = width - nlanes;
x = width - half_nlanes;
continue;
}
break;
}
}
template<int chanNum>
CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* tmp, const short mapsx[], const short clone[],
const int width, const int nlanes) {
v_uint8 val_0, val_1, val_2, val_3;
const int half_nlanes = nlanes / 2;
const int shift = static_cast<int>(half_nlanes / 4);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
v_uint8 hmask = vx_load(_mask_horizontal);
int x = 0;
for (;;) {
for (; x <= width - half_nlanes && x >= 0; x += half_nlanes) {
v_int16 a10 = vx_load(&clone[4 * x]);
v_int16 a32 = vx_load(&clone[4 * (x + 2)]);
v_int16 a54 = vx_load(&clone[4 * (x + 4)]);
v_int16 a76 = vx_load(&clone[4 * (x + 6)]);
for (int c = 0; c < chanNum; ++c) {
v_gather_channel(val_0, tmp, &mapsx[x], chanNum, c, 0);
v_gather_channel(val_1, tmp, &mapsx[x], chanNum, c, shift);
v_gather_channel(val_2, tmp, &mapsx[x], chanNum, c, shift * 2);
v_gather_channel(val_3, tmp, &mapsx[x], chanNum, c, shift * 3);
v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3));
v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0));
v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1));
v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2));
v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3));
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10);
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32);
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54);
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76);
v_int16 r0 = v_add_wrap(val1_0, t0);
v_int16 r1 = v_add_wrap(val1_1, t1);
v_int16 r2 = v_add_wrap(val1_2, t2);
v_int16 r3 = v_add_wrap(val1_3, t3);
v_uint8 q0 = v_pack_u(r0, r1);
v_uint8 q1 = v_pack_u(r2, r3);
v_uint8 q2 = v_shuffle(q0, hmask);
v_uint8 q3 = v_shuffle(q1, hmask);
v_uint8 q4 = v_blend<0xCC /*0b11001100*/>(q2, v_slli_si128(q3, 4));
v_uint8 q5 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q2, 4), q3);
v_store_low(&dst[c][0][x], q4);
v_store_high(&dst[c][1][x], q4);
v_store_low(&dst[c][2][x], q5);
v_store_high(&dst[c][3][x], q5);
}
}
if (x < width) {
x = width - half_nlanes;
continue;
}
break;
}
}
template<int chanNum>
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int nlanes = static_cast<int>(v_int8::nlanes);
constexpr int half_nlanes = nlanes / 2;
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
if (!xRatioEq && !yRatioEq) {
if (4 == lpi) {
// vertical pass
int inLength = inSz.width * chanNum;
GAPI_Assert(inLength >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15 };
v_uint8 vmask = vx_load(_mask_vertical);
int w = 0;
for (;;) {
for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
v_int16 r0 = v_add_wrap(val1_0, t0);
v_int16 r1 = v_add_wrap(val1_1, t1);
v_int16 r2 = v_add_wrap(val1_2, t2);
v_int16 r3 = v_add_wrap(val1_3, t3);
v_uint8 q0 = v_pack_u(r0, r1);
v_uint8 q1 = v_pack_u(r2, r3);
v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_slli_si128(q1, 4));
v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_srli_si128(q0, 4), q1);
v_uint8 q4 = v_shuffle(q2, vmask);
v_uint8 q5 = v_shuffle(q3, vmask);
vx_store(&tmp[4 * w + 0], q4);
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
}
if (w < inLength) {
w = inLength - half_nlanes;
continue;
}
break;
}
// horizontal pass
GAPI_Assert(outSz.width >= half_nlanes);
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
} else { // if any lpi
int inLength = inSz.width * chanNum;
GAPI_Assert(inLength >= half_nlanes);
GAPI_Assert(outSz.width >= half_nlanes);
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
const uchar* s0 = src0[l];
const uchar* s1 = src1[l];
// vertical pass
vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
// horizontal pass
horizontal_anyLPI<chanNum>(dst, tmp, mapsx, alpha, nlanes, outSz.width, l);
}
}
} else if (!xRatioEq) {
GAPI_Assert(yRatioEq);
if (4 == lpi) {
int inLength = inSz.width * chanNum;
// vertical pass
GAPI_DbgAssert(inLength >= nlanes);
v_uint8 s0, s1, s2, s3;
int w = 0;
for (;;) {
for (; w <= inLength - nlanes; w += nlanes) {
s0 = vx_load(&src0[0][w]);
s1 = vx_load(&src0[1][w]);
s2 = vx_load(&src0[2][w]);
s3 = vx_load(&src0[3][w]);
v_store_interleave(&tmp[lpi * w], s0, s1, s2, s3);
}
if (w < inLength) {
w = inLength - nlanes;
continue;
}
break;
}
// horizontal pass
GAPI_Assert(outSz.width >= half_nlanes);
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width, nlanes);
} else { // any LPI
GAPI_Assert(outSz.width >= half_nlanes);
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
// horizontal pass
horizontal_anyLPI<chanNum>(dst, src, mapsx, alpha, nlanes, outSz.width, l);
}
}
} else if (!yRatioEq) {
GAPI_Assert(xRatioEq);
int inLength = inSz.width*chanNum; // == outSz.width
GAPI_Assert(inLength >= half_nlanes);
GAPI_Assert(outSz.width >= nlanes);
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
const uchar* s0 = src0[l];
const uchar* s1 = src1[l];
// vertical pass
vertical_anyLPI(s0, s1, tmp, inLength, nlanes, beta0);
//split channels to planes and store
channels2planes_store<chanNum>(dst, tmp, outSz.width, nlanes, l);
}
} else {
GAPI_Assert(xRatioEq && yRatioEq);
GAPI_Assert(outSz.width >= nlanes);
//split channels to planes and store
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
channels2planes_store<chanNum>(dst, src, outSz.width, nlanes, l);
}
}
}
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int chanNum = 3;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 8UC4)
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int chanNum = 4;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
}
} // namespace neon } // namespace neon
} // namespace kernels } // namespace kernels
} // namespace gapi } // namespace gapi

View File

@ -904,34 +904,33 @@ void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
// Resize (bi-linear, 8UC3) // Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst, void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t *src0[], const uint8_t* src0[],
const uint8_t *src1[], const uint8_t* src1[],
const short alpha[], const short alpha[],
const short clone[], // 4 clones of alpha const short clone[], // 4 clones of alpha
const short mapsx[], const short mapsx[],
const short beta[], const short beta[],
uint8_t tmp[], uint8_t tmp[],
const Size &inSz, const Size& inSz,
const Size &outSz, const Size& outSz,
int lpi) { const int lpi) {
constexpr const int chanNum = 3; constexpr int chanNum = 3;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
} }
// Resize (bi-linear, 8UC4) // Resize (bi-linear, 8UC4)
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst, void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
const uint8_t *src0[], const uint8_t* src0[],
const uint8_t *src1[], const uint8_t* src1[],
const short alpha[], const short alpha[],
const short clone[], // 4 clones of alpha const short clone[], // 4 clones of alpha
const short mapsx[], const short mapsx[],
const short beta[], const short beta[],
uint8_t tmp[], uint8_t tmp[],
const Size &inSz, const Size& inSz,
const Size &outSz, const Size& outSz,
int lpi) { const int lpi) {
constexpr const int chanNum = 4; constexpr int chanNum = 4;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi); calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
} }

View File

@ -1223,6 +1223,23 @@ static void calcRowLinearC(const cv::gapi::fluid::View & in,
} }
#endif // HAVE_SSE #endif // HAVE_SSE
#ifdef HAVE_NEON
if (std::is_same<T, uint8_t>::value) {
if (inSz.width >= 16 && outSz.width >= 8) {
neon::calcRowLinear_8UC<numChan>(dst,
reinterpret_cast<const uint8_t**>(src0),
reinterpret_cast<const uint8_t**>(src1),
reinterpret_cast<const short*>(alpha),
reinterpret_cast<const short*>(clone),
reinterpret_cast<const short*>(mapsx),
reinterpret_cast<const short*>(beta),
reinterpret_cast<uint8_t*>(tmp),
inSz, outSz, lpi);
return;
}
}
#endif // HAVE_NEON
auto length = out[0].get().length(); auto length = out[0].get().length();
for (int l = 0; l < lpi; l++) { for (int l = 0; l < lpi; l++) {

View File

@ -188,6 +188,16 @@ inline unsigned int trailingZeros32(unsigned int value) {
// access from within opencv code more accessible // access from within opencv code more accessible
namespace cv { namespace cv {
#ifndef CV_ALWAYS_INLINE
#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER)
#define CV_ALWAYS_INLINE __forceinline
#else
#define CV_ALWAYS_INLINE inline
#endif
#endif
namespace hal { namespace hal {
enum StoreMode enum StoreMode

View File

@ -2323,7 +2323,157 @@ inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
return result; return result;
} }
CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b)
{
// Multiply
int32x4_t mul_lo = vmull_s16(vget_low_s16(a.val),
vget_low_s16(b.val));
int32x4_t mul_hi = vmull_s16(vget_high_s16(a.val),
vget_high_s16(b.val));
// Rounding narrowing shift right
// narrow = (int16_t)((mul + 16384) >> 15);
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
// Join together
return v_int16x8(vcombine_s16(narrow_lo, narrow_hi));
}
CV_ALWAYS_INLINE v_int16x8 v_mulhrs(const v_int16x8& a, const short b)
{
return v_mulhrs(a, v_setall_s16(b));
}
CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[],
const short* index, const int chanNum,
const int c, const int shift)
{
int32x4_t result = {};
result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0)) + c)]), result, 0);
result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1)) + c)]), result, 1);
result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 0) + 1) + c)]), result, 2);
result = vsetq_lane_s32(*reinterpret_cast<const int*>(&tmp[4 * (chanNum * (*(index + shift + 1) + 1) + c)]), result, 3);
vec.val = vreinterpretq_u8_s32(result);
}
template<int chanNum>
CV_ALWAYS_INLINE void v_gather_channel(v_uint8x16& vec, const uchar src[], const int channel)
{
uint8x16_t result = {};
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + channel), result, 0);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + chanNum + channel), result, 1);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 2 * chanNum + channel), result, 2);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 3 * chanNum + channel), result, 3);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 4 * chanNum + channel), result, 4);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 5 * chanNum + channel), result, 5);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 6 * chanNum + channel), result, 6);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 7 * chanNum + channel), result, 7);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 8 * chanNum + channel), result, 8);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 9 * chanNum + channel), result, 9);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 10 * chanNum + channel), result, 10);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 11 * chanNum + channel), result, 11);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 12 * chanNum + channel), result, 12);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 13 * chanNum + channel), result, 13);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 14 * chanNum + channel), result, 14);
result = vsetq_lane_u8(*reinterpret_cast<const uchar*>(src + 15 * chanNum + channel), result, 15);
vec.val = result;
}
namespace {
template<int chanNum>
CV_ALWAYS_INLINE void v_gather_channel(v_int16x8& vec, const uchar src[], const short* index, const int channel, const int pos)
{
int16x8_t result = {};
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*index + pos) + channel]), result, 0);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 1) + pos) + channel]), result, 1);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 2) + pos) + channel]), result, 2);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 3) + pos) + channel]), result, 3);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 4) + pos) + channel]), result, 4);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 5) + pos) + channel]), result, 5);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 6) + pos) + channel]), result, 6);
result = vsetq_lane_s16(*reinterpret_cast<const uchar*>(&src[chanNum * (*(index + 7) + pos) + channel]), result, 7);
vec.val = result;
}
} // namespace
template<int imm>
CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
{
const uint16_t _mask[8] = { ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,
((imm) & (1 << 1)) ? 0xFFFF : 0x0000,
((imm) & (1 << 2)) ? 0xFFFF : 0x0000,
((imm) & (1 << 3)) ? 0xFFFF : 0x0000,
((imm) & (1 << 4)) ? 0xFFFF : 0x0000,
((imm) & (1 << 5)) ? 0xFFFF : 0x0000,
((imm) & (1 << 6)) ? 0xFFFF : 0x0000,
((imm) & (1 << 7)) ? 0xFFFF : 0x0000 };
uint16x8_t _mask_vec = vld1q_u16(_mask);
uint16x8_t _a = vreinterpretq_u16_u8(a.val);
uint16x8_t _b = vreinterpretq_u16_u8(b.val);
return v_uint8x16(vreinterpretq_u8_u16(vbslq_u16(_mask_vec, _b, _a)));
}
CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mask)
{
uint8x16_t tbl = a.val; // input a
uint8x16_t idx = mask.val; // input mask
uint8x16_t idx_masked =
vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
#if defined(__aarch64__)
return v_uint8x16(vqtbl1q_u8(tbl, idx_masked));
#elif defined(__GNUC__)
uint8x16_t ret;
// %e and %f represent the even and odd D registers
// respectively.
__asm__ __volatile__(
"vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
"vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
: [ret] "=&w"(ret)
: [tbl] "w"(tbl), [idx] "w"(idx_masked));
return v_uint8x16(ret);
#else
uint8x8x2_t a_split = { vget_low_u8(tbl), vget_high_u8(tbl) };
return v_uint8x16(vcombine_u8(
vtbl2_u8(a_split, vget_low_u8(idx_masked)),
vtbl2_u8(a_split, vget_high_u8(idx_masked))));
#endif
}
CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a, const int imm)
{
uint8x16_t ret = {};
if (imm <= 0) {
ret = a.val;
}
if (imm > 15) {
ret = vdupq_n_u8(0);
} else {
ret = vextq_u8(vdupq_n_u8(0), a.val, 16 - (imm));
}
return v_uint8x16(ret);
}
CV_ALWAYS_INLINE v_uint8x16 v_srli_si128(const v_uint8x16& a, const int imm)
{
uint8x16_t ret = {};
if (imm <= 0) {
ret = a.val;
}
if (imm > 15) {
ret = vdupq_n_u8(0);
} else {
ret = vextq_u8(a.val, vdupq_n_u8(0), imm);
}
return v_uint8x16(ret);
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END