Improve performance of the Resize 3c/3p and Resize 8UC1 (#4945)

* scratch buffer

* Refactoring horizontal path

* * Refactoring horizontal pass. Step2

* * Refactoring horizontal pass. Step 3

* * Refactoring vertical pass. Step2

* Refactoring  horizontal pass. Step4

* * Clean

* Applied comments.

* * Applied comments. Part 2
This commit is contained in:
Anna Khakimova 2021-04-19 21:11:58 +03:00 committed by GitHub
parent 40eba6a2ef
commit 068229c815
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 196 additions and 205 deletions

View File

@ -228,20 +228,90 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
}
}
CV_ALWAYS_INLINE void vertical_4LPI(const uint8_t* src0[], const uint8_t* src1[],
uchar tmp[], const short beta[], const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
v_int16 lo1, hi1, lo2, hi2;
v_int32 res1_s32, res2_s32;
int w = 0;
for (;;) {
for (; w <= length - half_nlanes; w += half_nlanes) {
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
v_int16 r0 = v_add_wrap(val1_0, t0);
v_int16 r1 = v_add_wrap(val1_1, t1);
v_int16 r2 = v_add_wrap(val1_2, t2);
v_int16 r3 = v_add_wrap(val1_3, t3);
v_interleave(r0, r1, lo1, hi1);
v_interleave(r2, r3, lo2, hi2);
v_int32 lo1_s32 = v_reinterpret_as_s32(lo1);
v_int32 hi1_s32 = v_reinterpret_as_s32(hi1);
v_int32 lo2_s32 = v_reinterpret_as_s32(lo2);
v_int32 hi2_s32 = v_reinterpret_as_s32(hi2);
v_interleave(lo1_s32, lo2_s32, res1_s32, res2_s32);
v_int16 res1 = v_reinterpret_as_s16(res1_s32);
v_int16 res2 = v_reinterpret_as_s16(res2_s32);
v_pack_u_store(&tmp[4 * w + 0], res1);
v_pack_u_store(&tmp[4 * w + half_nlanes], res2);
v_interleave(hi1_s32, hi2_s32, res1_s32, res2_s32);
v_int16 res3 = v_reinterpret_as_s16(res1_s32);
v_int16 res4 = v_reinterpret_as_s16(res2_s32);
v_pack_u_store(&tmp[4 * w + 2*half_nlanes], res3);
v_pack_u_store(&tmp[4 * w + 3*half_nlanes], res4);
}
if (w < length) {
w = length - half_nlanes;
continue;
}
break;
}
}
template<int chanNum>
CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* tmp, const short mapsx[],
const short clone[], const int length) {
const uchar* tmp, const short mapsx[], const uchar _mask_horizontal[],
const short clone[],
const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
const int half_nlanes = nlanes / 2;
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
const int shift = static_cast<int>(half_nlanes / 4);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
v_uint8 hmask = vx_load(_mask_horizontal);
v_uint8 val_0, val_1, val_2, val_3;
int x = 0;
for (;;) {
for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) {
@ -315,71 +385,19 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
static_assert(v_uint8::nlanes == 16,
"The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
if (!xRatioEq && !yRatioEq) {
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
// vertical pass
int inLength = inSz.width * chanNum;
GAPI_Assert(inLength >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15 };
v_uint8 vmask = vx_load(_mask_vertical);
int w = 0;
for (;;) {
for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
v_int16 r0 = v_add_wrap(val1_0, t0);
v_int16 r1 = v_add_wrap(val1_1, t1);
v_int16 r2 = v_add_wrap(val1_2, t2);
v_int16 r3 = v_add_wrap(val1_3, t3);
v_uint8 q0 = v_pack_u(r0, r1);
v_uint8 q1 = v_pack_u(r2, r3);
v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
v_uint8 q4 = v_shuffle(q2, vmask);
v_uint8 q5 = v_shuffle(q3, vmask);
vx_store(&tmp[4 * w + 0], q4);
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
}
if (w < inLength) {
w = inLength - half_nlanes;
continue;
}
break;
}
vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
// horizontal pass
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
} else { // if any lpi
int inLength = inSz.width * chanNum;
@ -397,6 +415,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
}
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
int inLength = inSz.width * chanNum;
@ -422,7 +442,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
}
// horizontal pass
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
} else { // any LPI
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
@ -469,9 +489,8 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int chanNum = 3;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
calcRowLinear_8UC_Impl_<3>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 8UC4)
@ -486,20 +505,18 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int chanNum = 4;
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
calcRowLinear_8UC_Impl_<4>(dst, src0, src1, alpha, clone, mapsx,
beta, tmp, inSz, outSz, lpi);
}
CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
const uchar* tmp, const short mapsx[],
const uchar _mask_horizontal[],
const short clone[], const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
const int half_nlanes = nlanes / 2;
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
v_uint8 hmask = vx_load(_mask_horizontal);
int x = 0;
for (;;) {
@ -557,12 +574,11 @@ CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
}
}
CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
const uchar* src, const short mapsx[],
const short alpha[], const int length,
const int line) {
const short alpha[], const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
const int half_nlanes = nlanes / 2;
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
v_int16 t0, t1;
int x = 0;
@ -573,7 +589,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
v_deinterleave_expand(t, t0, t1);
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[line][x], d);
v_pack_u_store(&dst[x], d);
}
if (x < length) {
@ -608,79 +624,34 @@ void calcRowLinear_8UC1(uint8_t* dst[],
if (!xRatioEq && !yRatioEq) {
GAPI_Assert(inSz.width >= half_nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
// vertical pass
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15 };
v_uint8 vmask = vx_load(_mask_vertical);
int w = 0;
for (;;) {
for (; w <= inSz.width - half_nlanes; w += half_nlanes) {
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
v_int16 r0 = v_add_wrap(val1_0, t0);
v_int16 r1 = v_add_wrap(val1_1, t1);
v_int16 r2 = v_add_wrap(val1_2, t2);
v_int16 r3 = v_add_wrap(val1_3, t3);
v_uint8 q0 = v_pack_u(r0, r1);
v_uint8 q1 = v_pack_u(r2, r3);
v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
v_uint8 q4 = v_shuffle(q2, vmask);
v_uint8 q5 = v_shuffle(q3, vmask);
vx_store(&tmp[4 * w + 0], q4);
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
}
if (w < inSz.width) {
w = inSz.width - half_nlanes;
continue;
}
break;
}
vertical_4LPI(src0, src1, tmp, beta, inSz.width);
// horizontal pass
horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
} else { // if any lpi
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
const uchar* s0 = src0[l];
const uchar* s1 = src1[l];
uchar* _dst = dst[l];
// vertical pass
vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
// horizontal pass
horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l);
horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
GAPI_Assert(inSz.width >= nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
// vertical pass
@ -702,14 +673,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
}
// horizontal pass
horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
} else { // any LPI
GAPI_Assert(outSz.width >= half_nlanes);
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
uchar* _dst = dst[l];
// horizontal pass
horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l);
horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
}
}

View File

@ -895,7 +895,7 @@ struct linearScratchDesc {
tmp = reinterpret_cast<T*> (mapsy + outH*2);
}
static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) {
auto size = outW * sizeof(alpha_t) +
outW * sizeof(alpha_t) * 4 + // alpha clones // previous alpha is redundant?
outW * sizeof(index_t) +
@ -910,7 +910,7 @@ struct linearScratchDesc {
template<typename T, typename Mapper, int chanNum = 1>
static void initScratchLinear(const cv::GMatDesc& in,
const Size& outSz,
cv::gapi::fluid::Buffer& scratch,
cv::gapi::fluid::Buffer& scratch,
int lpi) {
using alpha_type = typename Mapper::alpha_type;
static const auto unity = Mapper::unity;
@ -1171,7 +1171,7 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
template<typename T, class Mapper, int numChan>
static void calcRowLinearC(const cv::gapi::fluid::View & in,
std::array<std::reference_wrapper<cv::gapi::fluid::Buffer>, numChan>& out,
cv::gapi::fluid::Buffer& scratch) {
cv::gapi::fluid::Buffer& scratch) {
using alpha_type = typename Mapper::alpha_type;
auto inSz = in.meta().size;

View File

@ -18,12 +18,12 @@ namespace gapi {
namespace kernels {
inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
uint8_t out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -46,12 +46,12 @@ inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
}
}
inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], uint8_t out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -75,12 +75,13 @@ inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
}
}
inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
const uint8_t in3[], uint8_t out[], int length) {
CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
const uint8_t in2[], const uint8_t in3[],
uint8_t out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -106,12 +107,12 @@ inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const u
}
}
inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
float out[], int length) {
CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -133,12 +134,12 @@ inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
}
}
inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
float out[], int length) {
CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -162,13 +163,13 @@ inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const floa
}
}
inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
const float in2[], const float in3[],
float out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -196,12 +197,12 @@ inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
//------------------------------------------------------------------------------
inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -223,12 +224,12 @@ inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
}
}
inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
uint8_t out1[], uint8_t out2[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -252,12 +253,12 @@ inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
}
}
inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
uint8_t out2[], uint8_t out3[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -283,12 +284,12 @@ inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[
}
}
inline void splitRow_32FC2_Impl(const float in[], float out0[],
CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
float out1[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -311,12 +312,12 @@ inline void splitRow_32FC2_Impl(const float in[], float out0[],
}
}
inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
float out2[], int length) {
CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
float out2[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -340,12 +341,12 @@ inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
}
}
inline void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
float out2[], float out3[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
cycle:
for (; l <= length - nlanes; l += nlanes) {
@ -380,7 +381,7 @@ static const int ITUR_BT_601_CVG = -852492;
static const int ITUR_BT_601_CVR = 1673527;
static const int ITUR_BT_601_SHIFT = 20;
static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
CV_ALWAYS_INLINE void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
int uu, vv;
uu = static_cast<int>(u) - 128;
vv = static_cast<int>(v) - 128;
@ -390,9 +391,9 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
}
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
v_int32 (&ruv)[4], v_int32 (&guv)[4],
v_int32 (&buv)[4]) {
CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
v_int32 (&ruv)[4], v_int32 (&guv)[4],
v_int32 (&buv)[4]) {
v_uint8 v128 = vx_setall_u8(128);
v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
@ -417,8 +418,8 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
}
}
static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
uchar& r, uchar& g, uchar& b) {
CV_ALWAYS_INLINE void yRGBuvToRGB(const uchar vy, const int ruv, const int guv,
const int buv, uchar& r, uchar& g, uchar& b) {
int yy = static_cast<int>(vy);
int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
@ -426,11 +427,11 @@ static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, con
b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
}
static inline void yRGBuvToRGB(const v_uint8& vy,
const v_int32 (&ruv)[4],
const v_int32 (&guv)[4],
const v_int32 (&buv)[4],
v_uint8& rr, v_uint8& gg, v_uint8& bb) {
CV_ALWAYS_INLINE void yRGBuvToRGB(const v_uint8& vy,
const v_int32 (&ruv)[4],
const v_int32 (&guv)[4],
const v_int32 (&buv)[4],
v_uint8& rr, v_uint8& gg, v_uint8& bb) {
v_uint8 v16 = vx_setall_u8(16);
v_uint8 posY = vy - v16;
v_uint16 yy0, yy1;
@ -463,15 +464,14 @@ static inline void yRGBuvToRGB(const v_uint8& vy,
bb = v_pack_u(b0, b1);
}
inline void calculate_nv12_to_rgb_impl(const uchar **srcY,
const uchar *srcUV,
uchar **dstRGBx,
int width) {
CV_ALWAYS_INLINE void calculate_nv12_to_rgb_impl(const uchar **srcY,
const uchar *srcUV,
uchar **dstRGBx,
int width) {
int i = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
v_uint8 u, v;
@ -535,14 +535,13 @@ inline void calculate_nv12_to_rgb_impl(const uchar **srcY,
}
}
inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
const uchar *srcV, uchar **dstRGBx,
int width) {
CV_ALWAYS_INLINE void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
const uchar *srcV, uchar **dstRGBx,
int width) {
int i = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
v_uint8 u = vx_load(srcU + i/2);
@ -610,8 +609,8 @@ inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
// vertical pass
template<typename T, typename A, typename I, typename W>
static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
W vbuf[]) {
CV_ALWAYS_INLINE void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap,
A yalpha, W vbuf[]) {
int y_1st = ymap.index0;
int ylast = ymap.index1 - 1;
@ -619,7 +618,7 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
GAPI_DbgAssert(y_1st < ylast);
#if MANUAL_SIMD
const int nlanes = v_uint16::nlanes;
constexpr int nlanes = v_uint16::nlanes;
#endif
// 1st and last rows
@ -667,8 +666,8 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
// horizontal pass
template<typename T, typename A, typename I, typename W>
static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
const A xalpha[], const W vbuf[]) {
CV_ALWAYS_INLINE void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
const A xalpha[], const W vbuf[]) {
// TO DO: try lambda here
#define HSUM(xmaxdf) \
for (int x = 0; x < outWidth; x++) { \
@ -704,9 +703,11 @@ static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
}
template<typename T, typename A, typename I, typename W>
static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
W vbuf[]) {
CV_ALWAYS_INLINE void calcRowArea_impl(T dst[], const T *src[], const Size& inSz,
const Size& outSz, A yalpha,
const MapperUnit<A, I>& ymap, int xmaxdf,
const I xindex[], const A xalpha[],
W vbuf[]) {
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
@ -738,18 +739,18 @@ static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Si
#if MANUAL_SIMD
template <typename VecT, typename T>
void copyRow_impl(const T in[], T out[], int l) {
CV_ALWAYS_INLINE void copyRow_impl(const T in[], T out[], int l) {
VecT r;
r = vx_load(&in[l]);
vx_store(&out[l], r);
}
#endif
inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
CV_ALWAYS_INLINE void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_uint8::nlanes;
constexpr int nlanes = v_uint8::nlanes;
for (; l <= length - nlanes; l += nlanes) {
copyRow_impl<v_uint8>(in, out, l);
@ -766,11 +767,11 @@ inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
}
}
inline void copyRow_32F_impl(const float in[], float out[], int length) {
CV_ALWAYS_INLINE void copyRow_32F_impl(const float in[], float out[], int length) {
int l = 0;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
for (; l <= length - nlanes; l += nlanes) {
copyRow_impl<v_float32>(in, out, l);
@ -801,7 +802,7 @@ CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
bool yRatioEq1 = inSz.height == outSz.height;
#if MANUAL_SIMD
const int nlanes = v_float32::nlanes;
constexpr int nlanes = v_float32::nlanes;
#endif
if (!xRatioEq1 && !yRatioEq1) {

View File

@ -2606,6 +2606,24 @@ CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8
return v_uint8x16(v);
}
CV_ALWAYS_INLINE void v_interleave(const v_int16x8& a, const v_int16x8& b,
v_int16x8& v1, v_int16x8& v2)
{
int16x8x2_t p = vzipq_s16(a.val, b.val);
v1.val = p.val[0];
v2.val = p.val[1];
return;
}
CV_ALWAYS_INLINE void v_interleave(const v_int32x4& a, const v_int32x4& b,
v_int32x4& v1, v_int32x4& v2)
{
int32x4x2_t p = vzipq_s32(a.val, b.val);
v1.val = p.val[0];
v2.val = p.val[1];
return;
}
template<int shift>
CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
{