Improve performance of the Resize 3c/3p and Resize 8UC1 (#4945)
* scratch buffer * Refactoring horizontal path * * Refactoring horizontal pass. Step2 * * Refactoring horizontal pass. Step 3 * * Refactoring vertical pass. Step2 * Refactoring horizontal pass. Step4 * * Clean * Applied comments. * * Applied comments. Part 2
This commit is contained in:
parent
40eba6a2ef
commit
068229c815
@ -228,20 +228,90 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE void vertical_4LPI(const uint8_t* src0[], const uint8_t* src1[],
|
||||||
|
uchar tmp[], const short beta[], const int length) {
|
||||||
|
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||||
|
constexpr int half_nlanes = nlanes / 2;
|
||||||
|
GAPI_Assert(length >= half_nlanes);
|
||||||
|
|
||||||
|
v_int16 b0 = vx_setall_s16(beta[0]);
|
||||||
|
v_int16 b1 = vx_setall_s16(beta[1]);
|
||||||
|
v_int16 b2 = vx_setall_s16(beta[2]);
|
||||||
|
v_int16 b3 = vx_setall_s16(beta[3]);
|
||||||
|
|
||||||
|
v_int16 lo1, hi1, lo2, hi2;
|
||||||
|
v_int32 res1_s32, res2_s32;
|
||||||
|
int w = 0;
|
||||||
|
for (;;) {
|
||||||
|
for (; w <= length - half_nlanes; w += half_nlanes) {
|
||||||
|
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
|
||||||
|
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
|
||||||
|
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
|
||||||
|
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
|
||||||
|
|
||||||
|
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
|
||||||
|
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
|
||||||
|
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
|
||||||
|
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
|
||||||
|
|
||||||
|
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
|
||||||
|
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
|
||||||
|
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
|
||||||
|
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
|
||||||
|
|
||||||
|
v_int16 r0 = v_add_wrap(val1_0, t0);
|
||||||
|
v_int16 r1 = v_add_wrap(val1_1, t1);
|
||||||
|
v_int16 r2 = v_add_wrap(val1_2, t2);
|
||||||
|
v_int16 r3 = v_add_wrap(val1_3, t3);
|
||||||
|
|
||||||
|
v_interleave(r0, r1, lo1, hi1);
|
||||||
|
v_interleave(r2, r3, lo2, hi2);
|
||||||
|
|
||||||
|
v_int32 lo1_s32 = v_reinterpret_as_s32(lo1);
|
||||||
|
v_int32 hi1_s32 = v_reinterpret_as_s32(hi1);
|
||||||
|
v_int32 lo2_s32 = v_reinterpret_as_s32(lo2);
|
||||||
|
v_int32 hi2_s32 = v_reinterpret_as_s32(hi2);
|
||||||
|
|
||||||
|
v_interleave(lo1_s32, lo2_s32, res1_s32, res2_s32);
|
||||||
|
|
||||||
|
v_int16 res1 = v_reinterpret_as_s16(res1_s32);
|
||||||
|
v_int16 res2 = v_reinterpret_as_s16(res2_s32);
|
||||||
|
|
||||||
|
v_pack_u_store(&tmp[4 * w + 0], res1);
|
||||||
|
v_pack_u_store(&tmp[4 * w + half_nlanes], res2);
|
||||||
|
|
||||||
|
v_interleave(hi1_s32, hi2_s32, res1_s32, res2_s32);
|
||||||
|
|
||||||
|
v_int16 res3 = v_reinterpret_as_s16(res1_s32);
|
||||||
|
v_int16 res4 = v_reinterpret_as_s16(res2_s32);
|
||||||
|
|
||||||
|
v_pack_u_store(&tmp[4 * w + 2*half_nlanes], res3);
|
||||||
|
v_pack_u_store(&tmp[4 * w + 3*half_nlanes], res4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (w < length) {
|
||||||
|
w = length - half_nlanes;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<int chanNum>
|
template<int chanNum>
|
||||||
CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
|
CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
|
||||||
const uchar* tmp, const short mapsx[],
|
const uchar* tmp, const short mapsx[], const uchar _mask_horizontal[],
|
||||||
const short clone[], const int length) {
|
const short clone[],
|
||||||
|
const int length) {
|
||||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||||
const int half_nlanes = nlanes / 2;
|
constexpr int half_nlanes = nlanes / 2;
|
||||||
GAPI_Assert(length >= half_nlanes);
|
GAPI_Assert(length >= half_nlanes);
|
||||||
|
|
||||||
const int shift = static_cast<int>(half_nlanes / 4);
|
const int shift = static_cast<int>(half_nlanes / 4);
|
||||||
|
|
||||||
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15 };
|
|
||||||
v_uint8 hmask = vx_load(_mask_horizontal);
|
v_uint8 hmask = vx_load(_mask_horizontal);
|
||||||
|
|
||||||
v_uint8 val_0, val_1, val_2, val_3;
|
v_uint8 val_0, val_1, val_2, val_3;
|
||||||
|
|
||||||
int x = 0;
|
int x = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) {
|
for (; x <= length - half_nlanes && x >= 0; x += half_nlanes) {
|
||||||
@ -315,71 +385,19 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
|
|||||||
static_assert(v_uint8::nlanes == 16,
|
static_assert(v_uint8::nlanes == 16,
|
||||||
"The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
|
"The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
|
||||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||||
constexpr int half_nlanes = nlanes / 2;
|
|
||||||
|
|
||||||
bool xRatioEq = inSz.width == outSz.width;
|
bool xRatioEq = inSz.width == outSz.width;
|
||||||
bool yRatioEq = inSz.height == outSz.height;
|
bool yRatioEq = inSz.height == outSz.height;
|
||||||
|
|
||||||
if (!xRatioEq && !yRatioEq) {
|
if (!xRatioEq && !yRatioEq) {
|
||||||
|
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||||
|
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||||
if (4 == lpi) {
|
if (4 == lpi) {
|
||||||
// vertical pass
|
// vertical pass
|
||||||
int inLength = inSz.width * chanNum;
|
vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
|
||||||
GAPI_Assert(inLength >= half_nlanes);
|
|
||||||
|
|
||||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
|
||||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
|
||||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
|
||||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
|
||||||
|
|
||||||
uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
|
|
||||||
2, 10, 6, 14, 3, 11, 7, 15 };
|
|
||||||
v_uint8 vmask = vx_load(_mask_vertical);
|
|
||||||
|
|
||||||
int w = 0;
|
|
||||||
for (;;) {
|
|
||||||
for (; w <= inLength - half_nlanes && w >= 0; w += half_nlanes) {
|
|
||||||
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
|
|
||||||
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
|
|
||||||
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
|
|
||||||
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
|
|
||||||
|
|
||||||
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
|
|
||||||
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
|
|
||||||
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
|
|
||||||
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
|
|
||||||
|
|
||||||
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
|
|
||||||
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
|
|
||||||
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
|
|
||||||
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
|
|
||||||
|
|
||||||
v_int16 r0 = v_add_wrap(val1_0, t0);
|
|
||||||
v_int16 r1 = v_add_wrap(val1_1, t1);
|
|
||||||
v_int16 r2 = v_add_wrap(val1_2, t2);
|
|
||||||
v_int16 r3 = v_add_wrap(val1_3, t3);
|
|
||||||
|
|
||||||
v_uint8 q0 = v_pack_u(r0, r1);
|
|
||||||
v_uint8 q1 = v_pack_u(r2, r3);
|
|
||||||
|
|
||||||
v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
|
|
||||||
v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
|
|
||||||
|
|
||||||
v_uint8 q4 = v_shuffle(q2, vmask);
|
|
||||||
v_uint8 q5 = v_shuffle(q3, vmask);
|
|
||||||
|
|
||||||
vx_store(&tmp[4 * w + 0], q4);
|
|
||||||
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (w < inLength) {
|
|
||||||
w = inLength - half_nlanes;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
|
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||||
} else { // if any lpi
|
} else { // if any lpi
|
||||||
int inLength = inSz.width * chanNum;
|
int inLength = inSz.width * chanNum;
|
||||||
|
|
||||||
@ -397,6 +415,8 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
|
|||||||
}
|
}
|
||||||
} else if (!xRatioEq) {
|
} else if (!xRatioEq) {
|
||||||
GAPI_DbgAssert(yRatioEq);
|
GAPI_DbgAssert(yRatioEq);
|
||||||
|
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||||
|
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||||
|
|
||||||
if (4 == lpi) {
|
if (4 == lpi) {
|
||||||
int inLength = inSz.width * chanNum;
|
int inLength = inSz.width * chanNum;
|
||||||
@ -422,7 +442,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
|
|||||||
}
|
}
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_4LPI<chanNum>(dst, tmp, mapsx, clone, outSz.width);
|
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||||
} else { // any LPI
|
} else { // any LPI
|
||||||
for (int l = 0; l < lpi; ++l) {
|
for (int l = 0; l < lpi; ++l) {
|
||||||
const uchar* src = src0[l];
|
const uchar* src = src0[l];
|
||||||
@ -469,9 +489,8 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3>& dst,
|
|||||||
const Size& inSz,
|
const Size& inSz,
|
||||||
const Size& outSz,
|
const Size& outSz,
|
||||||
const int lpi) {
|
const int lpi) {
|
||||||
constexpr int chanNum = 3;
|
calcRowLinear_8UC_Impl_<3>(dst, src0, src1, alpha, clone, mapsx,
|
||||||
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
|
beta, tmp, inSz, outSz, lpi);
|
||||||
beta, tmp, inSz, outSz, lpi);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize (bi-linear, 8UC4)
|
// Resize (bi-linear, 8UC4)
|
||||||
@ -486,20 +505,18 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
|
|||||||
const Size& inSz,
|
const Size& inSz,
|
||||||
const Size& outSz,
|
const Size& outSz,
|
||||||
const int lpi) {
|
const int lpi) {
|
||||||
constexpr int chanNum = 4;
|
calcRowLinear_8UC_Impl_<4>(dst, src0, src1, alpha, clone, mapsx,
|
||||||
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx,
|
beta, tmp, inSz, outSz, lpi);
|
||||||
beta, tmp, inSz, outSz, lpi);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
|
CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
|
||||||
const uchar* tmp, const short mapsx[],
|
const uchar* tmp, const short mapsx[],
|
||||||
|
const uchar _mask_horizontal[],
|
||||||
const short clone[], const int length) {
|
const short clone[], const int length) {
|
||||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||||
const int half_nlanes = nlanes / 2;
|
constexpr int half_nlanes = nlanes / 2;
|
||||||
GAPI_Assert(length >= half_nlanes);
|
GAPI_Assert(length >= half_nlanes);
|
||||||
|
|
||||||
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
|
||||||
1, 5, 9, 13, 3, 7, 11, 15 };
|
|
||||||
v_uint8 hmask = vx_load(_mask_horizontal);
|
v_uint8 hmask = vx_load(_mask_horizontal);
|
||||||
int x = 0;
|
int x = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
@ -557,12 +574,11 @@ CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
|
CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
|
||||||
const uchar* src, const short mapsx[],
|
const uchar* src, const short mapsx[],
|
||||||
const short alpha[], const int length,
|
const short alpha[], const int length) {
|
||||||
const int line) {
|
|
||||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||||
const int half_nlanes = nlanes / 2;
|
constexpr int half_nlanes = nlanes / 2;
|
||||||
GAPI_Assert(length >= half_nlanes);
|
GAPI_Assert(length >= half_nlanes);
|
||||||
v_int16 t0, t1;
|
v_int16 t0, t1;
|
||||||
int x = 0;
|
int x = 0;
|
||||||
@ -573,7 +589,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst[],
|
|||||||
|
|
||||||
v_deinterleave_expand(t, t0, t1);
|
v_deinterleave_expand(t, t0, t1);
|
||||||
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
||||||
v_pack_u_store(&dst[line][x], d);
|
v_pack_u_store(&dst[x], d);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (x < length) {
|
if (x < length) {
|
||||||
@ -608,79 +624,34 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
|||||||
if (!xRatioEq && !yRatioEq) {
|
if (!xRatioEq && !yRatioEq) {
|
||||||
GAPI_Assert(inSz.width >= half_nlanes);
|
GAPI_Assert(inSz.width >= half_nlanes);
|
||||||
|
|
||||||
|
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||||
|
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||||
if (4 == lpi) {
|
if (4 == lpi) {
|
||||||
// vertical pass
|
// vertical pass
|
||||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
vertical_4LPI(src0, src1, tmp, beta, inSz.width);
|
||||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
|
||||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
|
||||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
|
||||||
|
|
||||||
uchar _mask_vertical[nlanes] = { 0, 8, 4, 12, 1, 9, 5, 13,
|
|
||||||
2, 10, 6, 14, 3, 11, 7, 15 };
|
|
||||||
v_uint8 vmask = vx_load(_mask_vertical);
|
|
||||||
|
|
||||||
int w = 0;
|
|
||||||
for (;;) {
|
|
||||||
for (; w <= inSz.width - half_nlanes; w += half_nlanes) {
|
|
||||||
v_int16 val0_0 = v_reinterpret_as_s16(vx_load_expand(&src0[0][w]));
|
|
||||||
v_int16 val0_1 = v_reinterpret_as_s16(vx_load_expand(&src0[1][w]));
|
|
||||||
v_int16 val0_2 = v_reinterpret_as_s16(vx_load_expand(&src0[2][w]));
|
|
||||||
v_int16 val0_3 = v_reinterpret_as_s16(vx_load_expand(&src0[3][w]));
|
|
||||||
|
|
||||||
v_int16 val1_0 = v_reinterpret_as_s16(vx_load_expand(&src1[0][w]));
|
|
||||||
v_int16 val1_1 = v_reinterpret_as_s16(vx_load_expand(&src1[1][w]));
|
|
||||||
v_int16 val1_2 = v_reinterpret_as_s16(vx_load_expand(&src1[2][w]));
|
|
||||||
v_int16 val1_3 = v_reinterpret_as_s16(vx_load_expand(&src1[3][w]));
|
|
||||||
|
|
||||||
v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0);
|
|
||||||
v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1);
|
|
||||||
v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2);
|
|
||||||
v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3);
|
|
||||||
|
|
||||||
v_int16 r0 = v_add_wrap(val1_0, t0);
|
|
||||||
v_int16 r1 = v_add_wrap(val1_1, t1);
|
|
||||||
v_int16 r2 = v_add_wrap(val1_2, t2);
|
|
||||||
v_int16 r3 = v_add_wrap(val1_3, t3);
|
|
||||||
|
|
||||||
v_uint8 q0 = v_pack_u(r0, r1);
|
|
||||||
v_uint8 q1 = v_pack_u(r2, r3);
|
|
||||||
|
|
||||||
v_uint8 q2 = v_blend<0xCC /*0b11001100*/>(q0, v_shift_left<4>(q1));
|
|
||||||
v_uint8 q3 = v_blend<0xCC /*0b11001100*/>(v_shift_right<4>(q0), q1);
|
|
||||||
|
|
||||||
v_uint8 q4 = v_shuffle(q2, vmask);
|
|
||||||
v_uint8 q5 = v_shuffle(q3, vmask);
|
|
||||||
|
|
||||||
vx_store(&tmp[4 * w + 0], q4);
|
|
||||||
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (w < inSz.width) {
|
|
||||||
w = inSz.width - half_nlanes;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
|
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||||
} else { // if any lpi
|
} else { // if any lpi
|
||||||
for (int l = 0; l < lpi; ++l) {
|
for (int l = 0; l < lpi; ++l) {
|
||||||
short beta0 = beta[l];
|
short beta0 = beta[l];
|
||||||
const uchar* s0 = src0[l];
|
const uchar* s0 = src0[l];
|
||||||
const uchar* s1 = src1[l];
|
const uchar* s1 = src1[l];
|
||||||
|
uchar* _dst = dst[l];
|
||||||
|
|
||||||
// vertical pass
|
// vertical pass
|
||||||
vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
|
vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_anyLPI(dst, tmp, mapsx, alpha, outSz.width, l);
|
horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
|
||||||
}
|
}
|
||||||
} // if lpi == 4
|
} // if lpi == 4
|
||||||
|
|
||||||
} else if (!xRatioEq) {
|
} else if (!xRatioEq) {
|
||||||
GAPI_DbgAssert(yRatioEq);
|
GAPI_DbgAssert(yRatioEq);
|
||||||
GAPI_Assert(inSz.width >= nlanes);
|
GAPI_Assert(inSz.width >= nlanes);
|
||||||
|
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||||
|
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||||
|
|
||||||
if (4 == lpi) {
|
if (4 == lpi) {
|
||||||
// vertical pass
|
// vertical pass
|
||||||
@ -702,14 +673,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
|||||||
}
|
}
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
|
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||||
} else { // any LPI
|
} else { // any LPI
|
||||||
GAPI_Assert(outSz.width >= half_nlanes);
|
GAPI_Assert(outSz.width >= half_nlanes);
|
||||||
for (int l = 0; l < lpi; ++l) {
|
for (int l = 0; l < lpi; ++l) {
|
||||||
const uchar* src = src0[l];
|
const uchar* src = src0[l];
|
||||||
|
uchar* _dst = dst[l];
|
||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
horizontal_anyLPI(dst, src, mapsx, alpha, outSz.width, l);
|
horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -895,7 +895,7 @@ struct linearScratchDesc {
|
|||||||
tmp = reinterpret_cast<T*> (mapsy + outH*2);
|
tmp = reinterpret_cast<T*> (mapsy + outH*2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
|
static int bufSize(int inW, int /*inH*/, int outW, int outH, int lpi) {
|
||||||
auto size = outW * sizeof(alpha_t) +
|
auto size = outW * sizeof(alpha_t) +
|
||||||
outW * sizeof(alpha_t) * 4 + // alpha clones // previous alpha is redundant?
|
outW * sizeof(alpha_t) * 4 + // alpha clones // previous alpha is redundant?
|
||||||
outW * sizeof(index_t) +
|
outW * sizeof(index_t) +
|
||||||
@ -910,7 +910,7 @@ struct linearScratchDesc {
|
|||||||
template<typename T, typename Mapper, int chanNum = 1>
|
template<typename T, typename Mapper, int chanNum = 1>
|
||||||
static void initScratchLinear(const cv::GMatDesc& in,
|
static void initScratchLinear(const cv::GMatDesc& in,
|
||||||
const Size& outSz,
|
const Size& outSz,
|
||||||
cv::gapi::fluid::Buffer& scratch,
|
cv::gapi::fluid::Buffer& scratch,
|
||||||
int lpi) {
|
int lpi) {
|
||||||
using alpha_type = typename Mapper::alpha_type;
|
using alpha_type = typename Mapper::alpha_type;
|
||||||
static const auto unity = Mapper::unity;
|
static const auto unity = Mapper::unity;
|
||||||
@ -1171,7 +1171,7 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
|
|||||||
template<typename T, class Mapper, int numChan>
|
template<typename T, class Mapper, int numChan>
|
||||||
static void calcRowLinearC(const cv::gapi::fluid::View & in,
|
static void calcRowLinearC(const cv::gapi::fluid::View & in,
|
||||||
std::array<std::reference_wrapper<cv::gapi::fluid::Buffer>, numChan>& out,
|
std::array<std::reference_wrapper<cv::gapi::fluid::Buffer>, numChan>& out,
|
||||||
cv::gapi::fluid::Buffer& scratch) {
|
cv::gapi::fluid::Buffer& scratch) {
|
||||||
using alpha_type = typename Mapper::alpha_type;
|
using alpha_type = typename Mapper::alpha_type;
|
||||||
|
|
||||||
auto inSz = in.meta().size;
|
auto inSz = in.meta().size;
|
||||||
|
@ -18,12 +18,12 @@ namespace gapi {
|
|||||||
|
|
||||||
namespace kernels {
|
namespace kernels {
|
||||||
|
|
||||||
inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
|
CV_ALWAYS_INLINE void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||||
uint8_t out[], int length) {
|
uint8_t out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -46,12 +46,12 @@ inline void mergeRow_8UC2_Impl(const uint8_t in0[], const uint8_t in1[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
|
CV_ALWAYS_INLINE void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||||
const uint8_t in2[], uint8_t out[], int length) {
|
const uint8_t in2[], uint8_t out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -75,12 +75,13 @@ inline void mergeRow_8UC3_Impl(const uint8_t in0[], const uint8_t in1[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const uint8_t in2[],
|
CV_ALWAYS_INLINE void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[],
|
||||||
const uint8_t in3[], uint8_t out[], int length) {
|
const uint8_t in2[], const uint8_t in3[],
|
||||||
|
uint8_t out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -106,12 +107,12 @@ inline void mergeRow_8UC4_Impl(const uint8_t in0[], const uint8_t in1[], const u
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
|
CV_ALWAYS_INLINE void mergeRow_32FC2_Impl(const float in0[], const float in1[],
|
||||||
float out[], int length) {
|
float out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -133,12 +134,12 @@ inline void mergeRow_32FC2_Impl(const float in0[], const float in1[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
|
CV_ALWAYS_INLINE void mergeRow_32FC3_Impl(const float in0[], const float in1[], const float in2[],
|
||||||
float out[], int length) {
|
float out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -162,13 +163,13 @@ inline void mergeRow_32FC3_Impl(const float in0[], const float in1[], const floa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
|
CV_ALWAYS_INLINE void mergeRow_32FC4_Impl(const float in0[], const float in1[],
|
||||||
const float in2[], const float in3[],
|
const float in2[], const float in3[],
|
||||||
float out[], int length) {
|
float out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -196,12 +197,12 @@ inline void mergeRow_32FC4_Impl(const float in0[], const float in1[],
|
|||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
|
CV_ALWAYS_INLINE void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
|
||||||
uint8_t out1[], int length) {
|
uint8_t out1[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -223,12 +224,12 @@ inline void splitRow_8UC2_Impl(const uint8_t in[], uint8_t out0[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
|
CV_ALWAYS_INLINE void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
|
||||||
uint8_t out1[], uint8_t out2[], int length) {
|
uint8_t out1[], uint8_t out2[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -252,12 +253,12 @@ inline void splitRow_8UC3_Impl(const uint8_t in[], uint8_t out0[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
CV_ALWAYS_INLINE void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[],
|
||||||
uint8_t out2[], uint8_t out3[], int length) {
|
uint8_t out2[], uint8_t out3[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -283,12 +284,12 @@ inline void splitRow_8UC4_Impl(const uint8_t in[], uint8_t out0[], uint8_t out1[
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void splitRow_32FC2_Impl(const float in[], float out0[],
|
CV_ALWAYS_INLINE void splitRow_32FC2_Impl(const float in[], float out0[],
|
||||||
float out1[], int length) {
|
float out1[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -311,12 +312,12 @@ inline void splitRow_32FC2_Impl(const float in[], float out0[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
|
CV_ALWAYS_INLINE void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
|
||||||
float out2[], int length) {
|
float out2[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -340,12 +341,12 @@ inline void splitRow_32FC3_Impl(const float in[], float out0[], float out1[],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
|
CV_ALWAYS_INLINE void splitRow_32FC4_Impl(const float in[], float out0[], float out1[],
|
||||||
float out2[], float out3[], int length) {
|
float out2[], float out3[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
cycle:
|
cycle:
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
@ -380,7 +381,7 @@ static const int ITUR_BT_601_CVG = -852492;
|
|||||||
static const int ITUR_BT_601_CVR = 1673527;
|
static const int ITUR_BT_601_CVR = 1673527;
|
||||||
static const int ITUR_BT_601_SHIFT = 20;
|
static const int ITUR_BT_601_SHIFT = 20;
|
||||||
|
|
||||||
static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
|
CV_ALWAYS_INLINE void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
|
||||||
int uu, vv;
|
int uu, vv;
|
||||||
uu = static_cast<int>(u) - 128;
|
uu = static_cast<int>(u) - 128;
|
||||||
vv = static_cast<int>(v) - 128;
|
vv = static_cast<int>(v) - 128;
|
||||||
@ -390,9 +391,9 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i
|
|||||||
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
|
buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
CV_ALWAYS_INLINE void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
||||||
v_int32 (&ruv)[4], v_int32 (&guv)[4],
|
v_int32 (&ruv)[4], v_int32 (&guv)[4],
|
||||||
v_int32 (&buv)[4]) {
|
v_int32 (&buv)[4]) {
|
||||||
v_uint8 v128 = vx_setall_u8(128);
|
v_uint8 v128 = vx_setall_u8(128);
|
||||||
v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
|
v_int8 su = v_reinterpret_as_s8(v_sub_wrap(u, v128));
|
||||||
v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
|
v_int8 sv = v_reinterpret_as_s8(v_sub_wrap(v, v128));
|
||||||
@ -417,8 +418,8 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
|
CV_ALWAYS_INLINE void yRGBuvToRGB(const uchar vy, const int ruv, const int guv,
|
||||||
uchar& r, uchar& g, uchar& b) {
|
const int buv, uchar& r, uchar& g, uchar& b) {
|
||||||
int yy = static_cast<int>(vy);
|
int yy = static_cast<int>(vy);
|
||||||
int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
|
int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
|
||||||
r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
|
r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
|
||||||
@ -426,11 +427,11 @@ static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, con
|
|||||||
b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
|
b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void yRGBuvToRGB(const v_uint8& vy,
|
CV_ALWAYS_INLINE void yRGBuvToRGB(const v_uint8& vy,
|
||||||
const v_int32 (&ruv)[4],
|
const v_int32 (&ruv)[4],
|
||||||
const v_int32 (&guv)[4],
|
const v_int32 (&guv)[4],
|
||||||
const v_int32 (&buv)[4],
|
const v_int32 (&buv)[4],
|
||||||
v_uint8& rr, v_uint8& gg, v_uint8& bb) {
|
v_uint8& rr, v_uint8& gg, v_uint8& bb) {
|
||||||
v_uint8 v16 = vx_setall_u8(16);
|
v_uint8 v16 = vx_setall_u8(16);
|
||||||
v_uint8 posY = vy - v16;
|
v_uint8 posY = vy - v16;
|
||||||
v_uint16 yy0, yy1;
|
v_uint16 yy0, yy1;
|
||||||
@ -463,15 +464,14 @@ static inline void yRGBuvToRGB(const v_uint8& vy,
|
|||||||
bb = v_pack_u(b0, b1);
|
bb = v_pack_u(b0, b1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void calculate_nv12_to_rgb_impl(const uchar **srcY,
|
CV_ALWAYS_INLINE void calculate_nv12_to_rgb_impl(const uchar **srcY,
|
||||||
const uchar *srcUV,
|
const uchar *srcUV,
|
||||||
uchar **dstRGBx,
|
uchar **dstRGBx,
|
||||||
int width) {
|
int width) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
const int nlanes = v_uint8::nlanes;
|
|
||||||
|
|
||||||
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
|
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
|
||||||
v_uint8 u, v;
|
v_uint8 u, v;
|
||||||
@ -535,14 +535,13 @@ inline void calculate_nv12_to_rgb_impl(const uchar **srcY,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
|
CV_ALWAYS_INLINE void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
|
||||||
const uchar *srcV, uchar **dstRGBx,
|
const uchar *srcV, uchar **dstRGBx,
|
||||||
int width) {
|
int width) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
const int nlanes = v_uint8::nlanes;
|
|
||||||
|
|
||||||
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
|
for ( ; i <= width - 2*nlanes; i += 2*nlanes) {
|
||||||
v_uint8 u = vx_load(srcU + i/2);
|
v_uint8 u = vx_load(srcU + i/2);
|
||||||
@ -610,8 +609,8 @@ inline void calculate_i420_to_rgb_impl(const uchar **srcY, const uchar *srcU,
|
|||||||
|
|
||||||
// vertical pass
|
// vertical pass
|
||||||
template<typename T, typename A, typename I, typename W>
|
template<typename T, typename A, typename I, typename W>
|
||||||
static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
|
CV_ALWAYS_INLINE void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap,
|
||||||
W vbuf[]) {
|
A yalpha, W vbuf[]) {
|
||||||
int y_1st = ymap.index0;
|
int y_1st = ymap.index0;
|
||||||
int ylast = ymap.index1 - 1;
|
int ylast = ymap.index1 - 1;
|
||||||
|
|
||||||
@ -619,7 +618,7 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
|
|||||||
GAPI_DbgAssert(y_1st < ylast);
|
GAPI_DbgAssert(y_1st < ylast);
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint16::nlanes;
|
constexpr int nlanes = v_uint16::nlanes;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 1st and last rows
|
// 1st and last rows
|
||||||
@ -667,8 +666,8 @@ static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ym
|
|||||||
|
|
||||||
// horizontal pass
|
// horizontal pass
|
||||||
template<typename T, typename A, typename I, typename W>
|
template<typename T, typename A, typename I, typename W>
|
||||||
static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
|
CV_ALWAYS_INLINE void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
|
||||||
const A xalpha[], const W vbuf[]) {
|
const A xalpha[], const W vbuf[]) {
|
||||||
// TO DO: try lambda here
|
// TO DO: try lambda here
|
||||||
#define HSUM(xmaxdf) \
|
#define HSUM(xmaxdf) \
|
||||||
for (int x = 0; x < outWidth; x++) { \
|
for (int x = 0; x < outWidth; x++) { \
|
||||||
@ -704,9 +703,11 @@ static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[],
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, typename A, typename I, typename W>
|
template<typename T, typename A, typename I, typename W>
|
||||||
static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
|
CV_ALWAYS_INLINE void calcRowArea_impl(T dst[], const T *src[], const Size& inSz,
|
||||||
A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
|
const Size& outSz, A yalpha,
|
||||||
W vbuf[]) {
|
const MapperUnit<A, I>& ymap, int xmaxdf,
|
||||||
|
const I xindex[], const A xalpha[],
|
||||||
|
W vbuf[]) {
|
||||||
bool xRatioEq1 = inSz.width == outSz.width;
|
bool xRatioEq1 = inSz.width == outSz.width;
|
||||||
bool yRatioEq1 = inSz.height == outSz.height;
|
bool yRatioEq1 = inSz.height == outSz.height;
|
||||||
|
|
||||||
@ -738,18 +739,18 @@ static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Si
|
|||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
template <typename VecT, typename T>
|
template <typename VecT, typename T>
|
||||||
void copyRow_impl(const T in[], T out[], int l) {
|
CV_ALWAYS_INLINE void copyRow_impl(const T in[], T out[], int l) {
|
||||||
VecT r;
|
VecT r;
|
||||||
r = vx_load(&in[l]);
|
r = vx_load(&in[l]);
|
||||||
vx_store(&out[l], r);
|
vx_store(&out[l], r);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
|
CV_ALWAYS_INLINE void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_uint8::nlanes;
|
constexpr int nlanes = v_uint8::nlanes;
|
||||||
|
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
copyRow_impl<v_uint8>(in, out, l);
|
copyRow_impl<v_uint8>(in, out, l);
|
||||||
@ -766,11 +767,11 @@ inline void copyRow_8U_impl(const uint8_t in[], uint8_t out[], int length) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void copyRow_32F_impl(const float in[], float out[], int length) {
|
CV_ALWAYS_INLINE void copyRow_32F_impl(const float in[], float out[], int length) {
|
||||||
int l = 0;
|
int l = 0;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
|
|
||||||
for (; l <= length - nlanes; l += nlanes) {
|
for (; l <= length - nlanes; l += nlanes) {
|
||||||
copyRow_impl<v_float32>(in, out, l);
|
copyRow_impl<v_float32>(in, out, l);
|
||||||
@ -801,7 +802,7 @@ CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
|
|||||||
bool yRatioEq1 = inSz.height == outSz.height;
|
bool yRatioEq1 = inSz.height == outSz.height;
|
||||||
|
|
||||||
#if MANUAL_SIMD
|
#if MANUAL_SIMD
|
||||||
const int nlanes = v_float32::nlanes;
|
constexpr int nlanes = v_float32::nlanes;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!xRatioEq1 && !yRatioEq1) {
|
if (!xRatioEq1 && !yRatioEq1) {
|
||||||
|
@ -2606,6 +2606,24 @@ CV_ALWAYS_INLINE v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8
|
|||||||
return v_uint8x16(v);
|
return v_uint8x16(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE void v_interleave(const v_int16x8& a, const v_int16x8& b,
|
||||||
|
v_int16x8& v1, v_int16x8& v2)
|
||||||
|
{
|
||||||
|
int16x8x2_t p = vzipq_s16(a.val, b.val);
|
||||||
|
v1.val = p.val[0];
|
||||||
|
v2.val = p.val[1];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
CV_ALWAYS_INLINE void v_interleave(const v_int32x4& a, const v_int32x4& b,
|
||||||
|
v_int32x4& v1, v_int32x4& v2)
|
||||||
|
{
|
||||||
|
int32x4x2_t p = vzipq_s32(a.val, b.val);
|
||||||
|
v1.val = p.val[0];
|
||||||
|
v2.val = p.val[1];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
template<int shift>
|
template<int shift>
|
||||||
CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
|
CV_ALWAYS_INLINE v_uint8x16 v_slli_si128(const v_uint8x16& a)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user