Pre-processing: Resize Linear 1C refactoring (#6330)
* * Resize 8UC1 refactoring * * Resize 32FC1 refactoring * Applied comments
This commit is contained in:
parent
123dd1d5ff
commit
c24b302c45
@ -43,25 +43,12 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
|
||||
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float* dst[],
|
||||
const float* src0[],
|
||||
const float* src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
template<int chanNum>
|
||||
CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
|
||||
const uchar* src, const int width,
|
||||
const int line) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
GAPI_Assert(width >= nlanes);
|
||||
GAPI_DbgAssert(width >= nlanes);
|
||||
|
||||
v_uint8 chan;
|
||||
int x = 0;
|
||||
@ -85,7 +72,7 @@ CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
|
||||
uchar* tmp, const int inLength,
|
||||
const short beta) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
GAPI_Assert(inLength >= nlanes);
|
||||
GAPI_DbgAssert(inLength >= nlanes);
|
||||
|
||||
const int half_nlanes = nlanes/2;
|
||||
int w = 0;
|
||||
@ -116,7 +103,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
|
||||
const int line) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
const int half_nlanes = nlanes/2;
|
||||
GAPI_Assert(width >= half_nlanes);
|
||||
GAPI_DbgAssert(width >= half_nlanes);
|
||||
|
||||
v_int16 t0, t1;//, t2, t3;
|
||||
int x = 0;
|
||||
@ -220,7 +207,7 @@ CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNu
|
||||
const int length) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
constexpr int half_nlanes = nlanes / 2;
|
||||
GAPI_Assert(length >= half_nlanes);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
const int shift = static_cast<int>(half_nlanes / 4);
|
||||
|
||||
@ -310,7 +297,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
|
||||
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
|
||||
neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
|
||||
|
||||
// horizontal pass
|
||||
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||
@ -338,7 +325,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
|
||||
int inLength = inSz.width * chanNum;
|
||||
|
||||
// vertical pass
|
||||
GAPI_Assert(inLength >= nlanes);
|
||||
GAPI_DbgAssert(inLength >= nlanes);
|
||||
v_uint8 s0, s1, s2, s3;
|
||||
int w = 0;
|
||||
for (;;) {
|
||||
@ -427,12 +414,13 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
|
||||
|
||||
CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
|
||||
const uchar* tmp, const short mapsx[],
|
||||
const uchar _mask_horizontal[],
|
||||
const short clone[], const int length) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
constexpr int half_nlanes = nlanes / 2;
|
||||
GAPI_Assert(length >= half_nlanes);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||
v_uint8 hmask = vx_load(_mask_horizontal);
|
||||
int x = 0;
|
||||
for (;;) {
|
||||
@ -495,7 +483,8 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
|
||||
const short alpha[], const int length) {
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
constexpr int half_nlanes = nlanes / 2;
|
||||
GAPI_Assert(length >= half_nlanes);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
v_int16 t0, t1;
|
||||
int x = 0;
|
||||
for (;;) {
|
||||
@ -515,39 +504,42 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace neon
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
template<>
|
||||
bool calcRowLinear8UC1Impl(neon_tag,
|
||||
uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi,
|
||||
const int) {
|
||||
static_assert(v_uint8::nlanes == 16,
|
||||
"The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
|
||||
|
||||
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
|
||||
constexpr int half_nlanes = nlanes / 2;
|
||||
constexpr int half_nlanes = v_uint8::nlanes / 2;
|
||||
|
||||
if (inSz.width < nlanes || outSz.width < half_nlanes)
|
||||
return false;
|
||||
|
||||
bool xRatioEq = inSz.width == outSz.width;
|
||||
bool yRatioEq = inSz.height == outSz.height;
|
||||
|
||||
if (!xRatioEq && !yRatioEq) {
|
||||
GAPI_Assert(inSz.width >= half_nlanes);
|
||||
|
||||
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
vertical_4LPI(src0, src1, tmp, beta, inSz.width);
|
||||
neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width);
|
||||
|
||||
// horizontal pass
|
||||
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||
neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
|
||||
} else { // if any lpi
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
@ -556,18 +548,16 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
uchar* _dst = dst[l];
|
||||
|
||||
// vertical pass
|
||||
vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
|
||||
neon::vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
|
||||
|
||||
// horizontal pass
|
||||
horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
|
||||
neon::horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
|
||||
}
|
||||
} // if lpi == 4
|
||||
|
||||
} else if (!xRatioEq) {
|
||||
GAPI_DbgAssert(yRatioEq);
|
||||
GAPI_Assert(inSz.width >= nlanes);
|
||||
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
|
||||
1, 5, 9, 13, 3, 7, 11, 15 };
|
||||
GAPI_DbgAssert(inSz.width >= nlanes);
|
||||
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
@ -589,15 +579,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
}
|
||||
|
||||
// horizontal pass
|
||||
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
|
||||
neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
|
||||
|
||||
} else { // any LPI
|
||||
GAPI_Assert(outSz.width >= half_nlanes);
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
const uchar* src = src0[l];
|
||||
uchar* _dst = dst[l];
|
||||
|
||||
// horizontal pass
|
||||
horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
|
||||
neon::horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
|
||||
}
|
||||
}
|
||||
|
||||
@ -611,7 +601,7 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
const uchar* s1 = src1[l];
|
||||
|
||||
// vertical pass
|
||||
vertical_anyLPI(s0, s1, dst[l], length, beta0);
|
||||
neon::vertical_anyLPI(s0, s1, dst[l], length, beta0);
|
||||
}
|
||||
|
||||
} else {
|
||||
@ -622,8 +612,8 @@ void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace neon
|
||||
|
||||
template void chanToPlaneRowImpl(neon_tag, const uint8_t* in, int chan, int chs, uint8_t* out, const int length);
|
||||
template void chanToPlaneRowImpl(neon_tag, const float* in, int chan, int chs, float * out, const int length);
|
||||
@ -646,6 +636,10 @@ template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<cons
|
||||
template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[], const float beta[],
|
||||
const Size& inSz, const Size& outSz, const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -28,19 +28,6 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si
|
||||
float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
|
||||
const float xalpha[], float vbuf[]);
|
||||
|
||||
// Resize (bi-linear, 8U)
|
||||
void calcRowLinear_8UC1(uint8_t *dst[],
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC3)
|
||||
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
|
||||
const uint8_t *src0[],
|
||||
@ -81,17 +68,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
|
||||
int lpi) {
|
||||
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
} // namespace neon
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
@ -131,6 +107,24 @@ extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::arr
|
||||
extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
|
||||
const short alpha[], const short clone[], const short mapsx[],
|
||||
const short beta[], uint8_t tmp[], const Size& inSz,
|
||||
const Size& outSz, const int lpi, const int l);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
|
||||
extern template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[],
|
||||
const float* src1[], const float alpha[],
|
||||
const int mapsx[], const float beta[],
|
||||
const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -61,17 +61,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
|
||||
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
|
||||
}
|
||||
|
||||
static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
const v_uint8& val_1,
|
||||
const v_uint8& val_2,
|
||||
const v_uint8& val_3,
|
||||
const v_int16& a10,
|
||||
const v_int16& a32,
|
||||
const v_int16& a54,
|
||||
const v_int16& a76,
|
||||
v_uint8& shuf_mask1,
|
||||
v_uint8& shuf_mask2,
|
||||
v_uint8& res1, v_uint8& res2) {
|
||||
CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
const v_uint8& val_1,
|
||||
const v_uint8& val_2,
|
||||
const v_uint8& val_3,
|
||||
const v_int16& a10,
|
||||
const v_int16& a32,
|
||||
const v_int16& a54,
|
||||
const v_int16& a76,
|
||||
v_uint8& shuf_mask1,
|
||||
v_uint8& shuf_mask2,
|
||||
v_uint8& res1, v_uint8& res2) {
|
||||
v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
|
||||
v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
|
||||
v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
|
||||
@ -108,17 +108,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
res2 = v_shuffle_s8(q7, shuf_mask2);
|
||||
}
|
||||
|
||||
static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const short beta[],
|
||||
const int& length, const int& half_nlanes) {
|
||||
CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const short beta[],
|
||||
const int& length) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
||||
|
||||
v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
|
||||
v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15,
|
||||
0, 8, 4, 12, 1, 9, 5, 13,
|
||||
0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15);
|
||||
for (int w = 0; w < length; ) {
|
||||
for (; w <= length - half_nlanes; w += half_nlanes) {
|
||||
@ -164,63 +167,26 @@ static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* sr
|
||||
}
|
||||
}
|
||||
|
||||
static inline v_uint8 setHorizontalShufMask1() {
|
||||
CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask1() {
|
||||
return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
|
||||
1, 5, 9, 13, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 2, 6, 10, 14,
|
||||
1, 5, 9, 13, 3, 7, 11, 15);
|
||||
}
|
||||
|
||||
static inline v_uint8 setHorizontalShufMask2() {
|
||||
CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask2() {
|
||||
return v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11,
|
||||
4, 5, 12, 13, 6, 7, 14, 15,
|
||||
0, 1, 8, 9, 2, 3, 10, 11,
|
||||
4, 5, 12, 13, 6, 7, 14, 15);
|
||||
}
|
||||
|
||||
static inline void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
|
||||
uint8_t tmp[], uint8_t* dst[], const int& length,
|
||||
const int& half_nlanes) {
|
||||
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
|
||||
constexpr int shift = 4;
|
||||
v_uint8 shuf_mask1 = setHorizontalShufMask1();
|
||||
v_uint8 shuf_mask2 = setHorizontalShufMask2();
|
||||
CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const int& beta0,
|
||||
const int l, const int length1, const int length2) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length1 >= half_nlanes);
|
||||
|
||||
v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
|
||||
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a10 = vx_load(&clone[4 * x]);
|
||||
v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
|
||||
v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
|
||||
v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
|
||||
|
||||
v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
|
||||
val_0 = v_permute32(val_0, idxs);
|
||||
val_1 = v_permute32(val_1, idxs);
|
||||
val_2 = v_permute32(val_2, idxs);
|
||||
val_3 = v_permute32(val_3, idxs);
|
||||
|
||||
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
|
||||
a10, a32, a54, a76,
|
||||
shuf_mask1, shuf_mask2,
|
||||
res1, res2);
|
||||
|
||||
v_store_low(&dst[0][x], res1);
|
||||
v_store_high(&dst[1][x], res1);
|
||||
v_store_low(&dst[2][x], res2);
|
||||
v_store_high(&dst[3][x], res2);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const int& beta0, const int& half_nlanes,
|
||||
const int& l, const int& length1, const int& length2) {
|
||||
for (int w = 0; w < length2; ) {
|
||||
for (; w <= length1 - half_nlanes; w += half_nlanes) {
|
||||
v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
|
||||
@ -235,148 +201,25 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
|
||||
}
|
||||
}
|
||||
|
||||
static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
|
||||
uint8_t* dst[], const uchar tmp[], const int& l,
|
||||
const int& half_nlanes, const int& length) {
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
|
||||
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
|
||||
v_uint8 t = v_gather_pairs(tmp, sx); // 16 pairs of src0 pixels
|
||||
v_int16 t0, t1;
|
||||
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
|
||||
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
||||
v_pack_u_store(&dst[l][x], d);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
bool xRatioEq = inSz.width == outSz.width;
|
||||
bool yRatioEq = inSz.height == outSz.height;
|
||||
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int half_nlanes = (nlanes / 2);
|
||||
|
||||
if (!xRatioEq && !yRatioEq) {
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width, half_nlanes);
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
|
||||
|
||||
} else { // if any lpi
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
|
||||
}
|
||||
} // if lpi == 4
|
||||
|
||||
} else if (!xRatioEq) {
|
||||
GAPI_DbgAssert(yRatioEq);
|
||||
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= nlanes);
|
||||
for (int w = 0; w < inSz.width; ) {
|
||||
for (; w <= inSz.width - nlanes; w += nlanes) {
|
||||
v_uint8 s0, s1, s2, s3;
|
||||
s0 = vx_load(&src0[0][w]);
|
||||
s1 = vx_load(&src0[1][w]);
|
||||
s2 = vx_load(&src0[2][w]);
|
||||
s3 = vx_load(&src0[3][w]);
|
||||
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
|
||||
}
|
||||
|
||||
if (w < inSz.width) {
|
||||
w = inSz.width - nlanes;
|
||||
}
|
||||
}
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
|
||||
|
||||
} else { // any LPI
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
const uchar *src = src0[l];
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq) {
|
||||
GAPI_DbgAssert(xRatioEq);
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
|
||||
inLength, outLength);
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq && yRatioEq);
|
||||
int length = inSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int chanNum>
|
||||
void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi) {
|
||||
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
const int shift = (half_nlanes / 4);
|
||||
|
||||
if (4 == lpi) {
|
||||
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
|
||||
verticalPass_lpi4_8U(src0, src1, tmp, beta,
|
||||
inSz.width*chanNum, half_nlanes);
|
||||
inSz.width*chanNum);
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
@ -420,8 +263,7 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
|
||||
inSz.width*chanNum, inSz.width*chanNum);
|
||||
|
||||
// horizontal pass
|
||||
@ -480,20 +322,176 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
CV_ALWAYS_INLINE void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
|
||||
uint8_t tmp[], uint8_t* dst[], const int& length) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
|
||||
constexpr int shift = 4;
|
||||
v_uint8 shuf_mask1 = avx::setHorizontalShufMask1();
|
||||
v_uint8 shuf_mask2 = avx::setHorizontalShufMask2();
|
||||
|
||||
v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
|
||||
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a10 = vx_load(&clone[4 * x]);
|
||||
v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
|
||||
v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
|
||||
v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
|
||||
|
||||
v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
|
||||
val_0 = v_permute32(val_0, idxs);
|
||||
val_1 = v_permute32(val_1, idxs);
|
||||
val_2 = v_permute32(val_2, idxs);
|
||||
val_3 = v_permute32(val_3, idxs);
|
||||
|
||||
avx::main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
|
||||
a10, a32, a54, a76,
|
||||
shuf_mask1, shuf_mask2,
|
||||
res1, res2);
|
||||
|
||||
v_store_low(&dst[0][x], res1);
|
||||
v_store_high(&dst[1][x], res1);
|
||||
v_store_low(&dst[2][x], res2);
|
||||
v_store_high(&dst[3][x], res2);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
|
||||
uint8_t* dst[], const uchar tmp[], const int l,
|
||||
const int length) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
v_int16 t0, t1;
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
|
||||
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
|
||||
v_uint8 t = v_gather_pairs(tmp, sx); // 16 pairs of src0 pixels
|
||||
|
||||
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
|
||||
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
||||
v_pack_u_store(&dst[l][x], d);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace avx
|
||||
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
template<>
|
||||
bool calcRowLinear8UC1Impl(avx2_tag,
|
||||
uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi,
|
||||
const int) {
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
|
||||
if (inSz.width < nlanes || outSz.width < half_nlanes)
|
||||
return false;
|
||||
|
||||
bool xRatioEq = inSz.width == outSz.width;
|
||||
bool yRatioEq = inSz.height == outSz.height;
|
||||
|
||||
if (!xRatioEq && !yRatioEq) {
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
avx::verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width);
|
||||
|
||||
// horizontal pass
|
||||
avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
|
||||
|
||||
} else { // if any lpi
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
avx::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
|
||||
|
||||
// horizontal pass
|
||||
avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
|
||||
}
|
||||
} // if lpi == 4
|
||||
|
||||
} else if (!xRatioEq) {
|
||||
GAPI_DbgAssert(yRatioEq);
|
||||
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= nlanes);
|
||||
v_uint8 s0, s1, s2, s3;
|
||||
for (int w = 0; w < inSz.width; ) {
|
||||
for (; w <= inSz.width - nlanes; w += nlanes) {
|
||||
s0 = vx_load(&src0[0][w]);
|
||||
s1 = vx_load(&src0[1][w]);
|
||||
s2 = vx_load(&src0[2][w]);
|
||||
s3 = vx_load(&src0[3][w]);
|
||||
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
|
||||
}
|
||||
|
||||
if (w < inSz.width) {
|
||||
w = inSz.width - nlanes;
|
||||
}
|
||||
}
|
||||
|
||||
// horizontal pass
|
||||
avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
|
||||
|
||||
} else { // any LPI
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
const uchar* src = src0[l];
|
||||
|
||||
// horizontal pass
|
||||
avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq) {
|
||||
GAPI_DbgAssert(xRatioEq);
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
avx::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq && yRatioEq);
|
||||
int length = inSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template void chanToPlaneRowImpl(avx2_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
|
||||
template void chanToPlaneRowImpl(avx2_tag, const float* in, const int chan, const int chs, float* out, const int length);
|
||||
|
||||
@ -516,6 +514,11 @@ template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<cons
|
||||
template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -41,20 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// Resize (bi-linear, 8UC1)
|
||||
void calcRowLinear_8UC1(uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC3)
|
||||
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
|
||||
const uint8_t* src0[],
|
||||
@ -66,7 +52,7 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
const int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC4)
|
||||
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
@ -79,33 +65,22 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
const int lpi);
|
||||
|
||||
template<int numChan>
|
||||
void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi) {
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
} // namespace avx
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
@ -148,6 +123,23 @@ extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::arr
|
||||
extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
|
||||
const short alpha[], const short clone[], const short mapsx[],
|
||||
const short beta[], uint8_t tmp[], const Size& inSz,
|
||||
const Size& outSz, const int lpi, const int l);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
|
||||
extern template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -55,10 +55,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
|
||||
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
|
||||
}
|
||||
|
||||
static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *src1[],
|
||||
uint8_t tmp[], v_int16& b0, v_int16& b1,
|
||||
v_int16& b2, v_int16& b3, v_uint8& shuf_mask,
|
||||
int half_nlanes, int width) {
|
||||
CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const short beta[], const v_uint8& shuf_mask,
|
||||
const int width) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(width >= half_nlanes);
|
||||
|
||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
||||
|
||||
v_uint32 permute_idxs1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
|
||||
v_uint32 permute_idxs2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
|
||||
|
||||
@ -86,37 +93,13 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr
|
||||
|
||||
v_uint8 q0 = v_packus(r0, r1);
|
||||
v_uint8 q1 = v_packus(r2, r3);
|
||||
#if 1
|
||||
|
||||
v_uint8 q2 = v_permutex2_s32(q0, q1, permute_idxs1);
|
||||
v_uint8 q3 = v_permutex2_s32(q0, q1, permute_idxs2);
|
||||
|
||||
v_uint8 q4 = v_shuffle_s8(q2, shuf_mask);
|
||||
v_uint8 q5 = v_shuffle_s8(q3, shuf_mask);
|
||||
|
||||
//Second variant of decompose. It'll be usefull in the future.
|
||||
#else
|
||||
v_uint8 q2 = v_mblend_shiftleft(q0, q1);
|
||||
v_uint8 q3 = v_mblend_shiftright(q0, q1);
|
||||
|
||||
v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15,
|
||||
0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15,
|
||||
0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15,
|
||||
0, 8, 4, 12, 1, 9, 5, 13,
|
||||
2, 10, 6, 14, 3, 11, 7, 15);
|
||||
|
||||
v_uint8 q4 = v_shuffle_s8(q2, mask1);
|
||||
v_uint8 q5 = v_shuffle_s8(q3, mask1);
|
||||
|
||||
v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
|
||||
v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
|
||||
|
||||
v_uint8 q6 = v_permutex2_s64(q4, q5, permute_idxs1);
|
||||
v_uint8 q7 = v_permutex2_s64(q4, q5, permute_idxs2);
|
||||
#endif
|
||||
|
||||
vx_store(&tmp[4 * w + 0], q4);
|
||||
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
|
||||
}
|
||||
@ -125,21 +108,21 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr
|
||||
w = width - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
const v_uint8& val_1,
|
||||
const v_uint8& val_2,
|
||||
const v_uint8& val_3,
|
||||
const v_int16& a10,
|
||||
const v_int16& a32,
|
||||
const v_int16& a54,
|
||||
const v_int16& a76,
|
||||
v_uint8& shuf_mask1,
|
||||
v_uint8& shuf_mask2,
|
||||
v_uint32& idxs1,
|
||||
v_uint32& idxs2,
|
||||
v_uint8& res1, v_uint8& res2) {
|
||||
CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
const v_uint8& val_1,
|
||||
const v_uint8& val_2,
|
||||
const v_uint8& val_3,
|
||||
const v_int16& a10,
|
||||
const v_int16& a32,
|
||||
const v_int16& a54,
|
||||
const v_int16& a76,
|
||||
v_uint8& shuf_mask1,
|
||||
v_uint8& shuf_mask2,
|
||||
v_uint32& idxs1,
|
||||
v_uint32& idxs2,
|
||||
v_uint8& res1, v_uint8& res2) {
|
||||
v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
|
||||
v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
|
||||
v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
|
||||
@ -165,91 +148,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
|
||||
|
||||
v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
|
||||
v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
|
||||
#if 1
|
||||
|
||||
v_uint8 q4 = v_permutex2_s32(q2, q3, idxs1);
|
||||
v_uint8 q5 = v_permutex2_s32(q2, q3, idxs2);
|
||||
|
||||
res1 = v_shuffle_s8(q4, shuf_mask2);
|
||||
res2 = v_shuffle_s8(q5, shuf_mask2);
|
||||
|
||||
//Second variant of decompose. It'll be usefull in the future.
|
||||
#else
|
||||
v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
|
||||
v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
|
||||
|
||||
v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
|
||||
|
||||
v_uint8 q6 = v_permute32(idx, q4);
|
||||
v_uint8 q7 = v_permute32(idx, q5);
|
||||
|
||||
v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15);
|
||||
|
||||
v_uint8 q8 = v_shuffle_s8(q6, mask2);
|
||||
v_uint8 q9 = v_shuffle_s8(q7, mask2);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
|
||||
uint8_t tmp[], uint8_t *dst[],
|
||||
v_uint8& shuf_mask1,
|
||||
int width, int half_nlanes) {
|
||||
v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15);
|
||||
CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const int beta0,
|
||||
const int l, const int length1, const int length2) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length1 >= half_nlanes);
|
||||
|
||||
v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
|
||||
v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
|
||||
|
||||
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
|
||||
const int shift = half_nlanes / 4;
|
||||
|
||||
for (int x = 0; x < width; ) {
|
||||
for (; x <= width - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a10 = vx_load(&clone[4 * x]);
|
||||
v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
|
||||
v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
|
||||
v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
|
||||
|
||||
v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
|
||||
|
||||
val_0 = v_permute32(val_0, permute_idxs1);
|
||||
val_1 = v_permute32(val_1, permute_idxs1);
|
||||
val_2 = v_permute32(val_2, permute_idxs1);
|
||||
val_3 = v_permute32(val_3, permute_idxs1);
|
||||
|
||||
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
|
||||
a10, a32, a54, a76,
|
||||
shuf_mask1, shuf_mask2,
|
||||
permute_idxs2, permute_idxs3,
|
||||
res1, res2);
|
||||
v_store_low(&dst[0][x], res1);
|
||||
v_store_high(&dst[1][x], res1);
|
||||
v_store_low(&dst[2][x], res2);
|
||||
v_store_high(&dst[3][x], res2);
|
||||
}
|
||||
|
||||
if (x < width) {
|
||||
x = width - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
|
||||
uint8_t tmp[], const int& beta0, const int& half_nlanes,
|
||||
const int& l, const int& length1, const int& length2) {
|
||||
for (int w = 0; w < length2; ) {
|
||||
for (; w <= length1 - half_nlanes; w += half_nlanes) {
|
||||
v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
|
||||
@ -264,169 +176,19 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
|
||||
}
|
||||
}
|
||||
|
||||
static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
|
||||
uint8_t* dst[], const uchar tmp[], const int& l,
|
||||
const int& half_nlanes, const int& length) {
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
|
||||
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
|
||||
v_uint8 t = v_gather_pairs(tmp, sx);
|
||||
v_int16 t0, t1;
|
||||
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
|
||||
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
||||
v_pack_u_store(&dst[l][x], d);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
void calcRowLinear_8UC1(uint8_t * dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
bool xRatioEq = inSz.width == outSz.width;
|
||||
bool yRatioEq = inSz.height == outSz.height;
|
||||
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int half_nlanes = (nlanes / 2);
|
||||
|
||||
if (!xRatioEq && !yRatioEq) {
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
|
||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
||||
|
||||
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15);
|
||||
|
||||
verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, shuf_mask1,
|
||||
half_nlanes, inSz.width);
|
||||
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
|
||||
outSz.width, half_nlanes);
|
||||
|
||||
} else { // if any lpi
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
|
||||
}
|
||||
} // if lpi == 4
|
||||
|
||||
} else if (!xRatioEq) {
|
||||
GAPI_DbgAssert(yRatioEq);
|
||||
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= nlanes);
|
||||
for (int w = 0; w < inSz.width; ) {
|
||||
for (; w <= inSz.width - nlanes; w += nlanes) {
|
||||
v_uint8 s0, s1, s2, s3;
|
||||
s0 = vx_load(&src0[0][w]);
|
||||
s1 = vx_load(&src0[1][w]);
|
||||
s2 = vx_load(&src0[2][w]);
|
||||
s3 = vx_load(&src0[3][w]);
|
||||
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
|
||||
}
|
||||
|
||||
if (w < inSz.width) {
|
||||
w = inSz.width - nlanes;
|
||||
}
|
||||
}
|
||||
|
||||
// horizontal pass
|
||||
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15);
|
||||
|
||||
horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
|
||||
outSz.width, half_nlanes);
|
||||
|
||||
} else { // any LPI
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
const uchar *src = src0[l];
|
||||
|
||||
// horizontal pass
|
||||
GAPI_DbgAssert(outSz.width >= half_nlanes);
|
||||
horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq) {
|
||||
GAPI_DbgAssert(xRatioEq);
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
|
||||
inLength, outLength);
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq && yRatioEq);
|
||||
int length = inSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 8U, generic number of channels)
|
||||
template<int chanNum>
|
||||
static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi) {
|
||||
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
constexpr int shift = (half_nlanes / 4);
|
||||
|
||||
@ -443,13 +205,8 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch
|
||||
2, 6, 10, 14, 3, 7, 11, 15);
|
||||
|
||||
// vertical pass
|
||||
v_int16 b0 = vx_setall_s16(beta[0]);
|
||||
v_int16 b1 = vx_setall_s16(beta[1]);
|
||||
v_int16 b2 = vx_setall_s16(beta[2]);
|
||||
v_int16 b3 = vx_setall_s16(beta[3]);
|
||||
|
||||
verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3,
|
||||
shuf_mask1, half_nlanes, inSz.width*chanNum);
|
||||
verticalPass_lpi4_8U(src0, src1, tmp, beta,
|
||||
shuf_mask1, inSz.width*chanNum);
|
||||
|
||||
// horizontal pass
|
||||
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
|
||||
@ -502,7 +259,7 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch
|
||||
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
|
||||
verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
|
||||
inSz.width*chanNum, inSz.width*chanNum);
|
||||
|
||||
// horizontal pass
|
||||
@ -561,19 +318,207 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
CV_ALWAYS_INLINE void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
|
||||
uint8_t tmp[], uint8_t* dst[],
|
||||
v_uint8& shuf_mask1,
|
||||
const int width) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(width >= half_nlanes);
|
||||
|
||||
v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15,
|
||||
0, 1, 4, 5, 8, 9, 12, 13,
|
||||
2, 3, 6, 7, 10, 11, 14, 15);
|
||||
|
||||
v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
|
||||
v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
|
||||
|
||||
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
|
||||
const int shift = half_nlanes / 4;
|
||||
|
||||
for (int x = 0; x < width; ) {
|
||||
for (; x <= width - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a10 = vx_load(&clone[4 * x]);
|
||||
v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
|
||||
v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
|
||||
v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
|
||||
|
||||
v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
|
||||
|
||||
val_0 = v_permute32(val_0, permute_idxs1);
|
||||
val_1 = v_permute32(val_1, permute_idxs1);
|
||||
val_2 = v_permute32(val_2, permute_idxs1);
|
||||
val_3 = v_permute32(val_3, permute_idxs1);
|
||||
|
||||
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
|
||||
a10, a32, a54, a76,
|
||||
shuf_mask1, shuf_mask2,
|
||||
permute_idxs2, permute_idxs3,
|
||||
res1, res2);
|
||||
v_store_low(&dst[0][x], res1);
|
||||
v_store_high(&dst[1][x], res1);
|
||||
v_store_low(&dst[2][x], res2);
|
||||
v_store_high(&dst[3][x], res2);
|
||||
}
|
||||
|
||||
if (x < width) {
|
||||
x = width - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
|
||||
uint8_t* dst[], const uchar tmp[], const int l,
|
||||
const int length) {
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
GAPI_DbgAssert(length >= half_nlanes);
|
||||
|
||||
v_int16 t0, t1;
|
||||
for (int x = 0; x < length; ) {
|
||||
for (; x <= length - half_nlanes; x += half_nlanes) {
|
||||
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
|
||||
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
|
||||
v_uint8 t = v_gather_pairs(tmp, sx);
|
||||
|
||||
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
|
||||
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
|
||||
v_pack_u_store(&dst[l][x], d);
|
||||
}
|
||||
|
||||
if (x < length) {
|
||||
x = length - half_nlanes;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace avx512
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
template<>
|
||||
bool calcRowLinear8UC1Impl(avx512_tag,
|
||||
uint8_t* dst[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi,
|
||||
const int) {
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
|
||||
if (inSz.width < nlanes || outSz.width < half_nlanes)
|
||||
return false;
|
||||
|
||||
bool xRatioEq = inSz.width == outSz.width;
|
||||
bool yRatioEq = inSz.height == outSz.height;
|
||||
|
||||
if (!xRatioEq && !yRatioEq) {
|
||||
if (4 == lpi) {
|
||||
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15);
|
||||
// vertical pass
|
||||
avx512::verticalPass_lpi4_8U(src0, src1, tmp, beta, shuf_mask1, inSz.width);
|
||||
|
||||
// horizontal pass
|
||||
avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
|
||||
outSz.width);
|
||||
|
||||
} else { // if any lpi
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
avx512::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
|
||||
|
||||
// horizontal pass
|
||||
avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
|
||||
}
|
||||
} // if lpi == 4
|
||||
|
||||
} else if (!xRatioEq) {
|
||||
GAPI_DbgAssert(yRatioEq);
|
||||
|
||||
if (4 == lpi) {
|
||||
// vertical pass
|
||||
GAPI_DbgAssert(inSz.width >= nlanes);
|
||||
for (int w = 0; w < inSz.width; ) {
|
||||
for (; w <= inSz.width - nlanes; w += nlanes) {
|
||||
v_uint8 s0, s1, s2, s3;
|
||||
s0 = vx_load(&src0[0][w]);
|
||||
s1 = vx_load(&src0[1][w]);
|
||||
s2 = vx_load(&src0[2][w]);
|
||||
s3 = vx_load(&src0[3][w]);
|
||||
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
|
||||
}
|
||||
|
||||
if (w < inSz.width) {
|
||||
w = inSz.width - nlanes;
|
||||
}
|
||||
}
|
||||
|
||||
// horizontal pass
|
||||
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15,
|
||||
0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15);
|
||||
|
||||
avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
|
||||
outSz.width);
|
||||
|
||||
} else { // any LPI
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
const uchar* src = src0[l];
|
||||
|
||||
// horizontal pass
|
||||
avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq) {
|
||||
GAPI_DbgAssert(xRatioEq);
|
||||
int inLength = inSz.width;
|
||||
int outLength = outSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
short beta0 = beta[l];
|
||||
|
||||
// vertical pass
|
||||
avx512::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq && yRatioEq);
|
||||
int length = inSz.width;
|
||||
|
||||
for (int l = 0; l < lpi; ++l) {
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
|
||||
template void chanToPlaneRowImpl(avx512_tag, const float* in, const int chan, const int chs, float* out, const int length);
|
||||
|
||||
@ -595,6 +540,12 @@ template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<
|
||||
template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[],
|
||||
const float* src1[], const float alpha[],
|
||||
const int mapsx[], const float beta[],
|
||||
const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -42,70 +42,46 @@ void calcRowArea_CVKL_U8(const uchar * src[],
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// Resize (bi-linear, 8UC1)
|
||||
void calcRowLinear_8UC1(uint8_t *dst[],
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC3)
|
||||
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi);
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC4)
|
||||
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi);
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi);
|
||||
|
||||
template<int numChan>
|
||||
void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const uint8_t* src0[],
|
||||
const uint8_t* src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size &inSz,
|
||||
const Size &outSz,
|
||||
int lpi) {
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
} // namespace avx512
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
@ -145,6 +121,23 @@ extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std:
|
||||
extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
|
||||
const short alpha[], const short clone[], const short mapsx[],
|
||||
const short beta[], uint8_t tmp[], const Size& inSz,
|
||||
const Size& outSz, const int lpi, const int l);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
|
||||
extern template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -56,17 +56,26 @@ namespace gapi {
|
||||
namespace kernels {
|
||||
|
||||
// 8UC1 Resize (bi-linear)
|
||||
void calcRowLinear_8UC1(uint8_t *dst[],
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
template<>
|
||||
bool calcRowLinear8UC1Impl(sse42_tag,
|
||||
uint8_t *dst[],
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[], // 4 clones of alpha
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi,
|
||||
const int) {
|
||||
constexpr int nlanes = v_uint8::nlanes;
|
||||
constexpr int half_nlanes = (v_uint8::nlanes / 2);
|
||||
|
||||
if (inSz.width < nlanes || outSz.width < half_nlanes)
|
||||
return false;
|
||||
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
@ -503,6 +512,7 @@ void calcRowLinear_8UC1(uint8_t *dst[],
|
||||
memcpy(dst[l], src0[l], length);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes.
|
||||
@ -934,19 +944,6 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
|
||||
@ -1289,6 +1286,11 @@ template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<cons
|
||||
template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -41,19 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
// Resize (bi-linear, 8U)
|
||||
void calcRowLinear_8UC1(uint8_t *dst[],
|
||||
const uint8_t *src0[],
|
||||
const uint8_t *src1[],
|
||||
const short alpha[],
|
||||
const short clone[],
|
||||
const short mapsx[],
|
||||
const short beta[],
|
||||
uint8_t tmp[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi);
|
||||
|
||||
// Resize (bi-linear, 8UC3)
|
||||
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
|
||||
const uint8_t *src0[],
|
||||
@ -95,17 +82,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
|
||||
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi);
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
|
||||
T* out, const int length);
|
||||
@ -145,6 +121,23 @@ extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::a
|
||||
extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
|
||||
extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
|
||||
const short alpha[], const short clone[], const short mapsx[],
|
||||
const short beta[], uint8_t tmp[], const Size& inSz,
|
||||
const Size& outSz, const int lpi, const int l);
|
||||
|
||||
template<typename isa_tag_t>
|
||||
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
|
||||
extern template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
|
||||
const float alpha[], const int mapsx[],
|
||||
const float beta[], const Size& inSz, const Size& outSz,
|
||||
const int lpi, const int l);
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -577,15 +577,18 @@ CV_ALWAYS_INLINE void copyRow_Impl(const T in[], T out[], int length) {
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32FC1)
|
||||
CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi) {
|
||||
template<typename isa_tag_t>
|
||||
CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(isa_tag_t,
|
||||
float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
const int lpi,
|
||||
const int) {
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
@ -714,7 +717,8 @@ template<typename isa_tag_t> struct vector_type_of<isa_tag_t, uint8_t> { using t
|
||||
template<typename isa_tag_t> struct vector_type_of<isa_tag_t, float> { using type = v_float32;};
|
||||
|
||||
template<typename isa_tag_t, typename T>
|
||||
CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length) {
|
||||
CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
|
||||
const int chs, T* out, const int length) {
|
||||
if (chs == 1) {
|
||||
copyRow_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, out, length);
|
||||
return;
|
||||
|
Loading…
Reference in New Issue
Block a user