Pre-processing: Resize Linear 1C refactoring (#6330)

* * Resize 8UC1 refactoring

* * Resize 32FC1 refactoring

* Applied comments
This commit is contained in:
Anna Khakimova 2021-06-24 09:39:09 +03:00 committed by GitHub
parent 123dd1d5ff
commit c24b302c45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1301 additions and 1242 deletions

View File

@ -43,25 +43,12 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float* dst[],
const float* src0[],
const float* src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
}
template<int chanNum>
CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
const uchar* src, const int width,
const int line) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
GAPI_Assert(width >= nlanes);
GAPI_DbgAssert(width >= nlanes);
v_uint8 chan;
int x = 0;
@ -85,7 +72,7 @@ CV_ALWAYS_INLINE void vertical_anyLPI(const uchar* src0, const uchar* src1,
uchar* tmp, const int inLength,
const short beta) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
GAPI_Assert(inLength >= nlanes);
GAPI_DbgAssert(inLength >= nlanes);
const int half_nlanes = nlanes/2;
int w = 0;
@ -116,7 +103,7 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(std::array<std::array<uint8_t*, 4>, chan
const int line) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
const int half_nlanes = nlanes/2;
GAPI_Assert(width >= half_nlanes);
GAPI_DbgAssert(width >= half_nlanes);
v_int16 t0, t1;//, t2, t3;
int x = 0;
@ -220,7 +207,7 @@ CV_ALWAYS_INLINE void horizontal_4LPI(std::array<std::array<uint8_t*, 4>, chanNu
const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
GAPI_DbgAssert(length >= half_nlanes);
const int shift = static_cast<int>(half_nlanes / 4);
@ -310,7 +297,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
// vertical pass
vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width * chanNum);
// horizontal pass
horizontal_4LPI<chanNum>(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
@ -338,7 +325,7 @@ CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl_(std::array<std::array<uint8_t*, 4>
int inLength = inSz.width * chanNum;
// vertical pass
GAPI_Assert(inLength >= nlanes);
GAPI_DbgAssert(inLength >= nlanes);
v_uint8 s0, s1, s2, s3;
int w = 0;
for (;;) {
@ -427,12 +414,13 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4>& dst,
CV_ALWAYS_INLINE void horizontal_4LPI(uint8_t* dst[],
const uchar* tmp, const short mapsx[],
const uchar _mask_horizontal[],
const short clone[], const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
GAPI_DbgAssert(length >= half_nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
v_uint8 hmask = vx_load(_mask_horizontal);
int x = 0;
for (;;) {
@ -495,7 +483,8 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
const short alpha[], const int length) {
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
GAPI_Assert(length >= half_nlanes);
GAPI_DbgAssert(length >= half_nlanes);
v_int16 t0, t1;
int x = 0;
for (;;) {
@ -515,39 +504,42 @@ CV_ALWAYS_INLINE void horizontal_anyLPI(uint8_t* dst,
break;
}
}
} // namespace neon
// 8UC1 Resize (bi-linear)
void calcRowLinear_8UC1(uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
template<>
bool calcRowLinear8UC1Impl(neon_tag,
uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi,
const int) {
static_assert(v_uint8::nlanes == 16,
"The wide of NEON vector is 128 bits, so one vector contains 16 uchars");
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
constexpr int half_nlanes = nlanes / 2;
constexpr int half_nlanes = v_uint8::nlanes / 2;
if (inSz.width < nlanes || outSz.width < half_nlanes)
return false;
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
if (!xRatioEq && !yRatioEq) {
GAPI_Assert(inSz.width >= half_nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
if (4 == lpi) {
// vertical pass
vertical_4LPI(src0, src1, tmp, beta, inSz.width);
neon::vertical_4LPI(src0, src1, tmp, beta, inSz.width);
// horizontal pass
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
} else { // if any lpi
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
@ -556,18 +548,16 @@ void calcRowLinear_8UC1(uint8_t* dst[],
uchar* _dst = dst[l];
// vertical pass
vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
neon::vertical_anyLPI(s0, s1, tmp, inSz.width, beta0);
// horizontal pass
horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
neon::horizontal_anyLPI(_dst, tmp, mapsx, alpha, outSz.width);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
GAPI_Assert(inSz.width >= nlanes);
uchar _mask_horizontal[nlanes] = { 0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15 };
GAPI_DbgAssert(inSz.width >= nlanes);
if (4 == lpi) {
// vertical pass
@ -589,15 +579,15 @@ void calcRowLinear_8UC1(uint8_t* dst[],
}
// horizontal pass
horizontal_4LPI(dst, tmp, mapsx, _mask_horizontal, clone, outSz.width);
neon::horizontal_4LPI(dst, tmp, mapsx, clone, outSz.width);
} else { // any LPI
GAPI_Assert(outSz.width >= half_nlanes);
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
uchar* _dst = dst[l];
// horizontal pass
horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
neon::horizontal_anyLPI(_dst, src, mapsx, alpha, outSz.width);
}
}
@ -611,7 +601,7 @@ void calcRowLinear_8UC1(uint8_t* dst[],
const uchar* s1 = src1[l];
// vertical pass
vertical_anyLPI(s0, s1, dst[l], length, beta0);
neon::vertical_anyLPI(s0, s1, dst[l], length, beta0);
}
} else {
@ -622,8 +612,8 @@ void calcRowLinear_8UC1(uint8_t* dst[],
memcpy(dst[l], src0[l], length);
}
}
return true;
}
} // namespace neon
template void chanToPlaneRowImpl(neon_tag, const uint8_t* in, int chan, int chs, uint8_t* out, const int length);
template void chanToPlaneRowImpl(neon_tag, const float* in, int chan, int chs, float * out, const int length);
@ -646,6 +636,10 @@ template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::array<cons
template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[], const float beta[],
const Size& inSz, const Size& outSz, const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -28,19 +28,6 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si
float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[],
const float xalpha[], float vbuf[]);
// Resize (bi-linear, 8U)
void calcRowLinear_8UC1(uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t *src0[],
@ -81,17 +68,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
int lpi) {
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi);
} // namespace neon
template<typename isa_tag_t, typename T>
@ -131,6 +107,24 @@ extern template void mergeRowImpl<neon_tag, uint8_t, 3>(neon_tag, const std::arr
extern template void mergeRowImpl<neon_tag, float, 3>(neon_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<neon_tag, uint8_t, 4>(neon_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<neon_tag, float, 4>(neon_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template<typename isa_tag_t>
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
const short alpha[], const short clone[], const short mapsx[],
const short beta[], uint8_t tmp[], const Size& inSz,
const Size& outSz, const int lpi, const int l);
template<typename isa_tag_t>
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
extern template void calcRowLinear32FC1Impl(neon_tag, float* dst[], const float* src0[],
const float* src1[], const float alpha[],
const int mapsx[], const float beta[],
const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -61,17 +61,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
const v_uint8& val_1,
const v_uint8& val_2,
const v_uint8& val_3,
const v_int16& a10,
const v_int16& a32,
const v_int16& a54,
const v_int16& a76,
v_uint8& shuf_mask1,
v_uint8& shuf_mask2,
v_uint8& res1, v_uint8& res2) {
CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
const v_uint8& val_1,
const v_uint8& val_2,
const v_uint8& val_3,
const v_int16& a10,
const v_int16& a32,
const v_int16& a54,
const v_int16& a76,
v_uint8& shuf_mask1,
v_uint8& shuf_mask2,
v_uint8& res1, v_uint8& res2) {
v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
@ -108,17 +108,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
res2 = v_shuffle_s8(q7, shuf_mask2);
}
static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const short beta[],
const int& length, const int& half_nlanes) {
CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const short beta[],
const int& length) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
v_uint8 shuf_mask = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15,
0, 8, 4, 12, 1, 9, 5, 13,
0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15);
for (int w = 0; w < length; ) {
for (; w <= length - half_nlanes; w += half_nlanes) {
@ -164,63 +167,26 @@ static inline void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* sr
}
}
static inline v_uint8 setHorizontalShufMask1() {
CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask1() {
return v_setr_s8(0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15,
0, 4, 8, 12, 2, 6, 10, 14,
1, 5, 9, 13, 3, 7, 11, 15);
}
static inline v_uint8 setHorizontalShufMask2() {
CV_ALWAYS_INLINE v_uint8 setHorizontalShufMask2() {
return v_setr_s8(0, 1, 8, 9, 2, 3, 10, 11,
4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11,
4, 5, 12, 13, 6, 7, 14, 15);
}
static inline void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
uint8_t tmp[], uint8_t* dst[], const int& length,
const int& half_nlanes) {
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
constexpr int shift = 4;
v_uint8 shuf_mask1 = setHorizontalShufMask1();
v_uint8 shuf_mask2 = setHorizontalShufMask2();
CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const int& beta0,
const int l, const int length1, const int length2) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length1 >= half_nlanes);
v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a10 = vx_load(&clone[4 * x]);
v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
val_0 = v_permute32(val_0, idxs);
val_1 = v_permute32(val_1, idxs);
val_2 = v_permute32(val_2, idxs);
val_3 = v_permute32(val_3, idxs);
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
a10, a32, a54, a76,
shuf_mask1, shuf_mask2,
res1, res2);
v_store_low(&dst[0][x], res1);
v_store_high(&dst[1][x], res1);
v_store_low(&dst[2][x], res2);
v_store_high(&dst[3][x], res2);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const int& beta0, const int& half_nlanes,
const int& l, const int& length1, const int& length2) {
for (int w = 0; w < length2; ) {
for (; w <= length1 - half_nlanes; w += half_nlanes) {
v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
@ -235,148 +201,25 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
}
}
static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
uint8_t* dst[], const uchar tmp[], const int& l,
const int& half_nlanes, const int& length) {
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
v_uint8 t = v_gather_pairs(tmp, sx); // 16 pairs of src0 pixels
v_int16 t0, t1;
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[l][x], d);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
// 8UC1 Resize (bi-linear)
void calcRowLinear_8UC1(uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi) {
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
constexpr int nlanes = v_uint8::nlanes;
constexpr int half_nlanes = (nlanes / 2);
if (!xRatioEq && !yRatioEq) {
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width, half_nlanes);
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
} else { // if any lpi
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= nlanes);
for (int w = 0; w < inSz.width; ) {
for (; w <= inSz.width - nlanes; w += nlanes) {
v_uint8 s0, s1, s2, s3;
s0 = vx_load(&src0[0][w]);
s1 = vx_load(&src0[1][w]);
s2 = vx_load(&src0[2][w]);
s3 = vx_load(&src0[3][w]);
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
}
if (w < inSz.width) {
w = inSz.width - nlanes;
}
}
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width, half_nlanes);
} else { // any LPI
for (int l = 0; l < lpi; ++l) {
const uchar *src = src0[l];
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
}
}
} else if (!yRatioEq) {
GAPI_DbgAssert(xRatioEq);
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
inLength, outLength);
}
} else {
GAPI_DbgAssert(xRatioEq && yRatioEq);
int length = inSz.width;
for (int l = 0; l < lpi; ++l) {
memcpy(dst[l], src0[l], length);
}
}
}
template<int chanNum>
void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi) {
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
const int shift = (half_nlanes / 4);
if (4 == lpi) {
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
verticalPass_lpi4_8U(src0, src1, tmp, beta,
inSz.width*chanNum, half_nlanes);
inSz.width*chanNum);
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
@ -420,8 +263,7 @@ void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
short beta0 = beta[l];
// vertical pass
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
inSz.width*chanNum, inSz.width*chanNum);
// horizontal pass
@ -480,20 +322,176 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
CV_ALWAYS_INLINE void horizontalPass_lpi4_8UC1(const short clone[], const short mapsx[],
uint8_t tmp[], uint8_t* dst[], const int& length) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length >= half_nlanes);
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
constexpr int shift = 4;
v_uint8 shuf_mask1 = avx::setHorizontalShufMask1();
v_uint8 shuf_mask2 = avx::setHorizontalShufMask2();
v_uint32 idxs = v_setr_s32(0, 2, 4, 6, 1, 3, 5, 7);
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a10 = vx_load(&clone[4 * x]);
v_int16 a32 = vx_load(&clone[4 * (x + 4)]);
v_int16 a54 = vx_load(&clone[4 * (x + 8)]);
v_int16 a76 = vx_load(&clone[4 * (x + 12)]);
v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift);
val_0 = v_permute32(val_0, idxs);
val_1 = v_permute32(val_1, idxs);
val_2 = v_permute32(val_2, idxs);
val_3 = v_permute32(val_3, idxs);
avx::main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
a10, a32, a54, a76,
shuf_mask1, shuf_mask2,
res1, res2);
v_store_low(&dst[0][x], res1);
v_store_high(&dst[1][x], res1);
v_store_low(&dst[2][x], res2);
v_store_high(&dst[3][x], res2);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
uint8_t* dst[], const uchar tmp[], const int l,
const int length) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length >= half_nlanes);
v_int16 t0, t1;
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
v_uint8 t = v_gather_pairs(tmp, sx); // 16 pairs of src0 pixels
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[l][x], d);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
} // namespace avx
// 8UC1 Resize (bi-linear)
template<>
bool calcRowLinear8UC1Impl(avx2_tag,
uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi,
const int) {
constexpr int nlanes = v_uint8::nlanes;
constexpr int half_nlanes = (v_uint8::nlanes / 2);
if (inSz.width < nlanes || outSz.width < half_nlanes)
return false;
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
if (!xRatioEq && !yRatioEq) {
if (4 == lpi) {
// vertical pass
avx::verticalPass_lpi4_8U(src0, src1, tmp, beta, inSz.width);
// horizontal pass
avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
} else { // if any lpi
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
avx::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
// horizontal pass
avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= nlanes);
v_uint8 s0, s1, s2, s3;
for (int w = 0; w < inSz.width; ) {
for (; w <= inSz.width - nlanes; w += nlanes) {
s0 = vx_load(&src0[0][w]);
s1 = vx_load(&src0[1][w]);
s2 = vx_load(&src0[2][w]);
s3 = vx_load(&src0[3][w]);
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
}
if (w < inSz.width) {
w = inSz.width - nlanes;
}
}
// horizontal pass
avx::horizontalPass_lpi4_8UC1(clone, mapsx, tmp, dst, outSz.width);
} else { // any LPI
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
// horizontal pass
avx::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
}
}
} else if (!yRatioEq) {
GAPI_DbgAssert(xRatioEq);
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
avx::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
}
} else {
GAPI_DbgAssert(xRatioEq && yRatioEq);
int length = inSz.width;
for (int l = 0; l < lpi; ++l) {
memcpy(dst[l], src0[l], length);
}
}
return true;
}
template void chanToPlaneRowImpl(avx2_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
template void chanToPlaneRowImpl(avx2_tag, const float* in, const int chan, const int chs, float* out, const int length);
@ -516,6 +514,11 @@ template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::array<cons
template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -41,20 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
#endif
//-----------------------------------------------------------------------------
// Resize (bi-linear, 8UC1)
void calcRowLinear_8UC1(uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t* src0[],
@ -66,7 +52,7 @@ void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
const int lpi);
// Resize (bi-linear, 8UC4)
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
@ -79,33 +65,22 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
const int lpi);
template<int numChan>
void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi) {
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size & inSz,
const Size & outSz,
int lpi);
} // namespace avx
template<typename isa_tag_t, typename T>
@ -148,6 +123,23 @@ extern template void mergeRowImpl<avx2_tag, uint8_t, 3>(avx2_tag, const std::arr
extern template void mergeRowImpl<avx2_tag, float, 3>(avx2_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<avx2_tag, uint8_t, 4>(avx2_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx2_tag, float, 4>(avx2_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template<typename isa_tag_t>
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
const short alpha[], const short clone[], const short mapsx[],
const short beta[], uint8_t tmp[], const Size& inSz,
const Size& outSz, const int lpi, const int l);
template<typename isa_tag_t>
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
extern template void calcRowLinear32FC1Impl(avx2_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -55,10 +55,17 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
}
static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *src1[],
uint8_t tmp[], v_int16& b0, v_int16& b1,
v_int16& b2, v_int16& b3, v_uint8& shuf_mask,
int half_nlanes, int width) {
CV_ALWAYS_INLINE void verticalPass_lpi4_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const short beta[], const v_uint8& shuf_mask,
const int width) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(width >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
v_uint32 permute_idxs1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0);
v_uint32 permute_idxs2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8);
@ -86,37 +93,13 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr
v_uint8 q0 = v_packus(r0, r1);
v_uint8 q1 = v_packus(r2, r3);
#if 1
v_uint8 q2 = v_permutex2_s32(q0, q1, permute_idxs1);
v_uint8 q3 = v_permutex2_s32(q0, q1, permute_idxs2);
v_uint8 q4 = v_shuffle_s8(q2, shuf_mask);
v_uint8 q5 = v_shuffle_s8(q3, shuf_mask);
//Second variant of decompose. It'll be usefull in the future.
#else
v_uint8 q2 = v_mblend_shiftleft(q0, q1);
v_uint8 q3 = v_mblend_shiftright(q0, q1);
v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15,
0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15,
0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15,
0, 8, 4, 12, 1, 9, 5, 13,
2, 10, 6, 14, 3, 11, 7, 15);
v_uint8 q4 = v_shuffle_s8(q2, mask1);
v_uint8 q5 = v_shuffle_s8(q3, mask1);
v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0);
v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4);
v_uint8 q6 = v_permutex2_s64(q4, q5, permute_idxs1);
v_uint8 q7 = v_permutex2_s64(q4, q5, permute_idxs2);
#endif
vx_store(&tmp[4 * w + 0], q4);
vx_store(&tmp[4 * w + 2 * half_nlanes], q5);
}
@ -125,21 +108,21 @@ static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *sr
w = width - half_nlanes;
}
}
}
}
static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
const v_uint8& val_1,
const v_uint8& val_2,
const v_uint8& val_3,
const v_int16& a10,
const v_int16& a32,
const v_int16& a54,
const v_int16& a76,
v_uint8& shuf_mask1,
v_uint8& shuf_mask2,
v_uint32& idxs1,
v_uint32& idxs2,
v_uint8& res1, v_uint8& res2) {
CV_ALWAYS_INLINE void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
const v_uint8& val_1,
const v_uint8& val_2,
const v_uint8& val_3,
const v_int16& a10,
const v_int16& a32,
const v_int16& a54,
const v_int16& a76,
v_uint8& shuf_mask1,
v_uint8& shuf_mask2,
v_uint32& idxs1,
v_uint32& idxs2,
v_uint8& res1, v_uint8& res2) {
v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0));
v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1));
v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2));
@ -165,91 +148,20 @@ static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0,
v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1);
v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1);
#if 1
v_uint8 q4 = v_permutex2_s32(q2, q3, idxs1);
v_uint8 q5 = v_permutex2_s32(q2, q3, idxs2);
res1 = v_shuffle_s8(q4, shuf_mask2);
res2 = v_shuffle_s8(q5, shuf_mask2);
//Second variant of decompose. It'll be usefull in the future.
#else
v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3);
v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
v_uint8 q6 = v_permute32(idx, q4);
v_uint8 q7 = v_permute32(idx, q5);
v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15);
v_uint8 q8 = v_shuffle_s8(q6, mask2);
v_uint8 q9 = v_shuffle_s8(q7, mask2);
#endif
}
static inline void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
uint8_t tmp[], uint8_t *dst[],
v_uint8& shuf_mask1,
int width, int half_nlanes) {
v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15);
CV_ALWAYS_INLINE void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const int beta0,
const int l, const int length1, const int length2) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length1 >= half_nlanes);
v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
const int shift = half_nlanes / 4;
for (int x = 0; x < width; ) {
for (; x <= width - half_nlanes; x += half_nlanes) {
v_int16 a10 = vx_load(&clone[4 * x]);
v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
val_0 = v_permute32(val_0, permute_idxs1);
val_1 = v_permute32(val_1, permute_idxs1);
val_2 = v_permute32(val_2, permute_idxs1);
val_3 = v_permute32(val_3, permute_idxs1);
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
a10, a32, a54, a76,
shuf_mask1, shuf_mask2,
permute_idxs2, permute_idxs3,
res1, res2);
v_store_low(&dst[0][x], res1);
v_store_high(&dst[1][x], res1);
v_store_low(&dst[2][x], res2);
v_store_high(&dst[3][x], res2);
}
if (x < width) {
x = width - half_nlanes;
}
}
}
static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[],
uint8_t tmp[], const int& beta0, const int& half_nlanes,
const int& l, const int& length1, const int& length2) {
for (int w = 0; w < length2; ) {
for (; w <= length1 - half_nlanes; w += half_nlanes) {
v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w]));
@ -264,169 +176,19 @@ static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t*
}
}
static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
uint8_t* dst[], const uchar tmp[], const int& l,
const int& half_nlanes, const int& length) {
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
v_uint8 t = v_gather_pairs(tmp, sx);
v_int16 t0, t1;
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[l][x], d);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
// 8UC1 Resize (bi-linear)
void calcRowLinear_8UC1(uint8_t * dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi) {
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
constexpr int nlanes = v_uint8::nlanes;
constexpr int half_nlanes = (nlanes / 2);
if (!xRatioEq && !yRatioEq) {
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15);
verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, shuf_mask1,
half_nlanes, inSz.width);
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
outSz.width, half_nlanes);
} else { // if any lpi
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength);
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= nlanes);
for (int w = 0; w < inSz.width; ) {
for (; w <= inSz.width - nlanes; w += nlanes) {
v_uint8 s0, s1, s2, s3;
s0 = vx_load(&src0[0][w]);
s1 = vx_load(&src0[1][w]);
s2 = vx_load(&src0[2][w]);
s3 = vx_load(&src0[3][w]);
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
}
if (w < inSz.width) {
w = inSz.width - nlanes;
}
}
// horizontal pass
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15);
horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
outSz.width, half_nlanes);
} else { // any LPI
for (int l = 0; l < lpi; ++l) {
const uchar *src = src0[l];
// horizontal pass
GAPI_DbgAssert(outSz.width >= half_nlanes);
horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width);
}
}
} else if (!yRatioEq) {
GAPI_DbgAssert(xRatioEq);
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
GAPI_DbgAssert(inSz.width >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l,
inLength, outLength);
}
} else {
GAPI_DbgAssert(xRatioEq && yRatioEq);
int length = inSz.width;
for (int l = 0; l < lpi; ++l) {
memcpy(dst[l], src0[l], length);
}
}
}
// Resize (bi-linear, 8U, generic number of channels)
template<int chanNum>
static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi) {
CV_ALWAYS_INLINE void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, chanNum> &dst,
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
constexpr int shift = (half_nlanes / 4);
@ -443,13 +205,8 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch
2, 6, 10, 14, 3, 7, 11, 15);
// vertical pass
v_int16 b0 = vx_setall_s16(beta[0]);
v_int16 b1 = vx_setall_s16(beta[1]);
v_int16 b2 = vx_setall_s16(beta[2]);
v_int16 b3 = vx_setall_s16(beta[3]);
verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3,
shuf_mask1, half_nlanes, inSz.width*chanNum);
verticalPass_lpi4_8U(src0, src1, tmp, beta,
shuf_mask1, inSz.width*chanNum);
// horizontal pass
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
@ -502,7 +259,7 @@ static inline void calcRowLinear_8UC_Impl(std::array<std::array<uint8_t*, 4>, ch
// vertical pass
GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes);
verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l,
verticalPass_anylpi_8U(src0, src1, tmp, beta0, l,
inSz.width*chanNum, inSz.width*chanNum);
// horizontal pass
@ -561,19 +318,207 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
CV_ALWAYS_INLINE void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[],
uint8_t tmp[], uint8_t* dst[],
v_uint8& shuf_mask1,
const int width) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(width >= half_nlanes);
v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15,
0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15);
v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2);
v_uint8 val_0, val_1, val_2, val_3, res1, res2;
const int shift = half_nlanes / 4;
for (int x = 0; x < width; ) {
for (; x <= width - half_nlanes; x += half_nlanes) {
v_int16 a10 = vx_load(&clone[4 * x]);
v_int16 a32 = vx_load(&clone[4 * (x + 8)]);
v_int16 a54 = vx_load(&clone[4 * (x + 16)]);
v_int16 a76 = vx_load(&clone[4 * (x + 24)]);
v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift);
val_0 = v_permute32(val_0, permute_idxs1);
val_1 = v_permute32(val_1, permute_idxs1);
val_2 = v_permute32(val_2, permute_idxs1);
val_3 = v_permute32(val_3, permute_idxs1);
main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3,
a10, a32, a54, a76,
shuf_mask1, shuf_mask2,
permute_idxs2, permute_idxs3,
res1, res2);
v_store_low(&dst[0][x], res1);
v_store_high(&dst[1][x], res1);
v_store_low(&dst[2][x], res2);
v_store_high(&dst[3][x], res2);
}
if (x < width) {
x = width - half_nlanes;
}
}
}
CV_ALWAYS_INLINE void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[],
uint8_t* dst[], const uchar tmp[], const int l,
const int length) {
constexpr int half_nlanes = (v_uint8::nlanes / 2);
GAPI_DbgAssert(length >= half_nlanes);
v_int16 t0, t1;
for (int x = 0; x < length; ) {
for (; x <= length - half_nlanes; x += half_nlanes) {
v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14
v_int16 sx = vx_load(&mapsx[x]); // as integer (int16)
v_uint8 t = v_gather_pairs(tmp, sx);
v_deinterleave_expand(t, t0, t1); // tmp pixels as int16
v_int16 d = v_mulhrs(t0 - t1, a0) + t1;
v_pack_u_store(&dst[l][x], d);
}
if (x < length) {
x = length - half_nlanes;
}
}
}
} // namespace avx512
// 8UC1 Resize (bi-linear)
template<>
bool calcRowLinear8UC1Impl(avx512_tag,
uint8_t* dst[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi,
const int) {
constexpr int nlanes = v_uint8::nlanes;
constexpr int half_nlanes = (v_uint8::nlanes / 2);
if (inSz.width < nlanes || outSz.width < half_nlanes)
return false;
bool xRatioEq = inSz.width == outSz.width;
bool yRatioEq = inSz.height == outSz.height;
if (!xRatioEq && !yRatioEq) {
if (4 == lpi) {
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15);
// vertical pass
avx512::verticalPass_lpi4_8U(src0, src1, tmp, beta, shuf_mask1, inSz.width);
// horizontal pass
avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
outSz.width);
} else { // if any lpi
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
avx512::verticalPass_anylpi_8U(src0, src1, tmp, beta0, l, inLength, inLength);
// horizontal pass
avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, outLength);
}
} // if lpi == 4
} else if (!xRatioEq) {
GAPI_DbgAssert(yRatioEq);
if (4 == lpi) {
// vertical pass
GAPI_DbgAssert(inSz.width >= nlanes);
for (int w = 0; w < inSz.width; ) {
for (; w <= inSz.width - nlanes; w += nlanes) {
v_uint8 s0, s1, s2, s3;
s0 = vx_load(&src0[0][w]);
s1 = vx_load(&src0[1][w]);
s2 = vx_load(&src0[2][w]);
s3 = vx_load(&src0[3][w]);
v_store_interleave(&tmp[4 * w], s0, s1, s2, s3);
}
if (w < inSz.width) {
w = inSz.width - nlanes;
}
}
// horizontal pass
v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15);
avx512::horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1,
outSz.width);
} else { // any LPI
for (int l = 0; l < lpi; ++l) {
const uchar* src = src0[l];
// horizontal pass
avx512::horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, outSz.width);
}
}
} else if (!yRatioEq) {
GAPI_DbgAssert(xRatioEq);
int inLength = inSz.width;
int outLength = outSz.width;
for (int l = 0; l < lpi; ++l) {
short beta0 = beta[l];
// vertical pass
avx512::verticalPass_anylpi_8U(src0, src1, dst[l], beta0, l, inLength, outLength);
}
} else {
GAPI_DbgAssert(xRatioEq && yRatioEq);
int length = inSz.width;
for (int l = 0; l < lpi; ++l) {
memcpy(dst[l], src0[l], length);
}
}
return true;
}
template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
template void chanToPlaneRowImpl(avx512_tag, const float* in, const int chan, const int chs, float* out, const int length);
@ -595,6 +540,12 @@ template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std::array<
template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[],
const float* src1[], const float alpha[],
const int mapsx[], const float beta[],
const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -42,70 +42,46 @@ void calcRowArea_CVKL_U8(const uchar * src[],
//-----------------------------------------------------------------------------
// Resize (bi-linear, 8UC1)
void calcRowLinear_8UC1(uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size & inSz,
const Size & outSz,
int lpi);
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi);
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi);
// Resize (bi-linear, 8UC4)
void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi);
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi);
template<int numChan>
void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
const uint8_t *src0[],
const uint8_t *src1[],
const uint8_t* src0[],
const uint8_t* src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size &inSz,
const Size &outSz,
int lpi) {
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi) {
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size & inSz,
const Size & outSz,
int lpi);
} // namespace avx512
template<typename isa_tag_t, typename T>
@ -145,6 +121,23 @@ extern template void mergeRowImpl<avx512_tag, uint8_t, 3>(avx512_tag, const std:
extern template void mergeRowImpl<avx512_tag, float, 3>(avx512_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<avx512_tag, uint8_t, 4>(avx512_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<avx512_tag, float, 4>(avx512_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template<typename isa_tag_t>
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
const short alpha[], const short clone[], const short mapsx[],
const short beta[], uint8_t tmp[], const Size& inSz,
const Size& outSz, const int lpi, const int l);
template<typename isa_tag_t>
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
extern template void calcRowLinear32FC1Impl(avx512_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -56,17 +56,26 @@ namespace gapi {
namespace kernels {
// 8UC1 Resize (bi-linear)
void calcRowLinear_8UC1(uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi) {
template<>
bool calcRowLinear8UC1Impl(sse42_tag,
uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[], // 4 clones of alpha
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
const int lpi,
const int) {
constexpr int nlanes = v_uint8::nlanes;
constexpr int half_nlanes = (v_uint8::nlanes / 2);
if (inSz.width < nlanes || outSz.width < half_nlanes)
return false;
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
@ -503,6 +512,7 @@ void calcRowLinear_8UC1(uint8_t *dst[],
memcpy(dst[l], src0[l], length);
}
}
return true;
}
// Resize 3C/4C universal intrinsic implementation for SSE42 version is a bit slower than original sometimes.
@ -934,19 +944,6 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
calcRowLinear_8UC_Impl_<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
int lpi) {
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
}
//------------------------------------------------------------------------------
void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
@ -1289,6 +1286,11 @@ template void mergeRowImpl<sse42_tag, uchar, 3>(sse42_tag, const std::array<cons
template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
template void mergeRowImpl<sse42_tag, uchar, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -41,19 +41,6 @@ void calcRowArea_CVKL_U8_SSE42(const uchar * src[],
//----------------------------------------------------------------------
// Resize (bi-linear, 8U)
void calcRowLinear_8UC1(uint8_t *dst[],
const uint8_t *src0[],
const uint8_t *src1[],
const short alpha[],
const short clone[],
const short mapsx[],
const short beta[],
uint8_t tmp[],
const Size& inSz,
const Size& outSz,
int lpi);
// Resize (bi-linear, 8UC3)
void calcRowLinear_8U(C3, std::array<std::array<uint8_t*, 4>, 3> &dst,
const uint8_t *src0[],
@ -95,17 +82,6 @@ void calcRowLinear_8UC(std::array<std::array<uint8_t*, 4>, numChan> &dst,
calcRowLinear_8U(std::integral_constant<int, numChan>{}, dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
}
// Resize (bi-linear, 32F)
void calcRowLinear_32F(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size & inSz,
const Size & outSz,
int lpi);
template<typename isa_tag_t, typename T>
void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
T* out, const int length);
@ -145,6 +121,23 @@ extern template void mergeRowImpl<sse42_tag, uint8_t, 3>(sse42_tag, const std::a
extern template void mergeRowImpl<sse42_tag, float, 3>(sse42_tag, const std::array<const float*, 3>& ins, float* out, const int length);
extern template void mergeRowImpl<sse42_tag, uint8_t, 4>(sse42_tag, const std::array<const uint8_t*, 4>& ins, uint8_t* out, const int length);
extern template void mergeRowImpl<sse42_tag, float, 4>(sse42_tag, const std::array<const float*, 4>& ins, float* out, const int length);
template<typename isa_tag_t>
bool calcRowLinear8UC1Impl(isa_tag_t, uint8_t* dst[], const uint8_t* src0[], const uint8_t* src1[],
const short alpha[], const short clone[], const short mapsx[],
const short beta[], uint8_t tmp[], const Size& inSz,
const Size& outSz, const int lpi, const int l);
template<typename isa_tag_t>
void calcRowLinear32FC1Impl(isa_tag_t, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
extern template void calcRowLinear32FC1Impl(sse42_tag, float* dst[], const float* src0[], const float* src1[],
const float alpha[], const int mapsx[],
const float beta[], const Size& inSz, const Size& outSz,
const int lpi, const int l);
} // namespace kernels
} // namespace gapi
} // namespace InferenceEngine

View File

@ -577,15 +577,18 @@ CV_ALWAYS_INLINE void copyRow_Impl(const T in[], T out[], int length) {
}
// Resize (bi-linear, 32FC1)
CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi) {
template<typename isa_tag_t>
CV_ALWAYS_INLINE void calcRowLinear32FC1Impl(isa_tag_t,
float *dst[],
const float *src0[],
const float *src1[],
const float alpha[],
const int mapsx[],
const float beta[],
const Size& inSz,
const Size& outSz,
const int lpi,
const int) {
bool xRatioEq1 = inSz.width == outSz.width;
bool yRatioEq1 = inSz.height == outSz.height;
@ -714,7 +717,8 @@ template<typename isa_tag_t> struct vector_type_of<isa_tag_t, uint8_t> { using t
template<typename isa_tag_t> struct vector_type_of<isa_tag_t, float> { using type = v_float32;};
template<typename isa_tag_t, typename T>
CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length) {
CV_ALWAYS_INLINE void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan,
const int chs, T* out, const int length) {
if (chs == 1) {
copyRow_Impl<vector_type_of_t<isa_tag_t, T>, T>(in, out, length);
return;