diff --git a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp index a181561f716..1e9c3613c07 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp @@ -266,10 +266,10 @@ static inline void horizontalPass_lpi4_8UC1(const short clone[], const short map v_int16 a76 = vx_load(&clone[4 * (x + 12)]); v_setr64(val_0, val_1, val_2, val_3, mapsx, tmp, x, shift); - val_0 = v_permutevar8x32(val_0, idxs); - val_1 = v_permutevar8x32(val_1, idxs); - val_2 = v_permutevar8x32(val_2, idxs); - val_3 = v_permutevar8x32(val_3, idxs); + val_0 = v_permute32(val_0, idxs); + val_1 = v_permute32(val_1, idxs); + val_2 = v_permute32(val_2, idxs); + val_3 = v_permute32(val_3, idxs); main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3, a10, a32, a54, a76, @@ -312,7 +312,7 @@ static inline void horizontalPass_anylpi_8U(const short alpha[], const short map for (; x <= length - half_nlanes; x += half_nlanes) { v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) - v_uint8 t = v_gather_pairs(tmp, sx); // 8 pairs of src0 pixels + v_uint8 t = v_gather_pairs(tmp, sx); // 16 pairs of src0 pixels v_int16 t0, t1; v_deinterleave_expand(t, t0, t1); // tmp pixels as int16 v_int16 d = v_mulhrs(t0 - t1, a0) + t1; diff --git a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp index 5b900d52c54..8563261fe78 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp @@ -125,26 +125,384 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf); } +static inline void verticalPass_lpi4_8U(const uint8_t *src0[], const uint8_t *src1[], + uint8_t tmp[], v_int16& b0, v_int16& b1, + v_int16& b2, v_int16& b3, v_uint8& shuf_mask, + int half_nlanes, int width) { + v_uint32 permute_idxs1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0); + v_uint32 permute_idxs2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8); + + for (int w = 0; w < width; ) { + for (; w <= width - half_nlanes; w += half_nlanes) { + v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); + v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); + v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); + v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); + + v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); + v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); + v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); + v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); +#if 1 + v_uint8 q2 = v_permutex2_s32(q0, q1, permute_idxs1); + v_uint8 q3 = v_permutex2_s32(q0, q1, permute_idxs2); + + v_uint8 q4 = v_shuffle_s8(q2, shuf_mask); + v_uint8 q5 = v_shuffle_s8(q3, shuf_mask); + + //Second variant of decompose. It'll be usefull in the future. +#else + v_uint8 q2 = v_mblend_shiftleft(q0, q1); + v_uint8 q3 = v_mblend_shiftright(q0, q1); + + v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15, + 0, 8, 4, 12, 1, 9, 5, 13, + 2, 10, 6, 14, 3, 11, 7, 15); + + v_uint8 q4 = v_shuffle_s8(q2, mask1); + v_uint8 q5 = v_shuffle_s8(q3, mask1); + + v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0); + v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4); + + v_uint8 q6 = v_permutex2_s64(q4, q5, permute_idxs1); + v_uint8 q7 = v_permutex2_s64(q4, q5, permute_idxs2); +#endif + + vx_store(&tmp[4 * w + 0], q4); + vx_store(&tmp[4 * w + 2 * half_nlanes], q5); + } + + if (w < width) { + w = width - half_nlanes; + } + } + } + +static inline void main_computation_horizontalPass_lpi4(const v_uint8& val_0, + const v_uint8& val_1, + const v_uint8& val_2, + const v_uint8& val_3, + const v_int16& a10, + const v_int16& a32, + const v_int16& a54, + const v_int16& a76, + v_uint8& shuf_mask1, + v_uint8& shuf_mask2, + v_uint32& idxs1, + v_uint32& idxs2, + v_uint8& res1, v_uint8& res2) { + v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); + v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); + v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); + v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + + v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); + v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); + v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); + v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); + + v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); + v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); + v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); + v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); + + v_int16 r0 = v_add_wrap(val1_0, t0); + v_int16 r1 = v_add_wrap(val1_1, t1); + v_int16 r2 = v_add_wrap(val1_2, t2); + v_int16 r3 = v_add_wrap(val1_3, t3); + + v_uint8 q0 = v_packus(r0, r1); + v_uint8 q1 = v_packus(r2, r3); + + v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1); + v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1); +#if 1 + v_uint8 q4 = v_permutex2_s32(q2, q3, idxs1); + v_uint8 q5 = v_permutex2_s32(q2, q3, idxs2); + + res1 = v_shuffle_s8(q4, shuf_mask2); + res2 = v_shuffle_s8(q5, shuf_mask2); + + //Second variant of decompose. It'll be usefull in the future. +#else + v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); + v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); + + v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + v_uint8 q6 = v_permute32(idx, q4); + v_uint8 q7 = v_permute32(idx, q5); + + v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15); + + v_uint8 q8 = v_shuffle_s8(q6, mask2); + v_uint8 q9 = v_shuffle_s8(q7, mask2); +#endif +} + +static inline void horizontalPass_lpi4_U8C1(const short clone[], const short mapsx[], + uint8_t tmp[], uint8_t *dst[], + v_uint8& shuf_mask1, + int width, int half_nlanes) { + v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15); + + v_uint32 permute_idxs1 = v_set_s32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + v_uint32 permute_idxs2 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + v_uint32 permute_idxs3 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2); + + v_uint8 val_0, val_1, val_2, val_3, res1, res2; + const int shift = half_nlanes / 4; + + for (int x = 0; x < width; ) { + for (; x <= width - half_nlanes; x += half_nlanes) { + v_int16 a10 = vx_load(&clone[4 * x]); + v_int16 a32 = vx_load(&clone[4 * (x + 8)]); + v_int16 a54 = vx_load(&clone[4 * (x + 16)]); + v_int16 a76 = vx_load(&clone[4 * (x + 24)]); + + v_set(val_0, val_1, val_2, val_3, tmp, mapsx, x, shift); + + val_0 = v_permute32(val_0, permute_idxs1); + val_1 = v_permute32(val_1, permute_idxs1); + val_2 = v_permute32(val_2, permute_idxs1); + val_3 = v_permute32(val_3, permute_idxs1); + + main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3, + a10, a32, a54, a76, + shuf_mask1, shuf_mask2, + permute_idxs2, permute_idxs3, + res1, res2); + v_store_low(&dst[0][x], res1); + v_store_high(&dst[1][x], res1); + v_store_low(&dst[2][x], res2); + v_store_high(&dst[3][x], res2); + } + + if (x < width) { + x = width - half_nlanes; + } + } +} + +static inline void verticalPass_anylpi_8U(const uint8_t* src0[], const uint8_t* src1[], + uint8_t tmp[], const int& beta0, const int& half_nlanes, + const int& l, const int& length1, const int& length2) { + for (int w = 0; w < length2; ) { + for (; w <= length1 - half_nlanes; w += half_nlanes) { + v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); + v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); + v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; + v_pack_u_store(tmp + w, t); + } + + if (w < length1) { + w = length1 - half_nlanes; + } + } +} + +static inline void horizontalPass_anylpi_8U(const short alpha[], const short mapsx[], + uint8_t* dst[], const uchar tmp[], const int& l, + const int& half_nlanes, const int& length) { + for (int x = 0; x < length; ) { + for (; x <= length - half_nlanes; x += half_nlanes) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_uint8 t = v_gather_pairs(tmp, sx); + v_int16 t0, t1; + v_deinterleave_expand(t, t0, t1); // tmp pixels as int16 + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[l][x], d); + } + + if (x < length) { + x = length - half_nlanes; + } + } +} + +// 8UC1 Resize (bi-linear) +void calcRowLinear_8UC1( uint8_t* dst[], + const uint8_t* src0[], + const uint8_t* src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size& inSz, + const Size& outSz, + int lpi) { + bool xRatioEq = inSz.width == outSz.width; + bool yRatioEq = inSz.height == outSz.height; + + constexpr int nlanes = v_uint8::nlanes; + constexpr int half_nlanes = (nlanes / 2); + + if (!xRatioEq && !yRatioEq) { + if (4 == lpi) { + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15); + + verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, shuf_mask1, + half_nlanes, inSz.width); + + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1, + outSz.width, half_nlanes); + + } else { // if any lpi + int inLength = inSz.width; + int outLength = outSz.width; + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, inLength, inLength); + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_anylpi_8U(alpha, mapsx, dst, tmp, l, half_nlanes, outLength); + } + } // if lpi == 4 + + } else if (!xRatioEq) { + GAPI_DbgAssert(yRatioEq); + + if (4 == lpi) { + // vertical pass + GAPI_DbgAssert(inSz.width >= nlanes); + for (int w = 0; w < inSz.width; ) { + for (; w <= inSz.width - nlanes; w += nlanes) { + v_uint8 s0, s1, s2, s3; + s0 = vx_load(&src0[0][w]); + s1 = vx_load(&src0[1][w]); + s2 = vx_load(&src0[2][w]); + s3 = vx_load(&src0[3][w]); + v_store_interleave(&tmp[4 * w], s0, s1, s2, s3); + } + + if (w < inSz.width) { + w = inSz.width - nlanes; + } + } + + // horizontal pass + v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15); + + horizontalPass_lpi4_U8C1(clone, mapsx, tmp, dst, shuf_mask1, + outSz.width, half_nlanes); + + } else { // any LPI + for (int l = 0; l < lpi; ++l) { + const uchar *src = src0[l]; + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + horizontalPass_anylpi_8U(alpha, mapsx, dst, src, l, half_nlanes, outSz.width); + } + } + + } else if (!yRatioEq) { + GAPI_DbgAssert(xRatioEq); + int inLength = inSz.width; + int outLength = outSz.width; + + for (int l = 0; l < lpi; ++l) { + short beta0 = beta[l]; + + // vertical pass + GAPI_DbgAssert(inSz.width >= half_nlanes); + verticalPass_anylpi_8U(src0, src1, dst[l], beta0, half_nlanes, l, + inLength, outLength); + } + + } else { + GAPI_DbgAssert(xRatioEq && yRatioEq); + int length = inSz.width; + + for (int l = 0; l < lpi; ++l) { + memcpy(dst[l], src0[l], length); + } + } +} + // Resize (bi-linear, 8U, generic number of channels) template -void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], // 4 clones of alpha - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size &inSz, - const Size &outSz, - int lpi) { +static inline void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], // 4 clones of alpha + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size &inSz, + const Size &outSz, + int lpi) { constexpr int half_nlanes = (v_uint8::nlanes / 2); - const int shift = (half_nlanes / 4); + constexpr int shift = (half_nlanes / 4); if (4 == lpi) { GAPI_DbgAssert(inSz.width >= half_nlanes); - v_uint8 shuf_mask1 = v_setr_s8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, @@ -154,6 +512,17 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + // vertical pass + v_int16 b0 = vx_setall_s16(beta[0]); + v_int16 b1 = vx_setall_s16(beta[1]); + v_int16 b2 = vx_setall_s16(beta[2]); + v_int16 b3 = vx_setall_s16(beta[3]); + + verticalPass_lpi4_8U(src0, src1, tmp, b0, b1, b2, b3, + shuf_mask1, half_nlanes, inSz.width*chanNum); + + // horizontal pass + v_uint8 val_0, val_1, val_2, val_3, res1, res2; v_uint8 shuf_mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, @@ -163,83 +532,8 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - v_uint32 idx1 = v_set_s32(23, 21, 7, 5, 22, 20, 6, 4, 19, 17, 3, 1, 18, 16, 2, 0); - v_uint32 idx2 = v_set_s32(31, 29, 15, 13, 30, 28, 14, 12, 27, 25, 11, 9, 26, 24, 10, 8); - v_uint32 idx3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); - v_uint32 idx4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2); - - // vertical pass - v_int16 b0 = vx_setall_s16(beta[0]); - v_int16 b1 = vx_setall_s16(beta[1]); - v_int16 b2 = vx_setall_s16(beta[2]); - v_int16 b3 = vx_setall_s16(beta[3]); - - for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - half_nlanes && w >= 0; w += half_nlanes) { - v_int16 val0_0 = v_load_ccache_expand(&src0[0][w]); - v_int16 val0_1 = v_load_ccache_expand(&src0[1][w]); - v_int16 val0_2 = v_load_ccache_expand(&src0[2][w]); - v_int16 val0_3 = v_load_ccache_expand(&src0[3][w]); - - v_int16 val1_0 = v_load_ccache_expand(&src1[0][w]); - v_int16 val1_1 = v_load_ccache_expand(&src1[1][w]); - v_int16 val1_2 = v_load_ccache_expand(&src1[2][w]); - v_int16 val1_3 = v_load_ccache_expand(&src1[3][w]); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), b0); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), b1); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), b2); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), b3); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_packus(r0, r1); - v_uint8 q1 = v_packus(r2, r3); -#if 1 - v_uint8 q2 = v_permutex2_s32(q0, q1, idx1); - v_uint8 q3 = v_permutex2_s32(q0, q1, idx2); - - v_uint8 q4 = v_shuffle_s8(q2, shuf_mask1); - v_uint8 q5 = v_shuffle_s8(q3, shuf_mask1); - - //Second variant of decompose. It'll be usefull in the future. -#else - v_uint8 q2 = v_mblend_shiftleft(q0, q1); - v_uint8 q3 = v_mblend_shiftright(q0, q1); - - v_uint8 mask1 = v_setr_s8(0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15, - 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15, - 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15, - 0, 8, 4, 12, 1, 9, 5, 13, - 2, 10, 6, 14, 3, 11, 7, 15); - - v_uint8 q4 = v_shuffle_s8(q2, mask1); - v_uint8 q5 = v_shuffle_s8(q3, mask1); - - v_uint64 idx1 = v_set_s64(11, 10, 3, 2, 9, 8, 1, 0); - v_uint64 idx2 = v_set_s64(15, 14, 7, 6, 13, 12, 5, 4); - - v_uint8 q6 = v_permutex2_s64(q4, q5, idx1); - v_uint8 q7 = v_permutex2_s64(q4, q5, idx2); -#endif - - vx_store(&tmp[4 * w + 0], q4); - vx_store(&tmp[4 * w + 2 * half_nlanes], q5); - } - - if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - half_nlanes; - } - } - - // horizontal pass - v_uint8 val_0, val_1, val_2, val_3; + v_uint32 idxs3 = v_set_s32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + v_uint32 idxs4 = v_set_s32(31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2); GAPI_DbgAssert(outSz.width >= half_nlanes); for (int x = 0; x < outSz.width; ) { @@ -248,72 +542,23 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, v_int16 a32 = vx_load(&clone[4 * (x + 8)]); v_int16 a54 = vx_load(&clone[4 * (x + 16)]); v_int16 a76 = vx_load(&clone[4 * (x + 24)]); - + for (int c = 0; c < chanNum; ++c) { v_gather_channel(val_0, tmp, mapsx, chanNum, c, x, 0); v_gather_channel(val_1, tmp, mapsx, chanNum, c, x, shift); v_gather_channel(val_2, tmp, mapsx, chanNum, c, x, shift * 2); v_gather_channel(val_3, tmp, mapsx, chanNum, c, x, shift * 3); - v_int16 val0_0 = v_reinterpret_as_s16(v_expand_low(val_0)); - v_int16 val0_1 = v_reinterpret_as_s16(v_expand_low(val_1)); - v_int16 val0_2 = v_reinterpret_as_s16(v_expand_low(val_2)); - v_int16 val0_3 = v_reinterpret_as_s16(v_expand_low(val_3)); + main_computation_horizontalPass_lpi4(val_0, val_1, val_2, val_3, + a10, a32, a54, a76, + shuf_mask1, shuf_mask2, + idxs3, idxs4, + res1, res2); - v_int16 val1_0 = v_reinterpret_as_s16(v_expand_high(val_0)); - v_int16 val1_1 = v_reinterpret_as_s16(v_expand_high(val_1)); - v_int16 val1_2 = v_reinterpret_as_s16(v_expand_high(val_2)); - v_int16 val1_3 = v_reinterpret_as_s16(v_expand_high(val_3)); - - v_int16 t0 = v_mulhrs(v_sub_wrap(val0_0, val1_0), a10); - v_int16 t1 = v_mulhrs(v_sub_wrap(val0_1, val1_1), a32); - v_int16 t2 = v_mulhrs(v_sub_wrap(val0_2, val1_2), a54); - v_int16 t3 = v_mulhrs(v_sub_wrap(val0_3, val1_3), a76); - - v_int16 r0 = v_add_wrap(val1_0, t0); - v_int16 r1 = v_add_wrap(val1_1, t1); - v_int16 r2 = v_add_wrap(val1_2, t2); - v_int16 r3 = v_add_wrap(val1_3, t3); - - v_uint8 q0 = v_packus(r0, r1); - v_uint8 q1 = v_packus(r2, r3); - - v_uint8 q2 = v_shuffle_s8(q0, shuf_mask1); - v_uint8 q3 = v_shuffle_s8(q1, shuf_mask1); -#if 1 - v_uint8 q4 = v_permutex2_s32(q2, q3, idx3); - v_uint8 q5 = v_permutex2_s32(q2, q3, idx4); - - v_uint8 q6 = v_shuffle_s8(q4, shuf_mask2); - v_uint8 q7 = v_shuffle_s8(q5, shuf_mask2); - - - //Second variant of decompose. It'll be usefull in the future. -#else - v_uint8 q4 = v_mask_blend_shiftleft<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); - v_uint8 q5 = v_mask_blend_shiftright<0xCCCCCCCC /*0b11001100110011001100110011001100*/, 4>(q2, q3); - - v_int32 idx = v_set_s32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); - - v_uint8 q6 = v_permutex_s32(idx, q4); - v_uint8 q7 = v_permutex_s32(idx, q5); - - v_uint8 mask2 = v_setr_s8(0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, - 2, 3, 6, 7, 10, 11, 14, 15); - - v_uint8 q8 = v_shuffle_s8(q6, mask2); - v_uint8 q9 = v_shuffle_s8(q7, mask2); -#endif - v_store_low(&dst[c][0][x], q6); - v_store_high(&dst[c][1][x], q6); - v_store_low(&dst[c][2][x], q7); - v_store_high(&dst[c][3][x], q7); + v_store_low(&dst[c][0][x], res1); + v_store_high(&dst[c][1][x], res1); + v_store_low(&dst[c][2][x], res2); + v_store_high(&dst[c][3][x], res2); } } @@ -325,41 +570,30 @@ void calcRowLinear_8UC_Impl(std::array, chanNum> &dst, for (int l = 0; l < lpi; ++l) { short beta0 = beta[l]; - // vertical pass + // vertical pass GAPI_DbgAssert(inSz.width*chanNum >= half_nlanes); - for (int w = 0; w < inSz.width*chanNum; ) { - for (; w <= inSz.width*chanNum - half_nlanes; w += half_nlanes) { - v_int16 s0 = v_reinterpret_as_s16(vx_load_expand(&src0[l][w])); - v_int16 s1 = v_reinterpret_as_s16(vx_load_expand(&src1[l][w])); - v_int16 t = v_mulhrs(s0 - s1, beta0) + s1; - v_pack_u_store(tmp + w, t); + verticalPass_anylpi_8U(src0, src1, tmp, beta0, half_nlanes, l, + inSz.width*chanNum, inSz.width*chanNum); + + // horizontal pass + GAPI_DbgAssert(outSz.width >= half_nlanes); + for (int x = 0; x < outSz.width; ) { + for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { + for (int c = 0; c < chanNum; ++c) { + v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 + v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) + v_int16 t0 = v_gather_chan(tmp, sx, c, 0); + v_int16 t1 = v_gather_chan(tmp, sx, c, 1); + v_int16 d = v_mulhrs(t0 - t1, a0) + t1; + v_pack_u_store(&dst[c][l][x], d); + } } - if (w < inSz.width*chanNum) { - w = inSz.width*chanNum - half_nlanes; + if (x < outSz.width) { + x = outSz.width - half_nlanes; } } - - // horizontal pass - GAPI_DbgAssert(outSz.width >= half_nlanes); - - for (int x = 0; x < outSz.width; ) { - for (; x <= outSz.width - half_nlanes && x >= 0; x += half_nlanes) { - for (int c = 0; c < chanNum; ++c) { - v_int16 a0 = vx_load(&alpha[x]); // as signed Q1.1.14 - v_int16 sx = vx_load(&mapsx[x]); // as integer (int16) - v_int16 t0 = v_gather_chan(tmp, sx, c, 0); - v_int16 t1 = v_gather_chan(tmp, sx, c, 1); - v_int16 d = v_mulhrs(t0 - t1, a0) + t1; - v_pack_u_store(&dst[c][l][x], d); - } - } - - if (x < outSz.width) { - x = outSz.width - half_nlanes; - } - } - } + } } } diff --git a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp index 7d21fb06d38..de34fd8028d 100644 --- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp +++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp @@ -26,34 +26,34 @@ void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Si const float xalpha[], float vbuf[]); #if USE_CVKL -void calcRowArea_CVKL_U8_SSE42(const uchar * src[], - uchar dst[], - const Size & inSz, - const Size & outSz, - int y, - const uint16_t xsi[], - const uint16_t ysi[], - const uint16_t xalpha[], - const uint16_t yalpha[], - int x_max_count, - int y_max_count, - uint16_t vert_sum[]); +void calcRowArea_CVKL_U8(const uchar * src[], + uchar dst[], + const Size & inSz, + const Size & outSz, + int y, + const uint16_t xsi[], + const uint16_t ysi[], + const uint16_t xalpha[], + const uint16_t yalpha[], + int x_max_count, + int y_max_count, + uint16_t vert_sum[]); #endif //----------------------------------------------------------------------------- -// Resize (bi-linear, 8U) -void calcRowLinear_8U(uint8_t *dst[], - const uint8_t *src0[], - const uint8_t *src1[], - const short alpha[], - const short clone[], - const short mapsx[], - const short beta[], - uint8_t tmp[], - const Size & inSz, - const Size & outSz, - int lpi); +// Resize (bi-linear, 8UC1) +void calcRowLinear_8UC1(uint8_t *dst[], + const uint8_t *src0[], + const uint8_t *src1[], + const short alpha[], + const short clone[], + const short mapsx[], + const short beta[], + uint8_t tmp[], + const Size & inSz, + const Size & outSz, + int lpi); // Resize (bi-linear, 8UC3) void calcRowLinear_8U(C3, std::array, 3> &dst, diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp index e6a3dbffca9..f36c76f74a3 100644 --- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp +++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp @@ -910,6 +910,26 @@ static void calcRowLinear(const cv::gapi::fluid::View & in, dst[l] = out.OutLine(l); } + #ifdef HAVE_AVX512 + if (with_cpu_x86_avx512_core()) { + if (std::is_same::value) { + if (inSz.width >= 64 && outSz.width >= 32) { + avx512::calcRowLinear_8UC1(reinterpret_cast(dst), + reinterpret_cast(src0), + reinterpret_cast(src1), + reinterpret_cast(alpha), + reinterpret_cast(clone), + reinterpret_cast(mapsx), + reinterpret_cast(beta), + reinterpret_cast(tmp), + inSz, outSz, lpi); + + return; + } + } + } + #endif + #ifdef HAVE_AVX2 if (with_cpu_x86_avx2()) { if (std::is_same::value) { diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp index 640d589f8ec..9b572a4fc8c 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_avx.hpp @@ -3173,7 +3173,7 @@ static inline void v_setr64(v_uint8x32& val_0, v_uint8x32& val_1,v_uint8x32& val *reinterpret_cast(&tmp[4 * mapsx[x + 3 * shift + 3]])); } -static inline v_uint8x32 v_permutevar8x32(v_uint8x32& a, v_uint32x8& idxs) +static inline v_uint8x32 v_permute32(v_uint8x32& a, v_uint32x8& idxs) { return v_uint8x32(_mm256_permutevar8x32_epi32(a.val, idxs.val)); } diff --git a/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp b/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp index 1f786b7bebb..73808dec73d 100644 --- a/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp +++ b/inference-engine/thirdparty/ocv/opencv_hal_avx512.hpp @@ -3033,23 +3033,6 @@ static inline v_int32x16 v_madd(const v_int16x32& a, const v_int16x32& b) return r; } -// This function call non-existing intrinsic _mm512_setr_epi8(). -#if 0 -static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd) -{ - static const __m512i mask_even = - _mm512_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, - -1, 12, -1, 14, -1, 16, -1, 18, -1, 20, - -1, 22, -1, 24, -1, 26, -1, 28, -1, 30, -1); - static const __m512i mask_odd = - _mm512_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, - -1, 13, -1, 15, -1, 17, -1, 19, -1, 21, - -1, 23, -1, 25, -1, 27, -1, 29, -1, 31, -1); - - even.val = _mm512_shuffle_epi8(src.val, mask_even); - odd .val = _mm512_shuffle_epi8(src.val, mask_odd); -} -#endif static inline v_int16x32 v_mulhi(const v_int16x32& a, short b) { v_int16x32 r; @@ -3125,7 +3108,6 @@ static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b) return v_uint8x64(_mm512_packus_epi16(a.val, b.val)); } - #define word(b0, b1, b2, b3) \ (((uint32_t)((uint8_t)(b0)) << 0*8) \ | ((uint32_t)((uint8_t)(b1)) << 1*8) \ @@ -3154,6 +3136,26 @@ static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4, word(b60, b61, b62, b63))); } +static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd) +{ + v_uint8x64 mask_even = v_setr_s8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, + 12, -1, 14, -1, 16, -1, 18, -1, 20, -1, 22, + -1, 24, -1, 26, -1, 28, -1, 30, -1, 32, -1, + 34, -1, 36, -1, 38, -1, 40, -1, 42, -1, 44, + -1, 46, -1, 48, -1, 50, -1, 52, -1, 54, -1, + 56, -1, 58, -1, 60, -1, 62, -1); + + v_uint8x64 mask_odd = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, + 13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23, + -1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1, + 35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45, + -1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1, + 57, -1, 59, -1, 61, -1, 63, -1); + + even.val = _mm512_shuffle_epi8(src.val, mask_even.val); + odd .val = _mm512_shuffle_epi8(src.val, mask_odd.val); +} + static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0) { return v_uint64x8(_mm512_set_epi64(b7, b6, b5, b4, b3, b2, b1, b0)); @@ -3173,11 +3175,11 @@ static inline v_int16x32 v_load_ccache_expand(const uchar* ptr) { return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr))); \ } -static inline __m512i v512_insert_epi16(__m512i target, const uchar x, const int index) +static inline __m512i v512_insert_epi16(__m512i& target, const ushort x, const int index) { return _mm512_mask_set1_epi16(target, 1UL << index, x); } -static inline __m512i v512_insert_epi32(__m512i target, const int32_t x, const int index) +static inline __m512i v512_insert_epi32(__m512i& target, const int32_t x, const int index) { return _mm512_mask_set1_epi32(target, 1UL << index, x); } @@ -3214,16 +3216,63 @@ static inline v_uint8x64 v_permutex2_s64(const v_uint8x64& a, const v_uint8x64& return v_uint8x64(_mm512_permutex2var_epi64(a.val, idxs.val, b.val)); } -static inline v_uint8x64 v_permutex_s32(const v_uint8x64& a, const v_uint64x8 idxs) +static inline v_uint8x64 v_permute32(const v_uint8x64& a, const v_uint64x8& idxs) { return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val)); } -static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16 idxs) +static inline v_uint8x64 v_permutex2_s32(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& idxs) { return v_uint8x64(_mm512_permutex2var_epi32(a.val, idxs.val, b.val)); } +static inline v_uint8x64 v_permute32(const v_uint8x64& a, const v_uint32x16& idxs) +{ + return v_uint8x64(_mm512_permutexvar_epi32(idxs.val, a.val)); +} + +static inline void v_set(v_uint8x64& val_0, v_uint8x64& val_1, + v_uint8x64& val_2, v_uint8x64& val_3, + uint8_t tmp[], const short mapsx[], + int x, int shift) +{ + val_0.val = _mm512_setr_epi64(*reinterpret_cast(&tmp[4 * (*(mapsx + x + 0))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 1))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 4))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 5))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 6))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 7))])); + + val_1.val = _mm512_setr_epi64(*reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 0))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 1))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 2))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 3))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 4))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 5))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 6))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + shift + 7))])); + + val_2.val = _mm512_setr_epi64(*reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 0))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 1))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 2))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 3))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 4))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 5))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 6))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 2 * shift + 7))])); + + val_3.val = _mm512_setr_epi64(*reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 0))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 1))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 2))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 3))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 4))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 5))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 6))]), + *reinterpret_cast(&tmp[4 * (*(mapsx + x + 3 * shift + 7))])); +} + #if defined(__GNUC__) int _mm512_cvtsi512_si32(__m512i a) @@ -3246,6 +3295,46 @@ static inline int v512_extract_epi16(__m512i target) return (v512_extract_epi32(target) >> (index % 2 ? 16 : 0)) & 0xFFFF; } +static inline v_uint8x64 v_gather_pairs(const uchar src[], const v_int16x32& index) { + v_uint8x64 r; + + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<0>(index.val)]), 0); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<1>(index.val)]), 1); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<2>(index.val)]), 2); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<3>(index.val)]), 3); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<4>(index.val)]), 4); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<5>(index.val)]), 5); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<6>(index.val)]), 6); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<7>(index.val)]), 7); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<8>(index.val)]), 8); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<9>(index.val)]), 9); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<10>(index.val)]), 10); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<11>(index.val)]), 11); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<12>(index.val)]), 12); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<13>(index.val)]), 13); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<14>(index.val)]), 14); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<15>(index.val)]), 15); + + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<16>(index.val)]), 16); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<17>(index.val)]), 17); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<18>(index.val)]), 18); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<19>(index.val)]), 19); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<20>(index.val)]), 20); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<21>(index.val)]), 21); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<22>(index.val)]), 22); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<23>(index.val)]), 23); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<24>(index.val)]), 24); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<25>(index.val)]), 25); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<26>(index.val)]), 26); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<27>(index.val)]), 27); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<28>(index.val)]), 28); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<29>(index.val)]), 29); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<30>(index.val)]), 30); + r.val = v512_insert_epi16(r.val, *reinterpret_cast(&src[v512_extract_epi16<31>(index.val)]), 31); + + return r; +} + namespace { template static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {