AVX2/AVX512 32FC1 Resize (#1788)
This commit is contained in:
parent
f60b46f3d4
commit
efad27d68c
@ -557,6 +557,18 @@ void copyRow_32F(const float in[], float out[], int length) {
|
||||
copyRow_32F_impl(in, out, length);
|
||||
}
|
||||
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
} // namespace avx
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
|
@ -639,6 +639,18 @@ void copyRow_32F(const float in[], float out[], int length) {
|
||||
copyRow_32F_impl(in, out, length);
|
||||
}
|
||||
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
|
@ -892,130 +892,15 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
|
||||
|
||||
// Resize (bi-linear, 32F)
|
||||
void calcRowLinear_32F(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size & inSz,
|
||||
const Size & outSz,
|
||||
int lpi) {
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
if (!xRatioEq1 && !yRatioEq1) {
|
||||
for (int l = 0; l < lpi; l++) {
|
||||
float beta0 = beta[l];
|
||||
float beta1 = 1 - beta0;
|
||||
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD128
|
||||
for (; x <= outSz.width - 4; x += 4) {
|
||||
v_float32x4 alpha0 = v_load(&alpha[x]);
|
||||
// v_float32x4 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_int32x4 sx = v_load(&mapsx[x]);
|
||||
|
||||
v_float32x4 s0l, s0h, s00, s01;
|
||||
v_gather_pairs(src0[l], sx, s0l, s0h);
|
||||
v_deinterleave(s0l, s0h, s00, s01);
|
||||
|
||||
// v_float32x4 res0 = s00*alpha0 + s01*alpha1;
|
||||
v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
v_float32x4 s1l, s1h, s10, s11;
|
||||
v_gather_pairs(src1[l], sx, s1l, s1h);
|
||||
v_deinterleave(s1l, s1h, s10, s11);
|
||||
|
||||
// v_float32x4 res1 = s10*alpha0 + s11*alpha1;
|
||||
v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
|
||||
|
||||
// v_float32x4 d = res0*beta0 + res1*beta1;
|
||||
v_float32x4 d = v_fma(res0 - res1, beta0, res1);
|
||||
|
||||
v_store(&dst[l][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < outSz.width; x++) {
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
|
||||
float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
|
||||
dst[l][x] = beta0*res0 + beta1*res1;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!xRatioEq1) {
|
||||
GAPI_DbgAssert(yRatioEq1);
|
||||
|
||||
for (int l = 0; l < lpi; l++) {
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD128
|
||||
for (; x <= outSz.width - 4; x += 4) {
|
||||
v_float32x4 alpha0 = v_load(&alpha[x]);
|
||||
// v_float32x4 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_int32x4 sx = v_load(&mapsx[x]);
|
||||
|
||||
v_float32x4 s0l, s0h, s00, s01;
|
||||
v_gather_pairs(src0[l], sx, s0l, s0h);
|
||||
v_deinterleave(s0l, s0h, s00, s01);
|
||||
|
||||
// v_float32x4 d = s00*alpha0 + s01*alpha1;
|
||||
v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
v_store(&dst[l][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < outSz.width; x++) {
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq1) {
|
||||
GAPI_DbgAssert(xRatioEq1);
|
||||
int length = inSz.width; // == outSz.width
|
||||
|
||||
for (int l = 0; l < lpi; l++) {
|
||||
float beta0 = beta[l];
|
||||
float beta1 = 1 - beta0;
|
||||
|
||||
int x = 0;
|
||||
|
||||
#if CV_SIMD128
|
||||
for (; x <= length - 4; x += 4) {
|
||||
v_float32x4 s0 = v_load(&src0[l][x]);
|
||||
v_float32x4 s1 = v_load(&src1[l][x]);
|
||||
|
||||
// v_float32x4 d = s0*beta0 + s1*beta1;
|
||||
v_float32x4 d = v_fma(s0 - s1, beta0, s1);
|
||||
|
||||
v_store(&dst[l][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < length; x++) {
|
||||
dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
|
||||
int length = inSz.width; // == outSz.width
|
||||
for (int l = 0; l < lpi; l++) {
|
||||
memcpy(dst[l], src0[l], length * sizeof(float));
|
||||
}
|
||||
}
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -927,6 +927,17 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value) {
|
||||
avx512::calcRowLinear_32F(reinterpret_cast<float**>(dst),
|
||||
reinterpret_cast<const float**>(src0),
|
||||
reinterpret_cast<const float**>(src1),
|
||||
reinterpret_cast<const float*>(alpha),
|
||||
reinterpret_cast<const int*>(mapsx),
|
||||
reinterpret_cast<const float*>(beta),
|
||||
inSz, outSz, lpi);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -947,6 +958,17 @@ static void calcRowLinear(const cv::gapi::fluid::View & in,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (std::is_same<T, float>::value) {
|
||||
avx::calcRowLinear_32F(reinterpret_cast<float**>(dst),
|
||||
reinterpret_cast<const float**>(src0),
|
||||
reinterpret_cast<const float**>(src1),
|
||||
reinterpret_cast<const float*>(alpha),
|
||||
reinterpret_cast<const int*>(mapsx),
|
||||
reinterpret_cast<const float*>(beta),
|
||||
inSz, outSz, lpi);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -783,6 +783,134 @@ inline void copyRow_32F_impl(const float in[], float out[], int length) {
|
||||
}
|
||||
}
|
||||
|
||||
// Resize (bi-linear, 32FC1)
|
||||
static inline void calcRowLinear_32FC1(float *dst[],
|
||||
const float *src0[],
|
||||
const float *src1[],
|
||||
const float alpha[],
|
||||
const int mapsx[],
|
||||
const float beta[],
|
||||
const Size& inSz,
|
||||
const Size& outSz,
|
||||
int lpi) {
|
||||
bool xRatioEq1 = inSz.width == outSz.width;
|
||||
bool yRatioEq1 = inSz.height == outSz.height;
|
||||
|
||||
#if CPU_SIMD
|
||||
const int nlanes = v_float32::nlanes;
|
||||
#endif
|
||||
|
||||
if (!xRatioEq1 && !yRatioEq1) {
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
float beta0 = beta[line];
|
||||
float beta1 = 1 - beta0;
|
||||
|
||||
int x = 0;
|
||||
|
||||
#if CPU_SIMD
|
||||
for (; x <= outSz.width - nlanes; x += nlanes) {
|
||||
v_float32 alpha0 = vx_load(&alpha[x]);
|
||||
// v_float32 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_float32 low1, high1, s00, s01;
|
||||
v_gather_pairs(src0[line], mapsx, x, low1, high1);
|
||||
v_deinterleave(low1, high1, s00, s01);
|
||||
|
||||
// v_float32 res0 = s00*alpha0 + s01*alpha1;
|
||||
v_float32 res0 = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
v_float32 low2, high2, s10, s11;
|
||||
v_gather_pairs(src1[line], mapsx, x, low2, high2);
|
||||
v_deinterleave(low2, high2, s10, s11);
|
||||
|
||||
// v_float32 res1 = s10*alpha0 + s11*alpha1;
|
||||
v_float32 res1 = v_fma(s10 - s11, alpha0, s11);
|
||||
|
||||
// v_float32 d = res0*beta0 + res1*beta1;
|
||||
v_float32 d = v_fma(res0 - res1, beta0, res1);
|
||||
|
||||
vx_store(&dst[line][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < outSz.width; ++x) {
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
float res0 = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
|
||||
float res1 = src1[line][sx0] * alpha0 + src1[line][sx1] * alpha1;
|
||||
dst[line][x] = beta0 * res0 + beta1 * res1;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!xRatioEq1) {
|
||||
GAPI_DbgAssert(yRatioEq1);
|
||||
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
int x = 0;
|
||||
|
||||
#if CPU_SIMD
|
||||
for (; x <= outSz.width - nlanes; x += nlanes) {
|
||||
v_float32 alpha0 = vx_load(&alpha[x]);
|
||||
// v_float32 alpha1 = 1.f - alpha0;
|
||||
|
||||
v_float32 low, high, s00, s01;
|
||||
v_gather_pairs(src0[line], mapsx, x, low, high);
|
||||
v_deinterleave(low, high, s00, s01);
|
||||
|
||||
// v_float32 d = s00*alpha0 + s01*alpha1;
|
||||
v_float32 d = v_fma(s00 - s01, alpha0, s01);
|
||||
|
||||
vx_store(&dst[line][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < outSz.width; ++x) {
|
||||
float alpha0 = alpha[x];
|
||||
float alpha1 = 1 - alpha0;
|
||||
int sx0 = mapsx[x];
|
||||
int sx1 = sx0 + 1;
|
||||
dst[line][x] = src0[line][sx0] * alpha0 + src0[line][sx1] * alpha1;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (!yRatioEq1) {
|
||||
GAPI_DbgAssert(xRatioEq1);
|
||||
int length = inSz.width; // == outSz.width
|
||||
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
float beta0 = beta[line];
|
||||
float beta1 = 1 - beta0;
|
||||
|
||||
int x = 0;
|
||||
|
||||
#if CPU_SIMD
|
||||
for (; x <= length - nlanes; x += nlanes) {
|
||||
v_float32 s0 = vx_load(&src0[line][x]);
|
||||
v_float32 s1 = vx_load(&src1[line][x]);
|
||||
|
||||
// v_float32 d = s0*beta0 + s1*beta1;
|
||||
v_float32 d = v_fma(s0 - s1, beta0, s1);
|
||||
|
||||
vx_store(&dst[line][x], d);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; x < length; ++x) {
|
||||
dst[line][x] = beta0 * src0[line][x] + beta1 * src1[line][x];
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
|
||||
int length = inSz.width; // == outSz.width
|
||||
for (int line = 0; line < lpi; ++line) {
|
||||
memcpy(dst[line], src0[line], length * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace kernels
|
||||
} // namespace gapi
|
||||
} // namespace InferenceEngine
|
||||
|
@ -1317,16 +1317,16 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
|
||||
////////// Other math /////////
|
||||
|
||||
/** Some frequent operations **/
|
||||
#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
|
||||
inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
|
||||
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
|
||||
static inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val));} \
|
||||
inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
|
||||
{ return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
|
||||
inline _Tpvec v_sqrt(const _Tpvec& x) \
|
||||
{ return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
|
||||
inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_fma(a, a, b * b); } \
|
||||
inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
|
||||
{ return v_sqrt(v_fma(a, a, b*b)); }
|
||||
|
||||
OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
|
||||
@ -1947,6 +1947,18 @@ static inline v_uint8x32 v_blend_shiftright(const v_uint8x32& a, const v_uint8x3
|
||||
return v_uint8x32(_mm256_blend_epi16(_mm256_srli_si256(a.val, shift), b.val, mask));
|
||||
}
|
||||
|
||||
template<int mask, int shift>
|
||||
static inline __m256 v_blend_shiftleft(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_castps_si256(a.val), _mm256_slli_si256(_mm256_castps_si256(b.val), shift), mask));
|
||||
}
|
||||
|
||||
template<int mask, int shift>
|
||||
static inline __m256 v_blend_shiftright(const v_float32x8& a, const v_float32x8& b)
|
||||
{
|
||||
return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_srli_si256(_mm256_castps_si256(a.val), shift), _mm256_castps_si256(b.val), mask));
|
||||
}
|
||||
|
||||
static inline v_uint8x32 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
|
||||
char b5, char b6, char b7, char b8, char b9,
|
||||
char b10, char b11, char b12, char b13, char b14,
|
||||
@ -3002,8 +3014,10 @@ static inline void v_deinterleave(const v_float32x8& low, const v_float32x8& hig
|
||||
{
|
||||
__m256 tmp0 = _mm256_unpacklo_ps(low.val, high.val);
|
||||
__m256 tmp1 = _mm256_unpackhi_ps(low.val, high.val);
|
||||
even.val = _mm256_unpacklo_ps(tmp0, tmp1);
|
||||
odd .val = _mm256_unpackhi_ps(tmp0, tmp1);
|
||||
__m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
|
||||
__m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
|
||||
even.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp2), 216 /*11011000*/));
|
||||
odd.val = _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(tmp3), 216 /*11011000*/));
|
||||
}
|
||||
|
||||
static inline void v_deinterleave(const v_uint8x32& v0, const v_uint8x32& v1,
|
||||
@ -3215,6 +3229,18 @@ static inline v_uint8x32 v_gather_pairs(const uchar src[], const v_int16x16& ind
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline void v_gather_pairs(const float src[], const int mapsx[], int x,
|
||||
v_float32x8& low, v_float32x8& high) {
|
||||
low.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[x + 0]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 1]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 2]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 3]])));
|
||||
high.val = _mm256_castsi256_ps(_mm256_setr_epi64x(*reinterpret_cast<const int64_t*>(&src[mapsx[x + 4]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 5]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 6]]),
|
||||
*reinterpret_cast<const int64_t*>(&src[mapsx[x + 7]])));
|
||||
}
|
||||
|
||||
namespace {
|
||||
template<int chanNum>
|
||||
static inline v_int16x16 v_gather_chan(const uchar src[], const v_int16x16& index, int channel, int pos) {
|
||||
|
@ -2955,41 +2955,11 @@ inline void v512_cleanup() { _mm256_zeroall(); }
|
||||
static inline void v_deinterleave(const v_float32x16& low, const v_float32x16& high,
|
||||
v_float32x16& even, v_float32x16& odd)
|
||||
{
|
||||
__m512 tmp0 = _mm512_unpacklo_ps(low.val, high.val);
|
||||
__m512 tmp1 = _mm512_unpackhi_ps(low.val, high.val);
|
||||
even.val = _mm512_unpacklo_ps(tmp0, tmp1);
|
||||
odd .val = _mm512_unpackhi_ps(tmp0, tmp1);
|
||||
}
|
||||
__m512i permute_mask1 = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
|
||||
__m512i permute_mask2 = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
|
||||
|
||||
static inline void v_deinterleave(const v_uint8x64& i0, const v_uint8x64& i1,
|
||||
const v_uint8x64& i2, const v_uint8x64& i3,
|
||||
v_uint8x64& o0, v_uint8x64& o1,
|
||||
v_uint8x64& o2, v_uint8x64& o3)
|
||||
{
|
||||
__m512i u0 = i0.val; // a0 b0 c0 d0 a1 b1 c1 d1 ...
|
||||
__m512i u1 = i1.val; // a4 b4 c4 d4 ...
|
||||
__m512i u2 = i2.val; // a8 b8 c8 d8 ...
|
||||
__m512i u3 = i3.val; // a12 b12 c12 d12 ...
|
||||
|
||||
__m512i v0 = _mm512_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
|
||||
__m512i v1 = _mm512_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
|
||||
__m512i v2 = _mm512_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
|
||||
__m512i v3 = _mm512_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
|
||||
|
||||
u0 = _mm512_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
|
||||
u1 = _mm512_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
|
||||
u2 = _mm512_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
|
||||
u3 = _mm512_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
|
||||
|
||||
v0 = _mm512_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
|
||||
v1 = _mm512_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
|
||||
v2 = _mm512_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
|
||||
v3 = _mm512_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
|
||||
|
||||
o0.val = _mm512_unpacklo_epi8(v0, v1); // a0 a1 a2 a3 ...
|
||||
o1.val = _mm512_unpackhi_epi8(v0, v1); // b0 b1 b2 b3 ...
|
||||
o2.val = _mm512_unpacklo_epi8(v2, v3); // c0 c1 c2 c3 ...
|
||||
o3.val = _mm512_unpackhi_epi8(v2, v3); // d0 d1 d2 d3 ...
|
||||
even.val = _mm512_permutex2var_ps(low.val, permute_mask1, high.val);
|
||||
odd.val = _mm512_permutex2var_ps(low.val, permute_mask2, high.val);
|
||||
}
|
||||
|
||||
static inline v_uint8x64 v_interleave_low(const v_uint8x64& a, const v_uint8x64& b)
|
||||
@ -3093,14 +3063,14 @@ template<int mask, int shift>
|
||||
static inline v_uint8x64 v_mask_blend_shiftleft(const v_uint8x64& a, const v_uint8x64& b)
|
||||
{
|
||||
return v_uint8x64(_mm512_mask_blend_epi16(mask,
|
||||
a.val, _mm512_bslli_epi128(b.val, shift)));
|
||||
a.val, _mm512_bslli_epi128(b.val, shift)));
|
||||
}
|
||||
|
||||
template<int mask, int shift>
|
||||
static inline v_uint8x64 v_mask_blend_shiftright(const v_uint8x64& a, const v_uint8x64& b)
|
||||
{
|
||||
return v_uint8x64(_mm512_mask_blend_epi16(mask,
|
||||
_mm512_bsrli_epi128(a.val, shift), b.val));
|
||||
_mm512_bsrli_epi128(a.val, shift), b.val));
|
||||
}
|
||||
|
||||
static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
|
||||
@ -3115,25 +3085,25 @@ static inline v_uint8x64 v_packus(const v_int16x32& a, const v_int16x32& b)
|
||||
| ((uint32_t)((uint8_t)(b3)) << 3*8))
|
||||
|
||||
static inline v_uint8x64 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
|
||||
char b5, char b6, char b7, char b8, char b9,
|
||||
char b10, char b11, char b12, char b13, char b14,
|
||||
char b15, char b16, char b17, char b18, char b19,
|
||||
char b20, char b21, char b22, char b23, char b24,
|
||||
char b25, char b26, char b27, char b28, char b29,
|
||||
char b30, char b31, char b32, char b33, char b34,
|
||||
char b35, char b36, char b37, char b38, char b39,
|
||||
char b40, char b41, char b42, char b43, char b44,
|
||||
char b45, char b46, char b47, char b48, char b49,
|
||||
char b50, char b51, char b52, char b53, char b54,
|
||||
char b55, char b56, char b57, char b58, char b59,
|
||||
char b60, char b61, char b62, char b63)
|
||||
char b5, char b6, char b7, char b8, char b9,
|
||||
char b10, char b11, char b12, char b13, char b14,
|
||||
char b15, char b16, char b17, char b18, char b19,
|
||||
char b20, char b21, char b22, char b23, char b24,
|
||||
char b25, char b26, char b27, char b28, char b29,
|
||||
char b30, char b31, char b32, char b33, char b34,
|
||||
char b35, char b36, char b37, char b38, char b39,
|
||||
char b40, char b41, char b42, char b43, char b44,
|
||||
char b45, char b46, char b47, char b48, char b49,
|
||||
char b50, char b51, char b52, char b53, char b54,
|
||||
char b55, char b56, char b57, char b58, char b59,
|
||||
char b60, char b61, char b62, char b63)
|
||||
{
|
||||
return v_uint8x64(_mm512_setr_epi32(word(b0, b1, b2, b3), word(b4, b5, b6, b7), word(b8, b9, b10, b11),
|
||||
word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
|
||||
word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
|
||||
word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
|
||||
word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
|
||||
word(b60, b61, b62, b63)));
|
||||
word(b12, b13, b14, b15), word(b16, b17, b18, b19), word(b20, b21, b22, b23),
|
||||
word(b24, b25, b26, b27), word(b28, b29, b30, b31), word(b32, b33, b34, b35),
|
||||
word(b36, b37, b38, b39), word(b40, b41, b42, b43), word(b44, b45, b46, b47),
|
||||
word(b48, b49, b50, b51), word(b52, b53, b54, b55), word(b56, b57, b58, b59),
|
||||
word(b60, b61, b62, b63)));
|
||||
}
|
||||
|
||||
static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even, v_int16x32& odd)
|
||||
@ -3145,15 +3115,15 @@ static inline void v_deinterleave_expand(const v_uint8x64& src, v_int16x32& even
|
||||
-1, 46, -1, 48, -1, 50, -1, 52, -1, 54, -1,
|
||||
56, -1, 58, -1, 60, -1, 62, -1);
|
||||
|
||||
v_uint8x64 mask_odd = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1,
|
||||
13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23,
|
||||
-1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1,
|
||||
35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45,
|
||||
-1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1,
|
||||
57, -1, 59, -1, 61, -1, 63, -1);
|
||||
v_uint8x64 mask_odd = v_setr_s8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1,
|
||||
13, -1, 15, -1, 17, -1, 19, -1, 21, -1, 23,
|
||||
-1, 25, -1, 27, -1, 29, -1, 31, -1, 33, -1,
|
||||
35, -1, 37, -1, 39, -1, 41, -1, 43, -1, 45,
|
||||
-1, 47, -1, 49, -1, 51, -1, 53, -1, 55, -1,
|
||||
57, -1, 59, -1, 61, -1, 63, -1);
|
||||
|
||||
even.val = _mm512_shuffle_epi8(src.val, mask_even.val);
|
||||
odd .val = _mm512_shuffle_epi8(src.val, mask_odd.val);
|
||||
odd.val = _mm512_shuffle_epi8(src.val, mask_odd.val);
|
||||
}
|
||||
|
||||
static inline v_uint64x8 v_set_s64(int b7, int b6, int b5, int b4, int b3, int b2, int b1, int b0)
|
||||
@ -3167,25 +3137,39 @@ static inline v_uint32x16 v_set_s32(int b15, int b14, int b13, int b12, int b11,
|
||||
return v_uint32x16(_mm512_set_epi32(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0));
|
||||
}
|
||||
|
||||
static inline v_uint32x16 v_setr_s32(int b1, int b2, int b3, int b4, int b5, int b6, int b7, int b8,
|
||||
int b9, int b10, int b11, int b12, int b13, int b14, int b15, int b16)
|
||||
{
|
||||
return v_uint32x16(_mm512_setr_epi32(b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15, b16));
|
||||
}
|
||||
|
||||
static inline v_uint8x64 v_shuffle_s8(const v_uint8x64& a, const v_uint8x64& mask)
|
||||
{
|
||||
return v_uint8x64(_mm512_shuffle_epi8(a.val, mask.val));
|
||||
}
|
||||
|
||||
static inline v_int16x32 v_load_ccache_expand(const uchar* ptr)
|
||||
{
|
||||
return v_int16x32(_mm512_cvtepu8_epi16(_mm256_lddqu_si256((const __m256i*)ptr))); \
|
||||
}
|
||||
|
||||
static inline __m512i v512_insert_epi16(__m512i& target, const ushort x, const int index)
|
||||
{
|
||||
return _mm512_mask_set1_epi16(target, 1UL << index, x);
|
||||
}
|
||||
|
||||
static inline __m512i v512_insert_epi32(__m512i& target, const int32_t x, const int index)
|
||||
{
|
||||
return _mm512_mask_set1_epi32(target, 1UL << index, x);
|
||||
}
|
||||
|
||||
static inline __m512i v512_insert_epi64(__m512i& target, const int64_t x, const int index)
|
||||
{
|
||||
return _mm512_mask_set1_epi64(target, 1UL << index, x);
|
||||
}
|
||||
|
||||
static inline void v_gather_channel(v_uint8x64& vec, const uint8_t tmp[], const short mapsx[],
|
||||
int chanNum, int c, int x, int shift)
|
||||
int chanNum, int c, int x, int shift)
|
||||
{
|
||||
__m256i vec1 = _mm256_setzero_si256();
|
||||
__m256i vec2 = _mm256_setzero_si256();
|
||||
@ -3292,7 +3276,7 @@ static inline int v512_extract_epi32(__m512i target)
|
||||
template <int index>
|
||||
static inline int v512_extract_epi16(__m512i target)
|
||||
{
|
||||
return (v512_extract_epi32<index/2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
|
||||
return (v512_extract_epi32<index / 2>(target) >> (index % 2 ? 16 : 0)) & 0xFFFF;
|
||||
}
|
||||
|
||||
static inline v_uint8x64 v_gather_pairs(const uchar src[], const v_int16x32& index) {
|
||||
@ -3335,6 +3319,33 @@ static inline v_uint8x64 v_gather_pairs(const uchar src[], const v_int16x32& ind
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline void v_gather_pairs(const float src[], const int mapsx[], int x,
|
||||
v_float32x16& low, v_float32x16& high) {
|
||||
__m512i lo = _mm512_castps_si512(low.val);
|
||||
__m512i hi = _mm512_castps_si512(high.val);
|
||||
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x]]), 0);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 1]]), 1);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 2]]), 2);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 3]]), 3);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 4]]), 4);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 5]]), 5);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 6]]), 6);
|
||||
lo = v512_insert_epi64(lo, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 7]]), 7);
|
||||
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 8]]), 0);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 9]]), 1);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 10]]), 2);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 11]]), 3);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 12]]), 4);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 13]]), 5);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 14]]), 6);
|
||||
hi = v512_insert_epi64(hi, *reinterpret_cast<const int64_t*>(&src[mapsx[x + 15]]), 7);
|
||||
|
||||
low.val = _mm512_castsi512_ps(lo);
|
||||
high.val = _mm512_castsi512_ps(hi);
|
||||
}
|
||||
|
||||
namespace {
|
||||
template<int chanNum>
|
||||
static inline v_int16x32 v_gather_chan(const uchar src[], const v_int16x32& index, int channel, int pos) {
|
||||
|
@ -2973,19 +2973,16 @@ namespace {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
static inline void v_gather_pairs(const float src[], const v_int32x4& index,
|
||||
static inline void v_gather_pairs(const float src[], const int mapsx[], int x,
|
||||
v_float32x4& low, v_float32x4& high) {
|
||||
int i[4];
|
||||
v_store(i, index);
|
||||
|
||||
__m128 l = _mm_setzero_ps();
|
||||
l = _mm_loadl_pi(l, (const __m64*)&src[i[0]]); // pair of floats
|
||||
l = _mm_loadh_pi(l, (const __m64*)&src[i[1]]);
|
||||
l = _mm_loadl_pi(l, (const __m64*)&src[mapsx[x + 0]]); // pair of floats
|
||||
l = _mm_loadh_pi(l, (const __m64*)&src[mapsx[x + 1]]);
|
||||
low.val = l;
|
||||
|
||||
__m128 h = _mm_setzero_ps();
|
||||
h = _mm_loadl_pi(h, (const __m64*)&src[i[2]]);
|
||||
h = _mm_loadh_pi(h, (const __m64*)&src[i[3]]);
|
||||
h = _mm_loadl_pi(h, (const __m64*)&src[mapsx[x + 2]]);
|
||||
h = _mm_loadh_pi(h, (const __m64*)&src[mapsx[x + 3]]);
|
||||
high.val = h;
|
||||
}
|
||||
|
||||
@ -3070,17 +3067,14 @@ static inline v_uint8x16 v_setr_s8(char b0, char b1, char b2, char b3, char b4,
|
||||
char b5, char b6, char b7, char b8, char b9,
|
||||
char b10, char b11, char b12, char b13, char b14,
|
||||
char b15) {
|
||||
v_uint8x16 res;
|
||||
res.val = _mm_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8,
|
||||
b9, b10, b11, b12, b13, b14, b15);
|
||||
return res;
|
||||
return v_uint8x16(_mm_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8,
|
||||
b9, b10, b11, b12, b13, b14, b15));
|
||||
}
|
||||
|
||||
|
||||
static inline v_uint8x16 v_shuffle_s8(const v_uint8x16& a, const v_uint8x16& mask) {
|
||||
v_uint8x16 res;
|
||||
res.val = _mm_shuffle_epi8(a.val, mask.val);
|
||||
return res;
|
||||
static inline v_uint8x16 v_shuffle_s8(const v_uint8x16& a, const v_uint8x16& mask)
|
||||
{
|
||||
return v_uint8x16(_mm_shuffle_epi8(a.val, mask.val));
|
||||
}
|
||||
|
||||
static inline void v_gather_channel(v_uint8x16& vec, const uint8_t tmp[], const short mapsx[],
|
||||
|
Loading…
Reference in New Issue
Block a user