[PP] Removed old (non GAPI) preprocessing code (#3664)
This commit is contained in:
parent
e490dfc161
commit
977c3dda23
@ -1,682 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ie_preprocess_data.hpp"
|
||||
#include "ie_preprocess_data_sse42.hpp"
|
||||
|
||||
#include <nmmintrin.h> // SSE 4.2
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace InferenceEngine {
|
||||
namespace Resize {
|
||||
|
||||
static inline int ceil(double value) {
|
||||
__m128d t = _mm_set_sd(value);
|
||||
int i = _mm_cvtsd_si32(t);
|
||||
return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t));
|
||||
}
|
||||
|
||||
|
||||
static inline int floor(double value) {
|
||||
__m128d t = _mm_set_sd(value);
|
||||
int i = _mm_cvtsd_si32(t);
|
||||
return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i)));
|
||||
}
|
||||
|
||||
static inline int16_t mulq15(int16_t a, int16_t b) {
|
||||
return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15);
|
||||
}
|
||||
|
||||
static inline uint16_t mulq16(uint16_t a, uint16_t b) {
|
||||
return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
|
||||
}
|
||||
|
||||
void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
|
||||
Border border = {BORDER_REPLICATE, 0};
|
||||
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
auto dwidth = static_cast<const int>(dstDims[3]);
|
||||
auto dheight = static_cast<const int>(dstDims[2]);
|
||||
auto swidth = static_cast<const int>(srcDims[3]);
|
||||
auto channels = static_cast<const int>(srcDims[1]);
|
||||
|
||||
auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto origSrcW = src_strides[2];
|
||||
auto origSrcH = src_strides[1] / src_strides[2];
|
||||
auto origDstW = dst_strides[2];
|
||||
auto origDstH = dst_strides[1] / dst_strides[2];
|
||||
|
||||
const int src_go_x = 0;
|
||||
const int src_go_y = 0;
|
||||
const int dst_go_x = 0;
|
||||
const int dst_go_y = 0;
|
||||
auto src_full_width = static_cast<const int>(srcDims[3]);
|
||||
auto src_full_height = static_cast<const int>(srcDims[2]);
|
||||
auto dst_full_width = static_cast<const int>(dstDims[3]);
|
||||
auto dst_full_height = static_cast<const int>(dstDims[2]);
|
||||
|
||||
const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) +
|
||||
inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) +
|
||||
outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
|
||||
auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
|
||||
auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
|
||||
|
||||
const int BITS = 15;
|
||||
const int SCALE = (1 << BITS);
|
||||
const int alpha_clones_num = 4;
|
||||
const int cols_block_size = 8;
|
||||
const int kRowsBlockSize = 4;
|
||||
|
||||
auto *pxofs1 = reinterpret_cast<int32_t *>(buffer);
|
||||
auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth);
|
||||
auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num);
|
||||
auto *beta = reinterpret_cast<int16_t *>(yofs + dheight);
|
||||
auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight);
|
||||
|
||||
auto tptr_ = tptr;
|
||||
|
||||
tptr_[0] = (uint8_t) border.value;
|
||||
tptr_[1] = (uint8_t) border.value;
|
||||
tptr_[2] = (uint8_t) border.value;
|
||||
tptr_[3] = (uint8_t) border.value;
|
||||
tptr_[swidth + 0 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth + 1 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth + 2 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth + 3 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value;
|
||||
tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value;
|
||||
|
||||
for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
|
||||
auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
|
||||
int32_t sx = floor(fx);
|
||||
fx -= sx;
|
||||
|
||||
int32_t sx0 = sx;
|
||||
if (sx < 0 && border.type == BORDER_REPLICATE) {
|
||||
fx = 0;
|
||||
sx0 = 0;
|
||||
}
|
||||
|
||||
fx = fx * SCALE;
|
||||
|
||||
if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
|
||||
fx = 1.f * SCALE - 1;
|
||||
sx0 = (std::max)(src_full_width - 2, 0);
|
||||
}
|
||||
|
||||
pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x);
|
||||
for (int i = 0; i < alpha_clones_num; i++) {
|
||||
alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx;
|
||||
}
|
||||
}
|
||||
|
||||
for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
|
||||
float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
|
||||
int32_t sy = floor(fy);
|
||||
fy -= sy;
|
||||
|
||||
int32_t sy0 = sy;
|
||||
if (sy < 0 && border.type == BORDER_REPLICATE) {
|
||||
fy = 0;
|
||||
sy0 = 0;
|
||||
}
|
||||
|
||||
fy = fy * SCALE;
|
||||
|
||||
if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
|
||||
fy = 1.f * SCALE - 1;
|
||||
sy0 = (std::max)(src_full_height - 2, 0);
|
||||
}
|
||||
|
||||
yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep;
|
||||
beta[dy - dst_go_y] = (int16_t) fy;
|
||||
}
|
||||
|
||||
if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) {
|
||||
auto full_pass = [&](int c, int y) {
|
||||
auto sptr_ = sptr + c * origSrcW * origSrcH;
|
||||
auto dptr_ = dptr + c * origDstW * origDstH;
|
||||
auto tptr_ = tptr;
|
||||
|
||||
for (int x = 0; x < swidth; x++) {
|
||||
int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0];
|
||||
int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x +
|
||||
sstep];
|
||||
|
||||
int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0));
|
||||
tptr_[x + 4] = (uint8_t) res;
|
||||
}
|
||||
|
||||
for (int x = 0; x < dwidth; x++) {
|
||||
int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4];
|
||||
int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4];
|
||||
|
||||
int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0));
|
||||
dptr_[y * dstep + x] = (uint8_t) res;
|
||||
}
|
||||
};
|
||||
|
||||
for (int c = 0; c < channels; c++) {
|
||||
for (int y = 0; y < dheight; y++) {
|
||||
full_pass(c, y);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) {
|
||||
int32_t filtered_rows_id[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 :
|
||||
(yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i];
|
||||
}
|
||||
|
||||
__m128i b0 = _mm_set1_epi16(beta[y + 0]);
|
||||
__m128i b1 = _mm_set1_epi16(beta[y + 1]);
|
||||
__m128i b2 = _mm_set1_epi16(beta[y + 2]);
|
||||
__m128i b3 = _mm_set1_epi16(beta[y + 3]);
|
||||
|
||||
int x = 0;
|
||||
vertical_pass:
|
||||
for (; x <= swidth - cols_block_size; x += cols_block_size) {
|
||||
__m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])),
|
||||
*(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1);
|
||||
__m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])),
|
||||
*(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1);
|
||||
__m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)),
|
||||
*(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1);
|
||||
__m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)),
|
||||
*(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1);
|
||||
|
||||
__m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128());
|
||||
__m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
|
||||
__m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128());
|
||||
__m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
|
||||
|
||||
__m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128());
|
||||
__m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
|
||||
__m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128());
|
||||
__m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
|
||||
|
||||
__m128i s0_0 = _mm_sub_epi16(val1_0, val0_0);
|
||||
__m128i s0_1 = _mm_sub_epi16(val1_1, val0_1);
|
||||
__m128i s0_2 = _mm_sub_epi16(val1_2, val0_2);
|
||||
__m128i s0_3 = _mm_sub_epi16(val1_3, val0_3);
|
||||
|
||||
__m128i t0 = _mm_mulhrs_epi16(s0_0, b0);
|
||||
__m128i t1 = _mm_mulhrs_epi16(s0_1, b1);
|
||||
__m128i t2 = _mm_mulhrs_epi16(s0_2, b2);
|
||||
__m128i t3 = _mm_mulhrs_epi16(s0_3, b3);
|
||||
|
||||
__m128i r0 = _mm_add_epi16(val0_0, t0);
|
||||
__m128i r1 = _mm_add_epi16(val0_1, t1);
|
||||
__m128i r2 = _mm_add_epi16(val0_2, t2);
|
||||
__m128i r3 = _mm_add_epi16(val0_3, t3);
|
||||
|
||||
__m128i q0 = _mm_packus_epi16(r0, r1);
|
||||
__m128i q1 = _mm_packus_epi16(r2, r3);
|
||||
|
||||
__m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
|
||||
__m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
|
||||
|
||||
__m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
|
||||
__m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5);
|
||||
}
|
||||
|
||||
if (x < swidth) {
|
||||
x = swidth - cols_block_size;
|
||||
goto vertical_pass;
|
||||
}
|
||||
|
||||
if (border.type == BORDER_CONSTANT) {
|
||||
for (int i = 0; i < kRowsBlockSize; i++) {
|
||||
if (yofs[y + i] < 0) {
|
||||
for (x = 0; x < swidth; x++) {
|
||||
int val0 = border.value;
|
||||
int val1 = sptr_[yofs[y + i] + x + sstep];
|
||||
|
||||
int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
|
||||
tptr_[x * 4 + i + 4] = (uint8_t) res;
|
||||
}
|
||||
}
|
||||
|
||||
if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) {
|
||||
for (x = 0; x < swidth; x++) {
|
||||
int val0 = sptr_[yofs[y + i] + x];
|
||||
int val1 = border.value;
|
||||
|
||||
int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
|
||||
tptr_[x * 4 + i + 4] = (uint8_t) res;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
x = 0;
|
||||
horizontal_pass:
|
||||
for (; x <= dwidth - cols_block_size; x += cols_block_size) {
|
||||
__m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num));
|
||||
__m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num));
|
||||
__m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num));
|
||||
__m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num));
|
||||
|
||||
__m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)),
|
||||
*(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1);
|
||||
__m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)),
|
||||
*(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1);
|
||||
__m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)),
|
||||
*(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1);
|
||||
__m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)),
|
||||
*(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1);
|
||||
|
||||
val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
|
||||
__m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128());
|
||||
__m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128());
|
||||
__m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128());
|
||||
__m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128());
|
||||
|
||||
__m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
|
||||
__m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
|
||||
__m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
|
||||
__m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
|
||||
|
||||
val1_0 = _mm_sub_epi16(val1_0, val0_0);
|
||||
val1_1 = _mm_sub_epi16(val1_1, val0_1);
|
||||
val1_2 = _mm_sub_epi16(val1_2, val0_2);
|
||||
val1_3 = _mm_sub_epi16(val1_3, val0_3);
|
||||
|
||||
__m128i t0 = _mm_mulhrs_epi16(val1_0, a10);
|
||||
__m128i t1 = _mm_mulhrs_epi16(val1_1, a32);
|
||||
__m128i t2 = _mm_mulhrs_epi16(val1_2, a54);
|
||||
__m128i t3 = _mm_mulhrs_epi16(val1_3, a76);
|
||||
|
||||
__m128i r0 = _mm_add_epi16(val0_0, t0);
|
||||
__m128i r1 = _mm_add_epi16(val0_1, t1);
|
||||
__m128i r2 = _mm_add_epi16(val0_2, t2);
|
||||
__m128i r3 = _mm_add_epi16(val0_3, t3);
|
||||
|
||||
__m128i q0 = _mm_packus_epi16(r0, r1);
|
||||
__m128i q1 = _mm_packus_epi16(r2, r3);
|
||||
|
||||
__m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
|
||||
__m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
|
||||
|
||||
__m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
|
||||
__m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
|
||||
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8));
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8));
|
||||
}
|
||||
|
||||
if (x < dwidth) {
|
||||
x = dwidth - cols_block_size;
|
||||
goto horizontal_pass;
|
||||
}
|
||||
};
|
||||
|
||||
for (int c = 0; c < channels; c++) {
|
||||
for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) {
|
||||
auto sptr_ = sptr + c * origSrcW * origSrcH;
|
||||
auto dptr_ = dptr + c * origDstW * origDstH;
|
||||
auto tptr_ = tptr;
|
||||
|
||||
full_pass_vec(sptr_, dptr_, tptr_, y);
|
||||
|
||||
if (y + kRowsBlockSize > dheight - kRowsBlockSize)
|
||||
full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
auto dwidth = static_cast<const int>(dstDims[3]);
|
||||
auto dheight = static_cast<const int>(dstDims[2]);
|
||||
auto swidth = static_cast<const int>(srcDims[3]);
|
||||
auto sheight = static_cast<const int>(srcDims[2]);
|
||||
auto channels = static_cast<const int>(srcDims[1]);
|
||||
|
||||
auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto origSrcW = src_strides[2];
|
||||
auto origSrcH = src_strides[1] / src_strides[2];
|
||||
auto origDstW = dst_strides[2];
|
||||
auto origDstH = dst_strides[1] / dst_strides[2];
|
||||
|
||||
const int src_go_x = 0;
|
||||
const int src_go_y = 0;
|
||||
const int dst_go_x = 0;
|
||||
const int dst_go_y = 0;
|
||||
|
||||
auto src_full_width = static_cast<const int>(srcDims[3]);
|
||||
auto src_full_height = static_cast<const int>(srcDims[2]);
|
||||
auto dst_full_width = static_cast<const int>(dstDims[3]);
|
||||
auto dst_full_height = static_cast<const int>(dstDims[2]);
|
||||
|
||||
auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
|
||||
float scale_x = static_cast<float>(src_full_width) / dst_full_width;
|
||||
float scale_y = static_cast<float>(src_full_height) / dst_full_height;
|
||||
|
||||
int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, scale_x);
|
||||
int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
|
||||
|
||||
auto* xsi = reinterpret_cast<uint16_t*>(buffer);
|
||||
auto* ysi = xsi + dwidth;
|
||||
auto* xalpha = ysi + dheight;
|
||||
auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
|
||||
|
||||
computeResizeAreaTab(src_go_x, dst_go_x, src_full_width, dwidth, scale_x, xsi, xalpha, x_max_count);
|
||||
computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
|
||||
|
||||
int vest_sum_size = 2*swidth;
|
||||
uint16_t* vert_sum = yalpha + dheight*y_max_count;
|
||||
uint16_t* alpha0 = vert_sum + vest_sum_size;
|
||||
uint16_t* alpha1 = alpha0 + dwidth;
|
||||
uint16_t* alpha2 = alpha1 + dwidth;
|
||||
uint16_t* alpha3 = alpha2 + dwidth;
|
||||
uint16_t* sxid0 = alpha3 + dwidth;
|
||||
uint16_t* sxid1 = sxid0 + 4*dwidth;
|
||||
uint16_t* sxid2 = sxid1 + 4*dwidth;
|
||||
uint16_t* sxid3 = sxid2 + 4*dwidth;
|
||||
|
||||
uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
|
||||
uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
|
||||
generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
|
||||
|
||||
auto full_pass = [&](int c, int y) {
|
||||
uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH;
|
||||
uint16_t* vert_sum_ = vert_sum;
|
||||
|
||||
int ysi_row = ysi[y];
|
||||
|
||||
memset(vert_sum_, 0, swidth * sizeof(uint16_t));
|
||||
|
||||
for (int dy = 0; dy < y_max_count; dy++) {
|
||||
uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
|
||||
const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH;
|
||||
if (ysi_row + dy >= sheight) break;
|
||||
|
||||
int x = 0;
|
||||
|
||||
__m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
|
||||
for (; x <= swidth - 16; x += 16) {
|
||||
__m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
|
||||
|
||||
// sptr_dy[x] << 8
|
||||
__m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
|
||||
__m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
|
||||
|
||||
__m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
|
||||
__m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
|
||||
|
||||
vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
|
||||
vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
|
||||
}
|
||||
|
||||
for (; x < swidth; x++) {
|
||||
vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
|
||||
}
|
||||
}
|
||||
|
||||
if (x_max_count == 2) {
|
||||
int x = 0;
|
||||
for (; x <= dwidth - 8; x += 8) {
|
||||
__m128i res = _mm_set1_epi16(1 << (8 - 1));
|
||||
|
||||
int id0 = xsi[x];
|
||||
|
||||
__m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
|
||||
__m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
|
||||
|
||||
__m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
|
||||
__m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
|
||||
|
||||
__m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
|
||||
__m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
|
||||
|
||||
__m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx0_id1));
|
||||
__m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx1_id1));
|
||||
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
|
||||
|
||||
res = _mm_srli_epi16(res, 8);
|
||||
res = _mm_packus_epi16(res, res);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
|
||||
}
|
||||
|
||||
for (; x < dwidth; x++) {
|
||||
uint16_t res = 1 << (8 - 1);
|
||||
int id = xsi[x];
|
||||
res += mulq16(alpha0[x], vert_sum_[id + 0]);
|
||||
res += mulq16(alpha1[x], vert_sum_[id + 1]);
|
||||
pdst_row[x] = saturateU32toU8(res >> 8);
|
||||
}
|
||||
} else if (x_max_count == 3) {
|
||||
int x = 0;
|
||||
for (; x <= dwidth - 8; x += 8) {
|
||||
__m128i res = _mm_set1_epi16(1 << (8 - 1));
|
||||
|
||||
int id0 = xsi[x];
|
||||
|
||||
__m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
|
||||
__m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
|
||||
__m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
|
||||
|
||||
__m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
|
||||
__m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
|
||||
__m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
|
||||
|
||||
__m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
|
||||
__m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
|
||||
__m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
|
||||
|
||||
__m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
|
||||
__m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
|
||||
__m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
|
||||
|
||||
__m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx0_id1)),
|
||||
_mm_shuffle_epi8(chunk2, sx0_id2));
|
||||
__m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx1_id1)),
|
||||
_mm_shuffle_epi8(chunk2, sx1_id2));
|
||||
__m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx2_id1)),
|
||||
_mm_shuffle_epi8(chunk2, sx2_id2));
|
||||
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
|
||||
|
||||
res = _mm_srli_epi16(res, 8);
|
||||
res = _mm_packus_epi16(res, res);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
|
||||
}
|
||||
|
||||
for (; x < dwidth; x++) {
|
||||
uint16_t res = 1 << (8 - 1);
|
||||
int id = xsi[x];
|
||||
res += mulq16(alpha0[x], vert_sum_[id + 0]);
|
||||
res += mulq16(alpha1[x], vert_sum_[id + 1]);
|
||||
res += mulq16(alpha2[x], vert_sum_[id + 2]);
|
||||
pdst_row[x] = saturateU32toU8(res >> 8);
|
||||
}
|
||||
} else if (x_max_count == 4) {
|
||||
int x = 0;
|
||||
for (; x <= dwidth - 8; x += 8) {
|
||||
__m128i res = _mm_set1_epi16(1 << (8 - 1));
|
||||
|
||||
int id0 = xsi[x];
|
||||
|
||||
__m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
|
||||
__m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
|
||||
__m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
|
||||
__m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
|
||||
|
||||
__m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
|
||||
__m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
|
||||
__m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
|
||||
__m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
|
||||
|
||||
__m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
|
||||
__m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
|
||||
__m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
|
||||
__m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
|
||||
|
||||
__m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
|
||||
__m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
|
||||
__m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
|
||||
__m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
|
||||
|
||||
__m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
|
||||
__m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
|
||||
__m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
|
||||
__m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
|
||||
|
||||
__m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx0_id1)),
|
||||
_mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
|
||||
_mm_shuffle_epi8(chunk3, sx0_id3)));
|
||||
__m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx1_id1)),
|
||||
_mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
|
||||
_mm_shuffle_epi8(chunk3, sx1_id3)));
|
||||
__m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx2_id1)),
|
||||
_mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
|
||||
_mm_shuffle_epi8(chunk3, sx2_id3)));
|
||||
__m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
|
||||
_mm_shuffle_epi8(chunk1, sx3_id1)),
|
||||
_mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
|
||||
_mm_shuffle_epi8(chunk3, sx3_id3)));
|
||||
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
|
||||
|
||||
res = _mm_srli_epi16(res, 8);
|
||||
res = _mm_packus_epi16(res, res);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
|
||||
}
|
||||
|
||||
for (; x < dwidth; x++) {
|
||||
uint16_t res = 1 << (8 - 1);
|
||||
int id = xsi[x];
|
||||
res += mulq16(alpha0[x], vert_sum_[id + 0]);
|
||||
res += mulq16(alpha1[x], vert_sum_[id + 1]);
|
||||
res += mulq16(alpha2[x], vert_sum_[id + 2]);
|
||||
res += mulq16(alpha3[x], vert_sum_[id + 3]);
|
||||
pdst_row[x] = saturateU32toU8(res >> 8);
|
||||
}
|
||||
} else if (x_max_count <= 7) {
|
||||
int x = 0;
|
||||
for (; x <= dwidth - 8; x += 8) {
|
||||
__m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
|
||||
for (int i = 0; i < x_max_count; i++) {
|
||||
__m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 1 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 2 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 3 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 4 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 5 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 6 + i],
|
||||
xalpha[x * x_max_count + x_max_count * 7 + i]);
|
||||
__m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
|
||||
vert_sum_[xsi[x + 1] + i],
|
||||
vert_sum_[xsi[x + 2] + i],
|
||||
vert_sum_[xsi[x + 3] + i],
|
||||
vert_sum_[xsi[x + 4] + i],
|
||||
vert_sum_[xsi[x + 5] + i],
|
||||
vert_sum_[xsi[x + 6] + i],
|
||||
vert_sum_[xsi[x + 7] + i]);
|
||||
|
||||
res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
|
||||
}
|
||||
res = _mm_srli_epi16(res, 8);
|
||||
res = _mm_packus_epi16(res, res);
|
||||
_mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
|
||||
}
|
||||
|
||||
for (; x < dwidth; x++) {
|
||||
uint16_t res = 1 << (8 - 1);
|
||||
for (int i = 0; i < x_max_count; i++) {
|
||||
uint16_t a = xalpha[x * x_max_count + i];
|
||||
int sx = xsi[x] + i;
|
||||
|
||||
res += mulq16(a, vert_sum_[sx]);
|
||||
}
|
||||
pdst_row[x] = saturateU32toU8(res >> 8);
|
||||
}
|
||||
} else {
|
||||
for (int x = 0; x < dwidth; x++) {
|
||||
uint16_t res = 1 << (8 - 1);
|
||||
__m128i vres = _mm_setzero_si128();
|
||||
int id = xsi[x];
|
||||
|
||||
int i = 0;
|
||||
for (; i <= x_max_count - 8; i += 8) {
|
||||
__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
|
||||
__m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
|
||||
|
||||
vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
|
||||
}
|
||||
vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
|
||||
vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
|
||||
vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
|
||||
res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
|
||||
|
||||
for (; i < x_max_count; i++) {
|
||||
uint16_t a = xalpha[x * x_max_count + i];
|
||||
uint16_t s = vert_sum_[id + i];
|
||||
|
||||
res += mulq16(a, s);
|
||||
}
|
||||
|
||||
pdst_row[x] = saturateU32toU8(res >> 8);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for (int c = 0; c < channels; c++) {
|
||||
for (int y = 0; y < dheight; y++) {
|
||||
full_pass(c, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Resize
|
||||
} // namespace InferenceEngine
|
@ -1,17 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ie_blob.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace InferenceEngine {
|
||||
|
||||
void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
|
||||
|
||||
void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
|
||||
|
||||
}
|
@ -4,753 +4,16 @@
|
||||
|
||||
#include "ie_preprocess_gapi.hpp"
|
||||
#include "ie_system_conf.h"
|
||||
#include "blob_transform.hpp"
|
||||
#include "ie_preprocess_data.hpp"
|
||||
#include "ie_preprocess_itt.hpp"
|
||||
|
||||
#ifdef HAVE_SSE
|
||||
# include "cpu_x86_sse42/ie_preprocess_data_sse42.hpp"
|
||||
#endif
|
||||
|
||||
#include "debug.h"
|
||||
#include "ie_compound_blob.h"
|
||||
#include <ie_input_info.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
|
||||
namespace InferenceEngine {
|
||||
|
||||
|
||||
namespace Resize {
|
||||
|
||||
template<typename data_t> static inline data_t saturate_cast(float res);
|
||||
|
||||
template<> inline float saturate_cast(float res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> inline uint8_t saturate_cast(float res) {
|
||||
int ires = static_cast<int>((std::round)(res));
|
||||
return static_cast<uint8_t>((std::max)(0, (std::min)(255, ires)));
|
||||
}
|
||||
|
||||
template<typename data_t = float>
|
||||
void resize_bilinear(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
|
||||
Border border = {BORDER_REPLICATE, 0};
|
||||
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
auto dwidth = static_cast<const int>(dstDims[3]);
|
||||
auto dheight = static_cast<const int>(dstDims[2]);
|
||||
auto swidth = static_cast<const int>(srcDims[3]);
|
||||
auto channels = static_cast<const int>(srcDims[1]);
|
||||
|
||||
auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto origSrcW = src_strides[2];
|
||||
auto origSrcH = src_strides[1] / src_strides[2];
|
||||
auto origDstW = dst_strides[2];
|
||||
auto origDstH = dst_strides[1] / dst_strides[2];
|
||||
|
||||
const int src_go_x = 0;
|
||||
const int src_go_y = 0;
|
||||
const int dst_go_x = 0;
|
||||
const int dst_go_y = 0;
|
||||
auto src_full_width = static_cast<const int>(srcDims[3]);
|
||||
auto src_full_height = static_cast<const int>(srcDims[2]);
|
||||
auto dst_full_width = static_cast<const int>(dstDims[3]);
|
||||
auto dst_full_height = static_cast<const int>(dstDims[2]);
|
||||
|
||||
auto *sptr = static_cast<data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto *dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
|
||||
auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
|
||||
auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
|
||||
|
||||
auto* xofs = reinterpret_cast<int32_t*>(buffer);
|
||||
auto* yofs = xofs + dwidth;
|
||||
auto* alpha = reinterpret_cast<float*>(yofs + dheight);
|
||||
auto* beta = alpha + dwidth;
|
||||
auto* tptr = beta + dheight;
|
||||
|
||||
for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
|
||||
auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
|
||||
int32_t sx = static_cast<int32_t>(floor(fx));
|
||||
fx -= sx;
|
||||
|
||||
int32_t sx0 = sx;
|
||||
if (sx < 0 && border.type == BORDER_REPLICATE) {
|
||||
fx = 0;
|
||||
sx0 = 0;
|
||||
}
|
||||
|
||||
if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
|
||||
fx = 1.f;
|
||||
sx0 = (std::max)(src_full_width - 2, 0);
|
||||
}
|
||||
|
||||
xofs[dx - dst_go_x] = sx0 - src_go_x;
|
||||
alpha[dx - dst_go_x] = fx;
|
||||
}
|
||||
|
||||
for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
|
||||
auto fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
|
||||
int32_t sy = static_cast<int32_t>(floor(fy));
|
||||
fy -= sy;
|
||||
|
||||
int32_t sy0 = sy;
|
||||
if (sy < 0 && border.type == BORDER_REPLICATE) {
|
||||
fy = 0;
|
||||
sy0 = 0;
|
||||
}
|
||||
|
||||
if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
|
||||
fy = 1.f;
|
||||
sy0 = (std::max)(src_full_height - 2, 0);
|
||||
}
|
||||
|
||||
yofs[dy - dst_go_y] = sy0 - src_go_y;
|
||||
beta[dy - dst_go_y] = fy;
|
||||
}
|
||||
|
||||
auto full_pass = [&](int c, int y) {
|
||||
auto sptr_ = sptr + c * origSrcW * origSrcH;
|
||||
auto dptr_ = dptr + c * origDstW * origDstH;
|
||||
auto tptr_ = tptr;
|
||||
|
||||
for (int x = 0; x < swidth; x++) {
|
||||
bool use_constant0 = yofs[y] + 0 < 0 || yofs[y] + 0 >= src_full_height;
|
||||
bool use_constant1 = yofs[y] + 1 < 0 || yofs[y] + 1 >= src_full_height;
|
||||
float val0 = static_cast<float>(use_constant0 ? border.value : sptr_[(yofs[y] + 0) * sstep + x]);
|
||||
float val1 = static_cast<float>(use_constant1 ? border.value : sptr_[(yofs[y] + 1) * sstep + x]);
|
||||
|
||||
float res = val0 + beta[y] * (val1 - val0);
|
||||
tptr_[x] = res;
|
||||
}
|
||||
|
||||
for (int x = 0; x < dwidth; x++) {
|
||||
bool use_constant0 = xofs[x] + 0 < 0 || xofs[x] + 0 >= src_full_width;
|
||||
bool use_constant1 = xofs[x] + 1 < 0 || xofs[x] + 1 >= src_full_width;
|
||||
float val0 = use_constant0 ? border.value : tptr_[xofs[x] + 0];
|
||||
float val1 = use_constant1 ? border.value : tptr_[xofs[x] + 1];
|
||||
|
||||
float res = val0 + alpha[x] * (val1 - val0);
|
||||
dptr_[y * dstep + x] = saturate_cast<data_t>(res);
|
||||
}
|
||||
};
|
||||
|
||||
for (int c = 0; c < channels; c++) {
|
||||
for (int y = 0; y < dheight; y++) {
|
||||
full_pass(c, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
|
||||
static const float threshold = 1e-3f;
|
||||
int max_count = 0;
|
||||
|
||||
for (int col = dst_go; col < dst_go + dsize; col++) {
|
||||
int count = 0;
|
||||
|
||||
float fsx1 = col * scale;
|
||||
float fsx2 = fsx1 + scale;
|
||||
|
||||
int sx1 = static_cast<int>(ceil(fsx1));
|
||||
int sx2 = static_cast<int>(floor(fsx2));
|
||||
|
||||
sx2 = (std::min)(sx2, ssize - 1);
|
||||
sx1 = (std::min)(sx1, sx2);
|
||||
|
||||
if (sx1 - fsx1 > threshold) {
|
||||
count++;
|
||||
}
|
||||
|
||||
for (int sx = sx1; sx < sx2; sx++) {
|
||||
count++;
|
||||
}
|
||||
|
||||
if (fsx2 - sx2 > threshold) {
|
||||
count++;
|
||||
}
|
||||
max_count = (std::max)(max_count, count);
|
||||
}
|
||||
|
||||
return max_count;
|
||||
}
|
||||
|
||||
void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
|
||||
uint16_t* si, uint16_t* alpha, int max_count) {
|
||||
static const float threshold = 1e-3f;
|
||||
int k = 0;
|
||||
|
||||
for (int col = dst_go; col < dst_go + dsize; col++) {
|
||||
int count = 0;
|
||||
|
||||
float fsx1 = col * scale;
|
||||
float fsx2 = fsx1 + scale;
|
||||
float cellWidth = (std::min)(scale, ssize - fsx1);
|
||||
|
||||
int sx1 = static_cast<int>(ceil(fsx1));
|
||||
int sx2 = static_cast<int>(floor(fsx2));
|
||||
|
||||
sx2 = (std::min)(sx2, ssize - 1);
|
||||
sx1 = (std::min)(sx1, sx2);
|
||||
|
||||
si[col - dst_go] = (uint16_t)(sx1 - src_go);
|
||||
|
||||
if (sx1 - fsx1 > threshold) {
|
||||
si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
|
||||
alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
|
||||
count++;
|
||||
}
|
||||
|
||||
for (int sx = sx1; sx < sx2; sx++) {
|
||||
alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
|
||||
count++;
|
||||
}
|
||||
|
||||
if (fsx2 - sx2 > threshold) {
|
||||
alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
|
||||
count++;
|
||||
}
|
||||
|
||||
if (count != max_count) {
|
||||
alpha[k++] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
|
||||
uint16_t** alpha, uint16_t** sxid) {
|
||||
if (x_max_count <= 4) {
|
||||
for (int col = 0; col < dcols; col++) {
|
||||
for (int x = 0; x < x_max_count; x++) {
|
||||
alpha[x][col] = xalpha[col*x_max_count + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (x_max_count <= 4) {
|
||||
for (int col = 0; col <= dcols - 8; col += 8) {
|
||||
for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
|
||||
for (int i = 0; i < 128 / 16; i++) {
|
||||
int id_diff = xsi[col + i] - xsi[col];
|
||||
|
||||
for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
|
||||
uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
|
||||
|
||||
int id0 = (id_diff + chunk_num_v) * 2 + 0;
|
||||
int id1 = (id_diff + chunk_num_v) * 2 + 1;
|
||||
|
||||
(reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
|
||||
(reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int computeResizeAreaTabFP32(int src_go, int dst_go, int ssize, int dsize, float scale, uint16_t* si, uint16_t* di, float* alpha) {
|
||||
static const float threshold = 1e-3f;
|
||||
int k = 0;
|
||||
|
||||
for (int col = dst_go; col < dst_go + dsize; col++) {
|
||||
float fsx1 = col * scale;
|
||||
float fsx2 = fsx1 + scale;
|
||||
float cellWidth = (std::min)(scale, ssize - fsx1);
|
||||
|
||||
int sx1 = static_cast<int>(ceil(fsx1));
|
||||
int sx2 = static_cast<int>(floor(fsx2));
|
||||
|
||||
sx2 = (std::min)(sx2, ssize - 1);
|
||||
sx1 = (std::min)(sx1, sx2);
|
||||
|
||||
if (sx1 - fsx1 > threshold) {
|
||||
di[k] = (uint16_t)(col - dst_go);
|
||||
si[k] = (uint16_t)(sx1 - src_go - 1);
|
||||
alpha[k++] = (sx1 - fsx1) / cellWidth;
|
||||
}
|
||||
|
||||
for (int sx = sx1; sx < sx2; sx++) {
|
||||
di[k] = (uint16_t)(col - dst_go);
|
||||
si[k] = (uint16_t)(sx - src_go);
|
||||
alpha[k++] = 1.0f / cellWidth;
|
||||
}
|
||||
|
||||
if (fsx2 - sx2 > threshold) {
|
||||
di[k] = (uint16_t)(col - dst_go);
|
||||
si[k] = (uint16_t)(sx2 - src_go);
|
||||
alpha[k++] = (std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth;
|
||||
}
|
||||
}
|
||||
return k;
|
||||
}
|
||||
|
||||
template<typename data_t = float>
|
||||
void resize_area_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto origSrcW = src_strides[2];
|
||||
auto origSrcH = src_strides[1] / src_strides[2];
|
||||
auto origDstW = dst_strides[2];
|
||||
auto origDstH = dst_strides[1] / dst_strides[2];
|
||||
|
||||
auto dwidth = static_cast<const int>(dstDims[3]);
|
||||
auto dheight = static_cast<const int>(dstDims[2]);
|
||||
auto swidth = static_cast<const int>(srcDims[3]);
|
||||
auto sheight = static_cast<const int>(srcDims[2]);
|
||||
auto channels = static_cast<const int>(srcDims[1]);
|
||||
|
||||
const int src_go_x = 0;
|
||||
const int src_go_y = 0;
|
||||
const int dst_go_x = 0;
|
||||
const int dst_go_y = 0;
|
||||
|
||||
auto src_full_width = static_cast<const int>(srcDims[3]);
|
||||
auto src_full_height = static_cast<const int>(srcDims[2]);
|
||||
auto dst_full_width = static_cast<const int>(dstDims[3]);
|
||||
auto dst_full_height = static_cast<const int>(dstDims[2]);
|
||||
|
||||
auto* sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto* dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
|
||||
auto sstep = static_cast<const int>(src_strides[2]);
|
||||
auto dstep = static_cast<const int>(dst_strides[2]);
|
||||
|
||||
float scale_x = static_cast<float>(src_full_width) / dst_full_width;
|
||||
float scale_y = static_cast<float>(src_full_height) / dst_full_height;
|
||||
|
||||
int vert_sum_size = swidth;
|
||||
int tabofs_size = (std::max)(2*swidth, 2*dwidth);
|
||||
int xsi_size = (std::max)(2*swidth, 2*dwidth);
|
||||
int xdi_size = (std::max)(2*swidth, 2*dwidth);
|
||||
int ysi_size = (std::max)(2*sheight, 2*dheight);
|
||||
int ydi_size = (std::max)(2*sheight, 2*dheight);
|
||||
int xalpha_size = (std::max)(2*swidth, 2*dwidth);
|
||||
|
||||
auto vert_sum = reinterpret_cast<float*>(buffer);
|
||||
auto tabofs = reinterpret_cast<int*>(vert_sum + vert_sum_size);
|
||||
auto xsi = reinterpret_cast<uint16_t*>(tabofs + tabofs_size + 1);
|
||||
auto xdi = xsi + xsi_size;
|
||||
auto ysi = xdi + xdi_size;
|
||||
auto ydi = ysi + ysi_size;
|
||||
auto xalpha = reinterpret_cast<float*>(ydi + ydi_size);
|
||||
auto yalpha = xalpha + xalpha_size;
|
||||
|
||||
int ytab_size = computeResizeAreaTabFP32(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, ydi, yalpha);
|
||||
int xtab_size = computeResizeAreaTabFP32(src_go_x, dst_go_x, src_full_width, dwidth, scale_x, xsi, xdi, xalpha);
|
||||
|
||||
int dy_ = 0;
|
||||
for (int i = 0; i < ytab_size && dy_ < dwidth*2; i++) {
|
||||
if (i == 0 || ydi[i] != ydi[i-1]) {
|
||||
tabofs[dy_++] = i;
|
||||
}
|
||||
}
|
||||
tabofs[dy_] = ytab_size;
|
||||
|
||||
auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int y) {
|
||||
auto vert_sum_ = vert_sum;
|
||||
|
||||
memset(vert_sum_, 0, swidth * sizeof(float));
|
||||
|
||||
data_t *pdst = dptr_ + y * dstep;
|
||||
|
||||
for (int dy = tabofs[y]; dy < tabofs[y + 1] && dy < ytab_size; dy++) {
|
||||
float beta = yalpha[dy];
|
||||
int sy = ysi[dy];
|
||||
|
||||
const data_t *psrc = sptr_ + sy * sstep;
|
||||
for (int x = 0; x < swidth; x++) {
|
||||
vert_sum_[x] += beta * psrc[x];
|
||||
}
|
||||
}
|
||||
|
||||
int xtab_ind = 0;
|
||||
for (int x = 0; x < dwidth; x++) {
|
||||
float res = 0.f;
|
||||
int dx = 0;
|
||||
for (; x == xdi[xtab_ind + dx] && xtab_ind + dx < xtab_size; dx++) {
|
||||
float alpha = xalpha[xtab_ind + dx];
|
||||
int sx = xsi[xtab_ind + dx];
|
||||
|
||||
res += alpha * vert_sum_[sx];
|
||||
}
|
||||
|
||||
pdst[x] = saturate_cast<data_t>(res);
|
||||
xtab_ind += dx;
|
||||
}
|
||||
};
|
||||
|
||||
for (int ch = 0; ch < channels; ch++) {
|
||||
for (int y = 0; y < dheight; y++) {
|
||||
auto sptr_ = sptr + ch * origSrcH * origSrcW;
|
||||
auto dptr_ = dptr + ch * origDstH * origDstW;
|
||||
|
||||
full_pass(sptr_, dptr_, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline int clip(int x, int a, int b) {
|
||||
return x >= a ? (x < b ? x : b-1) : a;
|
||||
}
|
||||
|
||||
const int MAX_ESIZE = 16;
|
||||
|
||||
template<typename data_t>
|
||||
void HResizeLinear(const data_t** src, float** dst, int count, const int* xofs, const float* alpha,
|
||||
int swidth, int dwidth, int cn, int xmin, int xmax ) {
|
||||
int dx, k;
|
||||
int dx0 = 0;
|
||||
|
||||
for (k = 0; k <= count - 2; k++) {
|
||||
const data_t *S0 = src[k], *S1 = src[k+1];
|
||||
float *D0 = dst[k], *D1 = dst[k+1];
|
||||
for (dx = dx0; dx < xmax; dx++) {
|
||||
int sx = xofs[dx];
|
||||
float a0 = alpha[dx*2], a1 = alpha[dx*2+1];
|
||||
float t0 = static_cast<float>(S0[sx])*a0 + static_cast<float>(S0[sx + cn])*a1;
|
||||
float t1 = static_cast<float>(S1[sx])*a0 + static_cast<float>(S1[sx + cn])*a1;
|
||||
D0[dx] = t0; D1[dx] = t1;
|
||||
}
|
||||
|
||||
for (; dx < dwidth; dx++) {
|
||||
int sx = xofs[dx];
|
||||
D0[dx] = static_cast<float>(S0[sx]); D1[dx] = static_cast<float>(S1[sx]);
|
||||
}
|
||||
}
|
||||
|
||||
for (; k < count; k++) {
|
||||
const data_t *S = src[k];
|
||||
float *D = dst[k];
|
||||
for (dx = 0; dx < xmax; dx++) {
|
||||
int sx = xofs[dx];
|
||||
D[dx] = static_cast<float>(S[sx])*alpha[dx*2] + static_cast<float>(S[sx+cn])*alpha[dx*2+1];
|
||||
}
|
||||
|
||||
for (; dx < dwidth; dx++)
|
||||
D[dx] = static_cast<float>(S[xofs[dx]]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename data_t>
|
||||
void VResizeLinear(float** src, data_t* dst, const float* beta, int width) {
|
||||
float b0 = beta[0], b1 = beta[1];
|
||||
const float *S0 = src[0], *S1 = src[1];
|
||||
|
||||
if (sizeof(data_t) == 4) {
|
||||
for (int x = 0; x < width; x++)
|
||||
dst[x] = static_cast<data_t>(S0[x] * b0 + S1[x] * b1);
|
||||
} else {
|
||||
for (int x = 0; x < width; x++)
|
||||
dst[x] = saturateU32toU8(static_cast<uint32_t>(S0[x] * b0 + S1[x] * b1));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename data_t>
|
||||
static void resize_area_upscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
auto origSrcW = src_strides[2];
|
||||
auto origSrcH = src_strides[1] / src_strides[2];
|
||||
auto origDstW = dst_strides[2];
|
||||
auto origDstH = dst_strides[1] / dst_strides[2];
|
||||
|
||||
auto dwidth = static_cast<const int>(dstDims[3]);
|
||||
auto dheight = static_cast<const int>(dstDims[2]);
|
||||
auto swidth = static_cast<const int>(srcDims[3]);
|
||||
auto sheight = static_cast<const int>(srcDims[2]);
|
||||
auto channels = static_cast<const int>(srcDims[1]);
|
||||
|
||||
auto src_full_width = static_cast<const int>(srcDims[3]);
|
||||
auto src_full_height = static_cast<const int>(srcDims[2]);
|
||||
auto dst_full_width = static_cast<const int>(dstDims[3]);
|
||||
auto dst_full_height = static_cast<const int>(dstDims[2]);
|
||||
|
||||
auto sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
auto dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
|
||||
|
||||
auto sstep = static_cast<const int>(src_strides[2]);
|
||||
auto dstep = static_cast<const int>(dst_strides[2]);
|
||||
|
||||
float scale_x = static_cast<float>(src_full_width) / dst_full_width;
|
||||
float scale_y = static_cast<float>(src_full_height) / dst_full_height;
|
||||
float inv_scale_x = static_cast<float>(dst_full_width) / src_full_width;
|
||||
float inv_scale_y = static_cast<float>(dst_full_height) / src_full_height;
|
||||
|
||||
int xmin = 0, xmax = dwidth, width = dwidth;
|
||||
int ksize = 2;
|
||||
int ksize2 = ksize/2;
|
||||
|
||||
auto xofs = reinterpret_cast<int*>(buffer);
|
||||
auto yofs = xofs + width;
|
||||
auto alpha = reinterpret_cast<float*>(yofs + dheight);
|
||||
auto beta = alpha + width*ksize;
|
||||
float cbuf[2] = {0};
|
||||
|
||||
for (int dx = 0; dx < dwidth; dx++) {
|
||||
int sx = static_cast<int>(floor(dx*scale_x));
|
||||
float fx = (dx+1) - (sx+1)*inv_scale_x;
|
||||
fx = fx <= 0 ? 0.f : fx - floor(fx);
|
||||
|
||||
if (sx < ksize2-1) {
|
||||
xmin = dx+1;
|
||||
if (sx < 0)
|
||||
fx = 0, sx = 0;
|
||||
}
|
||||
|
||||
if (sx + ksize2 >= swidth) {
|
||||
xmax = (std::min)(xmax, dx);
|
||||
if (sx >= swidth-1)
|
||||
fx = 0, sx = swidth-1;
|
||||
}
|
||||
|
||||
xofs[dx] = sx;
|
||||
|
||||
cbuf[0] = 1.f - fx;
|
||||
cbuf[1] = fx;
|
||||
|
||||
for (int k = 0; k < ksize; k++)
|
||||
alpha[dx*ksize + k] = cbuf[k];
|
||||
}
|
||||
|
||||
for (int dy = 0; dy < dheight; dy++) {
|
||||
int sy = static_cast<int>(floor(dy*scale_y));
|
||||
float fy = (dy+1) - (sy+1)*inv_scale_y;
|
||||
fy = fy <= 0 ? 0.f : fy - floor(fy);
|
||||
|
||||
yofs[dy] = sy;
|
||||
cbuf[0] = 1.f - fy;
|
||||
cbuf[1] = fy;
|
||||
|
||||
for (int k = 0; k < ksize; k++)
|
||||
beta[dy*ksize + k] = cbuf[k];
|
||||
}
|
||||
|
||||
auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int dy) {
|
||||
int bufstep = dwidth;
|
||||
const data_t* srows[MAX_ESIZE]={0};
|
||||
float* rows[MAX_ESIZE]={0};
|
||||
int prev_sy[MAX_ESIZE];
|
||||
|
||||
for (int k = 0; k < ksize; k++) {
|
||||
prev_sy[k] = -1;
|
||||
rows[k] = reinterpret_cast<float*>(buffer + (width + dheight)*(sizeof(int) + sizeof(float)*ksize))
|
||||
+ k*bufstep;
|
||||
}
|
||||
|
||||
int sy0 = yofs[dy], k0 = ksize, k1 = 0;
|
||||
|
||||
for (int k = 0; k < ksize; k++) {
|
||||
int sy = clip(sy0 - ksize2 + 1 + k, 0, sheight);
|
||||
for (k1 = (std::max)(k1, k); k1 < ksize; k1++) {
|
||||
if (k1 < MAX_ESIZE && sy == prev_sy[k1]) {
|
||||
if (k1 > k)
|
||||
memcpy(rows[k], rows[k1], bufstep*sizeof(rows[0][0]));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (k1 == ksize)
|
||||
k0 = (std::min)(k0, k);
|
||||
srows[k] = sptr_ + sy * sstep;
|
||||
prev_sy[k] = sy;
|
||||
}
|
||||
|
||||
if (k0 < ksize)
|
||||
HResizeLinear<data_t>(srows + k0, reinterpret_cast<float**>(rows + k0), ksize - k0, xofs,
|
||||
reinterpret_cast<const float*>(alpha), swidth, dwidth, 1, xmin, xmax);
|
||||
|
||||
VResizeLinear<data_t>(reinterpret_cast<float**>(rows), dptr_ + dstep*dy, beta + dy*ksize, dwidth);
|
||||
};
|
||||
|
||||
for (int ch = 0; ch < channels; ch++) {
|
||||
for (int dy = 0; dy < dheight; dy++) {
|
||||
auto sptr_ = sptr + ch * origSrcH * origSrcW;
|
||||
auto dptr_ = dptr + ch * origDstH * origDstW;
|
||||
|
||||
full_pass(sptr_, dptr_, dy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t resize_get_buffer_size(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
|
||||
SizeVector strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
|
||||
size_t origW = strides[2];
|
||||
size_t origH = strides[1] / strides[2];
|
||||
|
||||
const int src_full_width = static_cast<int>(origW);
|
||||
const int src_full_height = static_cast<int>(origH);
|
||||
const int dst_full_width = static_cast<int>(dstDims[3]);
|
||||
const int dst_full_height = static_cast<int>(dstDims[2]);
|
||||
|
||||
float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
|
||||
float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
|
||||
|
||||
auto resize_bilinear_u8_buffer_size = [&]() {
|
||||
size_t buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
|
||||
(sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
|
||||
sizeof(uint32_t) * dstDims[3] +
|
||||
(((srcDims[3] + 7) / 8) * 8 * 8) +
|
||||
sizeof(uint8_t) * 12;
|
||||
|
||||
return buffer_size;
|
||||
};
|
||||
|
||||
auto resize_bilinear_fp32_buffer_size = [&]() {
|
||||
size_t buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
|
||||
(sizeof(int32_t) + sizeof(float)) * dstDims[2] +
|
||||
(((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
|
||||
|
||||
return buffer_size;
|
||||
};
|
||||
|
||||
auto resize_area_u8_downscale_sse_buffer_size = [&]() {
|
||||
const int dwidth = static_cast<int>(dstDims[3]);
|
||||
const int dheight = static_cast<int>(dstDims[2]);
|
||||
const int swidth = static_cast<int>(srcDims[3]);
|
||||
|
||||
const int dst_go_x = 0;
|
||||
const int dst_go_y = 0;
|
||||
|
||||
int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
|
||||
int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
|
||||
|
||||
size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
|
||||
size_t alpha_buf_size =
|
||||
sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
|
||||
size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
|
||||
size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
|
||||
size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
|
||||
|
||||
size_t buffer_size = si_buf_size +
|
||||
alpha_buf_size +
|
||||
vert_sum_buf_size +
|
||||
alpha_array_buf_size +
|
||||
sxid_array_buf_size;
|
||||
|
||||
return buffer_size;
|
||||
};
|
||||
|
||||
auto resize_area_downscale_buffer_size = [&]() {
|
||||
size_t buffer_size = sizeof(float) * (srcDims[3]) +
|
||||
sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
|
||||
sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
|
||||
sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
|
||||
|
||||
return buffer_size;
|
||||
};
|
||||
|
||||
auto resize_area_upscale_buffer_size = [&]() {
|
||||
size_t buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
|
||||
|
||||
return buffer_size;
|
||||
};
|
||||
|
||||
if (algorithm == RESIZE_BILINEAR) {
|
||||
if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
|
||||
return resize_bilinear_u8_buffer_size();
|
||||
} else {
|
||||
return resize_bilinear_fp32_buffer_size();
|
||||
}
|
||||
} else if (algorithm == RESIZE_AREA) {
|
||||
if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
|
||||
if (scale_x <= 1 && scale_y <= 1) {
|
||||
#ifdef HAVE_SSE
|
||||
if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
|
||||
return resize_area_u8_downscale_sse_buffer_size();
|
||||
else
|
||||
#endif
|
||||
return resize_area_downscale_buffer_size();
|
||||
} else {
|
||||
return resize_area_upscale_buffer_size();
|
||||
}
|
||||
} else {
|
||||
if (scale_x <= 1 && scale_y <= 1)
|
||||
return resize_area_downscale_buffer_size();
|
||||
else
|
||||
return resize_area_upscale_buffer_size();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
|
||||
if (inBlob->getTensorDesc().getLayout() != NCHW || outBlob->getTensorDesc().getLayout() != NCHW)
|
||||
THROW_IE_EXCEPTION << "Resize supports only NCHW layout";
|
||||
|
||||
if (!((inBlob->getTensorDesc().getPrecision() == Precision::U8 && outBlob->getTensorDesc().getPrecision() == Precision::U8) ||
|
||||
(inBlob->getTensorDesc().getPrecision() == Precision::FP32 && outBlob->getTensorDesc().getPrecision() == Precision::FP32)))
|
||||
THROW_IE_EXCEPTION << "Resize supports only U8 and FP32 precisions";
|
||||
|
||||
if (algorithm != RESIZE_BILINEAR && algorithm != RESIZE_AREA)
|
||||
THROW_IE_EXCEPTION << "Unsupported resize algorithm type";
|
||||
|
||||
size_t buffer_size = resize_get_buffer_size(inBlob, outBlob, algorithm);
|
||||
auto* buffer = static_cast<uint8_t *>(malloc(buffer_size));
|
||||
if (buffer == nullptr) {
|
||||
THROW_IE_EXCEPTION << "Could not allocate memory for blob";
|
||||
}
|
||||
|
||||
auto dstDims = outBlob->getTensorDesc().getDims();
|
||||
auto srcDims = inBlob->getTensorDesc().getDims();
|
||||
float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
|
||||
float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
|
||||
|
||||
if (algorithm == RESIZE_BILINEAR) {
|
||||
if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
|
||||
#ifdef HAVE_SSE
|
||||
if (with_cpu_x86_sse42())
|
||||
Resize::resize_bilinear_u8(inBlob, outBlob, buffer);
|
||||
else
|
||||
#endif
|
||||
resize_bilinear<uint8_t>(inBlob, outBlob, buffer);
|
||||
} else {
|
||||
resize_bilinear<float>(inBlob, outBlob, buffer);
|
||||
}
|
||||
} else if (algorithm == RESIZE_AREA) {
|
||||
if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
|
||||
if (scale_x <= 1 && scale_y <= 1) {
|
||||
#ifdef HAVE_SSE
|
||||
if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
|
||||
Resize::resize_area_u8_downscale(inBlob, outBlob, buffer);
|
||||
else
|
||||
#endif
|
||||
resize_area_downscale<uint8_t>(inBlob, outBlob, buffer);
|
||||
} else {
|
||||
resize_area_upscale<uint8_t>(inBlob, outBlob, buffer);
|
||||
}
|
||||
} else {
|
||||
if (scale_x <= 1 && scale_y <= 1)
|
||||
resize_area_downscale<float>(inBlob, outBlob, buffer);
|
||||
else
|
||||
resize_area_upscale<float>(inBlob, outBlob, buffer);
|
||||
}
|
||||
}
|
||||
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
} // namespace Resize
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
using namespace Resize;
|
||||
|
||||
/**
|
||||
* @brief This class stores pre-process information for exact input
|
||||
*/
|
||||
@ -759,8 +22,6 @@ class PreProcessData : public IPreProcessData {
|
||||
* @brief ROI blob.
|
||||
*/
|
||||
Blob::Ptr _userBlob = nullptr;
|
||||
Blob::Ptr _tmp1 = nullptr;
|
||||
Blob::Ptr _tmp2 = nullptr;
|
||||
|
||||
/**
|
||||
* @brief Pointer-to-implementation (PIMPL) hiding preprocessing implementation details.
|
||||
@ -814,97 +75,12 @@ void PreProcessData::execute(Blob::Ptr &preprocessedBlob, const PreProcessInfo &
|
||||
_preproc.reset(new PreprocEngine);
|
||||
}
|
||||
|
||||
if (_preproc->preprocessWithGAPI(_userBlob, preprocessedBlob, algorithm, fmt, serial, batchSize)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (algorithm == NO_RESIZE) {
|
||||
THROW_IE_EXCEPTION << "Input pre-processing is called without the pre-processing info set: "
|
||||
"there's nothing to be done";
|
||||
}
|
||||
|
||||
if (batchSize > 1) {
|
||||
THROW_IE_EXCEPTION << "Batch pre-processing is unsupported in this mode. "
|
||||
"Use default pre-processing instead to process batches.";
|
||||
}
|
||||
|
||||
if (fmt != ColorFormat::RAW) {
|
||||
THROW_IE_EXCEPTION << "Non-default (not ColorFormat::RAW) color formats are unsupported "
|
||||
"in this mode. Use default pre-processing instead to process color "
|
||||
"formats.";
|
||||
}
|
||||
|
||||
Blob::Ptr res_in, res_out;
|
||||
if (_userBlob->getTensorDesc().getLayout() == NHWC) {
|
||||
if (!_tmp1 || _tmp1->size() != _userBlob->size()) {
|
||||
if (_userBlob->getTensorDesc().getPrecision() == Precision::FP32) {
|
||||
_tmp1 = make_shared_blob<float>({Precision::FP32, _userBlob->getTensorDesc().getDims(), Layout::NCHW});
|
||||
} else {
|
||||
_tmp1 = make_shared_blob<uint8_t>({Precision::U8, _userBlob->getTensorDesc().getDims(), Layout::NCHW});
|
||||
}
|
||||
_tmp1->allocate();
|
||||
}
|
||||
|
||||
{
|
||||
OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Reorder before");
|
||||
blob_copy(_userBlob, _tmp1);
|
||||
}
|
||||
res_in = _tmp1;
|
||||
} else {
|
||||
res_in = _userBlob;
|
||||
}
|
||||
|
||||
if (preprocessedBlob->getTensorDesc().getLayout() == NHWC) {
|
||||
if (!_tmp2 || _tmp2->size() != preprocessedBlob->size()) {
|
||||
if (preprocessedBlob->getTensorDesc().getPrecision() == Precision::FP32) {
|
||||
_tmp2 = make_shared_blob<float>({Precision::FP32, preprocessedBlob->getTensorDesc().getDims(), Layout::NCHW});
|
||||
} else {
|
||||
_tmp2 = make_shared_blob<uint8_t>({Precision::U8, preprocessedBlob->getTensorDesc().getDims(), Layout::NCHW});
|
||||
}
|
||||
_tmp2->allocate();
|
||||
}
|
||||
res_out = _tmp2;
|
||||
} else {
|
||||
res_out = preprocessedBlob;
|
||||
}
|
||||
|
||||
{
|
||||
OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Resize");
|
||||
resize(res_in, res_out, algorithm);
|
||||
}
|
||||
|
||||
if (res_out == _tmp2) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Reorder after");
|
||||
blob_copy(_tmp2, preprocessedBlob);
|
||||
}
|
||||
_preproc->preprocessWithGAPI(_userBlob, preprocessedBlob, algorithm, fmt, serial, batchSize);
|
||||
}
|
||||
|
||||
void PreProcessData::isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst) {
|
||||
// if G-API pre-processing is used, let it check that pre-processing is applicable
|
||||
if (PreprocEngine::useGAPI()) {
|
||||
PreprocEngine::checkApplicabilityGAPI(src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!src->is<MemoryBlob>() || !dst->is<MemoryBlob>()) {
|
||||
THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs must "
|
||||
"be memory blobs";
|
||||
}
|
||||
|
||||
auto &src_dims = src->getTensorDesc().getDims();
|
||||
auto &dst_dims = dst->getTensorDesc().getDims();
|
||||
|
||||
if (src_dims.size() != dst_dims.size())
|
||||
THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs have different "
|
||||
"number of dimensions";
|
||||
|
||||
if (src_dims.size() != 4)
|
||||
THROW_IE_EXCEPTION << "Preprocessing is not applicable. Only 4D tensors are supported.";
|
||||
|
||||
if (src_dims[0] != dst_dims[0] || src_dims[1] != dst_dims[1])
|
||||
THROW_IE_EXCEPTION << "Preprocessing is not applicable. Wrong shape. Network expected 4D input tensor with "
|
||||
"shape [" << dst_dims[0] << "," << dst_dims[1] <<",H,W] but provided tensor has "
|
||||
"shape " << details::dumpVec(src_dims) << ".";
|
||||
PreprocEngine::checkApplicabilityGAPI(src, dst);
|
||||
}
|
||||
|
||||
} // namespace InferenceEngine
|
||||
|
@ -97,42 +97,4 @@ inline PreProcessDataPtr CreatePreprocDataHelper() {
|
||||
return PreProcessDataPtr(preprocLibraryPath);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
//
|
||||
// Implementation-internal types and functions and macros
|
||||
//
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
namespace Resize {
|
||||
|
||||
static inline uint8_t saturateU32toU8(uint32_t v) {
|
||||
return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
|
||||
}
|
||||
|
||||
void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
|
||||
|
||||
void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
|
||||
|
||||
int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale);
|
||||
|
||||
void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
|
||||
uint16_t* si, uint16_t* alpha, int max_count);
|
||||
|
||||
void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
|
||||
uint16_t** alpha, uint16_t** sxid);
|
||||
|
||||
enum BorderType {
|
||||
BORDER_CONSTANT = 0,
|
||||
BORDER_REPLICATE = 1,
|
||||
};
|
||||
|
||||
struct Border {
|
||||
BorderType type;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
} // namespace Resize
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
} // namespace InferenceEngine
|
||||
|
@ -757,15 +757,6 @@ PreprocEngine::Update PreprocEngine::needUpdate(const CallDesc &newCallOrig) con
|
||||
return Update::NOTHING;
|
||||
}
|
||||
|
||||
bool PreprocEngine::useGAPI() {
|
||||
static const bool NO_GAPI = [](const char *str) -> bool {
|
||||
std::string var(str ? str : "");
|
||||
return var == "N" || var == "NO" || var == "OFF" || var == "0";
|
||||
} (getenv("USE_GAPI"));
|
||||
|
||||
return !NO_GAPI;
|
||||
}
|
||||
|
||||
void PreprocEngine::checkApplicabilityGAPI(const Blob::Ptr &src, const Blob::Ptr &dst) {
|
||||
// Note: src blob is the ROI blob, dst blob is the network's input blob
|
||||
|
||||
@ -904,7 +895,7 @@ void PreprocEngine::executeGraph(Opt<cv::GComputation>& lastComputation,
|
||||
}
|
||||
|
||||
template<typename BlobTypePtr>
|
||||
bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
|
||||
void PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
|
||||
ResizeAlgorithm algorithm, ColorFormat in_fmt, ColorFormat out_fmt, bool omp_serial,
|
||||
int batch_size) {
|
||||
|
||||
@ -953,7 +944,6 @@ bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &o
|
||||
if (algorithm == NO_RESIZE && std::get<0>(thisCall) == std::get<1>(thisCall)) {
|
||||
//if requested output parameters match input blob no need to do anything
|
||||
THROW_IE_EXCEPTION << "No job to do in the PreProcessing ?";
|
||||
return true;
|
||||
}
|
||||
|
||||
const Update update = needUpdate(thisCall);
|
||||
@ -983,15 +973,10 @@ bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &o
|
||||
|
||||
executeGraph(_lastComputation, batched_input_plane_mats, batched_output_plane_mats, batch_size,
|
||||
omp_serial, update);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PreprocEngine::preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob,
|
||||
void PreprocEngine::preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob,
|
||||
const ResizeAlgorithm& algorithm, ColorFormat in_fmt, bool omp_serial, int batch_size) {
|
||||
if (!useGAPI()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto out_fmt = (in_fmt == ColorFormat::RAW) ? ColorFormat::RAW : ColorFormat::BGR; // FIXME: get expected color format from network
|
||||
|
||||
|
@ -45,16 +45,15 @@ class PreprocEngine {
|
||||
Update update);
|
||||
|
||||
template<typename BlobTypePtr>
|
||||
bool preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
|
||||
void preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
|
||||
ResizeAlgorithm algorithm, ColorFormat in_fmt, ColorFormat out_fmt, bool omp_serial,
|
||||
int batch_size);
|
||||
|
||||
public:
|
||||
PreprocEngine();
|
||||
static bool useGAPI();
|
||||
static void checkApplicabilityGAPI(const Blob::Ptr &src, const Blob::Ptr &dst);
|
||||
static int getCorrectBatchSize(int batch_size, const Blob::Ptr& roiBlob);
|
||||
bool preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm,
|
||||
void preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm,
|
||||
ColorFormat in_fmt, bool omp_serial, int batch_size = -1);
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user