[PP] Removed old (non GAPI) preprocessing code (#3664)

2020-12-22 07:52:04 +03:00 · 2020-12-22 07:52:04 +03:00 · 977c3dda23
commit 977c3dda23
parent e490dfc161
6 changed files with 6 additions and 1583 deletions
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_data_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_data_sse42.cpp
@ -1,682 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "ie_preprocess_data.hpp"
-#include "ie_preprocess_data_sse42.hpp"
-
-#include <nmmintrin.h>  // SSE 4.2
-
-#include <stdint.h>
-
-namespace InferenceEngine {
-namespace Resize {
-
-static inline int ceil(double value) {
-    __m128d t = _mm_set_sd(value);
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t));
-}
-
-
-static inline int floor(double value) {
-    __m128d t = _mm_set_sd(value);
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i)));
-}
-
-static inline int16_t mulq15(int16_t a, int16_t b) {
-    return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15);
-}
-
-static inline uint16_t mulq16(uint16_t a, uint16_t b) {
-    return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
-}
-
-void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
-    Border border = {BORDER_REPLICATE, 0};
-
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    auto dwidth = static_cast<const int>(dstDims[3]);
-    auto dheight = static_cast<const int>(dstDims[2]);
-    auto swidth = static_cast<const int>(srcDims[3]);
-    auto channels = static_cast<const int>(srcDims[1]);
-
-    auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto origSrcW = src_strides[2];
-    auto origSrcH = src_strides[1] / src_strides[2];
-    auto origDstW = dst_strides[2];
-    auto origDstH = dst_strides[1] / dst_strides[2];
-
-    const int src_go_x = 0;
-    const int src_go_y = 0;
-    const int dst_go_x = 0;
-    const int dst_go_y = 0;
-    auto src_full_width = static_cast<const int>(srcDims[3]);
-    auto src_full_height = static_cast<const int>(srcDims[2]);
-    auto dst_full_width = static_cast<const int>(dstDims[3]);
-    auto dst_full_height = static_cast<const int>(dstDims[2]);
-
-    const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) +
-                          inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) +
-                    outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-    auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-    auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-    auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
-    auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
-
-    const int BITS = 15;
-    const int SCALE = (1 << BITS);
-    const int alpha_clones_num = 4;
-    const int cols_block_size = 8;
-    const int kRowsBlockSize = 4;
-
-    auto *pxofs1 = reinterpret_cast<int32_t *>(buffer);
-    auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth);
-    auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num);
-    auto *beta = reinterpret_cast<int16_t *>(yofs + dheight);
-    auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight);
-
-    auto tptr_ = tptr;
-
-    tptr_[0] = (uint8_t) border.value;
-    tptr_[1] = (uint8_t) border.value;
-    tptr_[2] = (uint8_t) border.value;
-    tptr_[3] = (uint8_t) border.value;
-    tptr_[swidth + 0 + 4] = (uint8_t) border.value;
-    tptr_[swidth + 1 + 4] = (uint8_t) border.value;
-    tptr_[swidth + 2 + 4] = (uint8_t) border.value;
-    tptr_[swidth + 3 + 4] = (uint8_t) border.value;
-    tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value;
-    tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value;
-    tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value;
-    tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value;
-
-    for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
-        auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
-        int32_t sx = floor(fx);
-        fx -= sx;
-
-        int32_t sx0 = sx;
-        if (sx < 0 && border.type == BORDER_REPLICATE) {
-            fx = 0;
-            sx0 = 0;
-        }
-
-        fx = fx * SCALE;
-
-        if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
-            fx = 1.f * SCALE - 1;
-            sx0 = (std::max)(src_full_width - 2, 0);
-        }
-
-        pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x);
-        for (int i = 0; i < alpha_clones_num; i++) {
-            alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx;
-        }
-    }
-
-    for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
-        float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
-        int32_t sy = floor(fy);
-        fy -= sy;
-
-        int32_t sy0 = sy;
-        if (sy < 0 && border.type == BORDER_REPLICATE) {
-            fy = 0;
-            sy0 = 0;
-        }
-
-        fy = fy * SCALE;
-
-        if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
-            fy = 1.f * SCALE - 1;
-            sy0 = (std::max)(src_full_height - 2, 0);
-        }
-
-        yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep;
-        beta[dy - dst_go_y] = (int16_t) fy;
-    }
-
-    if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) {
-        auto full_pass = [&](int c, int y) {
-            auto sptr_ = sptr + c * origSrcW * origSrcH;
-            auto dptr_ = dptr + c * origDstW * origDstH;
-            auto tptr_ = tptr;
-
-            for (int x = 0; x < swidth; x++) {
-                int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0];
-                int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x +
-                                                                                                      sstep];
-
-                int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0));
-                tptr_[x + 4] = (uint8_t) res;
-            }
-
-            for (int x = 0; x < dwidth; x++) {
-                int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4];
-                int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4];
-
-                int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0));
-                dptr_[y * dstep + x] = (uint8_t) res;
-            }
-        };
-
-        for (int c = 0; c < channels; c++) {
-            for (int y = 0; y < dheight; y++) {
-                full_pass(c, y);
-            }
-        }
-
-        return;
-    }
-
-    auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) {
-        int32_t filtered_rows_id[4];
-        for (int i = 0; i < 4; i++) {
-            filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 :
-                                  (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i];
-        }
-
-        __m128i b0 = _mm_set1_epi16(beta[y + 0]);
-        __m128i b1 = _mm_set1_epi16(beta[y + 1]);
-        __m128i b2 = _mm_set1_epi16(beta[y + 2]);
-        __m128i b3 = _mm_set1_epi16(beta[y + 3]);
-
-        int x = 0;
-        vertical_pass:
-        for (; x <= swidth - cols_block_size; x += cols_block_size) {
-            __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])),
-                                              *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1);
-            __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])),
-                                              *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1);
-            __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)),
-                                              *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1);
-            __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)),
-                                              *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1);
-
-            __m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128());
-            __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
-            __m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128());
-            __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
-
-            __m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128());
-            __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
-            __m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128());
-            __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
-
-            __m128i s0_0 = _mm_sub_epi16(val1_0, val0_0);
-            __m128i s0_1 = _mm_sub_epi16(val1_1, val0_1);
-            __m128i s0_2 = _mm_sub_epi16(val1_2, val0_2);
-            __m128i s0_3 = _mm_sub_epi16(val1_3, val0_3);
-
-            __m128i t0 = _mm_mulhrs_epi16(s0_0, b0);
-            __m128i t1 = _mm_mulhrs_epi16(s0_1, b1);
-            __m128i t2 = _mm_mulhrs_epi16(s0_2, b2);
-            __m128i t3 = _mm_mulhrs_epi16(s0_3, b3);
-
-            __m128i r0 = _mm_add_epi16(val0_0, t0);
-            __m128i r1 = _mm_add_epi16(val0_1, t1);
-            __m128i r2 = _mm_add_epi16(val0_2, t2);
-            __m128i r3 = _mm_add_epi16(val0_3, t3);
-
-            __m128i q0 = _mm_packus_epi16(r0, r1);
-            __m128i q1 = _mm_packus_epi16(r2, r3);
-
-            __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
-            __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
-
-            __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
-            __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
-
-            _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4);
-            _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5);
-        }
-
-        if (x < swidth) {
-            x = swidth - cols_block_size;
-            goto vertical_pass;
-        }
-
-        if (border.type == BORDER_CONSTANT) {
-            for (int i = 0; i < kRowsBlockSize; i++) {
-                if (yofs[y + i] < 0) {
-                    for (x = 0; x < swidth; x++) {
-                        int val0 = border.value;
-                        int val1 = sptr_[yofs[y + i] + x + sstep];
-
-                        int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
-                        tptr_[x * 4 + i + 4] = (uint8_t) res;
-                    }
-                }
-
-                if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) {
-                    for (x = 0; x < swidth; x++) {
-                        int val0 = sptr_[yofs[y + i] + x];
-                        int val1 = border.value;
-
-                        int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
-                        tptr_[x * 4 + i + 4] = (uint8_t) res;
-                    }
-                }
-            }
-        }
-
-        x = 0;
-        horizontal_pass:
-        for (; x <= dwidth - cols_block_size; x += cols_block_size) {
-            __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num));
-            __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num));
-            __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num));
-            __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num));
-
-            __m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)),
-                                             *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1);
-            __m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)),
-                                             *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1);
-            __m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)),
-                                             *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1);
-            __m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)),
-                                             *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1);
-
-            val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
-            val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
-            val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
-            val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
-
-            __m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128());
-            __m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128());
-            __m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128());
-            __m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128());
-
-            __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
-            __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
-            __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
-            __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
-
-            val1_0 = _mm_sub_epi16(val1_0, val0_0);
-            val1_1 = _mm_sub_epi16(val1_1, val0_1);
-            val1_2 = _mm_sub_epi16(val1_2, val0_2);
-            val1_3 = _mm_sub_epi16(val1_3, val0_3);
-
-            __m128i t0 = _mm_mulhrs_epi16(val1_0, a10);
-            __m128i t1 = _mm_mulhrs_epi16(val1_1, a32);
-            __m128i t2 = _mm_mulhrs_epi16(val1_2, a54);
-            __m128i t3 = _mm_mulhrs_epi16(val1_3, a76);
-
-            __m128i r0 = _mm_add_epi16(val0_0, t0);
-            __m128i r1 = _mm_add_epi16(val0_1, t1);
-            __m128i r2 = _mm_add_epi16(val0_2, t2);
-            __m128i r3 = _mm_add_epi16(val0_3, t3);
-
-            __m128i q0 = _mm_packus_epi16(r0, r1);
-            __m128i q1 = _mm_packus_epi16(r2, r3);
-
-            __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
-            __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
-
-            __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
-            __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
-
-            _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4);
-            _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8));
-            _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5);
-            _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8));
-        }
-
-        if (x < dwidth) {
-            x = dwidth - cols_block_size;
-            goto horizontal_pass;
-        }
-    };
-
-    for (int c = 0; c < channels; c++) {
-        for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) {
-            auto sptr_ = sptr + c * origSrcW * origSrcH;
-            auto dptr_ = dptr + c * origDstW * origDstH;
-            auto tptr_ = tptr;
-
-            full_pass_vec(sptr_, dptr_, tptr_, y);
-
-            if (y + kRowsBlockSize > dheight - kRowsBlockSize)
-                full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize);
-        }
-    }
-}
-
-void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    auto dwidth = static_cast<const int>(dstDims[3]);
-    auto dheight = static_cast<const int>(dstDims[2]);
-    auto swidth = static_cast<const int>(srcDims[3]);
-    auto sheight = static_cast<const int>(srcDims[2]);
-    auto channels = static_cast<const int>(srcDims[1]);
-
-    auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto origSrcW = src_strides[2];
-    auto origSrcH = src_strides[1] / src_strides[2];
-    auto origDstW = dst_strides[2];
-    auto origDstH = dst_strides[1] / dst_strides[2];
-
-    const int src_go_x = 0;
-    const int src_go_y = 0;
-    const int dst_go_x = 0;
-    const int dst_go_y = 0;
-
-    auto src_full_width = static_cast<const int>(srcDims[3]);
-    auto src_full_height = static_cast<const int>(srcDims[2]);
-    auto dst_full_width = static_cast<const int>(dstDims[3]);
-    auto dst_full_height = static_cast<const int>(dstDims[2]);
-
-    auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-    auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-
-    float scale_x = static_cast<float>(src_full_width) / dst_full_width;
-    float scale_y = static_cast<float>(src_full_height) / dst_full_height;
-
-    int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  scale_x);
-    int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
-
-    auto* xsi = reinterpret_cast<uint16_t*>(buffer);
-    auto* ysi = xsi + dwidth;
-    auto* xalpha = ysi + dheight;
-    auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
-
-    computeResizeAreaTab(src_go_x, dst_go_x, src_full_width,   dwidth, scale_x, xsi, xalpha, x_max_count);
-    computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
-
-    int vest_sum_size = 2*swidth;
-    uint16_t* vert_sum = yalpha + dheight*y_max_count;
-    uint16_t* alpha0 = vert_sum + vest_sum_size;
-    uint16_t* alpha1 = alpha0 + dwidth;
-    uint16_t* alpha2 = alpha1 + dwidth;
-    uint16_t* alpha3 = alpha2 + dwidth;
-    uint16_t* sxid0 = alpha3 + dwidth;
-    uint16_t* sxid1 = sxid0 + 4*dwidth;
-    uint16_t* sxid2 = sxid1 + 4*dwidth;
-    uint16_t* sxid3 = sxid2 + 4*dwidth;
-
-    uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
-    uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
-    generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
-
-    auto full_pass = [&](int c, int y) {
-        uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH;
-        uint16_t* vert_sum_ = vert_sum;
-
-        int ysi_row = ysi[y];
-
-        memset(vert_sum_, 0, swidth * sizeof(uint16_t));
-
-        for (int dy = 0; dy < y_max_count; dy++) {
-            uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
-            const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH;
-            if (ysi_row + dy >= sheight) break;
-
-            int x = 0;
-
-            __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
-            for (; x <= swidth - 16; x += 16) {
-                __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
-
-                // sptr_dy[x] << 8
-                __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
-                __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
-
-                __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
-                __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
-
-                vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
-                vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
-
-                _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
-                _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
-            }
-
-            for (; x < swidth; x++) {
-                vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
-            }
-        }
-
-        if (x_max_count == 2) {
-            int x = 0;
-            for (; x <= dwidth - 8; x += 8) {
-                __m128i res = _mm_set1_epi16(1 << (8 - 1));
-
-                int id0 = xsi[x];
-
-                __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
-                __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
-
-                __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
-                __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
-
-                __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
-                __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
-
-                __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
-                                                 _mm_shuffle_epi8(chunk1, sx0_id1));
-                __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
-                                                 _mm_shuffle_epi8(chunk1, sx1_id1));
-
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
-
-                res = _mm_srli_epi16(res, 8);
-                res = _mm_packus_epi16(res, res);
-                _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
-            }
-
-            for (; x < dwidth; x++) {
-                uint16_t res = 1 << (8 - 1);
-                int id = xsi[x];
-                res += mulq16(alpha0[x], vert_sum_[id + 0]);
-                res += mulq16(alpha1[x], vert_sum_[id + 1]);
-                pdst_row[x] = saturateU32toU8(res >> 8);
-            }
-        } else if (x_max_count == 3) {
-            int x = 0;
-            for (; x <= dwidth - 8; x += 8) {
-                __m128i res = _mm_set1_epi16(1 << (8 - 1));
-
-                int id0 = xsi[x];
-
-                __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
-                __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
-                __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
-
-                __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
-                __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
-                __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
-
-                __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
-                __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
-                __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
-
-                __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
-                __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
-                __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
-
-                __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx0_id1)),
-                                                 _mm_shuffle_epi8(chunk2, sx0_id2));
-                __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx1_id1)),
-                                                 _mm_shuffle_epi8(chunk2, sx1_id2));
-                __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx2_id1)),
-                                                 _mm_shuffle_epi8(chunk2, sx2_id2));
-
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
-
-                res = _mm_srli_epi16(res, 8);
-                res = _mm_packus_epi16(res, res);
-                _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
-            }
-
-            for (; x < dwidth; x++) {
-                uint16_t res = 1 << (8 - 1);
-                int id = xsi[x];
-                res += mulq16(alpha0[x], vert_sum_[id + 0]);
-                res += mulq16(alpha1[x], vert_sum_[id + 1]);
-                res += mulq16(alpha2[x], vert_sum_[id + 2]);
-                pdst_row[x] = saturateU32toU8(res >> 8);
-            }
-        } else if (x_max_count == 4) {
-            int x = 0;
-            for (; x <= dwidth - 8; x += 8) {
-                __m128i res = _mm_set1_epi16(1 << (8 - 1));
-
-                int id0 = xsi[x];
-
-                __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
-                __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
-                __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
-                __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
-
-                __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
-                __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
-                __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
-                __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
-
-                __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
-                __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
-                __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
-                __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
-
-                __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
-                __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
-                __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
-                __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
-
-                __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
-                __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
-                __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
-                __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
-
-                __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx0_id1)),
-                                                 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
-                                                              _mm_shuffle_epi8(chunk3, sx0_id3)));
-                __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx1_id1)),
-                                                 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
-                                                              _mm_shuffle_epi8(chunk3, sx1_id3)));
-                __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx2_id1)),
-                                                 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
-                                                              _mm_shuffle_epi8(chunk3, sx2_id3)));
-                __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
-                                                              _mm_shuffle_epi8(chunk1, sx3_id1)),
-                                                 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
-                                                              _mm_shuffle_epi8(chunk3, sx3_id3)));
-
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
-                res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
-
-                res = _mm_srli_epi16(res, 8);
-                res = _mm_packus_epi16(res, res);
-                _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
-            }
-
-            for (; x < dwidth; x++) {
-                uint16_t res = 1 << (8 - 1);
-                int id = xsi[x];
-                res += mulq16(alpha0[x], vert_sum_[id + 0]);
-                res += mulq16(alpha1[x], vert_sum_[id + 1]);
-                res += mulq16(alpha2[x], vert_sum_[id + 2]);
-                res += mulq16(alpha3[x], vert_sum_[id + 3]);
-                pdst_row[x] = saturateU32toU8(res >> 8);
-            }
-        } else if (x_max_count <= 7) {
-            int x = 0;
-            for (; x <= dwidth - 8; x += 8) {
-                __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
-                for (int i = 0; i < x_max_count; i++) {
-                    __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 1 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 2 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 3 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 4 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 5 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 6 + i],
-                                                    xalpha[x * x_max_count + x_max_count * 7 + i]);
-                    __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
-                                                       vert_sum_[xsi[x + 1] + i],
-                                                       vert_sum_[xsi[x + 2] + i],
-                                                       vert_sum_[xsi[x + 3] + i],
-                                                       vert_sum_[xsi[x + 4] + i],
-                                                       vert_sum_[xsi[x + 5] + i],
-                                                       vert_sum_[xsi[x + 6] + i],
-                                                       vert_sum_[xsi[x + 7] + i]);
-
-                    res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
-                }
-                res = _mm_srli_epi16(res, 8);
-                res = _mm_packus_epi16(res, res);
-                _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
-            }
-
-            for (; x < dwidth; x++) {
-                uint16_t res = 1 << (8 - 1);
-                for (int i = 0; i < x_max_count; i++) {
-                    uint16_t a = xalpha[x * x_max_count + i];
-                    int sx = xsi[x] + i;
-
-                    res += mulq16(a, vert_sum_[sx]);
-                }
-                pdst_row[x] = saturateU32toU8(res >> 8);
-            }
-        } else {
-            for (int x = 0; x < dwidth; x++) {
-                uint16_t res = 1 << (8 - 1);
-                __m128i vres = _mm_setzero_si128();
-                int id = xsi[x];
-
-                int i = 0;
-                for (; i <= x_max_count - 8; i += 8) {
-                    __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
-                    __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
-
-                    vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
-                }
-                vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
-                vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
-                vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
-                res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
-
-                for (; i < x_max_count; i++) {
-                    uint16_t a = xalpha[x * x_max_count + i];
-                    uint16_t s = vert_sum_[id + i];
-
-                    res += mulq16(a, s);
-                }
-
-                pdst_row[x] = saturateU32toU8(res >> 8);
-            }
-        }
-    };
-
-    for (int c = 0; c < channels; c++) {
-        for (int y = 0; y < dheight; y++) {
-            full_pass(c, y);
-        }
-    }
-}
-
-}  // namespace Resize
-}  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_data_sse42.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_data_sse42.hpp
@ -1,17 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ie_blob.h"
-
-#include <stdint.h>
-
-namespace InferenceEngine {
-
-void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
-
-void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
-
-}
--- a/inference-engine/src/preprocessing/ie_preprocess_data.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_data.cpp
@ -4,753 +4,16 @@

 #include "ie_preprocess_gapi.hpp"
 #include "ie_system_conf.h"
-#include "blob_transform.hpp"
 #include "ie_preprocess_data.hpp"
 #include "ie_preprocess_itt.hpp"

-#ifdef HAVE_SSE
-# include "cpu_x86_sse42/ie_preprocess_data_sse42.hpp"
-#endif
-
 #include "debug.h"
-#include "ie_compound_blob.h"
 #include <ie_input_info.hpp>

 #include <memory>
-#include <algorithm>

 namespace InferenceEngine {

-
-namespace Resize {
-
-template<typename data_t> static inline data_t saturate_cast(float res);
-
-template<> inline float saturate_cast(float res) {
-    return res;
-}
-
-template<> inline uint8_t saturate_cast(float res) {
-    int ires = static_cast<int>((std::round)(res));
-    return static_cast<uint8_t>((std::max)(0, (std::min)(255, ires)));
-}
-
-template<typename data_t = float>
-void resize_bilinear(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
-    Border border = {BORDER_REPLICATE, 0};
-
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    auto dwidth = static_cast<const int>(dstDims[3]);
-    auto dheight = static_cast<const int>(dstDims[2]);
-    auto swidth = static_cast<const int>(srcDims[3]);
-    auto channels = static_cast<const int>(srcDims[1]);
-
-    auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto origSrcW = src_strides[2];
-    auto origSrcH = src_strides[1] / src_strides[2];
-    auto origDstW = dst_strides[2];
-    auto origDstH = dst_strides[1] / dst_strides[2];
-
-    const int src_go_x = 0;
-    const int src_go_y = 0;
-    const int dst_go_x = 0;
-    const int dst_go_y = 0;
-    auto src_full_width = static_cast<const int>(srcDims[3]);
-    auto src_full_height = static_cast<const int>(srcDims[2]);
-    auto dst_full_width = static_cast<const int>(dstDims[3]);
-    auto dst_full_height = static_cast<const int>(dstDims[2]);
-
-    auto *sptr = static_cast<data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto *dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-    auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
-    auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
-    auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
-
-    auto* xofs = reinterpret_cast<int32_t*>(buffer);
-    auto* yofs = xofs + dwidth;
-    auto* alpha = reinterpret_cast<float*>(yofs + dheight);
-    auto* beta = alpha + dwidth;
-    auto* tptr = beta + dheight;
-
-    for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
-        auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
-        int32_t sx = static_cast<int32_t>(floor(fx));
-        fx -= sx;
-
-        int32_t sx0 = sx;
-        if (sx < 0 && border.type == BORDER_REPLICATE) {
-            fx = 0;
-            sx0 = 0;
-        }
-
-        if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
-            fx = 1.f;
-            sx0 = (std::max)(src_full_width - 2, 0);
-        }
-
-        xofs[dx - dst_go_x] = sx0 - src_go_x;
-        alpha[dx - dst_go_x] = fx;
-    }
-
-    for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
-        auto fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
-        int32_t sy = static_cast<int32_t>(floor(fy));
-        fy -= sy;
-
-        int32_t sy0 = sy;
-        if (sy < 0 && border.type == BORDER_REPLICATE) {
-            fy = 0;
-            sy0 = 0;
-        }
-
-        if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
-            fy = 1.f;
-            sy0 = (std::max)(src_full_height - 2, 0);
-        }
-
-        yofs[dy - dst_go_y] = sy0 - src_go_y;
-        beta[dy - dst_go_y] = fy;
-    }
-
-    auto full_pass = [&](int c, int y) {
-        auto sptr_ = sptr + c * origSrcW * origSrcH;
-        auto dptr_ = dptr + c * origDstW * origDstH;
-        auto tptr_ = tptr;
-
-        for (int x = 0; x < swidth; x++) {
-            bool use_constant0 = yofs[y] + 0 < 0 || yofs[y] + 0 >= src_full_height;
-            bool use_constant1 = yofs[y] + 1 < 0 || yofs[y] + 1 >= src_full_height;
-            float val0 = static_cast<float>(use_constant0 ? border.value : sptr_[(yofs[y] + 0) * sstep + x]);
-            float val1 = static_cast<float>(use_constant1 ? border.value : sptr_[(yofs[y] + 1) * sstep + x]);
-
-            float res = val0 + beta[y] * (val1 - val0);
-            tptr_[x] = res;
-        }
-
-        for (int x = 0; x < dwidth; x++) {
-            bool use_constant0 = xofs[x] + 0 < 0 || xofs[x] + 0 >= src_full_width;
-            bool use_constant1 = xofs[x] + 1 < 0 || xofs[x] + 1 >= src_full_width;
-            float val0 = use_constant0 ? border.value : tptr_[xofs[x] + 0];
-            float val1 = use_constant1 ? border.value : tptr_[xofs[x] + 1];
-
-            float res = val0 + alpha[x] * (val1 - val0);
-            dptr_[y * dstep + x] = saturate_cast<data_t>(res);
-        }
-    };
-
-    for (int c = 0; c < channels; c++) {
-        for (int y = 0; y < dheight; y++) {
-            full_pass(c, y);
-        }
-    }
-}
-
-int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
-    static const float threshold = 1e-3f;
-    int max_count = 0;
-
-    for (int col = dst_go; col < dst_go + dsize; col++) {
-        int count = 0;
-
-        float fsx1 = col * scale;
-        float fsx2 = fsx1 + scale;
-
-        int sx1 = static_cast<int>(ceil(fsx1));
-        int sx2 = static_cast<int>(floor(fsx2));
-
-        sx2 = (std::min)(sx2, ssize - 1);
-        sx1 = (std::min)(sx1, sx2);
-
-        if (sx1 - fsx1 > threshold) {
-            count++;
-        }
-
-        for (int sx = sx1; sx < sx2; sx++) {
-            count++;
-        }
-
-        if (fsx2 - sx2 > threshold) {
-            count++;
-        }
-        max_count = (std::max)(max_count, count);
-    }
-
-    return max_count;
-}
-
-void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
-                          uint16_t* si, uint16_t* alpha, int max_count) {
-    static const float threshold = 1e-3f;
-    int k = 0;
-
-    for (int col = dst_go; col < dst_go + dsize; col++) {
-        int count = 0;
-
-        float fsx1 = col * scale;
-        float fsx2 = fsx1 + scale;
-        float cellWidth = (std::min)(scale, ssize - fsx1);
-
-        int sx1 = static_cast<int>(ceil(fsx1));
-        int sx2 = static_cast<int>(floor(fsx2));
-
-        sx2 = (std::min)(sx2, ssize - 1);
-        sx1 = (std::min)(sx1, sx2);
-
-        si[col - dst_go] = (uint16_t)(sx1 - src_go);
-
-        if (sx1 - fsx1 > threshold) {
-            si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
-            alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
-            count++;
-        }
-
-        for (int sx = sx1; sx < sx2; sx++) {
-            alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
-            count++;
-        }
-
-        if (fsx2 - sx2 > threshold) {
-            alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
-            count++;
-        }
-
-        if (count != max_count) {
-            alpha[k++] = 0;
-        }
-    }
-}
-
-void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
-                                  uint16_t** alpha, uint16_t** sxid) {
-    if (x_max_count <= 4) {
-        for (int col = 0; col < dcols; col++) {
-            for (int x = 0; x < x_max_count; x++) {
-                alpha[x][col] = xalpha[col*x_max_count + x];
-            }
-        }
-    }
-    if (x_max_count <= 4) {
-        for (int col = 0; col <= dcols - 8; col += 8) {
-            for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
-                for (int i = 0; i < 128 / 16; i++) {
-                    int id_diff = xsi[col + i] - xsi[col];
-
-                    for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
-                        uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
-
-                        int id0 = (id_diff + chunk_num_v) * 2 + 0;
-                        int id1 = (id_diff + chunk_num_v) * 2 + 1;
-
-                        (reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
-                        (reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
-                    }
-                }
-            }
-        }
-    }
-}
-
-int computeResizeAreaTabFP32(int src_go, int dst_go, int ssize, int dsize, float scale, uint16_t* si, uint16_t* di, float* alpha) {
-    static const float threshold = 1e-3f;
-    int k = 0;
-
-    for (int col = dst_go; col < dst_go + dsize; col++) {
-        float fsx1 = col * scale;
-        float fsx2 = fsx1 + scale;
-        float cellWidth = (std::min)(scale, ssize - fsx1);
-
-        int sx1 = static_cast<int>(ceil(fsx1));
-        int sx2 = static_cast<int>(floor(fsx2));
-
-        sx2 = (std::min)(sx2, ssize - 1);
-        sx1 = (std::min)(sx1, sx2);
-
-        if (sx1 - fsx1 > threshold) {
-            di[k] = (uint16_t)(col - dst_go);
-            si[k] = (uint16_t)(sx1 - src_go - 1);
-            alpha[k++] = (sx1 - fsx1) / cellWidth;
-        }
-
-        for (int sx = sx1; sx < sx2; sx++) {
-            di[k] = (uint16_t)(col - dst_go);
-            si[k] = (uint16_t)(sx  - src_go);
-            alpha[k++] = 1.0f / cellWidth;
-        }
-
-        if (fsx2 - sx2 > threshold) {
-            di[k] = (uint16_t)(col - dst_go);
-            si[k] = (uint16_t)(sx2 - src_go);
-            alpha[k++] = (std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth;
-        }
-    }
-    return k;
-}
-
-template<typename data_t = float>
-void resize_area_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto origSrcW = src_strides[2];
-    auto origSrcH = src_strides[1] / src_strides[2];
-    auto origDstW = dst_strides[2];
-    auto origDstH = dst_strides[1] / dst_strides[2];
-
-    auto dwidth = static_cast<const int>(dstDims[3]);
-    auto dheight = static_cast<const int>(dstDims[2]);
-    auto swidth = static_cast<const int>(srcDims[3]);
-    auto sheight = static_cast<const int>(srcDims[2]);
-    auto channels = static_cast<const int>(srcDims[1]);
-
-    const int src_go_x = 0;
-    const int src_go_y = 0;
-    const int dst_go_x = 0;
-    const int dst_go_y = 0;
-
-    auto src_full_width = static_cast<const int>(srcDims[3]);
-    auto src_full_height = static_cast<const int>(srcDims[2]);
-    auto dst_full_width = static_cast<const int>(dstDims[3]);
-    auto dst_full_height = static_cast<const int>(dstDims[2]);
-
-    auto* sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto* dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-    auto sstep = static_cast<const int>(src_strides[2]);
-    auto dstep = static_cast<const int>(dst_strides[2]);
-
-    float scale_x = static_cast<float>(src_full_width) / dst_full_width;
-    float scale_y = static_cast<float>(src_full_height) / dst_full_height;
-
-    int vert_sum_size = swidth;
-    int tabofs_size = (std::max)(2*swidth, 2*dwidth);
-    int xsi_size = (std::max)(2*swidth, 2*dwidth);
-    int xdi_size = (std::max)(2*swidth, 2*dwidth);
-    int ysi_size = (std::max)(2*sheight, 2*dheight);
-    int ydi_size = (std::max)(2*sheight, 2*dheight);
-    int xalpha_size = (std::max)(2*swidth, 2*dwidth);
-
-    auto vert_sum = reinterpret_cast<float*>(buffer);
-    auto tabofs = reinterpret_cast<int*>(vert_sum + vert_sum_size);
-    auto xsi = reinterpret_cast<uint16_t*>(tabofs + tabofs_size + 1);
-    auto xdi = xsi + xsi_size;
-    auto ysi = xdi + xdi_size;
-    auto ydi = ysi + ysi_size;
-    auto xalpha = reinterpret_cast<float*>(ydi + ydi_size);
-    auto yalpha = xalpha + xalpha_size;
-
-    int ytab_size = computeResizeAreaTabFP32(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, ydi, yalpha);
-    int xtab_size = computeResizeAreaTabFP32(src_go_x, dst_go_x, src_full_width,  dwidth,  scale_x, xsi, xdi, xalpha);
-
-    int dy_ = 0;
-    for (int i = 0; i < ytab_size && dy_ < dwidth*2; i++) {
-        if (i == 0 || ydi[i] != ydi[i-1]) {
-            tabofs[dy_++] = i;
-        }
-    }
-    tabofs[dy_] = ytab_size;
-
-    auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int y) {
-        auto vert_sum_ = vert_sum;
-
-        memset(vert_sum_, 0, swidth * sizeof(float));
-
-        data_t *pdst = dptr_ + y * dstep;
-
-        for (int dy = tabofs[y]; dy < tabofs[y + 1] && dy < ytab_size; dy++) {
-            float beta = yalpha[dy];
-            int sy = ysi[dy];
-
-            const data_t *psrc = sptr_ + sy * sstep;
-            for (int x = 0; x < swidth; x++) {
-                vert_sum_[x] += beta * psrc[x];
-            }
-        }
-
-        int xtab_ind = 0;
-        for (int x = 0; x < dwidth; x++) {
-            float res = 0.f;
-            int dx = 0;
-            for (; x == xdi[xtab_ind + dx] && xtab_ind + dx < xtab_size; dx++) {
-                float alpha = xalpha[xtab_ind + dx];
-                int sx = xsi[xtab_ind + dx];
-
-                res += alpha * vert_sum_[sx];
-            }
-
-            pdst[x] = saturate_cast<data_t>(res);
-            xtab_ind += dx;
-        }
-    };
-
-    for (int ch = 0; ch < channels; ch++) {
-        for (int y = 0; y < dheight; y++) {
-            auto sptr_ = sptr + ch * origSrcH * origSrcW;
-            auto dptr_ = dptr + ch * origDstH * origDstW;
-
-            full_pass(sptr_, dptr_, y);
-        }
-    }
-}
-
-inline int clip(int x, int a, int b) {
-    return x >= a ? (x < b ? x : b-1) : a;
-}
-
-const int MAX_ESIZE = 16;
-
-template<typename data_t>
-void HResizeLinear(const data_t** src, float** dst, int count, const int* xofs, const float* alpha,
-                 int swidth, int dwidth, int cn, int xmin, int xmax ) {
-    int dx, k;
-    int dx0 = 0;
-
-    for (k = 0; k <= count - 2; k++) {
-        const data_t *S0 = src[k], *S1 = src[k+1];
-        float *D0 = dst[k], *D1 = dst[k+1];
-        for (dx = dx0; dx < xmax; dx++) {
-            int sx = xofs[dx];
-            float a0 = alpha[dx*2], a1 = alpha[dx*2+1];
-            float t0 = static_cast<float>(S0[sx])*a0 + static_cast<float>(S0[sx + cn])*a1;
-            float t1 = static_cast<float>(S1[sx])*a0 + static_cast<float>(S1[sx + cn])*a1;
-            D0[dx] = t0; D1[dx] = t1;
-        }
-
-        for (; dx < dwidth; dx++) {
-            int sx = xofs[dx];
-            D0[dx] = static_cast<float>(S0[sx]); D1[dx] = static_cast<float>(S1[sx]);
-        }
-    }
-
-    for (; k < count; k++) {
-        const data_t *S = src[k];
-        float *D = dst[k];
-        for (dx = 0; dx < xmax; dx++) {
-            int sx = xofs[dx];
-            D[dx] = static_cast<float>(S[sx])*alpha[dx*2] + static_cast<float>(S[sx+cn])*alpha[dx*2+1];
-        }
-
-        for (; dx < dwidth; dx++)
-            D[dx] = static_cast<float>(S[xofs[dx]]);
-    }
-}
-
-template<typename data_t>
-void VResizeLinear(float** src, data_t* dst, const float* beta, int width) {
-    float b0 = beta[0], b1 = beta[1];
-    const float *S0 = src[0], *S1 = src[1];
-
-    if (sizeof(data_t) == 4) {
-        for (int x = 0; x < width; x++)
-            dst[x] = static_cast<data_t>(S0[x] * b0 + S1[x] * b1);
-    } else {
-        for (int x = 0; x < width; x++)
-            dst[x] = saturateU32toU8(static_cast<uint32_t>(S0[x] * b0 + S1[x] * b1));
-    }
-}
-
-template<typename data_t>
-static void resize_area_upscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
-    auto origSrcW = src_strides[2];
-    auto origSrcH = src_strides[1] / src_strides[2];
-    auto origDstW = dst_strides[2];
-    auto origDstH = dst_strides[1] / dst_strides[2];
-
-    auto dwidth = static_cast<const int>(dstDims[3]);
-    auto dheight = static_cast<const int>(dstDims[2]);
-    auto swidth = static_cast<const int>(srcDims[3]);
-    auto sheight = static_cast<const int>(srcDims[2]);
-    auto channels = static_cast<const int>(srcDims[1]);
-
-    auto src_full_width = static_cast<const int>(srcDims[3]);
-    auto src_full_height = static_cast<const int>(srcDims[2]);
-    auto dst_full_width = static_cast<const int>(dstDims[3]);
-    auto dst_full_height = static_cast<const int>(dstDims[2]);
-
-    auto sptr = static_cast<const data_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto dptr = static_cast<data_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-    auto sstep = static_cast<const int>(src_strides[2]);
-    auto dstep = static_cast<const int>(dst_strides[2]);
-
-    float scale_x = static_cast<float>(src_full_width)  / dst_full_width;
-    float scale_y = static_cast<float>(src_full_height) / dst_full_height;
-    float inv_scale_x = static_cast<float>(dst_full_width) / src_full_width;
-    float inv_scale_y = static_cast<float>(dst_full_height) / src_full_height;
-
-    int xmin = 0, xmax = dwidth, width = dwidth;
-    int ksize = 2;
-    int ksize2 = ksize/2;
-
-    auto xofs = reinterpret_cast<int*>(buffer);
-    auto yofs = xofs + width;
-    auto alpha = reinterpret_cast<float*>(yofs + dheight);
-    auto beta = alpha + width*ksize;
-    float cbuf[2] = {0};
-
-    for (int dx = 0; dx < dwidth; dx++) {
-        int sx = static_cast<int>(floor(dx*scale_x));
-        float fx = (dx+1) - (sx+1)*inv_scale_x;
-        fx = fx <= 0 ? 0.f : fx - floor(fx);
-
-        if (sx < ksize2-1) {
-            xmin = dx+1;
-            if (sx < 0)
-                fx = 0, sx = 0;
-        }
-
-        if (sx + ksize2 >= swidth) {
-            xmax = (std::min)(xmax, dx);
-            if (sx >= swidth-1)
-                fx = 0, sx = swidth-1;
-        }
-
-        xofs[dx] = sx;
-
-        cbuf[0] = 1.f - fx;
-        cbuf[1] = fx;
-
-        for (int k = 0; k < ksize; k++)
-            alpha[dx*ksize + k] = cbuf[k];
-    }
-
-    for (int dy = 0; dy < dheight; dy++) {
-        int sy = static_cast<int>(floor(dy*scale_y));
-        float fy = (dy+1) - (sy+1)*inv_scale_y;
-        fy = fy <= 0 ? 0.f : fy - floor(fy);
-
-        yofs[dy] = sy;
-        cbuf[0] = 1.f - fy;
-        cbuf[1] = fy;
-
-        for (int k = 0; k < ksize; k++)
-            beta[dy*ksize + k] = cbuf[k];
-    }
-
-    auto full_pass = [&](const data_t* sptr_, data_t* dptr_, int dy) {
-        int bufstep = dwidth;
-        const data_t* srows[MAX_ESIZE]={0};
-        float* rows[MAX_ESIZE]={0};
-        int prev_sy[MAX_ESIZE];
-
-        for (int k = 0; k < ksize; k++) {
-            prev_sy[k] = -1;
-            rows[k] = reinterpret_cast<float*>(buffer + (width + dheight)*(sizeof(int) + sizeof(float)*ksize))
-                      + k*bufstep;
-        }
-
-        int sy0 = yofs[dy], k0 = ksize, k1 = 0;
-
-        for (int k = 0; k < ksize; k++) {
-            int sy = clip(sy0 - ksize2 + 1 + k, 0, sheight);
-            for (k1 = (std::max)(k1, k); k1 < ksize; k1++) {
-                if (k1 < MAX_ESIZE && sy == prev_sy[k1]) {
-                    if (k1 > k)
-                        memcpy(rows[k], rows[k1], bufstep*sizeof(rows[0][0]));
-                    break;
-                }
-            }
-
-            if (k1 == ksize)
-                k0 = (std::min)(k0, k);
-            srows[k] = sptr_ + sy * sstep;
-            prev_sy[k] = sy;
-        }
-
-        if (k0 < ksize)
-            HResizeLinear<data_t>(srows + k0, reinterpret_cast<float**>(rows + k0), ksize - k0, xofs,
-                                  reinterpret_cast<const float*>(alpha), swidth, dwidth, 1, xmin, xmax);
-
-        VResizeLinear<data_t>(reinterpret_cast<float**>(rows), dptr_ + dstep*dy, beta + dy*ksize, dwidth);
-    };
-
-    for (int ch = 0; ch < channels; ch++) {
-        for (int dy = 0; dy < dheight; dy++) {
-            auto sptr_ = sptr + ch * origSrcH * origSrcW;
-            auto dptr_ = dptr + ch * origDstH * origDstW;
-
-            full_pass(sptr_, dptr_, dy);
-        }
-    }
-}
-
-size_t resize_get_buffer_size(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-
-    SizeVector strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
-    size_t origW = strides[2];
-    size_t origH = strides[1] / strides[2];
-
-    const int src_full_width = static_cast<int>(origW);
-    const int src_full_height = static_cast<int>(origH);
-    const int dst_full_width = static_cast<int>(dstDims[3]);
-    const int dst_full_height = static_cast<int>(dstDims[2]);
-
-    float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
-    float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
-
-    auto resize_bilinear_u8_buffer_size = [&]() {
-        size_t buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
-                             (sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
-                             sizeof(uint32_t) * dstDims[3] +
-                             (((srcDims[3] + 7) / 8) * 8 * 8) +
-                             sizeof(uint8_t) * 12;
-
-        return buffer_size;
-    };
-
-    auto resize_bilinear_fp32_buffer_size = [&]() {
-        size_t buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
-                             (sizeof(int32_t) + sizeof(float)) * dstDims[2] +
-                             (((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
-
-        return buffer_size;
-    };
-
-    auto resize_area_u8_downscale_sse_buffer_size = [&]() {
-        const int dwidth = static_cast<int>(dstDims[3]);
-        const int dheight = static_cast<int>(dstDims[2]);
-        const int swidth = static_cast<int>(srcDims[3]);
-
-        const int dst_go_x = 0;
-        const int dst_go_y = 0;
-
-        int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
-        int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
-
-        size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
-        size_t alpha_buf_size =
-                sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
-        size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
-        size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
-        size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
-
-        size_t buffer_size = si_buf_size +
-                             alpha_buf_size +
-                             vert_sum_buf_size +
-                             alpha_array_buf_size +
-                             sxid_array_buf_size;
-
-        return buffer_size;
-    };
-
-    auto resize_area_downscale_buffer_size = [&]() {
-        size_t buffer_size = sizeof(float) * (srcDims[3]) +
-                             sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
-                             sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
-                             sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
-
-        return buffer_size;
-    };
-
-    auto resize_area_upscale_buffer_size = [&]() {
-        size_t buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
-
-        return buffer_size;
-    };
-
-    if (algorithm == RESIZE_BILINEAR) {
-        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-            return resize_bilinear_u8_buffer_size();
-        } else {
-            return resize_bilinear_fp32_buffer_size();
-        }
-    } else if (algorithm == RESIZE_AREA) {
-        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-            if (scale_x <= 1 && scale_y <= 1) {
-#ifdef HAVE_SSE
-                if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
-                    return resize_area_u8_downscale_sse_buffer_size();
-                else
-#endif
-                    return resize_area_downscale_buffer_size();
-            } else {
-                return resize_area_upscale_buffer_size();
-            }
-        } else {
-            if (scale_x <= 1 && scale_y <= 1)
-                return resize_area_downscale_buffer_size();
-            else
-                return resize_area_upscale_buffer_size();
-        }
-    }
-
-    return 0;
-}
-
-void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
-    if (inBlob->getTensorDesc().getLayout() != NCHW || outBlob->getTensorDesc().getLayout() != NCHW)
-        THROW_IE_EXCEPTION << "Resize supports only NCHW layout";
-
-    if (!((inBlob->getTensorDesc().getPrecision() == Precision::U8 && outBlob->getTensorDesc().getPrecision() == Precision::U8) ||
-          (inBlob->getTensorDesc().getPrecision() == Precision::FP32 && outBlob->getTensorDesc().getPrecision() == Precision::FP32)))
-        THROW_IE_EXCEPTION << "Resize supports only U8 and FP32 precisions";
-
-    if (algorithm != RESIZE_BILINEAR && algorithm != RESIZE_AREA)
-        THROW_IE_EXCEPTION << "Unsupported resize algorithm type";
-
-    size_t buffer_size = resize_get_buffer_size(inBlob, outBlob, algorithm);
-    auto* buffer = static_cast<uint8_t *>(malloc(buffer_size));
-    if (buffer == nullptr) {
-        THROW_IE_EXCEPTION << "Could not allocate memory for blob";
-    }
-
-    auto dstDims = outBlob->getTensorDesc().getDims();
-    auto srcDims = inBlob->getTensorDesc().getDims();
-    float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
-    float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
-
-    if (algorithm == RESIZE_BILINEAR) {
-        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-#ifdef HAVE_SSE
-            if (with_cpu_x86_sse42())
-                Resize::resize_bilinear_u8(inBlob, outBlob, buffer);
-            else
-#endif
-                resize_bilinear<uint8_t>(inBlob, outBlob, buffer);
-        } else {
-            resize_bilinear<float>(inBlob, outBlob, buffer);
-        }
-    } else if (algorithm == RESIZE_AREA) {
-        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-            if (scale_x <= 1 && scale_y <= 1) {
-#ifdef HAVE_SSE
-                if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
-                    Resize::resize_area_u8_downscale(inBlob, outBlob, buffer);
-                else
-#endif
-                    resize_area_downscale<uint8_t>(inBlob, outBlob, buffer);
-            } else {
-                resize_area_upscale<uint8_t>(inBlob, outBlob, buffer);
-            }
-        } else {
-            if (scale_x <= 1 && scale_y <= 1)
-                resize_area_downscale<float>(inBlob, outBlob, buffer);
-            else
-                resize_area_upscale<float>(inBlob, outBlob, buffer);
-        }
-    }
-
-    free(buffer);
-}
-
-}  // namespace Resize
-
-//----------------------------------------------------------------------
-
-using namespace Resize;
-
 /**
 * @brief This class stores pre-process information for exact input
 */
@ -759,8 +22,6 @@ class PreProcessData : public IPreProcessData {
     * @brief ROI blob.
     */
    Blob::Ptr _userBlob = nullptr;
-    Blob::Ptr _tmp1 = nullptr;
-    Blob::Ptr _tmp2 = nullptr;

    /**
     * @brief Pointer-to-implementation (PIMPL) hiding preprocessing implementation details.
@ -814,97 +75,12 @@ void PreProcessData::execute(Blob::Ptr &preprocessedBlob, const PreProcessInfo &
        _preproc.reset(new PreprocEngine);
    }

-    if (_preproc->preprocessWithGAPI(_userBlob, preprocessedBlob, algorithm, fmt, serial, batchSize)) {
-        return;
-    }
-
-    if (algorithm == NO_RESIZE) {
-       THROW_IE_EXCEPTION << "Input pre-processing is called without the pre-processing info set: "
-                             "there's nothing to be done";
-    }
-
-    if (batchSize > 1) {
-        THROW_IE_EXCEPTION << "Batch pre-processing is unsupported in this mode. "
-                              "Use default pre-processing instead to process batches.";
-    }
-
-    if (fmt != ColorFormat::RAW) {
-        THROW_IE_EXCEPTION << "Non-default (not ColorFormat::RAW) color formats are unsupported "
-                              "in this mode. Use default pre-processing instead to process color "
-                              "formats.";
-    }
-
-    Blob::Ptr res_in, res_out;
-    if (_userBlob->getTensorDesc().getLayout() == NHWC) {
-        if (!_tmp1 || _tmp1->size() != _userBlob->size()) {
-            if (_userBlob->getTensorDesc().getPrecision() == Precision::FP32) {
-                _tmp1 = make_shared_blob<float>({Precision::FP32, _userBlob->getTensorDesc().getDims(), Layout::NCHW});
-            } else {
-                _tmp1 = make_shared_blob<uint8_t>({Precision::U8, _userBlob->getTensorDesc().getDims(), Layout::NCHW});
-            }
-            _tmp1->allocate();
-        }
-
-        {
-            OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Reorder before");
-            blob_copy(_userBlob, _tmp1);
-        }
-        res_in = _tmp1;
-    } else {
-        res_in = _userBlob;
-    }
-
-    if (preprocessedBlob->getTensorDesc().getLayout() == NHWC) {
-        if (!_tmp2 || _tmp2->size() != preprocessedBlob->size()) {
-            if (preprocessedBlob->getTensorDesc().getPrecision() == Precision::FP32) {
-                _tmp2 = make_shared_blob<float>({Precision::FP32, preprocessedBlob->getTensorDesc().getDims(), Layout::NCHW});
-            } else {
-                _tmp2 = make_shared_blob<uint8_t>({Precision::U8, preprocessedBlob->getTensorDesc().getDims(), Layout::NCHW});
-            }
-            _tmp2->allocate();
-        }
-        res_out = _tmp2;
-    } else {
-        res_out = preprocessedBlob;
-    }
-
-    {
-        OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Resize");
-        resize(res_in, res_out, algorithm);
-    }
-
-    if (res_out == _tmp2) {
-        OV_ITT_SCOPED_TASK(itt::domains::IEPreproc, "Reorder after");
-        blob_copy(_tmp2, preprocessedBlob);
-    }
+    _preproc->preprocessWithGAPI(_userBlob, preprocessedBlob, algorithm, fmt, serial, batchSize);
 }

 void PreProcessData::isApplicable(const Blob::Ptr &src, const Blob::Ptr &dst) {
-    // if G-API pre-processing is used, let it check that pre-processing is applicable
-    if (PreprocEngine::useGAPI()) {
-        PreprocEngine::checkApplicabilityGAPI(src, dst);
-        return;
-    }

-    if (!src->is<MemoryBlob>() || !dst->is<MemoryBlob>()) {
-        THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs must "
-                              "be memory blobs";
-    }
-
-    auto &src_dims = src->getTensorDesc().getDims();
-    auto &dst_dims = dst->getTensorDesc().getDims();
-
-    if (src_dims.size() != dst_dims.size())
-        THROW_IE_EXCEPTION << "Preprocessing is not applicable. Source and destination blobs have different "
-                              "number of dimensions";
-
-    if (src_dims.size() != 4)
-        THROW_IE_EXCEPTION << "Preprocessing is not applicable. Only 4D tensors are supported.";
-
-    if (src_dims[0] != dst_dims[0] || src_dims[1] != dst_dims[1])
-        THROW_IE_EXCEPTION << "Preprocessing is not applicable. Wrong shape. Network expected 4D input tensor with "
-                              "shape [" << dst_dims[0] << "," << dst_dims[1] <<",H,W] but provided tensor has "
-                              "shape "  << details::dumpVec(src_dims) << ".";
+    PreprocEngine::checkApplicabilityGAPI(src, dst);
 }

 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_data.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_data.hpp
@ -97,42 +97,4 @@ inline PreProcessDataPtr CreatePreprocDataHelper() {
    return PreProcessDataPtr(preprocLibraryPath);
 }

-//----------------------------------------------------------------------
-//
-// Implementation-internal types and functions and macros
-//
-//----------------------------------------------------------------------
-
-namespace Resize {
-
-static inline uint8_t saturateU32toU8(uint32_t v) {
-    return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
-}
-
-void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
-
-void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer);
-
-int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale);
-
-void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
-                          uint16_t* si, uint16_t* alpha, int max_count);
-
-void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
-                                  uint16_t** alpha, uint16_t** sxid);
-
-enum BorderType {
-    BORDER_CONSTANT  =  0,
-    BORDER_REPLICATE =  1,
-};
-
-struct Border {
-    BorderType  type;
-    int32_t     value;
-};
-
-}  // namespace Resize
-
-//----------------------------------------------------------------------
-
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp
@ -757,15 +757,6 @@ PreprocEngine::Update PreprocEngine::needUpdate(const CallDesc &newCallOrig) con
    return Update::NOTHING;
 }

-bool PreprocEngine::useGAPI() {
-    static const bool NO_GAPI = [](const char *str) -> bool {
-        std::string var(str ? str : "");
-        return var == "N" || var == "NO" || var == "OFF" || var == "0";
-    } (getenv("USE_GAPI"));
-
-    return !NO_GAPI;
-}
-
 void PreprocEngine::checkApplicabilityGAPI(const Blob::Ptr &src, const Blob::Ptr &dst) {
    // Note: src blob is the ROI blob, dst blob is the network's input blob

@ -904,7 +895,7 @@ void PreprocEngine::executeGraph(Opt<cv::GComputation>& lastComputation,
 }

 template<typename BlobTypePtr>
-bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
+void PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
    ResizeAlgorithm algorithm, ColorFormat in_fmt, ColorFormat out_fmt, bool omp_serial,
    int batch_size) {

@ -953,7 +944,6 @@ bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &o
    if (algorithm == NO_RESIZE && std::get<0>(thisCall) == std::get<1>(thisCall)) {
        //if requested output parameters match input blob no need to do anything
        THROW_IE_EXCEPTION  << "No job to do in the PreProcessing ?";
-        return true;
    }

    const Update update = needUpdate(thisCall);
@ -983,15 +973,10 @@ bool PreprocEngine::preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &o

    executeGraph(_lastComputation, batched_input_plane_mats, batched_output_plane_mats, batch_size,
        omp_serial, update);
-
-    return true;
 }

-bool PreprocEngine::preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob,
+void PreprocEngine::preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob,
        const ResizeAlgorithm& algorithm, ColorFormat in_fmt, bool omp_serial, int batch_size) {
-    if (!useGAPI()) {
-        return false;
-    }

    const auto out_fmt = (in_fmt == ColorFormat::RAW) ? ColorFormat::RAW : ColorFormat::BGR;  // FIXME: get expected color format from network

--- a/inference-engine/src/preprocessing/ie_preprocess_gapi.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi.hpp
@ -45,16 +45,15 @@ class PreprocEngine {
                      Update update);

    template<typename BlobTypePtr>
-    bool preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
+    void preprocessBlob(const BlobTypePtr &inBlob, MemoryBlob::Ptr &outBlob,
        ResizeAlgorithm algorithm, ColorFormat in_fmt, ColorFormat out_fmt, bool omp_serial,
        int batch_size);

 public:
    PreprocEngine();
-    static bool useGAPI();
    static void checkApplicabilityGAPI(const Blob::Ptr &src, const Blob::Ptr &dst);
    static int getCorrectBatchSize(int batch_size, const Blob::Ptr& roiBlob);
-    bool preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm,
+    void preprocessWithGAPI(const Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm,
        ColorFormat in_fmt, bool omp_serial, int batch_size = -1);
 };