[CPU]disable and cleanup interp and resample that are covered by interpolate (#3164)

* [BF16] Interpolate layer and test were updated for support BF16 Co-authored-by: alexey-varyzgin <alexey.varyzgin@intel.com>
2020-12-04 16:05:10 +08:00 · 2020-12-04 16:05:10 +08:00 · d35e3e806b
commit d35e3e806b
parent a7ede592c3
16 changed files with 21 additions and 2182 deletions
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@ -36,7 +36,6 @@ set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tensoriterator_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tile_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_mvn_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_resample_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_normalize_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_scatter_update_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_interpolate_node.cpp
@ -93,7 +92,6 @@ set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/softmax.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/common/emitter.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/interp.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_eltwise_emitters.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/jit_mkldnn_emitters.cpp
--- a/inference-engine/src/mkldnn_plugin/bf16transformer.h
+++ b/inference-engine/src/mkldnn_plugin/bf16transformer.h
@ -14,7 +14,7 @@ namespace MKLDNNPlugin {
 class BF16Transformer {
    const InferenceEngine::details::caseless_set<std::string> _initbf16 =
-        { "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo" };
+        { "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo", "Interpolate" };
    const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
        { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "normalize",
          "sigmoid", "ReLU6", "not", "activation", "HSwish", "mish", "logistic", "mod", "resample",
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -15,7 +15,6 @@
 #include "nodes/mkldnn_quantize_node.h"
 #include "nodes/mkldnn_mvn_node.h"
 #include <nodes/mkldnn_permute_node.h>
 #include "nodes/mkldnn_resample_node.h"
 #include "nodes/mkldnn_interpolate_node.h"
 #include "nodes/mkldnn_input_node.h"
@ -123,9 +122,6 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
    FuseMVNAndSimpleOperation(graph);
    graph.RemoveDroppedNodes();
    FuseResampleAndSimpleOperation(graph);
    graph.RemoveDroppedNodes();
    FuseInterpolateAndSimpleOperation(graph);
    graph.RemoveDroppedNodes();
@ -1491,74 +1487,6 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
    }
 }
 void MKLDNNGraphOptimizer::FuseResampleAndSimpleOperation(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();
    auto isSutableParentNode = [](MKLDNNNodePtr node) {
        bool isSutableResample = (node->getType() == Resample) && (node->inDims[0].ndims() == 4 || node->inDims[0].ndims() == 5);
        if (isSutableResample) {
            auto *resampleLayer = node->getCnnLayer().get();
            if (resampleLayer == nullptr)
                THROW_IE_EXCEPTION << "Cannot get Resample layer " << node->getName();
            return node->getChildEdges().size() == 1 && resampleLayer->GetParamAsString("type") == "caffe.ResampleParameter.NEAREST";
        } else {
            return false;
        }
    };
    auto isSutableChildNode = [](MKLDNNNodePtr node) {
        if (!node->getCnnLayer())
            return false;
        if (node->getType() == Quantize) {
            auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
            if (quantizeNode == nullptr)
                THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
            return !quantizeNode->isBinarization();
        } else if (node->getType() == Eltwise) {
            auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
            if (eltwiseNode == nullptr)
                THROW_IE_EXCEPTION << "Cannot get Eltwise node " << node->getName();
            return eltwiseNode->getOpType() == Relu ||
                   eltwiseNode->getOpType() == MulAdd;
        }
        return false;
    };
    auto parent = graphNodes.begin();
    while (parent != graphNodes.end()) {
        auto parentNode = *parent;
        if (!isSutableParentNode(parentNode)) {
            parent++;
            continue;
        }
        auto childNode = parentNode->getChildEdgeAt(0)->getChild();
        if (!isSutableChildNode(childNode)) {
            parent++;
            continue;
        }
        parentNode->fuseWith(childNode);
        if (childNode->getType() == Quantize || childNode->getType() == Eltwise) {
            auto parentEdges = childNode->parentEdges;
            for (auto &parentEdge : parentEdges) {
                auto p_edge = parentEdge.lock();
                if (p_edge->getParent()->getType() == Resample)
                    continue;
                removeEdge(graph, p_edge);
            }
        }
        graph.DropNode(childNode);
    }
 }
 void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@ -37,7 +37,6 @@ private:
    void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph);
 #endif
    void FuseMVNAndSimpleOperation(MKLDNNGraph &graph);
    void FuseResampleAndSimpleOperation(MKLDNNGraph &graph);
    void FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph);
    void FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph);
    void RemoveIdentityOperator(MKLDNNGraph& graph);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -39,7 +39,6 @@
 #include <nodes/mkldnn_bin_conv_node.h>
 #include <nodes/mkldnn_def_conv_node.h>
 #include <nodes/mkldnn_mvn_node.h>
 #include <nodes/mkldnn_resample_node.h>
 #include <nodes/mkldnn_normalize_node.h>
 #include <nodes/mkldnn_reduce_node.h>
 #include <nodes/mkldnn_tensoriterator_node.h>
@ -123,7 +122,6 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
        { "Memory", MemoryOutput },  // for construction from layer ctor
        { "Convert", Convert },
        { "MVN", MVN},
        { "Resample", Resample},
        { "Normalize", Normalize},
        { "ScatterUpdate", ScatterUpdate},
        { "ScatterElementsUpdate", ScatterElementsUpdate},
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -66,7 +66,6 @@ enum Type {
    TensorIterator,
    Convert,
    MVN,
    Resample,
    Normalize,
    ScatterUpdate,
    ScatterElementsUpdate,
@ -162,8 +161,6 @@ static std::string NameFromType(Type type) {
            return "TensorIterator";
        case Convert:
            return "Convert";
        case Resample:
            return "Resample";
        case Normalize:
            return "Normalize";
        case ScatterUpdate:
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@ -26,6 +26,7 @@
 #include <legacy/transformations/convert_opset1_to_legacy/convert_prior_to_ie_prior.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_nms_5_to_legacy.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_interpolate_to_interp_or_resample.hpp>
 #include <legacy/ngraph_ops/fully_connected.hpp>
 #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
@ -52,6 +53,7 @@
 #include <transformations/op_conversions/rnn_cell_decomposition.hpp>
 #include <transformations/op_conversions/gru_cell_decomposition.hpp>
 #include <transformations/op_conversions/log_softmax_decomposition.hpp>
 #include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
 #include <transformations/convert_precision.hpp>
 #include <transformations/init_node_info.hpp>
 #include <transformations/rt_info/fused_names_attribute.hpp>
@ -200,8 +202,10 @@ static void Transformation(ICNNNetwork::Ptr& clonedNetwork, const Config& conf)
    pass_config->disable<ngraph::pass::HSigmoidDecomposition>();
    pass_config->disable<ngraph::pass::ConvertMod>();
    pass_config->disable<ngraph::pass::LogSoftmaxDecomposition>();
    pass_config->disable<ngraph::pass::ConvertInterpolateToInterpOrResampleMatcher>();
    pass_config->enable<ngraph::pass::ConvertPadToGroupConvolution>();
    pass_config->enable<ngraph::pass::ConvertInterpolate1ToInterpolate4>();
    manager.run_passes(nGraphFunc);
--- a/inference-engine/src/mkldnn_plugin/nodes/interp.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/interp.cpp
@ -1,432 +0,0 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "base.hpp"
 #include <string>
 #include <vector>
 #include <limits>
 #include <memory>
 #include "ie_parallel.hpp"
 #include "jit_generator.hpp"
 using namespace mkldnn::impl::cpu;
 using namespace mkldnn::impl::utils;
 namespace InferenceEngine {
 namespace Extensions {
 namespace Cpu {
 #define GET_OFF(field) offsetof(jit_args_interp, field)
 struct jit_args_interp {
    const float *src00;
    const float *src01;
    const float *src10;
    const float *src11;
    float *dst;
    float *h_lambda0;
    float *h_lambda1;
    float *w_lambda0;
    float *w_lambda1;
 };
 struct jit_uni_interp_kernel {
    void (*ker_)(const jit_args_interp *);
    void operator()(const jit_args_interp *args) { assert(ker_); ker_(args); }
    jit_uni_interp_kernel() : ker_(nullptr) {}
    virtual ~jit_uni_interp_kernel() {}
 };
 template <cpu_isa_t isa>
 struct jit_uni_interp_kernel_f32 : public jit_uni_interp_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interp_kernel_f32)
    jit_uni_interp_kernel_f32() : jit_uni_interp_kernel(), jit_generator() {
        this->preamble();
        mov(reg_src00, ptr[reg_params + GET_OFF(src00)]);
        mov(reg_src01, ptr[reg_params + GET_OFF(src01)]);
        mov(reg_src10, ptr[reg_params + GET_OFF(src10)]);
        mov(reg_src11, ptr[reg_params + GET_OFF(src11)]);
        mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
        mov(reg_h_lambda0, ptr[reg_params + GET_OFF(h_lambda0)]);
        mov(reg_h_lambda1, ptr[reg_params + GET_OFF(h_lambda1)]);
        mov(reg_w_lambda0, ptr[reg_params + GET_OFF(w_lambda0)]);
        mov(reg_w_lambda1, ptr[reg_params + GET_OFF(w_lambda1)]);
        uni_vmovups(vmm_src00, ptr[reg_src00]);
        uni_vmovups(vmm_src01, ptr[reg_src01]);
        uni_vmovups(vmm_src10, ptr[reg_src10]);
        uni_vmovups(vmm_src11, ptr[reg_src11]);
        uni_vbroadcastss(vmm_h_lambda0, ptr[reg_h_lambda0]);
        uni_vbroadcastss(vmm_h_lambda1, ptr[reg_h_lambda1]);
        uni_vbroadcastss(vmm_w_lambda0, ptr[reg_w_lambda0]);
        uni_vbroadcastss(vmm_w_lambda1, ptr[reg_w_lambda1]);
        if (isa != sse42) {
            uni_vmulps(vmm_src01, vmm_src01, vmm_w_lambda0);
            uni_vmulps(vmm_src11, vmm_src11, vmm_w_lambda0);
            uni_vfmadd231ps(vmm_src01, vmm_w_lambda1, vmm_src00);
            uni_vfmadd231ps(vmm_src11, vmm_w_lambda1, vmm_src10);
            uni_vmulps(vmm_src01, vmm_src01, vmm_h_lambda1);
            uni_vfmadd231ps(vmm_src01, vmm_h_lambda0, vmm_src11);
            uni_vmovups(ptr[reg_dst], vmm_src01);
        } else {
            uni_vmulps(vmm_src01, vmm_src01, vmm_w_lambda0);
            uni_vmulps(vmm_src11, vmm_src11, vmm_w_lambda0);
            uni_vfmadd231ps(vmm_src01, vmm_w_lambda1, vmm_src00);
            // uni_vfmadd231ps affects XMM (vmm_w_lambda1) register. Need to initialize again.
            uni_vbroadcastss(vmm_w_lambda1, ptr[reg_w_lambda1]);
            uni_vfmadd231ps(vmm_src11, vmm_w_lambda1, vmm_src10);
            uni_vmulps(vmm_src01, vmm_src01, vmm_h_lambda1);
            uni_vfmadd231ps(vmm_src01, vmm_h_lambda0, vmm_src11);
            uni_vmovups(ptr[reg_dst], vmm_src01);
            // Next 4 elements
            size_t stride = 4 * sizeof(float);
            add(reg_src00, stride);
            add(reg_src01, stride);
            add(reg_src10, stride);
            add(reg_src11, stride);
            add(reg_dst, stride);
            uni_vmovups(vmm_src00, ptr[reg_src00]);
            uni_vmovups(vmm_src01, ptr[reg_src01]);
            uni_vmovups(vmm_src10, ptr[reg_src10]);
            uni_vmovups(vmm_src11, ptr[reg_src11]);
            uni_vbroadcastss(vmm_h_lambda0, ptr[reg_h_lambda0]);
            uni_vbroadcastss(vmm_w_lambda1, ptr[reg_w_lambda1]);
            uni_vmulps(vmm_src01, vmm_src01, vmm_w_lambda0);
            uni_vmulps(vmm_src11, vmm_src11, vmm_w_lambda0);
            uni_vfmadd231ps(vmm_src01, vmm_w_lambda1, vmm_src00);
            uni_vbroadcastss(vmm_w_lambda1, ptr[reg_w_lambda1]);
            uni_vfmadd231ps(vmm_src11, vmm_w_lambda1, vmm_src10);
            uni_vmulps(vmm_src01, vmm_src01, vmm_h_lambda1);
            uni_vfmadd231ps(vmm_src01, vmm_h_lambda0, vmm_src11);
            uni_vmovups(ptr[reg_dst], vmm_src01);
        }
        this->postamble();
        ker_ = (decltype(ker_))this->getCode();
    }
 private:
    using Vmm = typename conditional3<isa == sse42, Xbyak::Xmm, isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;
    Xbyak::Reg64 reg_src00 = r8;
    Xbyak::Reg64 reg_src01 = r9;
    Xbyak::Reg64 reg_src10 = r10;
    Xbyak::Reg64 reg_src11 = r11;
    Xbyak::Reg64 reg_dst   = rbp;
    Xbyak::Reg64 reg_h_lambda0 = r12;
    Xbyak::Reg64 reg_h_lambda1 = r13;
    Xbyak::Reg64 reg_w_lambda0 = r14;
    Xbyak::Reg64 reg_w_lambda1 = r15;
    Xbyak::Reg64 reg_params = abi_param1;
    Vmm vmm_src00 = Vmm(0);
    Vmm vmm_src01 = Vmm(1);
    Vmm vmm_src10 = Vmm(2);
    Vmm vmm_src11 = Vmm(3);
    Vmm vmm_h_lambda0 = Vmm(4);
    Vmm vmm_h_lambda1 = Vmm(5);
    Vmm vmm_w_lambda0 = Vmm(6);
    Vmm vmm_w_lambda1 = Vmm(7);
    Vmm vmm_dst   = Vmm(8);
 };
 class InterpImpl: public ExtLayerBase {
 public:
    explicit InterpImpl(const CNNLayer* layer) {
        try {
            if (layer->insData.size() != 1 || layer->outData.empty())
                THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
            auto inData = layer->insData[0].lock();
            if (inData == nullptr) {
                THROW_IE_EXCEPTION << "Layer '" << layer->name << "' has nullable input data.";
            }
            if (inData->getTensorDesc().getDims().size() != 4)
                THROW_IE_EXCEPTION << "Interp supports only 4d blobs!";
            // We don't read other parameters since they are needed only for dst reshape in caffe
            pad_beg = layer->GetParamAsInt("pad_beg");
            pad_end = layer->GetParamAsInt("pad_end");
            align_corners = layer->GetParamAsBool("align_corners", true);
            ConfLayout blk_layout;
            if (inData->getTensorDesc().getPrecision() == Precision::U8) {
                LayerConfig config;
                DataConfig dataConfigDct;
                dataConfigDct.desc = TensorDesc(Precision::U8, inData->getTensorDesc().getDims(), Layout::NCHW);
                config.inConfs.push_back(dataConfigDct);
                DataConfig dataConfigOut;
                const SizeVector& out_dims = layer->outData[0]->getTensorDesc().getDims();
                SizeVector blocks = out_dims;
                SizeVector order(blocks.size());
                SizeVector dimOffsets(blocks.size());
                SizeVector strides(blocks.size());
                size_t offset((std::numeric_limits<size_t>::max)());
                for (size_t i = 0; i < order.size(); i++) {
                    strides[i] = (std::numeric_limits<size_t>::max)();
                    dimOffsets[i] = 0;
                    order[i] = i;
                }
                dataConfigOut.desc = TensorDesc(Precision::FP32, out_dims, { blocks, order, offset, dimOffsets, strides });
                config.outConfs.push_back(dataConfigOut);
                config.dynBatchSupport = false;
                confs.push_back(config);
            } else {
                if (mayiuse(avx512_common)) {
                    blk_layout = ConfLayout::BLK16;
                    interp_kernel.reset(new jit_uni_interp_kernel_f32<avx512_common>());
                    addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
                } else if (mayiuse(avx2)) {
                    blk_layout = ConfLayout::BLK8;
                    interp_kernel.reset(new jit_uni_interp_kernel_f32<avx2>());
                    addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
                } else {
                    blk_layout = ConfLayout::BLK8;
                    interp_kernel.reset(new jit_uni_interp_kernel_f32<sse42>());
                    addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) });
                }
            }
        } catch (InferenceEngine::details::InferenceEngineException &ex) {
            errorMsg = ex.what();
        }
    }
    StatusCode init(LayerConfig& config, ResponseDesc *resp) noexcept override {
        if (config.inConfs.size() != 1 || config.outConfs.size() != 1) {
            strncpy(resp->msg, "Interp layer has invalid configs", sizeof(resp->msg));
            return GENERAL_ERROR;
        }
        if (config.inConfs[0].desc.getDims().size() != 4) {
            std::ostringstream result;
            result << "Interp layer has invalid layout: " << config.inConfs[0].desc.getLayout();
            strncpy(resp->msg, result.str().c_str(), sizeof(resp->msg) - 1);
            return GENERAL_ERROR;
        }
        auto inPrecision = config.inConfs[0].desc.getPrecision();
        if (inPrecision != Precision::U8 && inPrecision != Precision::FP32)  {
            strncpy(resp->msg, "Interp layer has unsupported input precision", sizeof(resp->msg));
            return GENERAL_ERROR;
        }
        if (config.outConfs[0].desc.getPrecision() != Precision::FP32)  {
            strncpy(resp->msg, "Interp layer has unsupported output precision", sizeof(resp->msg));
            return GENERAL_ERROR;
        }
        return OK;
    }
    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
                       ResponseDesc *resp) noexcept override {
 #ifdef WIN32
 #undef IN
 #endif
        size_t IN = inputs[0]->getTensorDesc().getDims()[0];
        size_t IH = inputs[0]->getTensorDesc().getDims()[2];
        size_t IW = inputs[0]->getTensorDesc().getDims()[3];
        size_t OH = outputs[0]->getTensorDesc().getDims()[2];
        size_t OW = outputs[0]->getTensorDesc().getDims()[3];
        size_t IH_pad = IH + pad_beg + pad_end;
        size_t IW_pad = IW + pad_beg + pad_end;
        auto *dst_data = outputs[0]->buffer().as<float *>() + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
        switch (inputs[0]->getTensorDesc().getPrecision()) {
        case Precision::FP32:
        {
            const float* src_data = inputs[0]->cbuffer().as<const float *>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
            size_t IC = (inputs[0]->getTensorDesc().getLayout() == Layout::BLOCKED)
                ? inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1] *
                inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[4]
                : IC = inputs[0]->getTensorDesc().getDims()[1];
            interpolate(IN, IC, src_data,
                -pad_beg, -pad_beg, IH_pad, IW_pad, IH, IW, dst_data, 0, 0, OH, OW, OH, OW);
        }
        break;
        case Precision::U8:
        {
            const uint8_t* src_data = inputs[0]->cbuffer().as<const uint8_t *>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
            size_t IC = inputs[0]->getTensorDesc().getDims()[1];
            interpolate_8u(inputs[0]->getTensorDesc().getLayout(), IN, IC, src_data,
                -pad_beg, -pad_beg, IH_pad, IW_pad, IH, IW, dst_data, 0, 0, OH, OW, OH, OW);
        }
        break;
        default:
            if (resp) {
                std::string errorMsg = "Incorrect input precision. Only U8 or FP32 are supported!";
                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
            }
            return GENERAL_ERROR;
        }
        return OK;
    }
 private:
    int pad_beg;
    int pad_end;
    bool align_corners;
    std::shared_ptr<jit_uni_interp_kernel> interp_kernel;
    void interpolate(const size_t N, const size_t C,
                     const float *src, const int x1, const int y1,
                     const int IH_pad, const int IW_pad, const size_t IH, const size_t IW,
                     float *dst, const int x2, const int y2,
                     const int OH_pad, const int OW_pad, const size_t OH, const size_t OW) {
        if (IH_pad == OH_pad && IW_pad == OW_pad) {
            for (size_t i = 0; i < N * C * OH * OW; i++) {
                dst[i] = src[i];
            }
            return;
        }
        float rh;
        float rw;
        if (align_corners) {
            rh = (OH_pad > 1) ? static_cast<float>(IH_pad - 1) / (OH_pad - 1) : 0.0f;
            rw = (OW_pad > 1) ? static_cast<float>(IW_pad - 1) / (OW_pad - 1) : 0.0f;
        } else {
            rh = static_cast<float>(IH_pad) / (OH_pad);
            rw = static_cast<float>(IW_pad) / (OW_pad);
        }
        int block_size = 1;
        if (interp_kernel) {
            if (mayiuse(avx512_common)) {
                block_size = 16;
            } else {
                block_size = 8;
            }
        }
        // Align channel number to block size to deal with channels padding in IE with multiple blobs
        size_t CB = (C + block_size - 1) & (-block_size);
        size_t CH = (C + block_size - 1) / block_size;
        parallel_for3d(N, CH, OH_pad, [&](size_t n, size_t cb, size_t h) {
                    const float *psrc_n_cb = src + n * CB * IH * IW + cb * block_size * IW * IH;  //  n+cb src address
                    // h is output h
                    float fh = rh * h;
                    // ih0 is higher input h position
                    int ih0 = static_cast<int>(fh);
                    // ih1 is lower input h position
                    int ih1 = (ih0 < IH_pad - 1) ? ih0 + 1 : ih0;
                    float h_lambda0 = fh - ih0;  // for lower input h weight
                    float h_lambda1 = 1.0f - h_lambda0;  // for higher input h weight
                    const float *psrc_h0 = psrc_n_cb + (y1 + ih0) * IW * block_size + x1 * block_size;
                    const float *psrc_h1 = psrc_n_cb + (y1 + ih1) * IW * block_size + x1 * block_size;
                    float *pdst_h = dst + n * CB * OH * OW + cb * block_size * OW * OH + (y2 + h) * OW * block_size + x2 * block_size;
                    auto arg = jit_args_interp();
                    arg.h_lambda0 = static_cast<float*>(&h_lambda0);
                    arg.h_lambda1 = static_cast<float*>(&h_lambda1);
                    for (int w = 0; w < OW_pad; ++w) {
                        float fw = rw * w;
                        int iw0 = static_cast<int>(fw);
                        int iw1 = (iw0 < IW_pad - 1) ? iw0 + 1 : iw0;
                        float w_lambda0 = fw - iw0;  // for right input w weight
                        float w_lambda1 = 1.0f - w_lambda0;  // for left input w weight
                        const float *psrc00 = psrc_h0 + iw0 * block_size;
                        const float *psrc01 = psrc_h0 + iw1 * block_size;
                        const float *psrc10 = psrc_h1 + iw0 * block_size;
                        const float *psrc11 = psrc_h1 + iw1 * block_size;
                        float *pdst = pdst_h + w * block_size;
                        if (interp_kernel) {
                            arg.src00 = psrc00;
                            arg.src01 = psrc01;
                            arg.src10 = psrc10;
                            arg.src11 = psrc11;
                            arg.dst = pdst;
                            arg.w_lambda0 = static_cast<float*>(&w_lambda0);
                            arg.w_lambda1 = static_cast<float*>(&w_lambda1);
                            (*interp_kernel)(&arg);
                        } else {
                            for (int c = 0; c < block_size; ++c) {
                                pdst[c] = h_lambda1 * (w_lambda1 * psrc00[c] + w_lambda0 * psrc01[c]) +
                                    h_lambda0 * (w_lambda1 * psrc10[c] + w_lambda0 * psrc11[c]);
                            }
                        }
                    }
        });
    }
    void interpolate_8u(Layout layout, const size_t N, const size_t C,
        const uint8_t *src, const int x1, const int y1,
        const int IH_pad, const int IW_pad, const size_t IH, const size_t IW,
        float *dst, const int x2, const int y2,
        const int OH_pad, const int OW_pad, const size_t OH, const size_t OW) {
        if (IH_pad == OH_pad && IW_pad == OW_pad) {
            for (size_t i = 0; i < N * C * OH * OW; i++) {
                dst[i] = static_cast<float>(src[i]);
            }
            return;
        }
        float rh;
        float rw;
        if (align_corners) {
            rh = (OH_pad > 1) ? static_cast<float>(IH_pad - 1) / (OH_pad - 1) : 0.0f;
            rw = (OW_pad > 1) ? static_cast<float>(IW_pad - 1) / (OW_pad - 1) : 0.0f;
        } else {
            rh = static_cast<float>(IH_pad) / (OH_pad);
            rw = static_cast<float>(IW_pad) / (OW_pad);
        }
        parallel_for3d(N, C, OH_pad, [&](size_t n, size_t cb, size_t h) {
            const uint8_t *psrc = src + n * C * IH * IW;
            float fh = rh * h;
            int ih0 = static_cast<int>(fh);
            int ih1 = (ih0 < IH_pad - 1) ? ih0 + 1 : ih0;
            float h_lambda0 = fh - ih0;
            float h_lambda1 = 1.0f - h_lambda0;
            for (int w = 0; w < OW_pad; ++w) {
                float fw = rw * w;
                int iw0 = static_cast<int>(fw);
                int iw1 = (iw0 < IW_pad - 1) ? iw0 + 1 : iw0;
                float w_lambda0 = fw - iw0;
                float w_lambda1 = 1.0f - w_lambda0;
                dst[n * C * OH * OW + cb * OW * OH + (y2 + h) * OW + (x2 + w)] =
                    h_lambda1 * (w_lambda1 * static_cast<float>(psrc[cb * IW * IH + (y1 + ih0) * IW + (x1 + iw0)]) +
                    w_lambda0 * static_cast<float>(psrc[cb * IW * IH + (y1 + ih0) * IW + (x1 + iw1)])) +
                    h_lambda0 * (w_lambda1 * static_cast<float>(psrc[cb * IW * IH + (y1 + ih1) * IW + (x1 + iw0)]) +
                    w_lambda0 * static_cast<float>(psrc[cb * IW * IH + (y1 + ih1) * IW + (x1 + iw1)]));
            }
        });
    }
 };
 REG_FACTORY_FOR(InterpImpl, Interp);
 }  // namespace Cpu
 }  // namespace Extensions
 }  // namespace InferenceEngine
--- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
@ -64,7 +64,6 @@ MKLDNN_EXTENSION_NODE(TopKImpl, TopK);
 MKLDNN_EXTENSION_NODE(ShuffleChannelsImpl, ShuffleChannels);
 MKLDNN_EXTENSION_NODE(SpaceToDepthImpl, SpaceToDepth);
 MKLDNN_EXTENSION_NODE(PowerFileImpl, PowerFile);
 MKLDNN_EXTENSION_NODE(InterpImpl, Interp);
 MKLDNN_EXTENSION_NODE(BatchToSpaceImpl, BatchToSpace);
 MKLDNN_EXTENSION_NODE(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator);
 MKLDNN_EXTENSION_NODE(SimplerNMSImpl, SimplerNMS);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
@ -1872,7 +1872,6 @@ void MKLDNNInterpolateNode::buildTblLinearOnnx(SizeVector& srcDimPad5d, SizeVect
        size_t scratchLen = rnd_up(OW + OW + OH + OH, 16);
        int idxType = 2;
        indexTable.resize(idxType * scratchLen);
        std::vector<int> index(scratchLen, 0);
        int *indexLeft = static_cast<int*>(&indexTable[0]);
        int *indexRight = static_cast<int*>(&indexTable[OW]);
        int *indexTop = static_cast<int*>(&indexTable[2 * OW]);
@ -2320,7 +2319,7 @@ void MKLDNNInterpolateNode::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr
                    arg.src_ptr[0] = in_ptr_cbd + blk_size * IW * index_h[h] * srcDataSize;
                    arg.index = static_cast<int*>(&(index_w_kernel[0]));
                    arg.work_amount = static_cast<size_t>(OW);
-                    arg.oc_off = cb * blk_size;
+                    arg.oc_off = cb * blk_size * sizeof(float);
                    (*interpolateKernel)(&arg);
                }
            });
@ -2351,7 +2350,7 @@ void MKLDNNInterpolateNode::NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_,
        arg.src_ptr[0] = in_ptr;
        arg.dst = out_ptr;
        arg.index = static_cast<int*>(&index_kernel[0]);  // need index_h and index_w in kernel, it's in continous memory so one param
-        arg.oc_off = static_cast<size_t>(c);
+        arg.oc_off = static_cast<size_t>(c * sizeof(float));
        // work_amount is OH(out loop) and OW(inner loop), can get in kernel from jcp.
        (*interpolateKernel)(&arg);
    });
@ -2391,7 +2390,7 @@ void MKLDNNInterpolateNode::linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *ou
        arg.weight_ptr[0] = static_cast<float*>(&weight[0]);
        arg.dst = out_ptr_nc;
        arg.work_amount = OW * OH;
-        arg.oc_off = c;
+        arg.oc_off = static_cast<size_t>(c * sizeof(float));
        (*interpolateKernel)(&arg);
    });
 }
@ -2666,7 +2665,7 @@ void MKLDNNInterpolateNode::cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr
        arg.weight_ptr[0] = xFactor;
        arg.weight_ptr[1] = yFactor;
        arg.work_amount = static_cast<size_t>(OW * OH);
-        arg.oc_off = static_cast<size_t>(C);
+        arg.oc_off = static_cast<size_t>(c * sizeof(float));
        (*interpolateKernel)(&arg);
    });
 }
@ -2788,7 +2787,7 @@ inline float MKLDNNInterpolateNode::coordTransToInput(int outCoord, float scale,
        }
        case InterpolateCoordTransMode::align_corners: {
            if (outShape > 1)
-                return outCoord * static_cast<float>(inShape - 1) / static_cast<float>(outShape - 1);
+                return outCoord * (static_cast<float>(inShape - 1) / static_cast<float>(outShape - 1));
            else
                return 0;
            break;
@ -2844,10 +2843,9 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
        return false;
    };
-    if (!mayiuse(cpu::sse42))
+    if (!mayiuse(cpu::sse42) || mode == InterpolateMode::linear) {
        return false;
    if (mode == InterpolateMode::linear || mode == InterpolateMode::cubic)
        return false;
    }
    if (node->getType() == Quantize) {
        auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
@ -2858,10 +2856,9 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
        if (eltwiseNode == nullptr)
            THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
-        return isOneOf(eltwiseNode->getOpType(), {MulAdd, Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp,
+        return isOneOf(eltwiseNode->getOpType(), {Prelu, Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp,
                                                  Tanh, Swish, Hswish, Mish, Hsigmoid, Round, Linear, Abs, Square, Sqrt}) ||
-                ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) ||
+                (eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2);
                 (eltwiseNode->getOpType() == Prelu));
    }
    return false;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp
@ -1,922 +0,0 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "mkldnn_resample_node.h"
 #include "desc_iterator.hpp"
 #include "mkldnn_quantize_node.h"
 #include <legacy/ie_layers.h>
 #include "mkldnn_eltwise_node.h"
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include "utils/bfloat16.hpp"
 #include <legacy/ie_layers_internal.hpp>
 #include "ie_parallel.hpp"
 #include <algorithm>
 #include "jit_generator.hpp"
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_depthwise.hpp"
 #include "jit_uni_quantization.hpp"
 #include "common/cpu_memcpy.h"
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
 using namespace mkldnn::impl::cpu;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;
 #define GET_OFF(field) offsetof(jit_resample_call_args, field)
 static inline bool isFloatCompatible(Precision prc) {
    return Precision::FP32 == prc || Precision::BF16 == prc;
 }
 static inline bool isFloatCompatible(memory::data_type type) {
    return memory::f32 == type || memory::bf16 == type;
 }
 template <cpu_isa_t isa>
 struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_resample_nearest_kernel_f32)
    explicit jit_uni_resample_nearest_kernel_f32(jit_resample_config_params jcp, const mkldnn_primitive_attr &attr)
            : jit_uni_resample_nearest_kernel(jcp, attr), jit_generator() {
        const auto &p = attr_.post_ops_;
        for (int i = 0; i < p.len_; i++) {
            auto &post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
                        this,
                        post_op.eltwise.alg,
                        post_op.eltwise.alpha,
                        post_op.eltwise.beta));
            } else if (post_op.is_depthwise()) {
                depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
                        this,
                        post_op.depthwise.alg));
            } else if (post_op.is_quantization()) {
                quantization_injectors.push_back(std::make_shared<jit_uni_quantization_injector_f32<isa>>(
                        this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
            }
        }
        this->preamble();
        mov(reg_src, ptr[reg_params + GET_OFF(src)]);
        mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
        mov(reg_index, ptr[reg_params + GET_OFF(index)]);
        mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
        mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
        mov(reg_index_stride, ptr[reg_params + GET_OFF(index_stride)]);
        mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]);
        if (attr_.post_ops_.len_ != 0)
            mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
        if (isa == cpu::avx512_common)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
        int blk_size = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
        if (isa == cpu::sse42)
            blk_size *= 2;
        Xbyak::Label resample_nearest_loop_label;
        Xbyak::Label resample_nearest_loop_end_label;
        L(resample_nearest_loop_label);
        {
            cmp(reg_work_amount, 0);
            jle(resample_nearest_loop_end_label, T_NEAR);
            if (jcp_.planar_layout) {
                uni_vmovdqu(vmm_index, ptr[reg_index]);
                uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask);
                vgatherdps(vmm_val, ptr[reg_src + vmm_index * jcp.src_data_size], vmm_mask);
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
                add(reg_dst, reg_dst_stride);
                add(reg_index, reg_index_stride);
                sub(reg_work_amount, 1);
            } else if (jcp_.nhwc_format) {  // support int8 and fusion for this format
                load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
                if (attr_.post_ops_.len_ != 0)
                    apply_post_ops(jcp_.dst_dt);
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
                if (isa == cpu::sse42) {
                    int sse42_offset = 4;
                    load_vector(vmm_val, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
                    if (attr_.post_ops_.len_ != 0) {
                        add(reg_oc_off, sse42_offset * sizeof(float));
                        apply_post_ops(jcp_.dst_dt);
                        sub(reg_oc_off, sse42_offset * sizeof(float));
                    }
                    store_vector(ptr[reg_dst + sse42_offset * jcp_.dst_data_size], vmm_val, jcp_.dst_dt);
                }
                add(reg_dst, reg_dst_stride);
                add(reg_src, reg_src_stride);
                add(reg_oc_off, blk_size * sizeof(float));
                sub(reg_work_amount, 1);
            } else {  // for blk
                mov(reg_src_aux, reg_src);
                mov(reg_index_oc, dword[reg_index]);
                add(reg_src_aux, reg_index_oc);
                load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
                if (attr_.post_ops_.len_ != 0)
                    apply_post_ops(jcp_.dst_dt);
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
                if (isa == cpu::sse42) {
                    int sse42_offset = 4;
                    add(reg_src_aux, sse42_offset * jcp_.src_data_size);
                    load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
                    if (attr_.post_ops_.len_ != 0) {
                        add(reg_oc_off, sse42_offset * sizeof(float));
                        apply_post_ops(jcp_.dst_dt);
                        sub(reg_oc_off, sse42_offset * sizeof(float));
                    }
                    store_vector(ptr[reg_dst + sse42_offset * jcp_.dst_data_size], vmm_val, jcp_.dst_dt);
                }
                add(reg_dst, reg_dst_stride);
                add(reg_index, reg_index_stride);
                sub(reg_work_amount, 1);
            }
            jmp(resample_nearest_loop_label, T_NEAR);
        }
        L(resample_nearest_loop_end_label);
        this->postamble();
        for (auto& inj : eltwise_injectors)
            inj->prepare_table();
        ker_ = (decltype(ker_)) this->getCode();
    }
 private:
    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;
    const int vlen = cpu_isa_traits<isa>::vlen;
    Xbyak::Reg64 reg_src = r8;
    Xbyak::Reg64 reg_dst = r9;
    Xbyak::Reg64 reg_src_stride = r10;
    Xbyak::Reg64 reg_dst_stride = r11;
    Xbyak::Reg64 reg_index_stride = r12;
    Xbyak::Reg64 reg_work_amount = r13;
    Xbyak::Reg64 reg_index = r14;
    Xbyak::Reg64 reg_src_aux = r15;
    Xbyak::Reg64 reg_params = abi_param1;
    Xbyak::Reg64 reg_oc_off = rax;
    Xbyak::Reg64 reg_d_weights = rbx;
    Xbyak::Reg64 reg_d_bias = rcx;
    Xbyak::Reg32 reg_index_oc = edx;
    Vmm vmm_val = Vmm(0);
    Vmm vmm_index = Vmm(1);
    Vmm vmm_zero = Vmm(2);
    Vmm vmm_mask = Vmm(3);
    Vmm vmm_d_weights = Vmm(4);
    Vmm vmm_d_bias = Vmm(5);
    std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
    std::vector<std::shared_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
    std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
            case memory::f32:
            case memory::s32:
                uni_vmovups(vmm_src, op);
                break;
            case memory::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
            case memory::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
            case memory::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
            default:
                assert(!"unknown dst_dt");
        }
        if (!isFloatCompatible(src_dt))
            uni_vcvtdq2ps(vmm_src, vmm_src);
    }
    inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) {
        Ymm ymm_dst = Ymm(vmm_dst.getIdx());
        Xmm xmm_dst = Xmm(vmm_dst.getIdx());
        if (dst_dt == memory::f32) {
            uni_vmovups(op, vmm_dst);
        } else if (dst_dt == memory::bf16) {
            vcvtneps2bf16(ymm_dst, vmm_dst);
            vmovdqu16(op, ymm_dst);
        } else if (dst_dt == memory::u8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
            if (isa == cpu::avx512_common) {
                vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
                vpmovusdb(op, vmm_dst);
            } else {
                uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
                if (isa != cpu::sse42)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
                if (isa != cpu::sse42)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
        } else if (dst_dt == memory::s8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
            if (isa == cpu::avx512_common) {
                vpmovsdb(op, vmm_dst);
            } else {
                uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
                if (isa != cpu::sse42)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
                if (isa != cpu::sse42)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
        }
    }
    void apply_post_ops(memory::data_type dst_dt) {
        const auto &p = attr_.post_ops_;
        int eltwise_inj_idx = 0;
        int depthwise_inj_idx = 0;
        int quantization_inj_idx = 0;
        for (int i = 0; i < p.len_; i++) {
            auto& post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
                eltwise_inj_idx++;
            } else if (post_op.is_depthwise()) {
                mov(reg_d_weights, reinterpret_cast<size_t>(post_op.depthwise.weights_data));
                mov(reg_d_bias, reinterpret_cast<size_t>(post_op.depthwise.biases_data));
                add(reg_d_weights, reg_oc_off);
                add(reg_d_bias, reg_oc_off);
                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_bias);
                depthwise_inj_idx++;
            } else if (post_op.is_quantization()) {
                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
                bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
                int s_idx = vmm_val.getIdx();
                quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
                quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0);
                quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_oc_off);
                quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding);
                quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_oc_off);
                quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0);
                quantization_inj_idx++;
            }
        }
    }
 };
 MKLDNNResampleNode::MKLDNNResampleNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
        : MKLDNNNode(layer, eng, cache) {}
 void MKLDNNResampleNode::getSupportedDescriptors() {
    if (!descs.empty())
        return;
    if (getParentEdges().size() != 1)
        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
    if (getChildEdges().empty())
        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
    auto *layer = getCnnLayer().get();
    type = layer->GetParamAsString("type");
    antialias = layer->GetParamAsBool("antialias", false);
    factor = layer->GetParamAsFloat("factor");
 }
 void MKLDNNResampleNode::initSupportedPrimitiveDescriptors() {
    if (!supportedPrimitiveDescriptors.empty())
        return;
    if (getParentEdgeAt(0)->getDims().ndims() < 4 || getParentEdgeAt(0)->getDims().ndims() > 5) {
        return;
    }
    setPostOps(attr, true);
    Precision inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
    Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
    if (!fusedWith.empty()) {
        auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
        if (lastFusedLayer) {
            outputPrecision = lastFusedLayer->outData[0]->getPrecision();
        }
    }
    if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) {
        if (!mayiuse(avx512_core_bf16))
            inputPrecision = outputPrecision = Precision::FP32;
        else
            inputPrecision = outputPrecision = Precision::BF16;
    }
    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
    input_prec = inputPrecision;
    output_prec = outputPrecision;
    src_data_size = MKLDNNExtensionUtils::sizeOfDataType(inputDataType);
    dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(outputDataType);
    InferenceEngine::LayerConfig config;
    config.dynBatchSupport = false;
    config.inConfs.resize(1);
    config.outConfs.resize(1);
    config.inConfs[0].constant = false;
    config.outConfs[0].constant = false;
    config.inConfs[0].inPlace = -1;
    config.outConfs[0].inPlace = -1;
    auto pushDesc = [&](memory::format format) {
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, format);
        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format});
    };
    if (type == "caffe.ResampleParameter.NEAREST") {
        if (getParentEdgeAt(0)->getDims().ndims() == 4) {
            pushDesc(memory::nhwc);
        } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
            pushDesc(memory::ndhwc);
        }
        if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) {
            if (getParentEdgeAt(0)->getDims().ndims() == 4) {
                if (mayiuse(cpu::avx512_common)) {
                    pushDesc(memory::nChw16c);
                } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
                    pushDesc(memory::nChw8c);
                }
            } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
                if (mayiuse(cpu::avx512_common)) {
                    pushDesc(memory::nCdhw16c);
                } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
                    pushDesc(memory::nCdhw8c);
                }
            }
            if (fusedWith.empty()) {
                pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()));
            }
        }
    }
    if (type == "caffe.ResampleParameter.LINEAR") {
        if (getParentEdgeAt(0)->getDims().ndims() == 4) {
            pushDesc(memory::nchw);
        } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
            pushDesc(memory::ncdhw);
        }
    }
 }
 void MKLDNNResampleNode::createPrimitive() {
    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
    auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
        THROW_IE_EXCEPTION << "Destination memory didn't allocate.";
    if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
        THROW_IE_EXCEPTION << "Input memory didn't allocate.";
    if (getSelectedPrimitiveDescriptor() == nullptr)
        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
    auto selectedPD = getSelectedPrimitiveDescriptor();
    Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
    auto jcp = jit_resample_config_params();
    jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision());
    jcp.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().outConfs[0].desc.getPrecision());
    jcp.src_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.src_dt);
    jcp.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.dst_dt);
    jcp.planar_layout = MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims()) == selected_layout;
    jcp.nhwc_format = (selected_layout == NHWC) || (selected_layout == NDHWC);
    if (type == "caffe.ResampleParameter.NEAREST") {
        if (mayiuse(cpu::avx512_common)) {
            if (jcp.planar_layout) {
                resample_nearest_kernel.reset(new jit_uni_resample_nearest_kernel_f32<cpu::avx2>(jcp, *attr.get()));
                blk_size = 8;
            } else {
                resample_nearest_kernel.reset(new jit_uni_resample_nearest_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
                blk_size = 16;
            }
        } else if (mayiuse(cpu::avx2)) {
            resample_nearest_kernel.reset(new jit_uni_resample_nearest_kernel_f32<cpu::avx2>(jcp, *attr.get()));
            blk_size = 8;
        } else if (mayiuse(cpu::sse42) && !jcp.planar_layout) {
            resample_nearest_kernel.reset(new jit_uni_resample_nearest_kernel_f32<cpu::sse42>(jcp, *attr.get()));
            blk_size = 8;
        }
    }
 }
 void MKLDNNResampleNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
    int blob_idx = 0;
    mkldnn::post_ops ops;
    for (auto &node : fusedWith) {
        auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode *>(node.get());
        if (quantizeNode) {
            quantizeNode->appendPostOps(ops);
            continue;
        }
        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode) {
            eltwiseNode->appendPostOps(ops);
            continue;
        }
        THROW_IE_EXCEPTION << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
    }
    attr.set_post_ops(ops);
 }
 void MKLDNNResampleNode::execute(mkldnn::stream strm) {
    auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
    auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
    Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
    SizeVector src_dim = getParentEdgeAt(0)->getDesc().getDims();
    SizeVector dst_dim = getChildEdgeAt(0)->getDesc().getDims();
    size_t dims_size = src_dim.size();
    size_t N = src_dim[0];
    size_t C = src_dim[1];
    size_t ID = (dims_size == 5) ? src_dim[dims_size - 3] : 1lu;
    size_t IH = src_dim[dims_size - 2];
    size_t IW = src_dim[dims_size - 1];
    size_t OD = (dims_size == 5) ? dst_dim[dims_size - 3] : 1lu;
    size_t OH = dst_dim[dims_size - 2];
    size_t OW = dst_dim[dims_size - 1];
    float fx = static_cast<float>(IW) / static_cast<float>(OW);
    float fy = static_cast<float>(IH) / static_cast<float>(OH);
    float fz = static_cast<float>(ID) / static_cast<float>(OD);
    if (type == "caffe.ResampleParameter.NEAREST") {
        if (layout == NCHW || layout == NCDHW) {
            if (output_prec == Precision::FP32) {
                auto src_data = reinterpret_cast<const float*>(srcMemPtr->GetData());
                auto dst_data = reinterpret_cast<float*>(dstMemPtr->GetData());
                NearestNeighbor_PLN<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
            } else if (output_prec == Precision::BF16) {
                auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
                auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
                NearestNeighbor_PLN<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
            } else {
                THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
            }
        } else {
            if (output_prec == Precision::U8) {
                auto dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData());
                if (input_prec == Precision::U8) {
                    auto src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<uint8_t, uint8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::I8) {
                    auto src_data = reinterpret_cast<const int8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<int8_t, uint8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::FP32) {
                    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<float, uint8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else {
                    THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
                }
            } else if (output_prec == Precision::I8) {
                auto dst_data = reinterpret_cast<int8_t *>(dstMemPtr->GetData());
                if (input_prec == Precision::U8) {
                    auto src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<uint8_t, int8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::I8) {
                    auto src_data = reinterpret_cast<const int8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<int8_t, int8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::FP32) {
                    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<float, int8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else {
                    THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
                }
            } else if (output_prec == Precision::FP32) {
                auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
                if (input_prec == Precision::U8) {
                    auto src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<uint8_t, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::I8) {
                    auto src_data = reinterpret_cast<const int8_t *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<int8_t, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else if (input_prec == Precision::FP32) {
                    auto src_data = reinterpret_cast<float *>(srcMemPtr->GetData());
                    NearestNeighbor_BLK<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
                } else {
                    THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
                }
            } else if (output_prec == Precision::BF16) {
                auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
                auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
                NearestNeighbor_BLK<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW);
            } else {
                THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name();
            }
        }
    } else if (type == "caffe.ResampleParameter.LINEAR") {
        // currently no fusion, the input and output precision is the same
        bool isDownsample = (fx > 1) || (fy > 1) || (fz > 1);
        int kernel_width = 2;
        if (input_prec == Precision::U8) {
            auto src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData());
            auto dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData());
            LinearInterpolation<uint8_t, uint8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias);
        } else if (input_prec == Precision::I8) {
            auto src_data = reinterpret_cast<const int8_t *>(srcMemPtr->GetData());
            auto dst_data = reinterpret_cast<int8_t *>(dstMemPtr->GetData());
            LinearInterpolation<int8_t, int8_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias);
        } else if (input_prec == Precision::FP32) {
            auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
            auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
            LinearInterpolation<float, float>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias);
        } else if (input_prec == Precision::BF16) {
            auto src_data = reinterpret_cast<const bfloat16_t*>(srcMemPtr->GetData());
            auto dst_data = reinterpret_cast<bfloat16_t*>(dstMemPtr->GetData());
            LinearInterpolation<bfloat16_t, bfloat16_t>(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width,
                isDownsample && antialias);
        } else {
            THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name();
        }
    } else {
        THROW_IE_EXCEPTION << "Unsupported resample parameter type: " << type;
    }
 }
 // f32 and no fused, f32->input is f32, no fuse->output is f32
 template <typename in_data_t, typename out_data_t>
 void MKLDNNResampleNode::NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                             float fx, float fy, float fz, int OD, int OH, int OW) {
    std::vector<int> index_buffer(OD * OH * OW);
    for (int oz = 0; oz < OD; oz++) {
        float iz = oz * fz;
        int iz_offset = static_cast<int>(std::floor(iz)) * IH * IW;
        int oz_offset = oz * OH * OW;
        for (int oy = 0; oy < OH; oy++) {
            float iy = oy * fy;
            int iy_offset = static_cast<int>(std::floor(iy)) * IW + iz_offset;
            int oy_offset = oy * OW + oz_offset;
            for (int ox = 0; ox < OW; ox++) {
                float ix = ox * fx;
                int ix_index = static_cast<int>(std::floor(ix)) + iy_offset;
                index_buffer[oy_offset + ox] = ix_index;
            }
        }
    }
    if (resample_nearest_kernel) {
        parallel_for2d(B, C, [&](size_t b, size_t c) {
            const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
            out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
            // for OW*OH*OD
            auto arg = jit_resample_call_args();
            arg.src = in_ptr;
            arg.dst = out_ptr;
            arg.index = static_cast<int*>(&index_buffer[0]);
            arg.index_stride = blk_size * sizeof(int);
            arg.dst_stride = blk_size * dst_data_size;
            arg.work_amount = OW * OH * OD / blk_size;
            (*resample_nearest_kernel)(&arg);
            int tail_start = (OW * OH * OD / blk_size) * blk_size;
            for (int tail = tail_start; tail < OW * OH * OD; tail++) {
                out_ptr[tail] = in_ptr[index_buffer[tail]];
            }
        });
    } else {
        parallel_for2d(B, C, [&](size_t b, size_t c) {
            const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c;
            out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c;
            for (int i_dst = 0; i_dst < OW * OH * OD; i_dst++) {
                out_ptr[i_dst] = in_ptr[index_buffer[i_dst]];
            }
        });
    }
 }
 // for ndhwc and nCdhw8/16d
 // int8->input may be int8, fused->output may be int8
 template <typename in_data_t, typename out_data_t>
 void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                             float fx, float fy, float fz, int OD, int OH, int OW) {
    std::vector<int> index_d(OD);
    std::vector<int> index_h(OH);
    std::vector<int> index_w(OW);
    for (int oz = 0; oz < OD; oz++) {
        float iz = oz * fz;
        index_d[oz] = static_cast<int>(std::floor(iz));
    }
    for (int oy = 0; oy < OH; oy++) {
        float iy = oy * fy;
        index_h[oy] = static_cast<int>(std::floor(iy));
    }
    for (int ox = 0; ox < OW; ox++) {
        float ix = ox * fx;
        index_w[ox] = static_cast<int>(std::floor(ix));
    }
    Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
    bool is_nhwc = (layout == NHWC || layout == NDHWC) ? true : false;
    for (int b = 0; b < B; b++) {
        if (is_nhwc) {
            const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b;
            out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b;
            if (resample_nearest_kernel) {
                int tail = (C / blk_size) * blk_size;
                parallel_for2d(OD, OH, [&](size_t d, size_t h) {
                    // better that same core process continuous memory
                    out_data_t *out_ptr_dh = out_ptr + C * OW * OH * d + C * OW * h;
                    const in_data_t *in_ptr_dh = in_ptr + C * IW * IH * index_d[d] + C * IW * index_h[h];
                    auto arg = jit_resample_call_args();
                    for (int ox = 0; ox < OW; ox++) {
                        // kernel for OC
                        arg.dst = out_ptr_dh + C * ox;
                        arg.src = in_ptr_dh + C * index_w[ox];
                        arg.dst_stride = blk_size * sizeof(out_data_t);
                        arg.src_stride = blk_size * sizeof(in_data_t);
                        arg.work_amount = C / blk_size;
                        arg.oc_off = 0;
                        (*resample_nearest_kernel)(&arg);
                    }
                    // tail
                    if (tail != C) {
                        for (int ox = 0; ox < OW; ox++) {
                            out_data_t *out_ptr_dhw = out_ptr_dh + C * ox;
                            const in_data_t *in_ptr_dhw = in_ptr_dh + C * index_w[ox];
                            if (fusedWith.empty() && output_prec == input_prec) {
                                cpu_memcpy(out_ptr_dhw + tail, in_ptr_dhw + tail, (C - tail) * sizeof(in_data_t));
                            } else {
                                for (int c = tail; c < C; c++) {
                                    float dst_value = static_cast<float>(in_ptr_dhw[c]);
                                    apply_post_ops_scalar(dst_value, c);
                                    if (isFloatCompatible(output_prec)) {
                                        out_ptr_dhw[c] = dst_value;
                                    } else if (output_prec == Precision::U8) {
                                        out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0;
                                    } else if (output_prec == Precision::I8) {
                                        out_ptr_dhw[c] = lroundf(dst_value);
                                    }
                                }
                            }
                        }
                    }
                });
            } else {  // without kernel
                parallel_for2d(OD, OH, [&](size_t d, size_t h) {
                    out_data_t *out_ptr_dh = out_ptr + C * OW * OH * d + C * OW * h;
                    const in_data_t *in_ptr_dh = in_ptr + C * IW * IH * index_d[d] + C * IW * index_h[h];
                    for (int ox = 0; ox < OW; ox++) {
                        out_data_t *out_ptr_dhw = out_ptr_dh + C * ox;
                        const in_data_t *in_ptr_dhw = in_ptr_dh + C * index_w[ox];
                        if (fusedWith.empty() && output_prec == input_prec) {
                            cpu_memcpy(out_ptr_dhw, in_ptr_dhw, C * sizeof(in_data_t));
                        } else {
                            for (int c = 0; c < C; c++) {
                                float dst_value = static_cast<float>(in_ptr_dhw[c]);
                                apply_post_ops_scalar(dst_value, c);
                                if (isFloatCompatible(output_prec)) {
                                    out_ptr_dhw[c] = dst_value;
                                } else if (output_prec == Precision::U8) {
                                    out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0;
                                } else if (output_prec == Precision::I8) {
                                    out_ptr_dhw[c] = lroundf(dst_value);
                                }
                            }
                        }
                    }
                });
            }
        } else {  // for nC(d)hw8/16c
            int CB = div_up(C, blk_size);
            const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * CB * blk_size * b;
            out_data_t *out_ptr = out_ptr_ + OW * OH * OD * CB * blk_size * b;
            if (resample_nearest_kernel) {
                std::vector<int> index_w_kernel(OW);
                for (int ox = 0; ox < OW; ox++) {
                    index_w_kernel[ox] = index_w[ox] * blk_size * sizeof(in_data_t);
                }
                parallel_for2d(CB, OD, [&](size_t cb, size_t d) {
                    out_data_t *out_ptr_cbd = out_ptr + blk_size * OW * OH * OD * cb + blk_size * OW * OH * d;
                    const in_data_t *in_ptr_cbd = in_ptr +  blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d];
                    auto arg = jit_resample_call_args();
                    for (int h = 0; h < OH; h++) {  // kernel for blk_size * OW
                        arg.dst = out_ptr_cbd + blk_size * OW * h;
                        arg.src = in_ptr_cbd + blk_size * IW * index_h[h];
                        arg.index = static_cast<int*>(&(index_w_kernel[0]));
                        arg.dst_stride = static_cast<size_t>(blk_size * sizeof(out_data_t));
                        arg.index_stride = static_cast<size_t>(1 * sizeof(int));
                        arg.work_amount = static_cast<size_t>(OW);
                        arg.oc_off = cb * blk_size;
                        (*resample_nearest_kernel)(&arg);
                    }
                });
            } else {
                parallel_for2d(CB, OD, [&](int cb, int d) {
                    out_data_t *out_ptr_cbd = out_ptr + blk_size * OW * OH * OD * cb + blk_size * OW * OH * d;
                    const in_data_t *in_ptr_cbd = in_ptr +  blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d];
                    for (int h = 0; h < OH; h++) {
                        out_data_t *out_ptr_cbdh = out_ptr_cbd + blk_size * OW * h;
                        const in_data_t *in_ptr_cbdh = in_ptr_cbd + blk_size * IW * index_h[h];
                        for (int w = 0; w < OW; w++) {
                            out_data_t *out_ptr_cbdhw = out_ptr_cbdh + blk_size * w;
                            const in_data_t *in_ptr_cbdhw = in_ptr_cbdh + blk_size * index_w[w];
                            if (fusedWith.empty()) {
                                cpu_memcpy(out_ptr_cbdhw, in_ptr_cbdhw, blk_size * sizeof(in_data_t));
                            } else {
                                for (int blk = 0; blk < blk_size; blk++) {
                                    float dst_value = static_cast<float>(in_ptr_cbdhw[blk]);
                                    apply_post_ops_scalar(dst_value, cb * blk_size + blk);
                                    if (isFloatCompatible(output_prec)) {
                                        out_ptr_cbdhw[blk] = dst_value;
                                    } else if (output_prec == Precision::U8) {
                                        out_ptr_cbdhw[blk] = (dst_value >= 0) ? lroundf(dst_value) : 0;
                                    } else if (output_prec == Precision::I8) {
                                        out_ptr_cbdhw[blk] = lroundf(dst_value);
                                    }
                                }
                            }
                        }
                    }
                });
            }
        }
    }  // batch end
 }
 static inline float triangleCoeff(float x) {
    return (std::max)(0.0f, 1 - std::abs(x));
 }
 template <typename in_data_t, typename out_data_t>
 void MKLDNNResampleNode::LinearInterpolation(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                             float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) {
    if (IW == OW && IH == OH && ID == OD) {
        size_t size = B * C * ID * IH * IW;
        if (isFloatCompatible(input_prec)) {
            size *= sizeof(in_data_t);
        }
        cpu_memcpy(out_ptr_, in_ptr_, size);
        return;
    }
    for (size_t b = 0; b < B; b++) {
        const in_data_t *in_ptr_n = in_ptr_ + IW * IH * ID * C * b;
        out_data_t *out_ptr_n = out_ptr_ + OW * OH * OD * C * b;
        for (size_t c = 0; c < C; c++) {
            const in_data_t *in_ptr_nc = in_ptr_n + IW * IH * ID * c;
            out_data_t *out_ptr_nc = out_ptr_n + OW * OH * OD * c;
            for (size_t oz = 0; oz < OD; oz++) {
                out_data_t *out_ptr_ncd = out_ptr_nc + OW * OH * oz;
                for (size_t oy = 0; oy < OH; oy++) {
                    out_data_t *out_ptr_ncdh = out_ptr_ncd + OW * oy;
                    for (size_t ox = 0; ox < OW; ox++) {
                        float ix = ox * fx + fx / 2.0f - 0.5f;
                        float iy = oy * fy + fy / 2.0f - 0.5f;
                        float iz = oz * fz + fz / 2.0f - 0.5f;
                        int ix_r = static_cast<int>(round(ix));
                        int iy_r = static_cast<int>(round(iy));
                        int iz_r = static_cast<int>(round(iz));
                        float sum = 0;
                        float wsum = 0;
                        float ax = 1.0f / (antialias ? fx : 1.0f);
                        float ay = 1.0f / (antialias ? fy : 1.0f);
                        float az = 1.0f / (antialias ? fz : 1.0f);
                        int rx = (fx < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ax));
                        int ry = (fy < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ay));
                        int rz = (fz < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / az));
                        for (int z = iz_r - rz; z <= iz_r + rz; z++) {
                            for (int y = iy_r - ry; y <= iy_r + ry; y++) {
                                for (int x = ix_r - rx; x <= ix_r + rx; x++) {
                                    bool is_continue =  z < 0                     ||
                                                        y < 0                     ||
                                                        x < 0                     ||
                                                        z >= static_cast<int>(ID) ||
                                                        y >= static_cast<int>(IH) ||
                                                        x >= static_cast<int>(IW);
                                    if (is_continue)
                                        continue;
                                    float dx = ix - x;
                                    float dy = iy - y;
                                    float dz = iz - z;
                                    float w = ax * triangleCoeff(ax * dx) *
                                              ay * triangleCoeff(ay * dy) *
                                              az * triangleCoeff(az * dz);
                                    sum += w * static_cast<float>(in_ptr_nc[z * IH * IW + y * IW + x]);
                                    wsum += w;
                                }
                            }
                        }
                        if (!wsum) {
                            out_ptr_ncdh[ox] = 0;
                        } else {
                            float dst_value = sum / wsum;
                            if (isFloatCompatible(output_prec)) {
                                out_ptr_ncdh[ox] = dst_value;
                            } else if (output_prec == Precision::U8) {
                                out_ptr_ncdh[ox] = (dst_value >= 0) ? lroundf(dst_value) : 0;
                            } else if (output_prec == Precision::I8) {
                                out_ptr_ncdh[ox] = lroundf(dst_value);
                            }
                        }
                    }
                }
            }
        }
    }
 }
 inline void MKLDNNResampleNode::apply_post_ops_scalar(float &dst_value, int index_c) {
    const auto &p = (*attr.get()).post_ops_;
    for (int i = 0; i < p.len_; i++) {
        auto &post_op = p.entry_[i];
        if (post_op.is_eltwise()) {
            //  only eltwise_relu supported
            if (dst_value < 0) dst_value = 0;
        } else if (post_op.is_depthwise()) {
            //  only ScaleShift supported
            float scale = post_op.depthwise.weights_data[index_c];
            float shift = post_op.depthwise.biases_data[index_c];
            dst_value = dst_value * scale + shift;
        } else if (post_op.is_quantization()) {
            bool do_dequantization = post_op.quantization.alg ==
                                     alg_kind::quantization_quantize_dequantize;
            bool do_rounding = do_dequantization || isFloatCompatible(output_prec) ||
                               i != p.len_ - 1;
            auto quant = post_op.quantization;
            float crop_low = quant.crop_low_data->shifts_[quant.crop_low_data->count_ == 1 ? 0 : index_c];
            float crop_high = quant.crop_high_data->shifts_[quant.crop_high_data->count_ == 1 ? 0 : index_c];
            float input_scale = quant.input_scale_data->scales_[quant.input_scale_data->count_ == 1 ? 0 : index_c];
            float input_shift = quant.input_shift_data->shifts_[quant.input_shift_data->count_ == 1 ? 0 : index_c];
            dst_value = nstl::min(crop_high, nstl::max(crop_low, dst_value));
            dst_value = dst_value * input_scale + input_shift;
            if (do_rounding) {
                dst_value = roundf(dst_value);
            }
            if (do_dequantization) {
                float output_scale = quant.output_scale_data->scales_[quant.output_scale_data->count_ == 1 ? 0 : index_c];
                float output_shift = quant.output_shift_data->shifts_[quant.output_shift_data->count_ == 1 ? 0 : index_c];
                dst_value = dst_value * output_scale + output_shift;
            }
        }
    }
 }
 bool MKLDNNResampleNode::created() const {
    return getType() == Resample;
 }
 REG_MKLDNN_PRIM_FOR(MKLDNNResampleNode, Resample);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h
@ -1,109 +0,0 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include <ie_common.h>
 #include <mkldnn_node.h>
 #include <string>
 #include <memory>
 #include <vector>
 namespace MKLDNNPlugin {
 struct jit_resample_config_params {
    bool planar_layout;
    bool nhwc_format;
    mkldnn::memory::data_type src_dt;
    mkldnn::memory::data_type dst_dt;
    int src_data_size;
    int dst_data_size;
 };
 struct jit_resample_call_args {
    const void *src;
    const int *index;
    void *dst;
    size_t src_stride;
    size_t index_stride;
    size_t dst_stride;
    size_t work_amount;
    size_t oc_off;
 };
 struct jit_uni_resample_nearest_kernel {
    void (*ker_)(const jit_resample_call_args *);
    void operator()(const jit_resample_call_args *args) {
        assert(ker_);
        ker_(args);
    }
    explicit jit_uni_resample_nearest_kernel(jit_resample_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
    virtual ~jit_uni_resample_nearest_kernel() {}
    jit_resample_config_params jcp_;
    const mkldnn_primitive_attr &attr_;
 };
 struct jit_uni_resample_linear_kernel {
    void (*ker_)(const jit_resample_call_args *);
    void operator()(const jit_resample_call_args *args) {
        assert(ker_);
        ker_(args);
    }
    explicit jit_uni_resample_linear_kernel(jit_resample_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
    virtual ~jit_uni_resample_linear_kernel() {}
    jit_resample_config_params jcp_;
    const mkldnn_primitive_attr &attr_;
 };
 class MKLDNNResampleNode : public MKLDNNNode {
 public:
    MKLDNNResampleNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    ~MKLDNNResampleNode() override = default;
    void getSupportedDescriptors() override;
    void initSupportedPrimitiveDescriptors() override;
    void createPrimitive() override;
    bool created() const override;
    void execute(mkldnn::stream strm) override;
    bool canBeInPlace() const override {
        return false;
    }
 private:
    template <typename in_data_t, typename out_data_t>
    void NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                          float fx, float fy, float fz, int OD, int OH, int OW);
    template <typename in_data_t, typename out_data_t>
    void NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                          float fx, float fy, float fz, int OD, int OH, int OW);
    template <typename in_data_t, typename out_data_t>
    void LinearInterpolation(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW,
                                          float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias);
    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights = false);
    inline void apply_post_ops_scalar(float &dst_value, int index_c);
    int blk_size;
    std::string type;
    bool antialias;
    float factor;
    mkldnn::primitive_attr attr;
    std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
    InferenceEngine::Precision input_prec, output_prec;
    size_t src_data_size, dst_data_size;
    std::shared_ptr<jit_uni_resample_nearest_kernel> resample_nearest_kernel;
 };
 }  // namespace MKLDNNPlugin
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp
@ -55,6 +55,7 @@ protected:
        std::vector<int64_t> axes;
        std::vector<float> scales;
        std:tie(mode, shapeCalcMode, coordinateTransformMode, nearestMode, antialias, padBegin, padEnd, cubeCoef, axes, scales) = interpolateParams;
        inPrc = outPrc = netPrecision;
        using ShapeCalcMode = ngraph::op::v4::Interpolate::ShapeCalcMode;
@ -81,6 +82,8 @@ protected:
        interpolate->get_rt_info() = getCPUInfo();
        const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(interpolate)};
        function = std::make_shared<ngraph::Function>(results, params, "interpolate");
        selectedType = getPrimitiveType() + "_" + inPrc.name();
    }
 };
@ -99,7 +102,6 @@ std::vector<CPUSpecificParams> filterCPUInfoForDevice() {
    if (with_cpu_x86_avx512f()) {
        resCPUParams.push_back(CPUSpecificParams{{nChw16c, x, x}, {nChw16c}, {"jit_avx512"}, "jit_avx512_FP32"});
        resCPUParams.push_back(CPUSpecificParams{{nhwc, x, x}, {nhwc}, {"jit_avx512"}, "jit_avx512_FP32"});
        resCPUParams.push_back(CPUSpecificParams{{nchw, x, x}, {nchw}, {"jit_avx2"}, "jit_avx2_FP32"});
    } else if (with_cpu_x86_avx2()) {
        resCPUParams.push_back(CPUSpecificParams{{nChw8c, x, x}, {nChw8c}, {"jit_avx2"}, "jit_avx2_FP32"});
        resCPUParams.push_back(CPUSpecificParams{{nhwc, x, x}, {nhwc}, {"jit_avx2"}, "jit_avx2_FP32"});
@ -115,7 +117,8 @@ std::vector<CPUSpecificParams> filterCPUInfoForDevice() {
 /* ========== */
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-        InferenceEngine::Precision::FP32
+    InferenceEngine::Precision::FP32,
    InferenceEngine::Precision::BF16
 };
 const std::vector<ngraph::op::v4::Interpolate::CoordinateTransformMode> coordinateTransformModes = {
--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp
@ -1,254 +0,0 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "test_graph.hpp"
 #include "single_layer_common.hpp"
 #include "tests_common.hpp"
 #include <ie_core.hpp>
 using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
 struct interp_test_params {
    struct {
        size_t n;
        size_t c;
        size_t h;
        size_t w;
    } in;
    struct {
        size_t h;
        size_t w;
    } out;
    int pad_beg;
    int pad_end;
    size_t num_prim_desc;
    int selectedType;
    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
 };
 void interpolate(const int N, const int C, const float *src, const int x1, const int y1, const int IH_pad, const int IW_pad,
                      const int IH, const int IW, float *dst, const int x2, const int y2, const int OH_pad, const int OW_pad, const int OH, const int OW) {
    if (IH_pad == OH_pad && IW_pad == OW_pad) {
        for (int i = 0; i < N * C * OH * OW; i++) {
            dst[i] = src[i];
        }
        return;
    }
    const float rh = (OH_pad > 1) ? static_cast<float>(IH_pad - 1) / (OH_pad - 1) : 0.0f;
    const float rw = (OW_pad > 1) ? static_cast<float>(IW_pad - 1) / (OW_pad - 1) : 0.0f;
    const int block_size = 1;
    // Align channel number to block size to deal with channels padding in IE with multiple blobs
    int CB = (C + block_size - 1) & (-block_size);  // CB=n*block_size, i.e.:c=15,(block_size=8), then CB=16, CH=2
    int CH = (C + block_size - 1) / block_size;  // number of block:(n)
    for (int n = 0; n < N; n++) {
        for (int cb = 0; cb < CH; ++cb) {
            for (int h = 0; h < OH_pad; ++h) {
                const float *psrc = src + n * CB * IH * IW;  // should be nChw8c(16c) data format
                float fh = rh * h;
                int ih0 = static_cast<int>(fh);
                int ih1 = (ih0 < IH_pad - 1) ? ih0 + 1 : ih0;
                float h_lambda0 = fh - ih0;
                float h_lambda1 = 1.0f - h_lambda0;
                for (int w = 0; w < OW_pad; ++w) {
                    float fw = rw * w;
                    int iw0 = static_cast<int>(fw);
                    int iw1 = (iw0 < IW_pad - 1) ? iw0 + 1 : iw0;
                    float w_lambda0 = fw - iw0;
                    float w_lambda1 = 1.0f - w_lambda0;
                    const float *psrc00 =
                            psrc + cb * block_size * IW * IH + (y1 + ih0) * IW * block_size + (x1 + iw0) * block_size;
                    const float *psrc01 =
                            psrc + cb * block_size * IW * IH + (y1 + ih0) * IW * block_size + (x1 + iw1) * block_size;
                    const float *psrc10 =
                            psrc + cb * block_size * IW * IH + (y1 + ih1) * IW * block_size + (x1 + iw0) * block_size;
                    const float *psrc11 =
                            psrc + cb * block_size * IW * IH + (y1 + ih1) * IW * block_size + (x1 + iw1) * block_size;
                    float *pdst = dst + n * CB * OH * OW + cb * block_size * OW * OH + (y2 + h) * OW * block_size +
                                  (x2 + w) * block_size;
                    for (int c = 0; c < block_size; ++c) {
                        pdst[c] = h_lambda1 * (w_lambda1 * psrc00[c] + w_lambda0 * psrc01[c]) +
                                  h_lambda0 * (w_lambda1 * psrc10[c] + w_lambda0 * psrc11[c]);
                    }
                }
            }
        }
    }
 }
 template <typename data_t>
 void ref_interp(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, interp_test_params prm) {
    int IB = static_cast<int>(src.getTensorDesc().getDims()[0]);
    int IC = static_cast<int>(src.getTensorDesc().getDims()[1]);
    int IH = static_cast<int>(src.getTensorDesc().getDims()[2]);
    int IW = static_cast<int>(src.getTensorDesc().getDims()[3]);
    int OH = static_cast<int>(dst.getTensorDesc().getDims()[2]);
    int OW = static_cast<int>(dst.getTensorDesc().getDims()[3]);
    int IH_pad = IH + prm.pad_beg + prm.pad_end;
    int IW_pad = IW + prm.pad_beg + prm.pad_end;
    const data_t *src_data = src.readOnly();
    data_t *dst_data = dst.data();
    interpolate(IB, IC, src_data, -prm.pad_beg, -prm.pad_beg, IH_pad, IW_pad, IH, IW, dst_data, 0, 0, OH, OW, OH, OW);
 }
 class MKLDNNCPUExtInterpTests: public TestsCommon, public WithParamInterface<interp_test_params> {
    std::string model_t = R"V0G0N(
 <Net Name="Convolution_Only" version="2" precision="FP32" batch="1">
    <layers>
        <layer name="in1" type="Input" precision="FP32" id="0">
            <output>
                <port id="0">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </output>
        </layer>
        <layer name="interp1" id="1" type="Interp" precision="FP32">
            <data pad_beg="_PB_" pad_end="_PE_" height="_OH_" width="_OW_"/>
            <input>
                <port id="1">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </input>
            <output>
                <port id="2">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_OH_</dim>
                    <dim>_OW_</dim>
                </port>
            </output>
        </layer>
    </layers>
    <edges>
        <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
    </edges>
 </Net>
 )V0G0N";
    std::string getModel(interp_test_params p) {
        std::string model = model_t;
        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
        REPLACE_WITH_NUM(model, "_OH_", p.out.h);
        REPLACE_WITH_NUM(model, "_OW_", p.out.w);
        REPLACE_WITH_NUM(model, "_PB_", p.pad_beg);
        REPLACE_WITH_NUM(model, "_PE_", p.pad_end);
        return model;
    }
 protected:
    virtual void TearDown() {
    }
    virtual void SetUp() {
        try {
            TestsCommon::SetUp();
            interp_test_params p = ::testing::WithParamInterface<interp_test_params>::GetParam();
            std::string model = getModel(p);
                        InferenceEngine::Core core;
            InferenceEngine::CNNNetwork network;
            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
            MKLDNNGraphTestClass graph;
            graph.CreateGraph(network);
            auto& nodes = graph.getNodes();
            nodes = graph.getNodes();
            for (auto &node : nodes) {
                if (node->getName() == "interp1") {
                    ASSERT_LE(p.num_prim_desc, node->getSupportedPrimitiveDescriptors().size());
                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
                        p.comp.at(j)(node->getSupportedPrimitiveDescriptors().at(j));
                    }
                    ASSERT_NE(nullptr, node->getSelectedPrimitiveDescriptor());
                    ASSERT_EQ(p.selectedType,
                              node->getSelectedPrimitiveDescriptor()->getImplementationType() & p.selectedType);
                }
            }
            ASSERT_LE(4, nodes.size());
            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src, InferenceEngine::NCHW});
            src->allocate();
            fill_data(src->buffer(), src->size());
            auto * srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
            if (srcPtr == nullptr)
                FAIL() << "Cannot cast blob to TBlob<float>.";
            InferenceEngine::BlobMap srcs;
            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
            InferenceEngine::OutputsDataMap out;
            out = network.getOutputsInfo();
            InferenceEngine::BlobMap outputBlobs;
            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
            InferenceEngine::TBlob<float>::Ptr output;
            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
            output->allocate();
            outputBlobs[item.first] = output;
            graph.Infer(srcs, outputBlobs);
            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
            dst_ref.allocate();
            ref_interp(*srcPtr, dst_ref, p);
            compare(*output, dst_ref);
        } catch (const InferenceEngine::details::InferenceEngineException &e) {
            FAIL() << e.what();
        }
    }
 };
 TEST_P(MKLDNNCPUExtInterpTests, TestsInterp) {}
 INSTANTIATE_TEST_CASE_P(
        TestsInterp, MKLDNNCPUExtInterpTests,
        ::testing::Values(
                interp_test_params{{1, 256, 1, 1}, {33, 65}, 0, 0, 1, MKLDNNPlugin::impl_desc_type::unknown },
                interp_test_params{{6, 128, 320, 320}, {23, 38}, 0, 0, 1, MKLDNNPlugin::impl_desc_type::unknown },
                interp_test_params{{1, 2, 33, 65}, {33, 65}, 0, 0, 1, MKLDNNPlugin::impl_desc_type::unknown }));
--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp
@ -1,367 +0,0 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "test_graph.hpp"
 #include "single_layer_common.hpp"
 #include "tests_common.hpp"
 #include "ir_gen_helper.hpp"
 #include <ie_core.hpp>
 #include <nodes/base.hpp>
 using namespace InferenceEngine;
 using namespace ::testing;
 using namespace std;
 using namespace single_layer_tests;
 using namespace Extensions;
 using namespace ::Cpu;
 struct resample_test_params {
    std::vector<size_t> in_dims;
    float factor;
    int antialias;
    std::string type;
    size_t num_prim_desc;
    bool isBlockedFormat;
    int selectedType;
    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
 };
 static inline float triangleCoeff(float x) {
    return max(0.0f, 1 - std::abs(x));
 }
 extern InferenceEngine::IExtensionPtr make_FakeExtensions();
 template <typename data_t>
 void ref_resample(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, resample_test_params prm) {
    const data_t *src_data = src.readOnly();
    data_t *dst_data = dst.data();
    size_t ndims = prm.in_dims.size();
    size_t N = prm.in_dims[0];
    size_t C = prm.in_dims[1];
    size_t ID = ndims == 5 ? prm.in_dims[ndims - 3] : 1;
    size_t IH = prm.in_dims[ndims - 2];
    size_t IW = prm.in_dims[ndims - 1];
    size_t OD = ndims == 5 ? ID / prm.factor : 1;
    size_t OH = IH / prm.factor;
    size_t OW = IW / prm.factor;
    float fx = static_cast<float>(IW) / static_cast<float>(OW);
    float fy = static_cast<float>(IH) / static_cast<float>(OH);
    float fz = static_cast<float>(ID) / static_cast<float>(OD);
    if (prm.type == "caffe.ResampleParameter.NEAREST") {
        for (size_t b = 0; b < N; b++) {
            for (size_t c = 0; c < C; c++) {
                const float *in_ptr = src_data + IW * IH * ID * C * b + IW * IH * ID * c;
                float *out_ptr = dst_data + OW * OH * OD * C * b + OW * OH * OD * c;
                for (size_t oz = 0; oz < OD; oz++) {
                    for (size_t oy = 0; oy < OH; oy++) {
                        for (size_t ox = 0; ox < OW; ox++) {
                            float ix = ox * fx;
                            float iy = oy * fy;
                            float iz = oz * fz;
                            size_t ix_r = static_cast<size_t>(std::floor(ix));
                            size_t iy_r = static_cast<size_t>(std::floor(iy));
                            size_t iz_r = static_cast<size_t>(std::floor(iz));
                            out_ptr[oz * OH * OW + oy * OW + ox] = in_ptr[iz_r * IH * IW + iy_r * IW + ix_r];
                        }
                    }
                }
            }
        }
    } else if (prm.type == "caffe.ResampleParameter.LINEAR") {
        size_t kernel_width = 2;
        bool isDownsample = (fx > 1) || (fy > 1) || (fz > 1);
        bool antialias = isDownsample && prm.antialias;
        for (size_t b = 0; b < N; b++) {
            for (size_t c = 0; c < C; c++) {
                const float *in_ptr = src_data + IW * IH * ID * C * b + IW * IH * ID * c;
                float *out_ptr = dst_data + OW * OH * OD * C * b + OW * OH * OD * c;
                for (size_t oz = 0; oz < OD; oz++) {
                    for (size_t oy = 0; oy < OH; oy++) {
                        for (size_t ox = 0; ox < OW; ox++) {
                            float ix = ox * fx + fx / 2.0f - 0.5f;
                            float iy = oy * fy + fy / 2.0f - 0.5f;
                            float iz = oz * fz + fz / 2.0f - 0.5f;
                            int ix_r = static_cast<int>(round(ix));
                            int iy_r = static_cast<int>(round(iy));
                            int iz_r = static_cast<int>(round(iz));
                            float sum = 0;
                            float wsum = 0;
                            float ax = 1.0f / (antialias ? fx : 1.0f);
                            float ay = 1.0f / (antialias ? fy : 1.0f);
                            float az = 1.0f / (antialias ? fz : 1.0f);
                            int rx = (fx < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ax));
                            int ry = (fy < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / ay));
                            int rz = (fz < 1.0f) ? 2 : static_cast<int>(ceil(static_cast<float>(kernel_width) / az));
                            for (int z = iz_r - rz; z <= iz_r + rz; z++) {
                                for (int y = iy_r - ry; y <= iy_r + ry; y++) {
                                    for (int x = ix_r - rx; x <= ix_r + rx; x++) {
                                        if (z < 0 || y < 0 || x < 0 || z >= static_cast<int>(ID) ||y >= static_cast<int>(IH) || x >= static_cast<int>(IW))
                                            continue;
                                        float dx = ix - x;
                                        float dy = iy - y;
                                        float dz = iz - z;
                                        float w = ax * triangleCoeff(ax * dx) * ay * triangleCoeff(ay * dy) * az * triangleCoeff(az * dz);
                                        sum += w * in_ptr[z * IH * IW + y * IW + x];
                                        wsum += w;
                                    }
                                }
                            }
                            out_ptr[oz * OH * OW + oy * OW + ox] = (!wsum) ? 0 : (sum / wsum);
                        }
                    }
                }
            }
        }
    } else {
        assert(!"Unsupported resample operation type");
    }
 }
 class MKLDNNCPUExtResampleTests: public TestsCommon, public WithParamInterface<resample_test_params> {
    std::string model_t = R"V0G0N(
 <Net Name="Resample_net" version="2" precision="FP32" batch="1">
    <layers>
        <layer name="in1" type="Input" precision="FP32" id="0">
            <output>
                <port id="0">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_ID_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </output>
        </layer>
        <layer name="fakeLayer" id="1" type="_FL_" precision="FP32">
            <input>
                <port id="1">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_ID_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </input>
            <output>
                <port id="2">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_ID_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </output>
        </layer>
        <layer name="resample" id="2" type="Resample" precision="FP32">
            <data antialias="_AN_" factor="_F_" type="_T_"/>
            <input>
                <port id="3">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_ID_</dim>
                    <dim>_IH_</dim>
                    <dim>_IW_</dim>
                </port>
            </input>
            <output>
                <port id="4">
                    <dim>_IN_</dim>
                    <dim>_IC_</dim>
                    <dim>_OD_</dim>
                    <dim>_OH_</dim>
                    <dim>_OW_</dim>
                </port>
            </output>
        </layer>
    </layers>
    <edges>
        <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
        <edge from-layer="1" from-port="2" to-layer="2" to-port="3"/>
    </edges>
 </Net>
 )V0G0N";
    std::string getModel(resample_test_params p) {
        std::string model = model_t;
        auto dims_size = p.in_dims.size();
        if (dims_size == 4) {
            REMOVE_LINE(model, "<dim>_ID_</dim>");
            REMOVE_LINE(model, "<dim>_OD_</dim>");
        }
        if (p.isBlockedFormat)
            REPLACE_WITH_STR(model, "_FL_", "FakeLayerBLK");
        else
            REPLACE_WITH_STR(model, "_FL_", "FakeLayerPLN");
        REPLACE_WITH_NUM(model, "_IN_", p.in_dims[0]);
        REPLACE_WITH_NUM(model, "_IC_", p.in_dims[1]);
        if (dims_size == 5)
            REPLACE_WITH_NUM(model, "_ID_", p.in_dims[dims_size - 3]);
        REPLACE_WITH_NUM(model, "_IH_", p.in_dims[dims_size - 2]);
        REPLACE_WITH_NUM(model, "_IW_", p.in_dims[dims_size - 1]);
        if (dims_size == 5)
            REPLACE_WITH_NUM(model, "_OD_", (int)(p.in_dims[dims_size - 3] / p.factor));
        REPLACE_WITH_NUM(model, "_OH_", (int)(p.in_dims[dims_size - 2] / p.factor));
        REPLACE_WITH_NUM(model, "_OW_", (int)(p.in_dims[dims_size - 1] / p.factor));
        REPLACE_WITH_NUM(model, "_AN_", p.antialias);
        REPLACE_WITH_NUM(model, "_F_", p.factor);
        REPLACE_WITH_STR(model, "_T_", p.type);
        return model;
    }
 protected:
    virtual void TearDown() {
    }
    virtual void SetUp() {
        try {
            TestsCommon::SetUp();
            resample_test_params p = ::testing::WithParamInterface<resample_test_params>::GetParam();
            std::string model = getModel(p);
            MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
            auto defaultExtensions = std::make_shared<InferenceEngine::Extensions::Cpu::MKLDNNExtensions>();
            extMgr->AddExtension(defaultExtensions);
            extMgr->AddExtension(make_FakeExtensions());
            InferenceEngine::Core core;
            InferenceEngine::CNNNetwork network;
            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
            MKLDNNGraphTestClass graph;
            graph.CreateGraph(network, extMgr);
            auto& nodes = graph.getNodes();
            nodes = graph.getNodes();
            for (auto &node : nodes) {
                if (node->getName() == "resample") {
                    ASSERT_EQ(p.num_prim_desc, node->getSupportedPrimitiveDescriptors().size());
                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
                        p.comp.at(j)(node->getSupportedPrimitiveDescriptors().at(j));
                    }
                    ASSERT_NE(nullptr, node->getSelectedPrimitiveDescriptor());
                    ASSERT_EQ(p.selectedType,
                              node->getSelectedPrimitiveDescriptor()->getImplementationType() & p.selectedType);
                }
            }
            InferenceEngine::SizeVector dims_src = p.in_dims;
            InferenceEngine::Layout layout = InferenceEngine::ANY;
            switch (p.in_dims.size()) {
                case 4: layout = InferenceEngine::NCHW; break;
                case 5: layout = InferenceEngine::NCDHW; break;
            }
            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src, layout});
            src->allocate();
            fill_data(src->buffer(), src->size());
            auto * srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
            if (srcPtr == nullptr)
                FAIL() << "Cannot cast blob to TBlob<float>.";
            InferenceEngine::BlobMap srcs;
            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
            InferenceEngine::OutputsDataMap out;
            out = network.getOutputsInfo();
            InferenceEngine::BlobMap outputBlobs;
            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
            InferenceEngine::TBlob<float>::Ptr output;
            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
            output->allocate();
            outputBlobs[item.first] = output;
            graph.Infer(srcs, outputBlobs);
            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
            dst_ref.allocate();
            ref_resample(*srcPtr, dst_ref, p);
            compare(*output, dst_ref);
        } catch (const InferenceEngine::details::InferenceEngineException &e) {
            FAIL() << e.what();
        }
    }
 };
 TEST_P(MKLDNNCPUExtResampleTests, TestsResample) {}
 INSTANTIATE_TEST_CASE_P(
        TestsResample, MKLDNNCPUExtResampleTests,
        ::testing::Values(
                resample_test_params{{2, 64, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 25}, 1.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 0.25f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 10, 20}, 4.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 25}, 1.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 0.25f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 10, 20}, 4.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                // 5D nearest
                resample_test_params{{2, 64, 20, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 20, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 64, 15, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 20, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 20, 15, 25}, 1.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 3, 15, 10, 20}, 4.f, 0, "caffe.ResampleParameter.NEAREST", 3, true, MKLDNNPlugin::impl_desc_type::unknown },
                // 5D linear
                resample_test_params{{2, 15, 15, 10, 20}, 9.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 15, 15, 10, 20}, 1.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 15, 15, 10, 20}, 4.f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 2, 15, 10, 20}, 0.25f, 1, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 15, 15, 10, 20}, 9.f, 0, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 15, 15, 10, 20}, 1.f, 0, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 8, 15, 10, 20}, 4.f, 0, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown },
                resample_test_params{{2, 2, 15, 10, 20}, 0.25f, 0, "caffe.ResampleParameter.LINEAR", 1, false, MKLDNNPlugin::impl_desc_type::unknown }));
--- a/ngraph/core/src/op/interpolate.cpp
+++ b/ngraph/core/src/op/interpolate.cpp
@ -222,8 +222,8 @@ void op::v4::Interpolate::validate_and_infer_types()
    element::Type input_et = get_input_element_type(0);
    NODE_VALIDATION_CHECK(this,
                          input_et == element::Type_t::f32 || input_et == element::Type_t::f16 ||
-                              input_et == element::Type_t::i8,
+                              input_et == element::Type_t::i8 || input_et == element::Type_t::bf16,
-                          "Input element type must be f32, f16, or i8");
+                          "Input element type must be f32, f16, bf16 or i8");
    PartialShape input_shape = PartialShape(get_input_partial_shape(0));